Send To Self

Recently I was handed over one of the best 10 GE NIC's out there, a Mellanox Technologies MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] (rev a0). I was asked to experiment with it and see what sort of performance was achievable. The problem was only had one card to play with. Since it was dual-port card the call was obvious: connect the ports with each other in a loop. I already knew this wasn't going to be all that straightforward....

The machine I was going to use to testdrive the NIC, a 6GB RAM, Intel i7, rig was running linux. Fedora Core 12 to be precise, not that this matters. Anyway, the linux kernel by design will route all local traffic through itself and not the network. So when I configured the interface for each of the ports, the kernel would know those ip's were local and would therefore not send the traffic through the actual network. Not a great thing to test network performance :)

At first I thought perhaps disabling the loopback interface, setting up a manual route from one of the interfaces to the other, and ensuring the ARP table had the corresponding MAC/IP pairs for each of the ports would suffice to get the actual packets out on the physical network. But Nay!!!! The kernel always knows the interface is local and sends the packets through itself rather than through the network.

A little googling led me to Julian Anastasov's site and his excellent collection of linux kernel networking patches. One of his works includes a patch to achieve exactly what I needed, send to yourself but through the network. You may not have the patch for the exact version of your kernel, but the code is actually quite straightforward and quite simple to adapt to your kernel version/flavor.

diff -urp v2.6.34/linux/Documentation/networking/ip-sysctl.txt linux/Documentation/networking/ip-sysctl.txt
--- v2.6.34/linux/Documentation/networking/ip-sysctl.txt	2010-05-17 10:48:46.000000000 +0300
+++ linux/Documentation/networking/ip-sysctl.txt	2010-05-19 00:01:30.000000000 +0300
@@ -693,6 +693,13 @@ accept_redirects - BOOLEAN
forwarding - BOOLEAN
Enable IP forwarding on this interface.
+loop - BOOLEAN
+	By default (loop=0) the traffic between local IP addresses
+	is routed via interface "lo". Setting this flag for two
+	interfaces allows traffic between their IP addresses to
+	be looped externally. This is useful for setups where the
+	interfaces are attached to same broadcast medium.
+
mc_forwarding - BOOLEAN
Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
and a multicast routing daemon is required.
diff -urp v2.6.34/linux/include/linux/inetdevice.h linux/include/linux/inetdevice.h
--- v2.6.34/linux/include/linux/inetdevice.h	2010-05-17 10:49:00.000000000 +0300
+++ linux/include/linux/inetdevice.h	2010-05-19 00:02:09.000000000 +0300
@@ -29,6 +29,7 @@ enum
IPV4_DEVCONF_NOXFRM,
IPV4_DEVCONF_NOPOLICY,
IPV4_DEVCONF_FORCE_IGMP_VERSION,
+	IPV4_DEVCONF_LOOP,
IPV4_DEVCONF_ARP_ANNOUNCE,
IPV4_DEVCONF_ARP_IGNORE,
IPV4_DEVCONF_PROMOTE_SECONDARIES,
@@ -137,6 +138,7 @@ static inline void ipv4_devconf_setall(s
IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))
#define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
+#define IN_DEV_LOOP(in_dev)		IN_DEV_CONF_GET(in_dev, LOOP)
#define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
#define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
#define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
diff -urp v2.6.34/linux/net/ipv4/devinet.c linux/net/ipv4/devinet.c
--- v2.6.34/linux/net/ipv4/devinet.c	2010-05-17 10:49:01.000000000 +0300
+++ linux/net/ipv4/devinet.c	2010-05-19 00:02:56.000000000 +0300
@@ -1410,6 +1410,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+		DEVINET_SYSCTL_RW_ENTRY(LOOP, "loop"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
diff -urp v2.6.34/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c
--- v2.6.34/linux/net/ipv4/fib_frontend.c	2010-05-17 10:49:01.000000000 +0300
+++ linux/net/ipv4/fib_frontend.c	2010-05-19 00:01:30.000000000 +0300
@@ -243,6 +243,7 @@ int fib_validate_source(__be32 src, __be
struct fib_result res;
int no_addr, rpf, accept_local;
+	int loop = 0;
int ret;
struct net *net;
@@ -255,6 +256,7 @@ int fib_validate_source(__be32 src, __be
accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
if (mark && !IN_DEV_SRC_VMARK(in_dev))
fl.mark = 0;
+		loop = IN_DEV_LOOP(in_dev);
}
rcu_read_unlock();
@@ -264,6 +266,11 @@ int fib_validate_source(__be32 src, __be
net = dev_net(dev);
if (fib_lookup(net, &fl, &res))
goto last_resort;
+	if (loop && res.type == RTN_LOCAL) {
+		*spec_dst = FIB_RES_PREFSRC(res);
+		fib_res_put(&res);
+		return 0;
+	}
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
goto e_inval_res;
diff -urp v2.6.34/linux/net/ipv4/route.c linux/net/ipv4/route.c
--- v2.6.34/linux/net/ipv4/route.c	2010-05-17 10:49:01.000000000 +0300
+++ linux/net/ipv4/route.c	2010-05-19 00:01:30.000000000 +0300
@@ -2609,6 +2609,11 @@ static int ip_route_output_slow(struct n
dev_put(dev_out);
goto out;	/* Wrong error code */
}
+		err = -ENETDOWN;
+		if (!(dev_out->flags&IFF_UP)) {
+			dev_put(dev_out);
+			goto out;
+		}
if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
@@ -2676,10 +2681,41 @@ static int ip_route_output_slow(struct n
free_res = 1;

 if (res.type == RTN_LOCAL) {
-		if (!fl.fl4_src)
-			fl.fl4_src = fl.fl4_dst;
+		struct in_device *in_dev;
+		__be32 src;
+
if (dev_out)
dev_put(dev_out);
+		dev_out = FIB_RES_DEV(res);
+		in_dev = in_dev_get(dev_out);
+		src = fl.fl4_src? : FIB_RES_PREFSRC(res);
+		if (in_dev && IN_DEV_LOOP(in_dev) && src) {
+			struct net_device *dev_src;
+
+			in_dev_put(in_dev);
+			in_dev = NULL;
+			dev_src = ip_dev_find(net, src);
+			if (dev_src && dev_src != dev_out &&
+			    (in_dev = in_dev_get(dev_src)) &&
+			    IN_DEV_LOOP(in_dev)) {
+				in_dev_put(in_dev);
+				dev_out = dev_src;
+				fl.fl4_src = src;
+				fl.oif = dev_out->ifindex;
+				res.type = RTN_UNICAST;
+				if (res.fi) {
+					fib_info_put(res.fi);
+					res.fi = NULL;
+				}
+				goto make_route;
+			}
+			if (dev_src)
+				dev_put(dev_src);
+		}
+		if (in_dev)
+			in_dev_put(in_dev);
+		if (!fl.fl4_src)
+			fl.fl4_src = fl.fl4_dst;
dev_out = net->loopback_dev;
dev_hold(dev_out);
fl.oif = dev_out->ifindex;

You need to apply the patch to your kernel sources. It's not a good idea to patch the original sources, so get yourself a copy of the sources you wish to patch. It's usually wise to do a dry run, to see if all hunks are successfully applied.


# patch -p1 --dry-run < patch.diff

Once that's fine, note that you may have to modify the patch slightly depending of the kernel sources you're attempting to patch, just go ahead:


# patch -p1 < patch.diff

Once that's done, recompile your kernel!! You don't really need to do much, probably your old .config should be good to go. If you're using Fedora or Ubuntu you may need to compile your kernel the Fedora/Ubuntu way.... I use Gentoo, so all I need to do is the usual


# make clean
# make oldconfig
# make menuconfig
# make && make modules_install

set up your kernel with grub, and reboot!! You should be pretty much ready to go!!! Once you boot into the new kernel, you need to configure the kernel via the proc interface to send to yourself via the network. This is done as follows (for instance):


# echo 1 > /proc/sys/net/ipv4/conf/eth0/loop
# echo 1 > /proc/sys/net/ipv4/conf/eth1/loop
# ip address add 192.168.1.1 dev eth0
# ip address add 192.168.2.1 dev eth1
# ip route replace local 192.168.2.1 dev eth1 scope host src 192.168.1.1 proto kernel
# ip route replace local 192.168.1.1 dev eth0 scope host src 192.168.2.1 proto kernel

You should now be ready to try to evaluate your dual-port card. However beware, if you're trying to evaluate a 10GE card such as the mellanox card described above, the kernel can only process so many interrupts per unit of time. If you try to both TX and RX at 10GE with the same machine, the kernel might not be able to cope with the volume of interrupts, memory copies, etc and so the performance figures should surely be inferior to those you obtain if you were using two machines.

Written on August 25, 2010