Intel 82599 Linux driver Pause & Pace (PAP) Patch

Lately, while carrying out some work with the great little piece of hardware the Intel 82599 10GE NICs are, we came across the need to actually pace connections. We could obviously negotiate to different speeds: 100Mbit, 1Gbit and 10Gbit. That wasn't bad, but it didn't give us the timing granularity we wanted. We really wanted to set up paces in gigabit steps 1-10 Gbit. Anway, the purpose for the pacing was the fact that we have got a resource intensive (particularly network intensive) which could lead to packet loss when computational resources were scarce on the receiving side. We could obviously go the leaky bucket way, which would fill at the corresponding rate, allowing us to throttle the pace. That was cool, but added to the computational cost of the transmission protocol. In fact, it was implemented because not everyone will have an 82599 based NIC at their availability. But the leaky bucket also had other limitations and inconveniences, ie. timer granularity at 5-10 Gbps speeds when dealing with small packets <100 Bytes enters the nanosecond scale: not good, sleeping precisely in that timescale is just not trivial. How about doing all of this by hardware!? Enter intel 82599 datasheet.

In the intro we can read that these NIC's allow for pacing via TRS (transmit rate limiting) which is achieved introducing an IPG (Inter-packet Gap), using the MAUI interface (gotta love that for an interface name!). On pages 122 and 544 of the datasheet you will find more detailed information regarding how the pacing is achieved and the corresponding register in the hardware: the PAP register. As described in the documentation you may achieve:

1-10 Gbps granularity (0001b, 0010b, 0011b, 0100b, 0101b, 0110b, 0111b, 1000b, 1001b, 1111b, 0000b). The PAP Register takes in values of the type 0x000YFFFF, where Y is the rate of pacing you wish to achieve. You may have noticed that there are in fact 11 hex values described above, 0000b is for 10Gbps, 0001b is for 1Gbps, and they increase in steps of 1Gbps up until value 1110b which is 9 Gbps. And we then we have one "rogue" value 1111b which is for a 9.294 Gbps (don't ask :P ).

Well, as great as this is and sounds, it wasn't implemented on the driver. I took a quick look at ethtool's interface but unfortunately there was no possibility of adding pacing to the driver via ethtool. So I had to resort to a custom IOCTL, which would allow adjusting the pace by using the SIOCDEVPRIVATE ioctl flag.

Now, without further ado, here is the ugly patch (beware of endianess!).

diff -urNp ixgbe-3.3.9//src/Makefile ixgbe-3.3.9-pap//src/Makefile
--- ixgbe-3.3.9//src/Makefile	2011-04-20 01:06:31.000000000 +0200
+++ ixgbe-3.3.9-pap//src/Makefile	2011-06-06 17:23:14.422000009 +0200
@@ -36,11 +36,13 @@ CFILES = ixgbe_main.c ixgbe_common.c ixg
          ixgbe_sriov.c ixgbe_mbx.c
          ixgbe_dcb.c ixgbe_dcb_82598.c
          ixgbe_dcb_82599.c
-         ixgbe_phy.c
+         ixgbe_phy.c
+	 ixgbe_pap.c
 HFILES = ixgbe.h ixgbe_common.h ixgbe_api.h ixgbe_osdep.h kcompat.h
          ixgbe_sriov.h ixgbe_mbx.h
          ixgbe_dcb.h
-         ixgbe_phy.h
+         ixgbe_phy.h
+	 ixgbe_pap.h
 ifeq (,$(BUILD_KERNEL))
 BUILD_KERNEL=$(shell uname -r)
 endif
@@ -146,6 +148,7 @@ ifeq ($(ARCH),ppc64)
 endif</p>
<p> # extra flags for module builds
+EXTRA_CFLAGS += -DPAP_IOCTL
 EXTRA_CFLAGS += -DDRIVER_$(shell echo $(DRIVER_NAME) | tr '[a-z]' '[A-Z]')
 EXTRA_CFLAGS += -DDRIVER_NAME=$(DRIVER_NAME)
 EXTRA_CFLAGS += -DDRIVER_NAME_CAPS=$(shell echo $(DRIVER_NAME) | tr '[a-z]' '[A-Z]')
diff -urNp ixgbe-3.3.9//src/ixgbe_main.c ixgbe-3.3.9-pap//src/ixgbe_main.c
--- ixgbe-3.3.9//src/ixgbe_main.c	2011-04-20 01:06:31.000000000 +0200
+++ ixgbe-3.3.9-pap//src/ixgbe_main.c	2011-06-06 17:20:54.661000009 +0200
@@ -52,6 +52,9 @@
 #ifdef SIOCETHTOOL
 #include <linux/ethtool.h>
 #endif
+#ifdef PAP_IOCTL
+#include "ixgbe_pap.h"
+#endif</p>
<p> #include "ixgbe.h"</p>
<p>@@ -7837,9 +7840,14 @@ static int ixgbe_del_sanmac_netdev(struc
  **/
 static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 {
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	switch (cmd) {
 	case SIOCETHTOOL:
 		return ethtool_ioctl(ifr);
+#ifdef PAP_IOCTL
+	case SIOCDEVPRIVATE:
+		return pap_rate_adjust(adapter, ifr);
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
diff -urNp ixgbe-3.3.9//src/ixgbe_pap.c ixgbe-3.3.9-pap//src/ixgbe_pap.c
--- ixgbe-3.3.9//src/ixgbe_pap.c	1970-01-01 01:00:00.000000000 +0100
+++ ixgbe-3.3.9-pap//src/ixgbe_pap.c	2011-06-06 17:31:42.683000010 +0200
@@ -0,0 +1,73 @@
+/*
+ *
+ *
+ *       Filename:  ixgbe_pap.c
+ *
+ *    Description:
+ *
+ *        Version:  1.0
+ *        Created:  06/06/11 16:42:59
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jaime Fullaondo,
+ *        Company:  HPCN at UAM
+ *
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+
+#include "ixgbe_pap.h"
+#include "ixgbe_type.h"
+
+
+int pap_rate_adjust_ioctl(struct ixgbe_adapter *adapter, struct ifreq *ifr)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 pap_rate = IXGBE_PAP_REG_TEMPLATE; /* A little concerned over endianess... */
+
+	switch(ifr->ifr_bandwidth)
+	{
+		case 1:
+			pap_rate |= (0x1 << IXGBE_PAP_OFFSET);
+			break;
+		case 2:
+			pap_rate |= (0x2 << IXGBE_PAP_OFFSET);
+			break;
+		case 3:
+			pap_rate |= (0x3 << IXGBE_PAP_OFFSET);
+			break;
+		case 4:
+			pap_rate |= (0x4 << IXGBE_PAP_OFFSET);
+			break;
+		case 5:
+			pap_rate |= (0x5 << IXGBE_PAP_OFFSET);
+			break;
+		case 6:
+			pap_rate |= (0x6 << IXGBE_PAP_OFFSET);
+			break;
+		case 7:
+			pap_rate |= (0x7 << IXGBE_PAP_OFFSET);
+			break;
+		case 8:
+			pap_rate |= (0x8 << IXGBE_PAP_OFFSET);
+			break;
+		case 9:
+			pap_rate |= (0x9 << IXGBE_PAP_OFFSET);
+			break;
+		case 10:
+		case 0:
+			break;
+		default:
+			return -EINVAL;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_PAP, pap_rate);
+	IXGBE_WRITE_FLUSH(hw); /* Is this necessary */
+
+	/* Add a short delay */
+	msleep(50);
+	return 0;
+}
diff -urNp ixgbe-3.3.9//src/ixgbe_pap.h ixgbe-3.3.9-pap//src/ixgbe_pap.h
--- ixgbe-3.3.9//src/ixgbe_pap.h	1970-01-01 01:00:00.000000000 +0100
+++ ixgbe-3.3.9-pap//src/ixgbe_pap.h	2011-06-06 17:28:27.587000010 +0200
@@ -0,0 +1,31 @@
+/*
+ *
+ *
+ *       Filename:  ixgbe_pap.h
+ *
+ *    Description:
+ *
+ *        Version:  1.0
+ *        Created:  06/06/11 16:38:39
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jaime Fullaondo (),
+ *        Company:  HPCN - High Performance Computing and Networking Lab at UAM
+ *
+ *
+ */
+
+#ifndef _IXGBE_PAP_H
+#define _IXGBE_PAP_H
+
+#include <linux/if.h>
+
+#include "ixgbe.h"
+
+#define IXGBE_PAP_REG_TEMPLATE 0x0000FFFF
+#define IXGBE_PAP_OFFSET 16
+
+int pap_rate_adjust_ioctl(struct ixgbe_adapter *adapter, struct ifreq *ifr);
+
+#endif /* _IXGBE_PAP_H  */
Written on June 6, 2011