DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD
@ 2015-02-23 16:55 Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions Zhou Danny
                   ` (6 more replies)
  0 siblings, 7 replies; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Rebase the patchset onto the HEAD
- Isolate ethdev from EAL for new-added wait-for-rx interrupt function
- Export wait-for-rx interrupt function for shared libraries
- Split-off a new patch file for changed struct rte_intr_handle that
other patches depend on, to avoid breaking git bisect
- Change sample applicaiton to accomodate EAL function spec change
accordingly

v4 changes
- Export interrupt enable/disable functions for shared libraries
- Adjust position of new-added structure fields and functions to
avoid breaking ABI
 
v3 changes
- Add return value for interrupt enable/disable functions
- Move spinlok from PMD to L3fwd-power
- Remove unnecessary variables in e1000_mac_info
- Fix miscelleous review comments
 
v2 changes
- Fix compilation issue in Makefile for missed header file.
- Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
which then wakes up DPDK polling thread). In this way, it introduces
non-deterministic wakeup latency for DPDK polling thread as well as packet
latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
and rx queue interrupt handlers causes a mess.
2) LSC interrupt is not supported by VF driver, so it is by default disabled
in L3fwd-power now. Feel free to turn in on if you want to support both LSC
and rx queue interrupts on a PF.

Danny Zhou (6):
  ethdev: add rx interrupt enable/disable functions
  eal: add rx queue interrupt FDs to intr handle struct
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  eal: add per rx queue interrupt handling based on VFIO
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode  
      switch

 examples/l3fwd-power/main.c                        | 155 ++++++---
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   1 +
 lib/librte_eal/common/include/rte_eal.h            |   1 +
 lib/librte_eal/common/include/rte_interrupts.h     |  12 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 191 ++++++++---
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  12 +-
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   4 +
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   1 +
 lib/librte_ether/rte_ethdev.c                      |  43 +++
 lib/librte_ether/rte_ethdev.h                      |  59 ++++
 lib/librte_ether/rte_ether_version.map             |   2 +
 lib/librte_pmd_e1000/e1000_ethdev.h                |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c                  | 228 +++++++++++--
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 365 ++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   7 +
 15 files changed, 970 insertions(+), 114 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-23 16:59   ` Stephen Hemminger
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 2/6] eal: add rx queue interrupt FDs to intr handle struct Zhou Danny
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Rebase the patchset onto the HEAD

v4 changes
- Export interrupt enable/disable functions for shared libraries
- Put new functions at the end of eth_dev_ops to avoid breaking ABI

v3 changes
- Add return value for interrupt enable/disable functions

Add two dev_ops functions to enable and disable rx queue interrupts

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_ether/rte_ethdev.c          | 43 +++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 59 ++++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |  2 ++
 3 files changed, 104 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 27bbb0b..eaf29de 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -2830,6 +2830,49 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	}
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
+				uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (port_id >= nb_ports) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return (-ENODEV);
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return (-ENODEV);
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+}
+
+int
+rte_eth_dev_rx_queue_intr_disable(uint8_t port_id,
+				uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (port_id >= nb_ports) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return (-ENODEV);
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return (-ENODEV);
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 4acd595..7aa6c81 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -823,6 +823,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+	uint16_t rxq;
 };
 
 /**
@@ -1028,6 +1030,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */
 
@@ -1379,6 +1389,10 @@ struct eth_dev_ops {
 	/** Get current RSS hash configuration. */
 	rss_hash_conf_get_t rss_hash_conf_get;
 	eth_filter_ctrl_t              filter_ctrl;          /**< common filter control*/
+
+	/** Enable/disable Rx queue interrupt. */
+	eth_rx_enable_intr_t       rx_queue_intr_enable; /**< Enable Rx queue interrupt. */
+	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt.*/
 };
 
 /**
@@ -2672,6 +2686,51 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 				enum rte_eth_event_type event);
 
 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_queue_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_queue_intr_disable() function disables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_queue_intr_disable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
  * Turn on the LED on the Ethernet device.
  * This function turns on the LED on the Ethernet device.
  *
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index f66fd2d..6fef09e 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -42,6 +42,8 @@ DPDK_2.0 {
 	rte_eth_dev_rss_hash_update;
 	rte_eth_dev_rss_reta_query;
 	rte_eth_dev_rss_reta_update;
+	rte_eth_dev_rx_queue_intr_disable;
+	rte_eth_dev_rx_queue_intr_enable;
 	rte_eth_dev_rx_queue_start;
 	rte_eth_dev_rx_queue_stop;
 	rte_eth_dev_set_link_down;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 2/6] eal: add rx queue interrupt FDs to intr handle struct
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 3/6] ixgbe: enable rx queue interrupts for both PF and VF Zhou Danny
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes:
- Create this new patch file for changed struct rte_intr_handle that
other patches depend on, to avoid breaking git bisect.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6a159c7..9924124 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define VFIO_MAX_QUEUE_ID 32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -57,6 +59,8 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	int queue_fd[VFIO_MAX_QUEUE_ID]; /**< rx and tx queue interrupt file descriptor */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 3/6] ixgbe: enable rx queue interrupts for both PF and VF
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 2/6] eal: add rx queue interrupt FDs to intr handle struct Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 4/6] igb: enable rx queue interrupts for PF Zhou Danny
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Rebase the patchset onto the HEAD

v3 changes
- Remove spinlok from PMD

v2 changes
- Consolidate review comments related to coding style

The patch does below things for ixgbe PF and VF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Yong Liu <yong.liu@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 365 +++++++++++++++++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   7 +
 2 files changed, 368 insertions(+), 4 deletions(-)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 6e7a1ab..9de9cbf 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -83,6 +83,9 @@
  */
 #define IXGBE_FC_LO    0x40
 
+/* Default minimum inter-interrupt interval for EITR configuration */
+#define IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT    0x79E
+
 /* Timer value included in XOFF frames. */
 #define IXGBE_FC_PAUSE 0x680
 
@@ -173,6 +176,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -186,11 +190,14 @@ static void ixgbe_dcb_init(struct ixgbe_hw *hw,struct ixgbe_dcb_config *dcb_conf
 /* For Virtual Function support */
 static int eth_ixgbevf_dev_init(struct eth_driver *eth_drv,
 		struct rte_eth_dev *eth_dev);
+static int ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev);
+static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_configure(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_start(struct rte_eth_dev *dev);
 static void ixgbevf_dev_stop(struct rte_eth_dev *dev);
 static void ixgbevf_dev_close(struct rte_eth_dev *dev);
 static void ixgbevf_intr_disable(struct ixgbe_hw *hw);
+static void ixgbevf_intr_enable(struct ixgbe_hw *hw);
 static void ixgbevf_dev_stats_get(struct rte_eth_dev *dev,
 		struct rte_eth_stats *stats);
 static void ixgbevf_dev_stats_reset(struct rte_eth_dev *dev);
@@ -200,6 +207,15 @@ static void ixgbevf_vlan_strip_queue_set(struct rte_eth_dev *dev,
 		uint16_t queue, int on);
 static void ixgbevf_vlan_offload_set(struct rte_eth_dev *dev, int mask);
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on);
+static void ixgbevf_dev_interrupt_handler(struct rte_intr_handle *handle,
+		void *param);
+static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+		uint16_t queue_id);
+static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+		 uint16_t queue_id);
+static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+		 uint8_t queue, uint8_t msix_vector);
+static void ixgbevf_configure_msix(struct  ixgbe_hw *hw);
 
 /* For Eth VMDQ APIs support */
 static int ixgbe_uc_hash_table_set(struct rte_eth_dev *dev, struct
@@ -217,6 +233,14 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
 		uint8_t	rule_id);
 
+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void ixgbe_configure_msix(struct  ixgbe_hw *hw);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 		uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -265,7 +289,7 @@ static int ixgbevf_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
  */
 #define UPDATE_VF_STAT(reg, last, cur)	                        \
 {                                                               \
-	u32 latest = IXGBE_READ_REG(hw, reg);                   \
+	uint32_t latest = IXGBE_READ_REG(hw, reg);                   \
 	cur += latest - last;                                   \
 	last = latest;                                          \
 }
@@ -346,6 +370,8 @@ static struct eth_dev_ops ixgbe_eth_dev_ops = {
 	.tx_queue_start	      = ixgbe_dev_tx_queue_start,
 	.tx_queue_stop        = ixgbe_dev_tx_queue_stop,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
+	.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
 	.rx_queue_count       = ixgbe_dev_rx_queue_count,
 	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -406,8 +432,11 @@ static struct eth_dev_ops ixgbevf_eth_dev_ops = {
 	.vlan_offload_set     = ixgbevf_vlan_offload_set,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
+	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
 	.tx_queue_setup       = ixgbe_dev_tx_queue_setup,
 	.tx_queue_release     = ixgbe_dev_tx_queue_release,
+	.rx_queue_intr_enable = ixgbevf_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbevf_dev_rx_queue_intr_disable,
 	.mac_addr_add         = ixgbevf_add_mac_addr,
 	.mac_addr_remove      = ixgbevf_remove_mac_addr,
 };
@@ -904,6 +933,10 @@ eth_ixgbe_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 			eth_dev->data->port_id, pci_dev->id.vendor_id,
 			pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+						IXGBE_MAX_OTHER_INTR;
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		ixgbe_dev_interrupt_handler, (void *)eth_dev);
 
@@ -1085,6 +1118,15 @@ eth_ixgbevf_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 			return (-EIO);
 	}
 
+	/* set max interrupt vfio request */
+	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+						IXGBEVF_MAX_OTHER_INTR;
+
+	rte_intr_callback_register(&(pci_dev->intr_handle),
+		ixgbevf_dev_interrupt_handler, (void *)eth_dev);
+
+	rte_intr_enable(&(pci_dev->intr_handle));
+
 	PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x mac.type=%s",
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id, "ixgbe_mac_82599_vf");
@@ -1486,6 +1528,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+	/* confiugre msix for  sleep until  rx interrupt */
+	ixgbe_configure_msix(hw);
+
 	/* initialize transmission unit */
 	ixgbe_dev_tx_init(dev);
 
@@ -1561,6 +1606,10 @@ skip_link_setup:
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ixgbe_dev_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		ixgbe_dev_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	ixgbe_enable_intr(dev);
 
@@ -2238,6 +2287,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+	return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -2264,10 +2335,10 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	PMD_DRV_LOG(INFO, "eicr %x", eicr);
 
 	intr->flags = 0;
-	if (eicr & IXGBE_EICR_LSC) {
-		/* set flag for async link update */
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
 		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
-	}
 
 	if (eicr & IXGBE_EICR_MAILBOX)
 		intr->flags |= IXGBE_FLAG_MAILBOX;
@@ -2275,6 +2346,30 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev)
+{
+	uint32_t eicr;
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	/* clear all cause mask */
+	ixgbevf_intr_disable(hw);
+
+	/* read-on-clear nic registers here */
+	eicr = IXGBE_READ_REG(hw, IXGBE_VTEICR);
+	PMD_DRV_LOG(INFO, "eicr %x", eicr);
+
+	intr->flags = 0;
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
+		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
+
+	return 0;
+}
+
 /**
  * It gets and then prints the link status.
  *
@@ -2370,6 +2465,18 @@ ixgbe_dev_interrupt_action(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	PMD_DRV_LOG(DEBUG, "enable intr immediately");
+	ixgbevf_intr_enable(hw);
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+	return 0;
+}
+
 /**
  * Interrupt handler which shall be registered for alarm callback for delayed
  * handling specific interrupt to wait for the stable nic state. As the
@@ -2431,6 +2538,15 @@ ixgbe_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	ixgbe_dev_interrupt_action(dev);
 }
 
+static void
+ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
+							void *param)
+{
+	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+	ixgbevf_dev_interrupt_get_status(dev);
+	ixgbevf_dev_interrupt_action(dev);
+}
+
 static int
 ixgbe_dev_led_on(struct rte_eth_dev *dev)
 {
@@ -2929,6 +3045,19 @@ ixgbevf_intr_disable(struct ixgbe_hw *hw)
 	IXGBE_WRITE_FLUSH(hw);
 }
 
+static void
+ixgbevf_intr_enable(struct ixgbe_hw *hw)
+{
+	PMD_INIT_FUNC_TRACE();
+
+	/* VF enable interrupt autoclean */
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_VF_IRQ_ENABLE_MASK);
+
+	IXGBE_WRITE_FLUSH(hw);
+}
+
 static int
 ixgbevf_dev_configure(struct rte_eth_dev *dev)
 {
@@ -2991,6 +3120,11 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+	ixgbevf_configure_msix(hw);
+
+	/* Re-enable interrupt for VF */
+	ixgbevf_intr_enable(hw);
+
 	return 0;
 }
 
@@ -3528,6 +3662,229 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 	return 0;
 }
 
+
+static int
+ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask |= (1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask &= ~(1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask |= (1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= (1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= (1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask &= ~(1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= ~(1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= ~(1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static void
+ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+	if (direction == -1) {
+		/* other causes */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC);
+		tmp &= ~0xFF;
+		tmp |= msix_vector;
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, tmp);
+	} else {
+		/* rx or tx cause */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		idx = ((16 * (queue & 1)) + (8 * direction));
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR(queue >> 1));
+		tmp &= ~(0xFF << idx);
+		tmp |= (msix_vector << idx);
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(queue >> 1), tmp);
+	}
+}
+
+/**
+ * set the IVAR registers, mapping interrupt causes to vectors
+ * @param hw
+ *  pointer to ixgbe_hw struct
+ * @direction
+ *  0 for Rx, 1 for Tx, -1 for other causes
+ * @queue
+ *  queue to map the corresponding interrupt to
+ * @msix_vector
+ *  the vector to map to the corresponding queue
+ */
+static void
+ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			   uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+
+	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+	if (hw->mac.type == ixgbe_mac_82598EB) {
+		if (direction == -1)
+			direction = 0;
+		idx = (((direction * 64) + queue) >> 2) & 0x1F;
+		tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(idx));
+		tmp &= ~(0xFF << (8 * (queue & 0x3)));
+		tmp |= (msix_vector << (8 * (queue & 0x3)));
+		IXGBE_WRITE_REG(hw, IXGBE_IVAR(idx), tmp);
+	} else if ((hw->mac.type == ixgbe_mac_82599EB) ||
+			(hw->mac.type == ixgbe_mac_X540)) {
+		if (direction == -1) {
+			/* other causes */
+			idx = ((queue & 1) * 8);
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, tmp);
+		} else {
+			/* rx or tx causes */
+			idx = ((16 * (queue & 1)) + (8 * direction));
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(queue >> 1));
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR(queue >> 1), tmp);
+		}
+	}
+}
+
+static void
+ixgbevf_configure_msix(struct ixgbe_hw *hw)
+{
+	uint32_t q_idx, vector_idx;
+	/* Configure all RX queues of VF */
+	for (vector_idx = 0; vector_idx < IXGBE_VF_MAXMSIVECTOR; vector_idx++) {
+		for (q_idx = 0; q_idx < (hw->mac.max_rx_queues - 1); q_idx++)
+			ixgbevf_set_ivar_map(hw, 0, q_idx, vector_idx);
+	}
+
+	/* Configure VF Rx queue ivar */
+	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+}
+
+/**
+ * Sets up the hardware to properly generate MSI-X interrupts
+ * @hw
+ *  board private structure
+ */
+static void
+ixgbe_configure_msix(struct ixgbe_hw *hw)
+{
+	int queue_id;
+	uint32_t mask;
+	uint32_t gpie;
+
+	/* setup GPIE for MSI-x mode */
+	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
+		   IXGBE_GPIE_OCD | IXGBE_GPIE_EIAME;
+	/*
+	* auto clearing and auto setting corresponding bits in EIMS
+	* when MSI-X interrupt is triggered
+	*/
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
+	else {
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF);
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF);
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+
+	/*
+	* Populate the IVAR table and set the ITR values to the
+	* corresponding register.
+	*/
+	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
+		ixgbe_set_ivar_map(hw, 0, queue_id, queue_id);
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		ixgbe_set_ivar_map(hw, -1, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+			       VFIO_MAX_QUEUE_ID);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		ixgbe_set_ivar_map(hw, -1, 1, VFIO_MAX_QUEUE_ID);
+		break;
+	default:
+		break;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_EITR(queue_id),
+			 IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT & 0xFFF);
+
+	/* set up to autoclear timer, and the vectors */
+	mask = IXGBE_EIMS_ENABLE_MASK;
+	mask &= ~(IXGBE_EIMS_OTHER |
+		  IXGBE_EIMS_MAILBOX |
+		  IXGBE_EIMS_LSC);
+
+	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+}
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 	uint16_t queue_idx, uint16_t tx_rate)
 {
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
index a549f5c..1bdfbce 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
@@ -109,6 +109,12 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+#define IXGBE_VF_IRQ_ENABLE_MASK        3          /* vf irq enable mask */
+#define IXGBE_VF_MAXMSIVECTOR           1
+/* maximum other interrupts besides rx&tx*/
+#define IXGBE_MAX_OTHER_INTR            1
+#define IXGBEVF_MAX_OTHER_INTR          1
+
 /*
  * Information about the fdir mode.
  */
@@ -317,6 +323,7 @@ uint32_t ixgbe_dev_rx_queue_count(struct rte_eth_dev *dev,
 		uint16_t rx_queue_id);
 
 int ixgbe_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
+int ixgbevf_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
 
 int ixgbe_dev_rx_init(struct rte_eth_dev *dev);
 
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 4/6] igb: enable rx queue interrupts for PF
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
                   ` (2 preceding siblings ...)
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 3/6] ixgbe: enable rx queue interrupts for both PF and VF Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO Zhou Danny
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Rebase the patchset onto the HEAD

v3 changes
- Remove unnecessary variables in e1000_mac_info
- Remove spinlok from PMD

v2 changes
- Consolidate review comments related to coding style

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_pmd_e1000/e1000_ethdev.h |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c   | 228 ++++++++++++++++++++++++++++++++----
 2 files changed, 206 insertions(+), 25 deletions(-)

diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h b/lib/librte_pmd_e1000/e1000_ethdev.h
index c451faa..13c4cad 100644
--- a/lib/librte_pmd_e1000/e1000_ethdev.h
+++ b/lib/librte_pmd_e1000/e1000_ethdev.h
@@ -108,6 +108,9 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+/* maximum number of other interrupts besides Rx & Tx interrupts */
+#define E1000_MAX_OTHER_INTR		1
+
 /* structure for interrupt relative data */
 struct e1000_interrupt {
 	uint32_t flags;
diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 504ae74..1eb6fbd 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -195,6 +196,16 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -254,6 +265,8 @@ static struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -465,6 +478,7 @@ eth_igb_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(eth_dev->data->dev_private);
 	uint32_t ctrl_ext;
+	struct rte_eth_dev_info dev_info;
 
 	pci_dev = eth_dev->pci_dev;
 	eth_dev->dev_ops = &eth_igb_ops;
@@ -586,6 +600,13 @@ eth_igb_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(eth_dev, &dev_info);
+
+	pci_dev->intr_handle.max_intr = dev_info.max_rx_queues +
+						E1000_MAX_OTHER_INTR;
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		eth_igb_interrupt_handler, (void *)eth_dev);
 
@@ -755,7 +776,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -795,6 +816,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(dev);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -822,33 +846,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -902,6 +902,10 @@ eth_igb_start(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ret = eth_igb_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
 
@@ -1828,6 +1832,34 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 }
 
 /*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+
+	return 0;
+}
+
+/*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
  * to update link status.
  *
@@ -3652,5 +3684,151 @@ static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	uint32_t max_rx_queues;
+	struct rte_eth_dev_info dev_info;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+	max_rx_queues = dev_info.max_rx_queues;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+					E1000_GPIE_PBA | E1000_GPIE_EIAME |
+					E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		intr_mask = 1 << max_rx_queues;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
+
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
+	* is asserted, this saves a register write for every interrupt
+	*/
+	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIAC);
+	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++)
+		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
                   ` (3 preceding siblings ...)
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 4/6] igb: enable rx queue interrupts for PF Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-24 10:42   ` David Marchand
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 6/6] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Zhou Danny
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
  6 siblings, 1 reply; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Rebase the patchset onto the HEAD
- Isolate ethdev from EAL for new-added wait-for-rx interrupt function
- Export wait-for-rx interrupt function for shared libraries

v4 changes:
- Adjust position of new-added structure fields

v3 changes:
- Fix review comments

v2 changes:
- Fix compilation issue for a missed header file
- Bug fix: free unreleased resources on the exception path before return
- Consolidate coding style related review comments

This patch does below:
- Create multiple VFIO eventfd for rx queues.
- Handle per rx queue interrupt.
- Eliminate unnecessary suspended DPDK polling thread wakeup mechanism
for rx interrupt by allowing polling thread epoll_wait rx queue
interrupt notification.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 lib/librte_eal/bsdapp/eal/rte_eal_version.map   |   1 +
 lib/librte_eal/common/include/rte_eal.h         |   1 +
 lib/librte_eal/common/include/rte_interrupts.h  |  12 ++
 lib/librte_eal/linuxapp/eal/eal_interrupts.c    | 191 ++++++++++++++++++------
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c      |  12 +-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |   1 +
 6 files changed, 174 insertions(+), 44 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 5ed6e4d..dd300ea 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -57,6 +57,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_wait_rx_pkt;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index f4ecd2e..b9d5230 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -150,6 +150,7 @@ int rte_eal_iopl_init(void);
  *   - On failure, a negative error value.
  */
 int rte_eal_init(int argc, char **argv);
+
 /**
  * Usage function typedef used by the application usage function.
  *
diff --git a/lib/librte_eal/common/include/rte_interrupts.h b/lib/librte_eal/common/include/rte_interrupts.h
index 609c34b..1ba2421 100644
--- a/lib/librte_eal/common/include/rte_interrupts.h
+++ b/lib/librte_eal/common/include/rte_interrupts.h
@@ -113,6 +113,18 @@ int rte_intr_enable(struct rte_intr_handle *intr_handle);
  */
 int rte_intr_disable(struct rte_intr_handle *intr_handle);
 
+/**
+ * @param intr_handle
+ *   pointer to the interrupt handle.
+ * @param queue_id
+ *   the queue id
+ * @return
+ *   - On success, return 0
+ *   - On failure, returns -1.
+ */
+int rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle,
+			uint8_t queue_id);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 8c5b834..ee0f019 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -127,6 +127,9 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT
 
 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+				sizeof(int) * (VFIO_MAX_QUEUE_ID + 1))
 
 /* enable legacy (INTx) interrupts */
 static int
@@ -218,10 +221,10 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* enable MSI-X interrupts */
+/* enable MSI interrupts */
 static int
 vfio_enable_msi(struct rte_intr_handle *intr_handle) {
-	int len, ret;
+	int len, ret, max_intr;
 	char irq_set_buf[IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
@@ -230,12 +233,19 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if ((!intr_handle->max_intr) ||
+		(intr_handle->max_intr > VFIO_MAX_QUEUE_ID))
+		max_intr = VFIO_MAX_QUEUE_ID + 1;
+	else
+		max_intr = intr_handle->max_intr;
+
+	irq_set->count = max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->queue_fd, sizeof(intr_handle->queue_fd));
+	fd_ptr[max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -244,27 +254,10 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
-/* disable MSI-X interrupts */
+/* disable MSI interrupts */
 static int
 vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
@@ -292,8 +285,8 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 /* enable MSI-X interrupts */
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
-	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret, max_intr;
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
 
@@ -301,12 +294,19 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if ((!intr_handle->max_intr) ||
+		(intr_handle->max_intr > VFIO_MAX_QUEUE_ID))
+		max_intr = VFIO_MAX_QUEUE_ID + 1;
+	else
+		max_intr = intr_handle->max_intr;
+
+	irq_set->count = max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->queue_fd, sizeof(intr_handle->queue_fd));
+	fd_ptr[max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -316,22 +316,6 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 		return -1;
 	}
 
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI-X interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -339,7 +323,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	int len, ret;
 
 	len = sizeof(struct vfio_irq_set);
@@ -860,3 +844,124 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static void
+eal_intr_process_rx_interrupts(struct rte_intr_handle *intr_handle,
+			struct epoll_event *events, int nfds)
+{
+	int n, bytes_read;
+	union rte_intr_read_buffer buf;
+
+	for (n = 0; n < nfds; n++) {
+		/* set the length to be read for different handle type */
+		switch (intr_handle->type) {
+		case RTE_INTR_HANDLE_UIO:
+			bytes_read = sizeof(buf.uio_intr_count);
+			break;
+		case RTE_INTR_HANDLE_ALARM:
+			bytes_read = sizeof(buf.timerfd_num);
+			break;
+#ifdef VFIO_PRESENT
+		case RTE_INTR_HANDLE_VFIO_MSIX:
+		case RTE_INTR_HANDLE_VFIO_MSI:
+		case RTE_INTR_HANDLE_VFIO_LEGACY:
+			bytes_read = sizeof(buf.vfio_intr_count);
+			break;
+#endif
+		default:
+			bytes_read = 1;
+			break;
+		}
+
+		/**
+		* read out to clear the ready-to-be-read flag
+		* for epoll_wait.
+		*/
+		bytes_read = read(events[n].data.fd, &buf, bytes_read);
+		if (bytes_read < 0)
+			RTE_LOG(ERR, EAL, "Error reading from file "
+				"descriptor %d: %s\n", events[n].data.fd,
+							strerror(errno));
+		else if (bytes_read == 0)
+			RTE_LOG(ERR, EAL, "Read nothing from file "
+				"descriptor %d\n", events[n].data.fd);
+	}
+}
+
+static void
+eal_intr_handle_rx_interrupts(struct rte_intr_handle *intr_handle,
+			int pfd, unsigned totalfds)
+{
+	struct epoll_event events[totalfds];
+	int nfds = 0;
+
+	do {
+		nfds = epoll_wait(pfd, events, totalfds,
+				EAL_INTR_EPOLL_WAIT_FOREVER);
+		/* epoll_wait fail */
+		if (nfds < 0) {
+			RTE_LOG(ERR, EAL,
+				"epoll_wait returns with fail\n");
+			return;
+		}
+	} while (nfds == 0);
+
+	/* epoll_wait has at least one fd ready to read */
+	eal_intr_process_rx_interrupts(intr_handle, events, nfds);
+}
+
+int
+rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t queue_id)
+{
+	struct epoll_event ev;
+	unsigned numfds = 0;
+
+	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+		return -1;
+	if (queue_id >= VFIO_MAX_QUEUE_ID)
+		return -1;
+
+	/* create epoll fd */
+	int pfd = epoll_create(1);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
+		return -1;
+	}
+
+	rte_spinlock_lock(&intr_lock);
+
+	ev.events = EPOLLIN | EPOLLPRI;
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+		ev.data.fd = intr_handle->fd;
+		break;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		ev.data.fd = intr_handle->queue_fd[queue_id];
+		break;
+#endif
+	default:
+		rte_spinlock_unlock(&intr_lock);
+		close(pfd);
+		return -1;
+	}
+
+	if (epoll_ctl(pfd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error adding fd %d epoll_ctl, %s\n",
+			intr_handle->queue_fd[queue_id], strerror(errno));
+	} else
+		numfds++;
+
+	rte_spinlock_unlock(&intr_lock);
+	/* serve the interrupt */
+	eal_intr_handle_rx_interrupts(intr_handle, pfd, numfds);
+
+	/**
+	* when we return, we need to rebuild the
+	* list of fds to monitor.
+	*/
+	close(pfd);
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 20e0977..0e5fa76 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -283,11 +283,21 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 
 		dev->intr_handle.fd = fd;
 		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
-
 		switch (i) {
 		case VFIO_PCI_MSIX_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+			for (i = 0; i < VFIO_MAX_QUEUE_ID; i++) {
+				fd = eventfd(0, 0);
+				if (fd < 0) {
+					RTE_LOG(ERR, EAL,
+					"cannot setup eventfd,"
+					"error %i (%s)\n",
+					errno, strerror(errno));
+					return -1;
+				}
+				dev->intr_handle.queue_fd[i] = fd;
+			}
 			break;
 		case VFIO_PCI_MSI_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 5ed6e4d..dd300ea 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -57,6 +57,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_wait_rx_pkt;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v5 6/6] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
                   ` (4 preceding siblings ...)
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO Zhou Danny
@ 2015-02-23 16:55 ` Zhou Danny
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
  6 siblings, 0 replies; 242+ messages in thread
From: Zhou Danny @ 2015-02-23 16:55 UTC (permalink / raw)
  To: dev

v5 changes
- Change invoked function name and parameter to accomodate EAL change

v3 changes
- Add spinlock to ensure thread safe when accessing interrupt mask
  register

v2 changes
- Remove unused function which is for debug purpose

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in
polling mode and switch to interrupt mode only if there is no
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles
than kernel, so one-shot interrupt is used here to guarantee minimum
overhead and DPDK polling thread returns to polling mode immediately
once it receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 examples/l3fwd-power/main.c | 155 ++++++++++++++++++++++++++++++++------------
 1 file changed, 114 insertions(+), 41 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f6b55b9..c8b3d6b 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -75,12 +75,14 @@
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -156,6 +158,9 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+/* ethernet addresses of ports */
+static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 /* Ports set in promiscuous mode off by default. */
@@ -188,6 +193,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -214,7 +222,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -226,11 +234,14 @@ static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -402,19 +413,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -707,22 +721,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the
+		minimum latency switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -762,6 +774,42 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(uint8_t port_id, uint8_t queue_id)
+{
+	struct rte_eth_dev *eth_dev = &rte_eth_devices[port_id];
+
+	/* Enable one-shot rx interrupt */
+	rte_spinlock_lock(&(locks[port_id]));
+	rte_eth_dev_rx_queue_intr_enable(port_id, queue_id);
+	rte_spinlock_unlock(&(locks[port_id]));
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt on port%d,rxq%d triggers\n",
+		rte_lcore_id(), port_id, queue_id);
+
+	rte_intr_wait_rx_pkt(&(eth_dev->pci_dev->intr_handle), queue_id);
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u is waked up from rx interrupt on port%d,rxq%d\n",
+		rte_lcore_id(), port_id, queue_id);
+
+	/* Disable one-shot rx interrupt */
+	rte_spinlock_lock(&(locks[port_id]));
+	rte_eth_dev_rx_queue_intr_disable(port_id, queue_id);
+	rte_spinlock_unlock(&(locks[port_id]));
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -775,7 +823,6 @@ main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
 
@@ -835,6 +882,8 @@ main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -848,6 +897,7 @@ main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -910,10 +960,13 @@ main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -928,16 +981,21 @@ main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of
+				* microsecond for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				sleep_until_rx_interrupt(
+					qconf->rx_queue_list[0].port_id,
+					qconf->rx_queue_list[0].queue_id);
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1270,7 +1328,7 @@ setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1280,7 +1338,7 @@ setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1476,6 +1534,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1525,10 +1584,19 @@ main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure not existed rxq: "
+				"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1552,6 +1620,9 @@ main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1586,8 +1657,9 @@ main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1635,7 +1707,6 @@ main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets
@@ -1644,6 +1715,8 @@ main(int argc, char **argv)
 		 */
 		if (promiscuous_on)
 			rte_eth_promiscuous_enable(portid);
+		/* initialize spinlock for each port */
+		rte_spinlock_init(&(locks[portid]));
 	}
 
 	check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions Zhou Danny
@ 2015-02-23 16:59   ` Stephen Hemminger
  2015-02-23 17:17     ` Zhou, Danny
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-02-23 16:59 UTC (permalink / raw)
  To: Zhou Danny; +Cc: dev

On Tue, 24 Feb 2015 00:55:37 +0800
Zhou Danny <danny.zhou@intel.com> wrote:

> +int
> +rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
> +				uint16_t queue_id)
> +{
> +	struct rte_eth_dev *dev;
> +
> +	if (port_id >= nb_ports) {
> +		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
> +		return (-ENODEV);

Please don't use the BSD style of extra useless paren's around
returns.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions
  2015-02-23 16:59   ` Stephen Hemminger
@ 2015-02-23 17:17     ` Zhou, Danny
  2015-05-11 14:10       ` [dpdk-dev] [PATCH] lib: syntax cleanup Ferruh Yigit
  0 siblings, 1 reply; 242+ messages in thread
From: Zhou, Danny @ 2015-02-23 17:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, February 24, 2015 12:59 AM
> To: Zhou, Danny
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 1/6] ethdev: add rx interrupt enable/disable functions
> 
> On Tue, 24 Feb 2015 00:55:37 +0800
> Zhou Danny <danny.zhou@intel.com> wrote:
> 
> > +int
> > +rte_eth_dev_rx_queue_intr_enable(uint8_t port_id,
> > +				uint16_t queue_id)
> > +{
> > +	struct rte_eth_dev *dev;
> > +
> > +	if (port_id >= nb_ports) {
> > +		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
> > +		return (-ENODEV);
> 
> Please don't use the BSD style of extra useless paren's around
> returns.

This code snippet doing sanity check is copied from other functions defined in the same file 
lib/librte_ether/rte_ethdev.c, and there are plenty of legacy code in this file doing the similar
BSD style thing. I'd suggest somebody to clean all those kinds of BSD style code from DPDK code, 
in a separated patchset. 

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO Zhou Danny
@ 2015-02-24 10:42   ` David Marchand
  2015-02-25  6:58     ` Zhou, Danny
  0 siblings, 1 reply; 242+ messages in thread
From: David Marchand @ 2015-02-24 10:42 UTC (permalink / raw)
  To: Zhou Danny; +Cc: dev

Hello Danny,

On Mon, Feb 23, 2015 at 5:55 PM, Zhou Danny <danny.zhou@intel.com> wrote:

[snip]

+/**
> + * @param intr_handle
> + *   pointer to the interrupt handle.
> + * @param queue_id
> + *   the queue id
> + * @return
> + *   - On success, return 0
> + *   - On failure, returns -1.
> + */
> +int rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle,
> +                       uint8_t queue_id);
> +
>

>From my point of view, the queue_id (just like port_id) is something that
should be handled by ethdev, not eal.

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index 8c5b834..ee0f019 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
>

 [snip]

+int
> +rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t
> queue_id)
> +{
> +       struct epoll_event ev;
> +       unsigned numfds = 0;
> +
> +       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd
> < 0)
> +               return -1;
> +       if (queue_id >= VFIO_MAX_QUEUE_ID)
> +               return -1;
> +
> +       /* create epoll fd */
> +       int pfd = epoll_create(1);
> +       if (pfd < 0) {
> +               RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
> +               return -1;
> +       }
>

Why recreate the epoll instance at each call to this function ?

+
> +       rte_spinlock_lock(&intr_lock);
> +
> +       ev.events = EPOLLIN | EPOLLPRI;
> +       switch (intr_handle->type) {
> +       case RTE_INTR_HANDLE_UIO:
> +               ev.data.fd = intr_handle->fd;
> +               break;
> +#ifdef VFIO_PRESENT
> +       case RTE_INTR_HANDLE_VFIO_MSIX:
> +       case RTE_INTR_HANDLE_VFIO_MSI:
> +       case RTE_INTR_HANDLE_VFIO_LEGACY:
> +               ev.data.fd = intr_handle->queue_fd[queue_id];
> +               break;
> +#endif
> +       default:
> +               rte_spinlock_unlock(&intr_lock);
> +               close(pfd);
> +               return -1;
> +       }
> +
> +       if (epoll_ctl(pfd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
> +               RTE_LOG(ERR, EAL, "Error adding fd %d epoll_ctl, %s\n",
> +                       intr_handle->queue_fd[queue_id], strerror(errno));
> +       } else
> +               numfds++;
> +
> +       rte_spinlock_unlock(&intr_lock);
> +       /* serve the interrupt */
> +       eal_intr_handle_rx_interrupts(intr_handle, pfd, numfds);
> +
> +       /**
> +       * when we return, we need to rebuild the
> +       * list of fds to monitor.
> +       */
> +       close(pfd);
>

Why do we need to rebuild this "list of fds" ?
Afaics, the fds we want to observe are not supposed to change in the
meantime.
epoll maintains this list, you don't have to care about this.


Looking at this patchset, I think there is a design issue.
eal does not need to know about portid neither queueid.

eal can provide an api to retrieve the interrupt fds, configure an epoll
instance, wait on an epoll instance etc...
ethdev is then responsible to setup the mapping between port id / queue id
and interrupt fds by asking the eal about those fds.

This would result in an eal api even simpler and we could add other fds in
a single epoll fd for other uses.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-24 10:42   ` David Marchand
@ 2015-02-25  6:58     ` Zhou, Danny
  2015-02-25 10:22       ` David Marchand
  0 siblings, 1 reply; 242+ messages in thread
From: Zhou, Danny @ 2015-02-25  6:58 UTC (permalink / raw)
  To: David Marchand; +Cc: dev

Thanks for comments and please see my answers inline.

From: David Marchand [mailto:david.marchand@6wind.com]
Sent: Tuesday, February 24, 2015 6:42 PM
To: Zhou, Danny
Cc: dev@dpdk.org
Subject: Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO

Hello Danny,

On Mon, Feb 23, 2015 at 5:55 PM, Zhou Danny <danny.zhou@intel.com<mailto:danny.zhou@intel.com>> wrote:

[snip]

+/**
+ * @param intr_handle
+ *   pointer to the interrupt handle.
+ * @param queue_id
+ *   the queue id
+ * @return
+ *   - On success, return 0
+ *   - On failure, returns -1.
+ */
+int rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle,
+                       uint8_t queue_id);
+

From my point of view, the queue_id (just like port_id) is something that should be handled by ethdev, not eal.

DZ: See comments below.

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 8c5b834..ee0f019 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c

 [snip]

+int
+rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t queue_id)
+{
+       struct epoll_event ev;
+       unsigned numfds = 0;
+
+       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+               return -1;
+       if (queue_id >= VFIO_MAX_QUEUE_ID)
+               return -1;
+
+       /* create epoll fd */
+       int pfd = epoll_create(1);
+       if (pfd < 0) {
+               RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
+               return -1;
+       }

Why recreate the epoll instance at each call to this function ?

DZ: To avoid recreating the epoll instance for each queue, the struct rte_intr_handle(or a new structure added to ethdev)
should be extended by adding fields storing per-queue pfd. This way, it could reduce user/kernel context  switch overhead
when calling epoll_create() each time.

Sounds good?

+
+       rte_spinlock_lock(&intr_lock);
+
+       ev.events = EPOLLIN | EPOLLPRI;
+       switch (intr_handle->type) {
+       case RTE_INTR_HANDLE_UIO:
+               ev.data.fd = intr_handle->fd;
+               break;
+#ifdef VFIO_PRESENT
+       case RTE_INTR_HANDLE_VFIO_MSIX:
+       case RTE_INTR_HANDLE_VFIO_MSI:
+       case RTE_INTR_HANDLE_VFIO_LEGACY:
+               ev.data.fd = intr_handle->queue_fd[queue_id];
+               break;
+#endif
+       default:
+               rte_spinlock_unlock(&intr_lock);
+               close(pfd);
+               return -1;
+       }
+
+       if (epoll_ctl(pfd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+               RTE_LOG(ERR, EAL, "Error adding fd %d epoll_ctl, %s\n",
+                       intr_handle->queue_fd[queue_id], strerror(errno));
+       } else
+               numfds++;
+
+       rte_spinlock_unlock(&intr_lock);
+       /* serve the interrupt */
+       eal_intr_handle_rx_interrupts(intr_handle, pfd, numfds);
+
+       /**
+       * when we return, we need to rebuild the
+       * list of fds to monitor.
+       */
+       close(pfd);

Why do we need to rebuild this "list of fds" ?
Afaics, the fds we want to observe are not supposed to change in the meantime.
epoll maintains this list, you don't have to care about this.

Agreed, it is not needed.

Looking at this patchset, I think there is a design issue.
eal does not need to know about portid neither queueid.

eal can provide an api to retrieve the interrupt fds, configure an epoll instance, wait on an epoll instance etc...
ethdev is then responsible to setup the mapping between port id / queue id and interrupt fds by asking the eal about those fds.

This would result in an eal api even simpler and we could add other fds in a single epoll fd for other uses.

DZ: The queueid is just an index to the queue related eventfd array stored in EAL. If this array is still in the EAL and ethdev can apply for it and setup mapping for certain queue, there
might be issue for multiple-process use case where the fd resources allocated for secondary process are not freed if the secondary process exits unexpectedly.

Probably we can setup the eventfd array inside ethdev,  and we just need EAL API to wait for ethdev’fd. So application invokes ethdev API with portid and queueid, and ethdev calls eal
API to wait on a ethdev fd which correlates with the specified portid and queueid.

Sounds ok to you?

I am going to travel tomorrow and Steve Liang might follow up on V6 patch submission when I am absent. Thanks Steve!

--
David Marchand


^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-25  6:58     ` Zhou, Danny
@ 2015-02-25 10:22       ` David Marchand
  2015-02-25 15:29         ` Zhou, Danny
  0 siblings, 1 reply; 242+ messages in thread
From: David Marchand @ 2015-02-25 10:22 UTC (permalink / raw)
  To: Zhou, Danny; +Cc: dev

Hello Danny,

On Wed, Feb 25, 2015 at 7:58 AM, Zhou, Danny <danny.zhou@intel.com> wrote:

>
> +int
> +rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t
> queue_id)
> +{
> +       struct epoll_event ev;
> +       unsigned numfds = 0;
> +
> +       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd
> < 0)
> +               return -1;
> +       if (queue_id >= VFIO_MAX_QUEUE_ID)
> +               return -1;
> +
> +       /* create epoll fd */
> +       int pfd = epoll_create(1);
> +       if (pfd < 0) {
> +               RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
> +               return -1;
> +       }
>
>
>
> Why recreate the epoll instance at each call to this function ?
>
>
>
> DZ: To avoid recreating the epoll instance for each queue, the struct
> rte_intr_handle(or a new structure added to ethdev)
>
> should be extended by adding fields storing per-queue pfd. This way, it
> could reduce user/kernel context  switch overhead
>
> when calling epoll_create() each time.
>
>
>
> Sounds good?
>

You don't need a epfd per queue. And hardcoding epfd == eventfd will give a
not very usable api.

Plus, epoll is something linux-specific, so you can't move it out of
eal/linux.
I suppose you need an abstraction here (and in the future we could add
something for bsd ?).



>
> Looking at this patchset, I think there is a design issue.
>
> eal does not need to know about portid neither queueid.
>
>
>
> eal can provide an api to retrieve the interrupt fds, configure an epoll
> instance, wait on an epoll instance etc...
>
> ethdev is then responsible to setup the mapping between port id / queue id
> and interrupt fds by asking the eal about those fds.
>
>
>
> This would result in an eal api even simpler and we could add other fds in
> a single epoll fd for other uses.
>
>
>
> DZ: The queueid is just an index to the queue related eventfd array stored
> in EAL. If this array is still in the EAL and ethdev can apply for it and
> setup mapping for certain queue, there
>
> might be issue for multiple-process use case where the fd resources
> allocated for secondary process are not freed if the secondary process
> exits unexpectedly.
>

Not sure I follow you.
If a secondary process exits, the eventfds created in primary process
should still be valid and reusable.
Why would you need to free them ? Something to do with vfio ?



>
> Probably we can setup the eventfd array inside ethdev,  and we just need
> EAL API to wait for ethdev’fd. So application invokes ethdev API with
> portid and queueid, and ethdev calls eal
>
> API to wait on a ethdev fd which correlates with the specified portid and
> queueid.
>
>
>
> Sounds ok to you?
>

eventfds creation can not be handled by ethdev, since it needs
infrastructure and informations from within the eal/linux.
Again, do we need an abstraction ?

ethdev must be the one that does the mappings between port/queue and
eventfds (or any object that represents a way to wake up for a given
port/queue).


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-25 10:22       ` David Marchand
@ 2015-02-25 15:29         ` Zhou, Danny
  2015-02-25 15:44           ` Thomas Monjalon
  2015-02-25 15:52           ` David Marchand
  0 siblings, 2 replies; 242+ messages in thread
From: Zhou, Danny @ 2015-02-25 15:29 UTC (permalink / raw)
  To: David Marchand; +Cc: dev



From: David Marchand [mailto:david.marchand@6wind.com]
Sent: Wednesday, February 25, 2015 6:22 PM
To: Zhou, Danny
Cc: dev@dpdk.org; Liang, Cunming
Subject: Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO

Hello Danny,

On Wed, Feb 25, 2015 at 7:58 AM, Zhou, Danny <danny.zhou@intel.com<mailto:danny.zhou@intel.com>> wrote:

+int
+rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t queue_id)
+{
+       struct epoll_event ev;
+       unsigned numfds = 0;
+
+       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+               return -1;
+       if (queue_id >= VFIO_MAX_QUEUE_ID)
+               return -1;
+
+       /* create epoll fd */
+       int pfd = epoll_create(1);
+       if (pfd < 0) {
+               RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
+               return -1;
+       }

Why recreate the epoll instance at each call to this function ?

DZ: To avoid recreating the epoll instance for each queue, the struct rte_intr_handle(or a new structure added to ethdev)
should be extended by adding fields storing per-queue pfd. This way, it could reduce user/kernel context  switch overhead
when calling epoll_create() each time.

Sounds good?

You don't need a epfd per queue. And hardcoding epfd == eventfd will give a not very usable api.

Plus, epoll is something linux-specific, so you can't move it out of eal/linux.
I suppose you need an abstraction here (and in the future we could add something for bsd ?).

DZ: libev provides abstraction layer which is a good candidate to integrate, rather than
reinventing one I think. The BSD support can be implemented in the files under
lib\librte_eal\bsdapp\eal folder by calling BSD specific APIs. Maybe it is a good idea to introduce
a separated component like OS Adaption Layer into EAL in the future once DPDK is widely adopted in
BSD as well as Windows, then all DPDK components invoke Linux specific APIs could instead calling abstraction APIs.

Adding an abstraction here specifically for epoll does not resolve all the porting/migration problem in my mind.


Looking at this patchset, I think there is a design issue.
eal does not need to know about portid neither queueid.

eal can provide an api to retrieve the interrupt fds, configure an epoll instance, wait on an epoll instance etc...
ethdev is then responsible to setup the mapping between port id / queue id and interrupt fds by asking the eal about those fds.

This would result in an eal api even simpler and we could add other fds in a single epoll fd for other uses.

DZ: The queueid is just an index to the queue related eventfd array stored in EAL. If this array is still in the EAL and ethdev can apply for it and setup mapping for certain queue, there
might be issue for multiple-process use case where the fd resources allocated for secondary process are not freed if the secondary process exits unexpectedly.

Not sure I follow you.
If a secondary process exits, the eventfds created in primary process should still be valid and reusable.
Why would you need to free them ? Something to do with vfio ?

DZ: See below.

Probably we can setup the eventfd array inside ethdev,  and we just need EAL API to wait for ethdev’fd. So application invokes ethdev API with portid and queueid, and ethdev calls eal
API to wait on a ethdev fd which correlates with the specified portid and queueid.

Sounds ok to you?

eventfds creation can not be handled by ethdev, since it needs infrastructure and informations from within the eal/linux.
Again, do we need an abstraction ?

ethdev must be the one that does the mappings between port/queue and eventfds (or any object that represents a way to wake up for a given port/queue).

DZ: agreed after revisiting code. Let us follow your direction to create a ethdev API, similar to rte_eth_dev_rx_queue_intr_enable()/rte_eth_dev_rx_queue_intr_disable(), to use portiid and queueid as arguments. Then this ethdev API uses the mapped eventfds to invoke corresponding EAL API, waiting for interrupt event notification from kernel.  A V6 patchset will be created for this.

--
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-25 15:29         ` Zhou, Danny
@ 2015-02-25 15:44           ` Thomas Monjalon
  2015-02-25 15:52           ` David Marchand
  1 sibling, 0 replies; 242+ messages in thread
From: Thomas Monjalon @ 2015-02-25 15:44 UTC (permalink / raw)
  To: Zhou, Danny; +Cc: dev

Please Danny, click on the button "uninstall Outlook"
or configure it to have quote marks.
This email is really hard to read.

2015-02-25 15:29, Zhou, Danny:
> From: David Marchand [mailto:david.marchand@6wind.com]
> Sent: Wednesday, February 25, 2015 6:22 PM
> To: Zhou, Danny
> Cc: dev@dpdk.org; Liang, Cunming
> Subject: Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
> 
> Hello Danny,
> 
> On Wed, Feb 25, 2015 at 7:58 AM, Zhou, Danny <danny.zhou@intel.com<mailto:danny.zhou@intel.com>> wrote:
> 
> +int
> +rte_intr_wait_rx_pkt(struct rte_intr_handle *intr_handle, uint8_t queue_id)
> +{
> +       struct epoll_event ev;
> +       unsigned numfds = 0;
> +
> +       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
> +               return -1;
> +       if (queue_id >= VFIO_MAX_QUEUE_ID)
> +               return -1;
> +
> +       /* create epoll fd */
> +       int pfd = epoll_create(1);
> +       if (pfd < 0) {
> +               RTE_LOG(ERR, EAL, "Cannot create epoll instance\n");
> +               return -1;
> +       }
> 
> Why recreate the epoll instance at each call to this function ?
> 
> DZ: To avoid recreating the epoll instance for each queue, the struct rte_intr_handle(or a new structure added to ethdev)
> should be extended by adding fields storing per-queue pfd. This way, it could reduce user/kernel context  switch overhead
> when calling epoll_create() each time.
> 
> Sounds good?
> 
> You don't need a epfd per queue. And hardcoding epfd == eventfd will give a not very usable api.
> 
> Plus, epoll is something linux-specific, so you can't move it out of eal/linux.
> I suppose you need an abstraction here (and in the future we could add something for bsd ?).
> 
> DZ: libev provides abstraction layer which is a good candidate to integrate, rather than
> reinventing one I think. The BSD support can be implemented in the files under
> lib\librte_eal\bsdapp\eal folder by calling BSD specific APIs. Maybe it is a good idea to introduce
> a separated component like OS Adaption Layer into EAL in the future once DPDK is widely adopted in
> BSD as well as Windows, then all DPDK components invoke Linux specific APIs could instead calling abstraction APIs.

EAL means Environment Abstraction Layer.
In my mind, OS is part of the environment.
DPDK components don't invoke Linux specific APIs, they use EAL!
What are you thinking about?

> Adding an abstraction here specifically for epoll does not resolve all the porting/migration problem in my mind.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt handling based on VFIO
  2015-02-25 15:29         ` Zhou, Danny
  2015-02-25 15:44           ` Thomas Monjalon
@ 2015-02-25 15:52           ` David Marchand
  1 sibling, 0 replies; 242+ messages in thread
From: David Marchand @ 2015-02-25 15:52 UTC (permalink / raw)
  To: Zhou, Danny; +Cc: dev

On Wed, Feb 25, 2015 at 4:29 PM, Zhou, Danny <danny.zhou@intel.com> wrote:

>
>
>
>
> *From:* David Marchand [mailto:david.marchand@6wind.com]
> *Sent:* Wednesday, February 25, 2015 6:22 PM
> *To:* Zhou, Danny
> *Cc:* dev@dpdk.org; Liang, Cunming
> *Subject:* Re: [dpdk-dev] [PATCH v5 5/6] eal: add per rx queue interrupt
> handling based on VFIO
>
>
>
>
>
> DZ: To avoid recreating the epoll instance for each queue, the struct
> rte_intr_handle(or a new structure added to ethdev)
>
> should be extended by adding fields storing per-queue pfd. This way, it
> could reduce user/kernel context  switch overhead
>
> when calling epoll_create() each time.
>
>
>
> Sounds good?
>
>
>
> You don't need a epfd per queue. And hardcoding epfd == eventfd will give
> a not very usable api.
>
>
>
> Plus, epoll is something linux-specific, so you can't move it out of
> eal/linux.
>
> I suppose you need an abstraction here (and in the future we could add
> something for bsd ?).
>
>
>
> DZ: libev provides abstraction layer which is a good candidate to
> integrate, rather than
>
> reinventing one I think. The BSD support can be implemented in the files
> under
>
> lib\librte_eal\bsdapp\eal folder by calling BSD specific APIs. Maybe it is
> a good idea to introduce
>
> a separated component like OS Adaption Layer into EAL in the future once
> DPDK is widely adopted in
>
> BSD as well as Windows, then all DPDK components invoke Linux specific
> APIs could instead calling abstraction APIs.
>
>
>
> Adding an abstraction here specifically for epoll does not resolve all the
> porting/migration problem in my mind.
>

Yes, reusing this kind of library (or libevent) looks like a good idea.

Hum, I would say eal/common is there for the common part and for the
different abstractions.
Do you see anything that would not fit in ?



>  eventfds creation can not be handled by ethdev, since it needs
> infrastructure and informations from within the eal/linux.
>
> Again, do we need an abstraction ?
>
>
>
> ethdev must be the one that does the mappings between port/queue and
> eventfds (or any object that represents a way to wake up for a given
> port/queue).
>
>
>
> DZ: agreed after revisiting code. Let us follow your direction to create a
> ethdev API, similar to
> rte_eth_dev_rx_queue_intr_enable()/rte_eth_dev_rx_queue_intr_disable(), to
> use portiid and queueid as arguments. Then this ethdev API uses the mapped
> eventfds to invoke corresponding EAL API, waiting for interrupt event
> notification from kernel.  A V6 patchset will be created for this.
>

Ok, I will look at it when available.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-02-23 16:55 [dpdk-dev] [PATCH v5 0/6] Interrupt mode PMD Zhou Danny
                   ` (5 preceding siblings ...)
  2015-02-23 16:55 ` [dpdk-dev] [PATCH v5 6/6] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Zhou Danny
@ 2015-02-27  4:56 ` Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 1/8] eal: declare new interrupt api Cunming Liang
                     ` (11 more replies)
  6 siblings, 12 replies; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess.
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (5):
  eal: declare new interrupt api
  eal/linux: add rx queue interrupt FDs to intr handle struct
  eal/bsd: dummy for new intr definition
  eal/linux: add per rx queue interrupt handling based on VFIO
  ethdev: add rx interrupt enable/disable functions

Zhou, Danny (3):
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch

 examples/l3fwd-power/main.c                        | 194 ++++++++---
 lib/librte_eal/bsdapp/eal/eal_interrupts.c         |  15 +
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |   4 +
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   2 +
 lib/librte_eal/common/include/rte_interrupts.h     |  38 +++
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 224 +++++++++---
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  23 +-
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   9 +
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   2 +
 lib/librte_ether/rte_ethdev.c                      |  66 ++++
 lib/librte_ether/rte_ethdev.h                      |  77 +++++
 lib/librte_ether/rte_ether_version.map             |   3 +
 lib/librte_pmd_e1000/e1000_ethdev.h                |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c                  | 231 +++++++++++--
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 377 ++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   7 +
 16 files changed, 1156 insertions(+), 119 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 1/8] eal: declare new interrupt api
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct Cunming Liang
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

Add two API *rte_intr_rx_wait* and *rte_intr_rx_set* for RX interrupt.
They're only available in VFIO_MSIX.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/common/include/rte_interrupts.h | 38 ++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/lib/librte_eal/common/include/rte_interrupts.h b/lib/librte_eal/common/include/rte_interrupts.h
index 609c34b..6e31f1d 100644
--- a/lib/librte_eal/common/include/rte_interrupts.h
+++ b/lib/librte_eal/common/include/rte_interrupts.h
@@ -45,6 +45,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /** Interrupt handle */
 struct rte_intr_handle;
 
@@ -54,6 +56,8 @@ typedef void (*rte_intr_callback_fn)(struct rte_intr_handle *intr_handle,
 
 #include <exec-env/rte_interrupts.h>
 
+#define RTE_EPOLL_FD_ANY        -1  /**< to hint using per thread epfd */
+
 /**
  * It registers the callback for the specific interrupt. Multiple
  * callbacks cal be registered at the same time.
@@ -113,6 +117,40 @@ int rte_intr_enable(struct rte_intr_handle *intr_handle);
  */
 int rte_intr_disable(struct rte_intr_handle *intr_handle);
 
+/**
+ * @param intr_handle
+ *   pointer to the interrupt handle.
+ * @param fd
+ *   event fd the thread wait for.
+ * @param vec
+ *   array for all the ready vectors.
+ * @param num
+ *   the max number of vector array.
+ * @return
+ *   - On success, returns the number of available vectors.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_wait(struct rte_intr_handle *intr_handle,
+		 int fd, uint32_t *vec, uint16_t num);
+
+/**
+ * @param intr_handle
+ *   pointer to the interrupt handle.
+ * @param fd
+ *   event fd which the intr vector associated to.
+ * @param op
+ *   operation type of {ADD, DEL}.
+ * @param vec
+ *   vector number added to the event wait list.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_set(struct rte_intr_handle *intr_handle,
+		int fd, int op, uint32_t vec);
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 1/8] eal: declare new interrupt api Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27 10:33     ` David Marchand
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition Cunming Liang
                     ` (9 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

Per vector event fd will store in rte_intr_handle during init.
Device drivers take responsibility to fill queue-vec mapping table(vec_num[]).

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes:
 - add mapping table between irq vector number and queue id.

v5 changes:
 - Create this new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect.

 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6a159c7..9f45377 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,9 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define VFIO_MAX_RXTX_INTR_ID        32
+#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -48,6 +51,9 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
+#define RTE_INTR_EVENT_ADD            1UL
+#define	RTE_INTR_EVENT_DEL            2UL
+
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	union {
@@ -57,6 +63,9 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	int efds[VFIO_MAX_RXTX_INTR_ID]; /**< rxtx intr event fd for vfio */
+	uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector number */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 1/8] eal: declare new interrupt api Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27  9:59     ` David Marchand
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO Cunming Liang
                     ` (8 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

To make bsd compiling happy with new intr changes.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/bsdapp/eal/eal_interrupts.c                | 15 +++++++++++++++
 .../bsdapp/eal/include/exec-env/rte_interrupts.h          |  4 ++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map             |  2 ++
 3 files changed, 21 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/eal_interrupts.c b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
index cb7d4f1..21fab1a 100644
--- a/lib/librte_eal/bsdapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
@@ -69,3 +69,18 @@ rte_eal_intr_init(void)
 	return 0;
 }
 
+int
+rte_intr_rx_wait(struct rte_intr_handle *intr_handle __rte_unused,
+		 int fd __rte_unused, uint32_t *vec __rte_unused,
+		 uint16_t num __rte_unused)
+{
+	return -ENOTSUP;
+}
+
+int
+rte_intr_rx_set(struct rte_intr_handle *intr_handle __rte_unused,
+		int fd __rte_unused, int op __rte_unused,
+		uint32_t vec __rte_unused)
+{
+	return -ENOTSUP;
+}
diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index 87a9cf6..b114aac 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define VFIO_MAX_RXTX_INTR_ID        32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -49,6 +51,8 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector number */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 17515a9..bf92029 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -58,6 +58,8 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_rx_set;
+	rte_intr_rx_wait;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (2 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27 10:33     ` David Marchand
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 5/8] ethdev: add rx interrupt enable/disable functions Cunming Liang
                     ` (7 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

This patch does below:
 - Create multiple VFIO eventfd for rx queues.
 - Handle per rx queue interrupt.
 - Eliminate unnecessary suspended DPDK polling thread wakeup mechanism
   for rx interrupt by allowing polling thread epoll_wait rx queue
   interrupt notification.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries

v4 changes:
 - Adjust position of new-added structure fields

v3 changes:
 - Fix review comments

v2 changes:
 - Fix compilation issue for a missed header file
 - Bug fix: free unreleased resources on the exception path before return
 - Consolidate coding style related review comments

 lib/librte_eal/linuxapp/eal/eal_interrupts.c    | 224 +++++++++++++++++++-----
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c      |  23 ++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |   2 +
 3 files changed, 201 insertions(+), 48 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 8c5b834..f90c2b4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -44,6 +44,7 @@
 #include <sys/epoll.h>
 #include <sys/signalfd.h>
 #include <sys/ioctl.h>
+#include <assert.h>
 
 #include <rte_common.h>
 #include <rte_interrupts.h>
@@ -70,6 +71,8 @@
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
 
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
 /**
  * union for pipe fds.
  */
@@ -127,6 +130,9 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT
 
 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (VFIO_MAX_RXTX_INTR_ID + 1))
 
 /* enable legacy (INTx) interrupts */
 static int
@@ -218,10 +224,10 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* enable MSI-X interrupts */
+/* enable MSI interrupts */
 static int
 vfio_enable_msi(struct rte_intr_handle *intr_handle) {
-	int len, ret;
+	int len, ret, max_intr;
 	char irq_set_buf[IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
@@ -230,12 +236,19 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if ((!intr_handle->max_intr) ||
+		(intr_handle->max_intr > VFIO_MAX_RXTX_INTR_ID))
+		max_intr = VFIO_MAX_RXTX_INTR_ID + 1;
+	else
+		max_intr = intr_handle->max_intr;
+
+	irq_set->count = max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
+	fd_ptr[max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -244,27 +257,10 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
-/* disable MSI-X interrupts */
+/* disable MSI interrupts */
 static int
 vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
@@ -292,8 +288,8 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 /* enable MSI-X interrupts */
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
-	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret, max_intr;
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
 
@@ -301,12 +297,19 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if ((!intr_handle->max_intr) ||
+		(intr_handle->max_intr > VFIO_MAX_RXTX_INTR_ID))
+		max_intr = VFIO_MAX_RXTX_INTR_ID + 1;
+	else
+		max_intr = intr_handle->max_intr;
+
+	irq_set->count = max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
+	fd_ptr[max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -316,22 +319,6 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 		return -1;
 	}
 
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI-X interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -339,7 +326,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	int len, ret;
 
 	len = sizeof(struct vfio_irq_set);
@@ -860,3 +847,154 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static void
+eal_intr_process_rxtx_interrupts(struct rte_intr_handle *intr_handle,
+				 struct epoll_event *events,
+				 uint32_t *vec, int nfds)
+{
+	int i, bytes_read;
+	union rte_intr_read_buffer buf;
+	int fd;
+
+	for (i = 0; i < nfds; i++) {
+		/* set the length to be read for different handle type */
+		switch (intr_handle->type) {
+		case RTE_INTR_HANDLE_UIO:
+			bytes_read = sizeof(buf.uio_intr_count);
+			break;
+		case RTE_INTR_HANDLE_ALARM:
+			bytes_read = sizeof(buf.timerfd_num);
+			break;
+#ifdef VFIO_PRESENT
+		case RTE_INTR_HANDLE_VFIO_MSIX:
+		case RTE_INTR_HANDLE_VFIO_MSI:
+		case RTE_INTR_HANDLE_VFIO_LEGACY:
+			bytes_read = sizeof(buf.vfio_intr_count);
+			break;
+#endif
+		default:
+			bytes_read = 1;
+			break;
+		}
+
+		/**
+		* read out to clear the ready-to-be-read flag
+		* for epoll_wait.
+		*/
+		vec[i] = events[i].data.u32;
+		assert(vec[i] < VFIO_MAX_RXTX_INTR_ID);
+
+		fd = intr_handle->efds[vec[i]];
+		bytes_read = read(fd, &buf, bytes_read);
+		if (bytes_read < 0)
+			RTE_LOG(ERR, EAL, "Error reading from file "
+				"descriptor %d: %s\n", fd, strerror(errno));
+		else if (bytes_read == 0)
+			RTE_LOG(ERR, EAL, "Read nothing from file "
+				"descriptor %d\n", fd);
+	}
+}
+
+static int init_tls_epfd(void)
+{
+	int pfd = epoll_create(1);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_rx_wait(struct rte_intr_handle *intr_handle, int epfd,
+		 uint32_t *vec, uint16_t num)
+{
+#define MAX_EVENTS      8
+	struct epoll_event events[MAX_EVENTS];
+	int ret, nfds = 0;
+
+	if (!intr_handle || !vec) {
+		RTE_LOG(ERR, EAL, "invalid input parameter\n");
+		return -1;
+	}
+
+	if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
+		RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
+		return -1;
+	}
+
+	if (epfd == RTE_EPOLL_FD_ANY) {
+		/* using per thread epoll fd */
+		if (unlikely(RTE_PER_LCORE(_epfd) == -1))
+			RTE_PER_LCORE(_epfd) = init_tls_epfd();
+		epfd = RTE_PER_LCORE(_epfd);
+	}
+
+	do {
+		ret = epoll_wait(epfd, events,
+				 RTE_MIN(num, MAX_EVENTS),
+				 EAL_INTR_EPOLL_WAIT_FOREVER);
+		if (unlikely(ret < 0)) {
+			/* epoll_wait fail */
+			RTE_LOG(ERR, EAL, "epoll_wait returns with fail\n");
+			return -1;
+		} else if (ret > 0) {
+			/* epoll_wait has at least one fd ready to read */
+			eal_intr_process_rxtx_interrupts(intr_handle, events,
+							 vec, ret);
+			num -= ret;
+			vec += ret;
+			nfds += ret;
+		} else if (nfds > 0)
+			break;
+	} while (num > 0);
+
+	return nfds;
+}
+
+int
+rte_intr_rx_set(struct rte_intr_handle *intr_handle, int epfd,
+		int op, uint32_t vec)
+{
+	struct epoll_event ev;
+
+	if (!intr_handle || vec >= VFIO_MAX_RXTX_INTR_ID) {
+		RTE_LOG(ERR, EAL, "invalid input parameter\n");
+		return -1;
+	}
+
+	if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
+		RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
+		return -1;
+	}
+
+	switch (op) {
+	case RTE_INTR_EVENT_ADD:
+		op = EPOLL_CTL_ADD;
+		break;
+	case RTE_INTR_EVENT_DEL:
+		op = EPOLL_CTL_DEL;
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "event op type mismatch\n");
+		return -1;
+	}
+
+	if (epfd == RTE_EPOLL_FD_ANY) {
+		/* using per thread epoll fd */
+		if (RTE_PER_LCORE(_epfd) == -1)
+			RTE_PER_LCORE(_epfd) = init_tls_epfd();
+		epfd = RTE_PER_LCORE(_epfd);
+	}
+
+	ev.data.u32 = vec;
+	ev.events = EPOLLIN | EPOLLPRI;
+	if (epoll_ctl(epfd, op, intr_handle->efds[vec], &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, intr_handle->efds[vec], strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index ee9660f..d90d23c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -38,6 +38,7 @@
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/epoll.h>
 
 #include <rte_log.h>
 #include <rte_pci.h>
@@ -274,16 +275,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
-					"error %i (%s)\n", errno, strerror(errno));
+				"error %i (%s)\n", errno, strerror(errno));
 			return -1;
 		}
 
 		/* if this vector cannot be used with eventfd, fail if we explicitly
 		 * specified interrupt type, otherwise continue */
 		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
-			if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) {
+			if (internal_config.vfio_intr_mode !=
+			    RTE_INTR_MODE_NONE) {
 				RTE_LOG(ERR, EAL,
-						"  interrupt vector does not support eventfd!\n");
+					"  interrupt vector "
+					"does not support eventfd!\n");
 				return -1;
 			} else
 				continue;
@@ -293,17 +296,27 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 		fd = eventfd(0, 0);
 		if (fd < 0) {
 			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
-					"error %i (%s)\n", errno, strerror(errno));
+				"error %i (%s)\n", errno, strerror(errno));
 			return -1;
 		}
 
 		dev->intr_handle.fd = fd;
 		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
-
 		switch (i) {
 		case VFIO_PCI_MSIX_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+			for (i = 0; i < VFIO_MAX_RXTX_INTR_ID; i++) {
+				fd = eventfd(0, 0);
+				if (fd < 0) {
+					RTE_LOG(ERR, EAL,
+						"cannot setup eventfd,"
+						"error %i (%s)\n",
+						errno, strerror(errno));
+					return -1;
+				}
+				dev->intr_handle.efds[i] = fd;
+			}
 			break;
 		case VFIO_PCI_MSI_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 5f1857d..892a452 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -64,6 +64,8 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_rx_set;
+	rte_intr_rx_wait;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 5/8] ethdev: add rx interrupt enable/disable functions
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (3 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 6/8] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
                     ` (6 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

Add three dev_ops functions to enable and disable rx queue interrupts; and to retrieve the vector num which the specified queue assosiated with.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes
 - add rx_intr_vec_get to retrieve the vector num of the queue.

v5 changes
 - Rebase the patchset onto the HEAD

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Put new functions at the end of eth_dev_ops to avoid breaking ABI

v3 changes
 - Add return value for interrupt enable/disable functions

 lib/librte_ether/rte_ethdev.c          | 66 +++++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 77 ++++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |  3 ++
 3 files changed, 146 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index bb94ccb..6654917 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3320,6 +3320,72 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	}
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_intr_vec_get(uint8_t port_id, uint16_t queue_id,
+			    uint32_t *vec)
+{
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+
+	if (port_id >= nb_ports) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	*vec = intr_handle->vec_num[queue_id];
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			   uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (port_id >= nb_ports) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+}
+
+int
+rte_eth_dev_rx_intr_disable(uint8_t port_id,
+			    uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (port_id >= nb_ports) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 8db3127..9cdde82 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -825,6 +825,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+	uint16_t rxq;
 };
 
 /**
@@ -1030,6 +1032,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */
 
@@ -1381,6 +1391,10 @@ struct eth_dev_ops {
 	/** Get current RSS hash configuration. */
 	rss_hash_conf_get_t rss_hash_conf_get;
 	eth_filter_ctrl_t              filter_ctrl;          /**< common filter control*/
+
+	/** Enable/disable Rx queue interrupt. */
+	eth_rx_enable_intr_t       rx_queue_intr_enable; /**< Enable Rx queue interrupt. */
+	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt.*/
 };
 
 /**
@@ -2846,6 +2860,69 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 				enum rte_eth_event_type event);
 
 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			       uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_intr_disable() function disables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_disable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
+ * It retrieves the interrupt vector number on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param vec
+ *   The interrupt vector number of the specified queue.
+ * @return
+ *   - (0) if successful.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_vec_get(uint8_t port_id, uint16_t queue_id,
+				uint32_t *vec);
+
+/**
  * Turn on the LED on the Ethernet device.
  * This function turns on the LED on the Ethernet device.
  *
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index 0d46578..7f93156 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -47,6 +47,9 @@ DPDK_2.0 {
 	rte_eth_dev_rss_hash_update;
 	rte_eth_dev_rss_reta_query;
 	rte_eth_dev_rss_reta_update;
+	rte_eth_dev_rx_intr_disable;
+	rte_eth_dev_rx_intr_enable;
+	rte_eth_dev_rx_intr_vec_get;
 	rte_eth_dev_rx_queue_start;
 	rte_eth_dev_rx_queue_stop;
 	rte_eth_dev_set_link_down;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 6/8] ixgbe: enable rx queue interrupts for both PF and VF
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (4 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 5/8] ethdev: add rx interrupt enable/disable functions Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF Cunming Liang
                     ` (5 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

From: "Zhou, Danny" <danny.zhou@intel.com>

The patch does below things for ixgbe PF and VF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Yong Liu <yong.liu@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 377 +++++++++++++++++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   7 +
 2 files changed, 380 insertions(+), 4 deletions(-)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 9bdc046..f5a0edd 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -83,6 +83,9 @@
  */
 #define IXGBE_FC_LO    0x40
 
+/* Default minimum inter-interrupt interval for EITR configuration */
+#define IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT    0x79E
+
 /* Timer value included in XOFF frames. */
 #define IXGBE_FC_PAUSE 0x680
 
@@ -173,6 +176,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -186,11 +190,14 @@ static void ixgbe_dcb_init(struct ixgbe_hw *hw,struct ixgbe_dcb_config *dcb_conf
 /* For Virtual Function support */
 static int eth_ixgbevf_dev_init(struct eth_driver *eth_drv,
 		struct rte_eth_dev *eth_dev);
+static int ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev);
+static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_configure(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_start(struct rte_eth_dev *dev);
 static void ixgbevf_dev_stop(struct rte_eth_dev *dev);
 static void ixgbevf_dev_close(struct rte_eth_dev *dev);
 static void ixgbevf_intr_disable(struct ixgbe_hw *hw);
+static void ixgbevf_intr_enable(struct ixgbe_hw *hw);
 static void ixgbevf_dev_stats_get(struct rte_eth_dev *dev,
 		struct rte_eth_stats *stats);
 static void ixgbevf_dev_stats_reset(struct rte_eth_dev *dev);
@@ -200,6 +207,15 @@ static void ixgbevf_vlan_strip_queue_set(struct rte_eth_dev *dev,
 		uint16_t queue, int on);
 static void ixgbevf_vlan_offload_set(struct rte_eth_dev *dev, int mask);
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on);
+static void ixgbevf_dev_interrupt_handler(struct rte_intr_handle *handle,
+		void *param);
+static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+		uint16_t queue_id);
+static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+		 uint16_t queue_id);
+static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+		 uint8_t queue, uint8_t msix_vector);
+static void ixgbevf_configure_msix(struct rte_eth_dev *dev);
 
 /* For Eth VMDQ APIs support */
 static int ixgbe_uc_hash_table_set(struct rte_eth_dev *dev, struct
@@ -217,6 +233,14 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
 		uint8_t	rule_id);
 
+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void ixgbe_configure_msix(struct rte_eth_dev *dev);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 		uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -265,7 +289,7 @@ static int ixgbevf_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
  */
 #define UPDATE_VF_STAT(reg, last, cur)	                        \
 {                                                               \
-	u32 latest = IXGBE_READ_REG(hw, reg);                   \
+	uint32_t latest = IXGBE_READ_REG(hw, reg);                   \
 	cur += latest - last;                                   \
 	last = latest;                                          \
 }
@@ -346,6 +370,8 @@ static struct eth_dev_ops ixgbe_eth_dev_ops = {
 	.tx_queue_start	      = ixgbe_dev_tx_queue_start,
 	.tx_queue_stop        = ixgbe_dev_tx_queue_stop,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
+	.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
 	.rx_queue_count       = ixgbe_dev_rx_queue_count,
 	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -406,8 +432,11 @@ static struct eth_dev_ops ixgbevf_eth_dev_ops = {
 	.vlan_offload_set     = ixgbevf_vlan_offload_set,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
+	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
 	.tx_queue_setup       = ixgbe_dev_tx_queue_setup,
 	.tx_queue_release     = ixgbe_dev_tx_queue_release,
+	.rx_queue_intr_enable = ixgbevf_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbevf_dev_rx_queue_intr_disable,
 	.mac_addr_add         = ixgbevf_add_mac_addr,
 	.mac_addr_remove      = ixgbevf_remove_mac_addr,
 };
@@ -904,6 +933,10 @@ eth_ixgbe_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 			eth_dev->data->port_id, pci_dev->id.vendor_id,
 			pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+						IXGBE_MAX_OTHER_INTR;
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		ixgbe_dev_interrupt_handler, (void *)eth_dev);
 
@@ -1085,6 +1118,15 @@ eth_ixgbevf_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 			return (-EIO);
 	}
 
+	/* set max interrupt vfio request */
+	pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+						IXGBEVF_MAX_OTHER_INTR;
+
+	rte_intr_callback_register(&(pci_dev->intr_handle),
+		ixgbevf_dev_interrupt_handler, (void *)eth_dev);
+
+	rte_intr_enable(&(pci_dev->intr_handle));
+
 	PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x mac.type=%s",
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id, "ixgbe_mac_82599_vf");
@@ -1486,6 +1528,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+	/* confiugre msix for  sleep until  rx interrupt */
+	ixgbe_configure_msix(dev);
+
 	/* initialize transmission unit */
 	ixgbe_dev_tx_init(dev);
 
@@ -1561,6 +1606,10 @@ skip_link_setup:
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ixgbe_dev_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		ixgbe_dev_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	ixgbe_enable_intr(dev);
 
@@ -2238,6 +2287,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+	return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -2264,10 +2335,10 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	PMD_DRV_LOG(INFO, "eicr %x", eicr);
 
 	intr->flags = 0;
-	if (eicr & IXGBE_EICR_LSC) {
-		/* set flag for async link update */
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
 		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
-	}
 
 	if (eicr & IXGBE_EICR_MAILBOX)
 		intr->flags |= IXGBE_FLAG_MAILBOX;
@@ -2275,6 +2346,30 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev)
+{
+	uint32_t eicr;
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	/* clear all cause mask */
+	ixgbevf_intr_disable(hw);
+
+	/* read-on-clear nic registers here */
+	eicr = IXGBE_READ_REG(hw, IXGBE_VTEICR);
+	PMD_DRV_LOG(INFO, "eicr %x", eicr);
+
+	intr->flags = 0;
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
+		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
+
+	return 0;
+}
+
 /**
  * It gets and then prints the link status.
  *
@@ -2370,6 +2465,18 @@ ixgbe_dev_interrupt_action(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	PMD_DRV_LOG(DEBUG, "enable intr immediately");
+	ixgbevf_intr_enable(hw);
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+	return 0;
+}
+
 /**
  * Interrupt handler which shall be registered for alarm callback for delayed
  * handling specific interrupt to wait for the stable nic state. As the
@@ -2431,6 +2538,15 @@ ixgbe_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	ixgbe_dev_interrupt_action(dev);
 }
 
+static void
+ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
+							void *param)
+{
+	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+	ixgbevf_dev_interrupt_get_status(dev);
+	ixgbevf_dev_interrupt_action(dev);
+}
+
 static int
 ixgbe_dev_led_on(struct rte_eth_dev *dev)
 {
@@ -2929,6 +3045,19 @@ ixgbevf_intr_disable(struct ixgbe_hw *hw)
 	IXGBE_WRITE_FLUSH(hw);
 }
 
+static void
+ixgbevf_intr_enable(struct ixgbe_hw *hw)
+{
+	PMD_INIT_FUNC_TRACE();
+
+	/* VF enable interrupt autoclean */
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_VF_IRQ_ENABLE_MASK);
+
+	IXGBE_WRITE_FLUSH(hw);
+}
+
 static int
 ixgbevf_dev_configure(struct rte_eth_dev *dev)
 {
@@ -2991,6 +3120,11 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+	ixgbevf_configure_msix(dev);
+
+	/* Re-enable interrupt for VF */
+	ixgbevf_intr_enable(hw);
+
 	return 0;
 }
 
@@ -3528,6 +3662,241 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 	return 0;
 }
 
+
+static int
+ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask |= (1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask &= ~(1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask |= (1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= (1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= (1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask &= ~(1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= ~(1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= ~(1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static void
+ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+	if (direction == -1) {
+		/* other causes */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC);
+		tmp &= ~0xFF;
+		tmp |= msix_vector;
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, tmp);
+	} else {
+		/* rx or tx cause */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		idx = ((16 * (queue & 1)) + (8 * direction));
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR(queue >> 1));
+		tmp &= ~(0xFF << idx);
+		tmp |= (msix_vector << idx);
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(queue >> 1), tmp);
+	}
+}
+
+/**
+ * set the IVAR registers, mapping interrupt causes to vectors
+ * @param hw
+ *  pointer to ixgbe_hw struct
+ * @direction
+ *  0 for Rx, 1 for Tx, -1 for other causes
+ * @queue
+ *  queue to map the corresponding interrupt to
+ * @msix_vector
+ *  the vector to map to the corresponding queue
+ */
+static void
+ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			   uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+
+	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+	if (hw->mac.type == ixgbe_mac_82598EB) {
+		if (direction == -1)
+			direction = 0;
+		idx = (((direction * 64) + queue) >> 2) & 0x1F;
+		tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(idx));
+		tmp &= ~(0xFF << (8 * (queue & 0x3)));
+		tmp |= (msix_vector << (8 * (queue & 0x3)));
+		IXGBE_WRITE_REG(hw, IXGBE_IVAR(idx), tmp);
+	} else if ((hw->mac.type == ixgbe_mac_82599EB) ||
+			(hw->mac.type == ixgbe_mac_X540)) {
+		if (direction == -1) {
+			/* other causes */
+			idx = ((queue & 1) * 8);
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, tmp);
+		} else {
+			/* rx or tx causes */
+			idx = ((16 * (queue & 1)) + (8 * direction));
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(queue >> 1));
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR(queue >> 1), tmp);
+		}
+	}
+}
+
+static void
+ixgbevf_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t q_idx;
+	uint32_t vector_idx = 0;
+
+	/* Configure all RX queues of VF */
+	for (q_idx = 0; q_idx < (hw->mac.max_rx_queues - 1); q_idx++) {
+		/* Force all queue use vector 0,
+		 * as IXGBE_VF_MAXMSIVECOTR = 1 */
+		ixgbevf_set_ivar_map(hw, 0, q_idx, vector_idx);
+		intr_handle->vec_num[q_idx] = vector_idx;
+	}
+
+	/* Configure VF Rx queue ivar */
+	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+}
+
+/**
+ * Sets up the hardware to properly generate MSI-X interrupts
+ * @hw
+ *  board private structure
+ */
+static void
+ixgbe_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	int queue_id;
+	uint32_t mask;
+	uint32_t gpie;
+
+	/* setup GPIE for MSI-x mode */
+	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
+		   IXGBE_GPIE_OCD | IXGBE_GPIE_EIAME;
+	/*
+	* auto clearing and auto setting corresponding bits in EIMS
+	* when MSI-X interrupt is triggered
+	*/
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
+	else {
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF);
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF);
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+
+	/*
+	* Populate the IVAR table and set the ITR values to the
+	* corresponding register.
+	*/
+	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++) {
+		ixgbe_set_ivar_map(hw, 0, queue_id, queue_id);
+		intr_handle->vec_num[queue_id] = queue_id;
+	}
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		ixgbe_set_ivar_map(hw, -1, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+			       VFIO_MAX_QUEUE_ID);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		ixgbe_set_ivar_map(hw, -1, 1, VFIO_MAX_QUEUE_ID);
+		break;
+	default:
+		break;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_EITR(queue_id),
+			 IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT & 0xFFF);
+
+	/* set up to autoclear timer, and the vectors */
+	mask = IXGBE_EIMS_ENABLE_MASK;
+	mask &= ~(IXGBE_EIMS_OTHER |
+		  IXGBE_EIMS_MAILBOX |
+		  IXGBE_EIMS_LSC);
+
+	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+}
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 	uint16_t queue_idx, uint16_t tx_rate)
 {
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
index a549f5c..1bdfbce 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
@@ -109,6 +109,12 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+#define IXGBE_VF_IRQ_ENABLE_MASK        3          /* vf irq enable mask */
+#define IXGBE_VF_MAXMSIVECTOR           1
+/* maximum other interrupts besides rx&tx*/
+#define IXGBE_MAX_OTHER_INTR            1
+#define IXGBEVF_MAX_OTHER_INTR          1
+
 /*
  * Information about the fdir mode.
  */
@@ -317,6 +323,7 @@ uint32_t ixgbe_dev_rx_queue_count(struct rte_eth_dev *dev,
 		uint16_t rx_queue_id);
 
 int ixgbe_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
+int ixgbevf_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
 
 int ixgbe_dev_rx_init(struct rte_eth_dev *dev);
 
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (5 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 6/8] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-03-20 20:51     ` Stephen Hemminger
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
                     ` (4 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

From: "Zhou, Danny" <danny.zhou@intel.com>

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove unnecessary variables in e1000_mac_info
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_e1000/e1000_ethdev.h |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c   | 231 ++++++++++++++++++++++++++++++++----
 2 files changed, 209 insertions(+), 25 deletions(-)

diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h b/lib/librte_pmd_e1000/e1000_ethdev.h
index c451faa..13c4cad 100644
--- a/lib/librte_pmd_e1000/e1000_ethdev.h
+++ b/lib/librte_pmd_e1000/e1000_ethdev.h
@@ -108,6 +108,9 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+/* maximum number of other interrupts besides Rx & Tx interrupts */
+#define E1000_MAX_OTHER_INTR		1
+
 /* structure for interrupt relative data */
 struct e1000_interrupt {
 	uint32_t flags;
diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 504ae74..7fe2e62 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -97,6 +97,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -195,6 +196,16 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -254,6 +265,8 @@ static struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -465,6 +478,7 @@ eth_igb_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(eth_dev->data->dev_private);
 	uint32_t ctrl_ext;
+	struct rte_eth_dev_info dev_info;
 
 	pci_dev = eth_dev->pci_dev;
 	eth_dev->dev_ops = &eth_igb_ops;
@@ -586,6 +600,13 @@ eth_igb_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(eth_dev, &dev_info);
+
+	pci_dev->intr_handle.max_intr = dev_info.max_rx_queues +
+						E1000_MAX_OTHER_INTR;
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		eth_igb_interrupt_handler, (void *)eth_dev);
 
@@ -755,7 +776,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -795,6 +816,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(dev);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -822,33 +846,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -902,6 +902,10 @@ eth_igb_start(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ret = eth_igb_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
 
@@ -1828,6 +1832,34 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 }
 
 /*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+
+	return 0;
+}
+
+/*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
  * to update link status.
  *
@@ -3652,5 +3684,154 @@ static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	uint32_t max_rx_queues;
+	struct rte_eth_dev_info dev_info;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+	max_rx_queues = dev_info.max_rx_queues;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+					E1000_GPIE_PBA | E1000_GPIE_EIAME |
+					E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		intr_mask = 1 << max_rx_queues;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
+
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
+	* is asserted, this saves a register write for every interrupt
+	*/
+	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIAC);
+	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < VFIO_MAX_QUEUE_ID; queue_id++) {
+		eth_igb_assign_msix_vector(hw, 0, queue_id, queue_id);
+		intr_handle->vec_num[queue_id] = queue_id;
+	}
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (6 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-02-27  4:56   ` Cunming Liang
  2015-02-28 22:57     ` Stephen Hemminger
  2015-02-28 23:00     ` Stephen Hemminger
  2015-02-27  8:00   ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Liu, Yong
                     ` (3 subsequent siblings)
  11 siblings, 2 replies; 242+ messages in thread
From: Cunming Liang @ 2015-02-27  4:56 UTC (permalink / raw)
  To: dev

From: "Zhou, Danny" <danny.zhou@intel.com>

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in
polling mode and switch to interrupt mode only if there is no
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles
than kernel, so one-shot interrupt is used here to guarantee minimum
overhead and DPDK polling thread returns to polling mode immediately
once it receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v6 changes
 - Split event fd add and wait

v5 changes
 - Change invoked function name and parameter to accomodate EAL change

v3 changes
 - Add spinlock to ensure thread safe when accessing interrupt mask
   register

v2 changes
 - Remove unused function which is for debug purpose

 examples/l3fwd-power/main.c | 194 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 152 insertions(+), 42 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f6b55b9..9a920f2 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -75,12 +75,14 @@
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -156,6 +158,9 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+/* ethernet addresses of ports */
+static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 /* Ports set in promiscuous mode off by default. */
@@ -188,6 +193,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -214,7 +222,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -226,11 +234,14 @@ static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -402,19 +413,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -707,22 +721,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the
+		minimum latency switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -762,6 +774,72 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(uint8_t port_id, uint8_t queue_id)
+{
+	struct rte_eth_dev *eth_dev = &rte_eth_devices[port_id];
+	struct rte_intr_handle *intr_handle = &(eth_dev->pci_dev->intr_handle);
+	uint32_t vec;
+
+	/* Enable one-shot rx interrupt */
+	rte_spinlock_lock(&(locks[port_id]));
+	rte_eth_dev_rx_intr_enable(port_id, queue_id);
+	rte_spinlock_unlock(&(locks[port_id]));
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt on port%d,rxq%d triggers\n",
+		rte_lcore_id(), port_id, queue_id);
+
+	rte_intr_rx_wait(intr_handle, RTE_EPOLL_FD_ANY, &vec, 1);
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u is waked up from rx interrupt on port%d,rxq%d\n",
+		rte_lcore_id(), port_id, queue_id);
+
+	/* Disable one-shot rx interrupt */
+	rte_spinlock_lock(&(locks[port_id]));
+	rte_eth_dev_rx_intr_disable(port_id, queue_id);
+	rte_spinlock_unlock(&(locks[port_id]));
+
+	return 0;
+}
+
+static int event_register(struct lcore_conf *qconf)
+{
+	struct rte_eth_dev *eth_dev;
+	struct rte_intr_handle *intr_handle;
+	struct lcore_rx_queue *rx_queue;
+	uint8_t portid, queueid;
+	uint32_t vec;
+	int ret;
+	int i;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		portid = rx_queue->port_id;
+		queueid = rx_queue->queue_id;
+
+		eth_dev = &rte_eth_devices[portid];
+		intr_handle = &(eth_dev->pci_dev->intr_handle);
+
+		rte_eth_dev_rx_intr_vec_get(portid, queueid, &vec);
+		ret = rte_intr_rx_set(intr_handle, RTE_EPOLL_FD_ANY,
+				      RTE_INTR_EVENT_ADD, vec);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -775,9 +853,9 @@ main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
+	int intr_en = 0;
 
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
 
@@ -794,13 +872,18 @@ main_loop(__attribute__((unused)) void *dummy)
 	RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id);
 
 	for (i = 0; i < qconf->n_rx_queue; i++) {
-
 		portid = qconf->rx_queue_list[i].port_id;
 		queueid = qconf->rx_queue_list[i].queue_id;
 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%hhu "
 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
 	}
 
+	/* add into event wait list */
+	if (event_register(qconf) == 0)
+		intr_en = 1;
+	else
+		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
+
 	while (1) {
 		stats[lcore_id].nb_iteration_looped++;
 
@@ -835,6 +918,7 @@ main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -848,6 +932,7 @@ main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -910,10 +995,13 @@ main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -928,16 +1016,23 @@ main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of
+				* microsecond for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				if (intr_en)
+					sleep_until_rx_interrupt(
+						qconf->rx_queue_list[0].port_id,
+						qconf->rx_queue_list[0].queue_id
+						);
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1270,7 +1365,7 @@ setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1280,7 +1375,7 @@ setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1476,6 +1571,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1525,10 +1621,19 @@ main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure not existed rxq: "
+				"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1552,6 +1657,9 @@ main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1586,8 +1694,9 @@ main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1635,7 +1744,6 @@ main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets
@@ -1644,6 +1752,8 @@ main(int argc, char **argv)
 		 */
 		if (promiscuous_on)
 			rte_eth_promiscuous_enable(portid);
+		/* initialize spinlock for each port */
+		rte_spinlock_init(&(locks[portid]));
 	}
 
 	check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (7 preceding siblings ...)
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
@ 2015-02-27  8:00   ` Liu, Yong
  2015-02-27 10:38   ` David Marchand
                     ` (2 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Liu, Yong @ 2015-02-27  8:00 UTC (permalink / raw)
  To: Liang, Cunming, dev

Tested-by: Yong Liu <yong.liu@intel.com>

- Tested Commit: 00c685634b8a43e4594e26949a6c4f1cf5b67047
- OS: Fedora20 3.15.8-200.fc20.x86_64
- GCC: gcc version 4.8.3 20140911 (Red Hat 4.8.3-7) (GCC)
- CPU: Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz
- NIC: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection
- Default x86_64-native-linuxapp-gcc configuration
- Total 4 cases, 4 passed, 0 failed

- Case: interrupt pmd on PF with single queue
  Description: Check interrupt pmd work with single queue
  Command / instruction:
    Bind ports to vfio-pci.
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 0000:08:00.0 0000:08:00.1
    Start l3fwd-power with one queue per port.
      l3fwd-power -c 7 -n 4 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Send one packet to Port0 and Port1, check that thread on core1 and core2 
    waked up.
      L3FWD_POWER: lcore 1 is waked up from rx interrupt on port1,rxq0
      L3FWD_POWER: lcore 2 is waked up from rx interrupt on port1,rxq0
  Expected test result:
    l3fwd-power can forward packets normally and thread on core1 and core2 
    will sleep when there's no packet received.

- Case: interrupt pmd on PF with multi queue
  Description: Check interrupt pmd work with multiple queues
  Command / instruction:
    Start l3fwd-power with two queues per port.
      l3fwd-power -c 1f -n 4 -- -p 0x3 \
      --config="(0,0,1),(0,1,2)(1,0,3),(1,1,4)"
    Send packet with increased dest IP to Port0 and Port1, check that thread 
    on core1,core2,core3,core4 waked up.
      L3FWD_POWER: lcore 1 is waked up from rx interrupt on port1,rxq0
      L3FWD_POWER: lcore 2 is waked up from rx interrupt on port1,rxq1
      L3FWD_POWER: lcore 3 is waked up from rx interrupt on port1,rxq0
      L3FWD_POWER: lcore 4 is waked up from rx interrupt on port1,rxq1
  Expected test result:
    l3fwd-power can forward packets normally and thread on core1-core4 will 
	sleep when there's no packet received.

- Case: interrupt pmd on PF with max Rx queues
  Description: Check interrupt pmd work with maximum queues
  Command / instruction:
    Start l3fwd-power with 32 queues per port.
    l3fwd-power -c ffffffff -n 4 -- -p 0x3 -P --config="(0,0,0),(0,1,1),\
      (0,2,2),(0,3,3),(0,4,4),(0,5,5),(0,6,6),(0,7,7),(0,8,8),
      (0,9,9),(0,10,10),(0,11,11),(0,12,12),(0,13,13),(0,14,14),\
      (0,15,15),\
      (1,0,16),(1,1,17),(1,2,18),(1,3,19),(1,4,20),(1,5,21),(1,6,22),\
      (1,7,23),(1,8,24),(1,9,25),(1,10,26),(1,11,27),(1,12,28),\
      (1,13,29),(1,14,30),\(1,15,31)"
    Send packet with increased dest IP to Port0 and Port1, check that all 
    threads waked up.
  Expected test result:
    l3fwd-power can forward packets normally and thread on core1-core31
    will sleep when there's no packet received.
		
- Case: interrupt pmd on VF with single queue
  Description: Check interrupt pmd work on VF device
  Command / instruction:
    Bind ports to back to ixgbe driver.
      ./tools/dpdk_nic_bind.py --bind=ixgbe 0000:08:00.0 0000:08:00.1
    Create one VF per Port in host and make sure PF interface up
	  echo 1 > /sys/bus/pci/devices/0000\:08\:00.0/sriov_numvfs
      echo 1 > /sys/bus/pci/devices/0000\:08\:00.1/sriov_numvfs
      ifconfig p786p1 up
      ifconfig p786p2 up	  
    Bind VF device to vfio-pci.
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 0000:08:10.0 0000:08:10.1
    Start l3fwd-power on host with one queue per port.
	  l3fwd-power -c 1f -n 4 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Send one packet to Port0 and Port1, check that thread on core1 and core2 
    waked up.
      L3FWD_POWER: lcore 1 is waked up from rx interrupt on port1,rxq0
      L3FWD_POWER: lcore 2 is waked up from rx interrupt on port1,rxq0
  Expected test result:
    l3fwd-power can forward packets normally on VF and thread on core1 and 
    core2 will sleep when there's no packet received.

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Cunming Liang
> Sent: Friday, February 27, 2015 12:56 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
> 
> v6 changes
>  - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
>  - rewrite rte_intr_rx_wait/rte_intr_rx_set.
>  - using vector number instead of queue_id as interrupt API params.
>  - patch reorder and split.
> 
> v5 changes
>  - Rebase the patchset onto the HEAD
>  - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
>  - Export wait-for-rx interrupt function for shared libraries
>  - Split-off a new patch file for changed struct rte_intr_handle that
>    other patches depend on, to avoid breaking git bisect
>  - Change sample applicaiton to accomodate EAL function spec change
>    accordingly
> 
> v4 changes
>  - Export interrupt enable/disable functions for shared libraries
>  - Adjust position of new-added structure fields and functions to
>    avoid breaking ABI
> 
> v3 changes
>  - Add return value for interrupt enable/disable functions
>  - Move spinlok from PMD to L3fwd-power
>  - Remove unnecessary variables in e1000_mac_info
>  - Fix miscelleous review comments
> 
> v2 changes
>  - Fix compilation issue in Makefile for missed header file.
>  - Consolidate internal and community review comments of v1 patch set.
> 
> The patch series introduce low-latency one-shot rx interrupt into DPDK
> with
> polling and interrupt mode switch control example.
> 
> DPDK userspace interrupt notification and handling mechanism is based on
> UIO
> with below limitation:
> 1) It is designed to handle LSC interrupt only with inefficient suspended
>    pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling
> thread
>    which then wakes up DPDK polling thread). In this way, it introduces
>    non-deterministic wakeup latency for DPDK polling thread as well as
> packet
>    latency if it is used to handle Rx interrupt.
> 2) UIO only supports a single interrupt vector which has to been shared by
>    LSC interrupt and interrupts assigned to dedicated rx queues.
> 
> This patchset includes below features:
> 1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF
> only).
> 2) Build on top of the VFIO mechanism instead of UIO, so it could support
>    up to 64 interrupt vectors for rx queue interrupts.
> 3) Have 1 DPDK polling thread handle per Rx queue interrupt with a
> dedicated
>    VFIO eventfd, which eliminates non-deterministic pthread wakeup latency
> in
>    user space.
> 4) Demonstrate interrupts control APIs and userspace NAIP-like
> polling/interrupt
>    switch algorithms in L3fwd-power example.
> 
> Known limitations:
> 1) It does not work for UIO due to a single interrupt eventfd shared by
> LSC
>    and rx queue interrupt handlers causes a mess.
> 2) LSC interrupt is not supported by VF driver, so it is by default
> disabled
>    in L3fwd-power now. Feel free to turn in on if you want to support both
> LSC
>    and rx queue interrupts on a PF.
> 
> Cunming Liang (5):
>   eal: declare new interrupt api
>   eal/linux: add rx queue interrupt FDs to intr handle struct
>   eal/bsd: dummy for new intr definition
>   eal/linux: add per rx queue interrupt handling based on VFIO
>   ethdev: add rx interrupt enable/disable functions
> 
> Zhou, Danny (3):
>   ixgbe: enable rx queue interrupts for both PF and VF
>   igb: enable rx queue interrupts for PF
>   l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
>     switch
> 
>  examples/l3fwd-power/main.c                        | 194 ++++++++---
>  lib/librte_eal/bsdapp/eal/eal_interrupts.c         |  15 +
>  .../bsdapp/eal/include/exec-env/rte_interrupts.h   |   4 +
>  lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   2 +
>  lib/librte_eal/common/include/rte_interrupts.h     |  38 +++
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 224 +++++++++---
>  lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  23 +-
>  .../linuxapp/eal/include/exec-env/rte_interrupts.h |   9 +
>  lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   2 +
>  lib/librte_ether/rte_ethdev.c                      |  66 ++++
>  lib/librte_ether/rte_ethdev.h                      |  77 +++++
>  lib/librte_ether/rte_ether_version.map             |   3 +
>  lib/librte_pmd_e1000/e1000_ethdev.h                |   3 +
>  lib/librte_pmd_e1000/igb_ethdev.c                  | 231 +++++++++++--
>  lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 377
> ++++++++++++++++++++-
>  lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   7 +
>  16 files changed, 1156 insertions(+), 119 deletions(-)
> 
> --
> 1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition Cunming Liang
@ 2015-02-27  9:59     ` David Marchand
  2015-02-27 11:21       ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: David Marchand @ 2015-02-27  9:59 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

Hello,

On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com>
wrote:
>
> diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
> b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
> index 87a9cf6..b114aac 100644
> --- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
> @@ -38,6 +38,8 @@
>  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
>  #define _RTE_LINUXAPP_INTERRUPTS_H_
>
> +#define VFIO_MAX_RXTX_INTR_ID        32
> +
>  enum rte_intr_handle_type {
>         RTE_INTR_HANDLE_UNKNOWN = 0,
>         RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> @@ -49,6 +51,8 @@ enum rte_intr_handle_type {
>  struct rte_intr_handle {
>         int fd;                          /**< file descriptor */
>         enum rte_intr_handle_type type;  /**< handle type */
> +       int max_intr;                    /**< max interrupt requested */
> +       uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector number
> */
>  };
>

No need to add those since this is not supported for bsd.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct Cunming Liang
@ 2015-02-27 10:33     ` David Marchand
  2015-02-27 11:28       ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: David Marchand @ 2015-02-27 10:33 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

Hello,

On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com>
wrote:

> Per vector event fd will store in rte_intr_handle during init.
> Device drivers take responsibility to fill queue-vec mapping
> table(vec_num[]).
>
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> ---
> v6 changes:
>  - add mapping table between irq vector number and queue id.
>


> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index 6a159c7..9f45377 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -38,6 +38,9 @@
>  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
>  #define _RTE_LINUXAPP_INTERRUPTS_H_
>
> +#define VFIO_MAX_RXTX_INTR_ID        32
> +#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID
> +
>

This is a little weird to talk about vfio here.
This file is "generic".

Ok, you will store vfio eventfds here, but vfio is an implementation, not
the abstraction.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO Cunming Liang
@ 2015-02-27 10:33     ` David Marchand
  2015-02-27 12:22       ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: David Marchand @ 2015-02-27 10:33 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

I am not really comfortable with this api.

This is just creating something on top of the standard epoll api with
limitations.
In the end, we could just use an external lib that does this already.

So ok, this will work for your limited use case, but this will not be
really useful for anything else.
Not sure it has its place in eal, this is more an example to me.


On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com>
wrote:

> This patch does below:
>  - Create multiple VFIO eventfd for rx queues.
>  - Handle per rx queue interrupt.
>  - Eliminate unnecessary suspended DPDK polling thread wakeup mechanism
>    for rx interrupt by allowing polling thread epoll_wait rx queue
>    interrupt notification.
>
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> ---
> v6 changes
>  - split rte_intr_wait_rx_pkt into two function, wait and set.
>  - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on
> eal.
>  - rte_intr_rx_wait to support multiplexing.
>  - allow epfd as input to support flexible event fd combination.
>
>
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c    | 224
> +++++++++++++++++++-----
>  lib/librte_eal/linuxapp/eal/eal_pci_vfio.c      |  23 ++-
>  lib/librte_eal/linuxapp/eal/rte_eal_version.map |   2 +
>  3 files changed, 201 insertions(+), 48 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index 8c5b834..f90c2b4 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
>
>
[snip]


>
> +static void
> +eal_intr_process_rxtx_interrupts(struct rte_intr_handle *intr_handle,
> +                                struct epoll_event *events,
> +                                uint32_t *vec, int nfds)
> +{
> +       int i, bytes_read;
> +       union rte_intr_read_buffer buf;
> +       int fd;
> +
> +       for (i = 0; i < nfds; i++) {
> +               /* set the length to be read for different handle type */
> +               switch (intr_handle->type) {
> +               case RTE_INTR_HANDLE_UIO:
> +                       bytes_read = sizeof(buf.uio_intr_count);
> +                       break;
> +               case RTE_INTR_HANDLE_ALARM:
> +                       bytes_read = sizeof(buf.timerfd_num);
> +                       break;
> +#ifdef VFIO_PRESENT
> +               case RTE_INTR_HANDLE_VFIO_MSIX:
> +               case RTE_INTR_HANDLE_VFIO_MSI:
> +               case RTE_INTR_HANDLE_VFIO_LEGACY:
> +                       bytes_read = sizeof(buf.vfio_intr_count);
> +                       break;
> +#endif
> +               default:
> +                       bytes_read = 1;
> +                       break;
> +               }
> +
> +               /**
> +               * read out to clear the ready-to-be-read flag
> +               * for epoll_wait.
> +               */
> +               vec[i] = events[i].data.u32;
> +               assert(vec[i] < VFIO_MAX_RXTX_INTR_ID);
> +
> +               fd = intr_handle->efds[vec[i]];
> +               bytes_read = read(fd, &buf, bytes_read);
> +               if (bytes_read < 0)
> +                       RTE_LOG(ERR, EAL, "Error reading from file "
> +                               "descriptor %d: %s\n", fd,
> strerror(errno));
> +               else if (bytes_read == 0)
> +                       RTE_LOG(ERR, EAL, "Read nothing from file "
> +                               "descriptor %d\n", fd);
> +       }
> +}
>

Why unconditionnally read ?
You are absorbing events from the application if the application gave you
an external epfd and populated it with its own fds.


> +
> +static int init_tls_epfd(void)
> +{
> +       int pfd = epoll_create(1);
> +       if (pfd < 0) {
> +               RTE_LOG(ERR, EAL,
> +                       "Cannot create epoll instance\n");
> +               return -1;
> +       }
> +       return pfd;
> +}
> +
> +int
> +rte_intr_rx_wait(struct rte_intr_handle *intr_handle, int epfd,
> +                uint32_t *vec, uint16_t num)
> +{
>

In the end, this "rx" does not mean anything to eal.


+#define MAX_EVENTS      8
> +       struct epoll_event events[MAX_EVENTS];
> +       int ret, nfds = 0;
> +
> +       if (!intr_handle || !vec) {
> +               RTE_LOG(ERR, EAL, "invalid input parameter\n");
> +               return -1;
> +       }
> +
> +       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
> +               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
> +               return -1;
> +       }
> +
> +       if (epfd == RTE_EPOLL_FD_ANY) {
> +               /* using per thread epoll fd */
> +               if (unlikely(RTE_PER_LCORE(_epfd) == -1))
> +                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
> +               epfd = RTE_PER_LCORE(_epfd);
> +       }
>

Rather than testing every time, this should be set by the caller, i.e. epfd
is always valid.
If application does not want to create a epfd, then it calls
 rte_intr_rx_wait with RTE_EPOLL_FD_ANY (this name is not well chosen) that
is a macro wrapped to RTE_PER_LCORE(_epfd).

init_tls_epfd() should be called only once at init time.
No need to check every time.

+
> +       do {
> +               ret = epoll_wait(epfd, events,
> +                                RTE_MIN(num, MAX_EVENTS),
> +                                EAL_INTR_EPOLL_WAIT_FOREVER);
> +               if (unlikely(ret < 0)) {
> +                       /* epoll_wait fail */
> +                       RTE_LOG(ERR, EAL, "epoll_wait returns with
> fail\n");
> +                       return -1;
> +               } else if (ret > 0) {
> +                       /* epoll_wait has at least one fd ready to read */
> +                       eal_intr_process_rxtx_interrupts(intr_handle,
> events,
> +                                                        vec, ret);
> +                       num -= ret;
> +                       vec += ret;
> +                       nfds += ret;
> +               } else if (nfds > 0)
> +                       break;
> +       } while (num > 0);
> +
> +       return nfds;
> +}
>

You are blocking unless all fds have been set, so you are serialising all
events.

+
> +int
> +rte_intr_rx_set(struct rte_intr_handle *intr_handle, int epfd,
> +               int op, uint32_t vec)
> +{
> +       struct epoll_event ev;
> +
> +       if (!intr_handle || vec >= VFIO_MAX_RXTX_INTR_ID) {
> +               RTE_LOG(ERR, EAL, "invalid input parameter\n");
> +               return -1;
> +       }
> +
> +       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
> +               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
> +               return -1;
> +       }
> +
> +       switch (op) {
> +       case RTE_INTR_EVENT_ADD:
> +               op = EPOLL_CTL_ADD;
> +               break;
> +       case RTE_INTR_EVENT_DEL:
> +               op = EPOLL_CTL_DEL;
> +               break;
> +       default:
> +               RTE_LOG(ERR, EAL, "event op type mismatch\n");
> +               return -1;
> +       }
> +
> +       if (epfd == RTE_EPOLL_FD_ANY) {
> +               /* using per thread epoll fd */
> +               if (RTE_PER_LCORE(_epfd) == -1)
> +                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
> +               epfd = RTE_PER_LCORE(_epfd);
> +       }
> +
> +       ev.data.u32 = vec;
> +       ev.events = EPOLLIN | EPOLLPRI;
> +       if (epoll_ctl(epfd, op, intr_handle->efds[vec], &ev) < 0) {
> +               RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
> +                       op, intr_handle->efds[vec], strerror(errno));
> +               return -1;
> +       }
> +
> +       return 0;
> +}
>



> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index ee9660f..d90d23c 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -38,6 +38,7 @@
>  #include <sys/socket.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
> +#include <sys/epoll.h>
>
>  #include <rte_log.h>
>  #include <rte_pci.h>
> @@ -274,16 +275,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device
> *dev, int vfio_dev_fd)
>                 ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
>                 if (ret < 0) {
>                         RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
> -                                       "error %i (%s)\n", errno,
> strerror(errno));
> +                               "error %i (%s)\n", errno, strerror(errno));
>                         return -1;
>                 }
>

Garbage, this has nothing to do with the patch.


>
>                 /* if this vector cannot be used with eventfd, fail if we
> explicitly
>                  * specified interrupt type, otherwise continue */
>                 if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
> -                       if (internal_config.vfio_intr_mode !=
> RTE_INTR_MODE_NONE) {
> +                       if (internal_config.vfio_intr_mode !=
> +                           RTE_INTR_MODE_NONE) {
>                                 RTE_LOG(ERR, EAL,
> -                                               "  interrupt vector does
> not support eventfd!\n");
> +                                       "  interrupt vector "
> +                                       "does not support eventfd!\n");
>                                 return -1;
>                         } else
>                                 continue;
>

Idem.



> @@ -293,17 +296,27 @@ pci_vfio_setup_interrupts(struct rte_pci_device
> *dev, int vfio_dev_fd)
>                 fd = eventfd(0, 0);
>                 if (fd < 0) {
>                         RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
> -                                       "error %i (%s)\n", errno,
> strerror(errno));
> +                               "error %i (%s)\n", errno, strerror(errno));
>

Idem.


>                         return -1;
>                 }
>
>                 dev->intr_handle.fd = fd;
>                 dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
> -
>

Idem.


>                 switch (i) {
>                 case VFIO_PCI_MSIX_IRQ_INDEX:
>                         internal_config.vfio_intr_mode =
> RTE_INTR_MODE_MSIX;
>                         dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
> +                       for (i = 0; i < VFIO_MAX_RXTX_INTR_ID; i++) {
> +                               fd = eventfd(0, 0);
> +                               if (fd < 0) {
> +                                       RTE_LOG(ERR, EAL,
> +                                               "cannot setup eventfd,"
> +                                               "error %i (%s)\n",
> +                                               errno, strerror(errno));
> +                                       return -1;
> +                               }
> +                               dev->intr_handle.efds[i] = fd;
> +                       }
>                         break;
>                 case VFIO_PCI_MSI_IRQ_INDEX:
>                         internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
> diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> index 5f1857d..892a452 100644
> --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> @@ -64,6 +64,8 @@ DPDK_2.0 {
>         rte_intr_callback_unregister;
>         rte_intr_disable;
>         rte_intr_enable;
> +       rte_intr_rx_set;
> +       rte_intr_rx_wait;
>         rte_log;
>         rte_log_add_in_history;
>         rte_log_cur_msg_loglevel;
> --
> 1.8.1.4
>




-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (8 preceding siblings ...)
  2015-02-27  8:00   ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Liu, Yong
@ 2015-02-27 10:38   ` David Marchand
  2015-02-28 22:38     ` Stephen Hemminger
  2015-03-04  0:52     ` Stephen Hemminger
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
  2015-05-05  5:53   ` [dpdk-dev] [PATCH v7 00/10] " Cunming Liang
  11 siblings, 2 replies; 242+ messages in thread
From: David Marchand @ 2015-02-27 10:38 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com>
wrote:

> v6 changes
>  - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
>  - rewrite rte_intr_rx_wait/rte_intr_rx_set.
>  - using vector number instead of queue_id as interrupt API params.
>  - patch reorder and split.
>
>
Ok, so after looking at this patchset, I would say this is the right
direction, but still this is too limited.
The ethdev part and the vfio eventfds part look acceptable to me.
But thinking about it, I could just reuse a standard event library with the
eventfds I would get from ethdev without a need for a new eal api.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition
  2015-02-27  9:59     ` David Marchand
@ 2015-02-27 11:21       ` Liang, Cunming
  2015-02-27 14:22         ` Thomas Monjalon
  0 siblings, 1 reply; 242+ messages in thread
From: Liang, Cunming @ 2015-02-27 11:21 UTC (permalink / raw)
  To: David Marchand; +Cc: dev



From: David Marchand [mailto:david.marchand@6wind.com]
Sent: Friday, February 27, 2015 6:00 PM
To: Liang, Cunming
Cc: dev@dpdk.org; Stephen Hemminger; Thomas Monjalon
Subject: Re: [PATCH v6 3/8] eal/bsd: dummy for new intr definition

Hello,

On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>> wrote:
diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index 87a9cf6..b114aac 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_

+#define VFIO_MAX_RXTX_INTR_ID        32
+
 enum rte_intr_handle_type {
        RTE_INTR_HANDLE_UNKNOWN = 0,
        RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -49,6 +51,8 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
        int fd;                          /**< file descriptor */
        enum rte_intr_handle_type type;  /**< handle type */
+       int max_intr;                    /**< max interrupt requested */
+       uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector number */
 };

No need to add those since this is not supported for bsd.
[Liang, Cunming] max_intr is used in dev_init for pci_dev->intr_handle init.
Vec_num is used in ethdev API rx_intr_vec_get. Without it, BSD macro will used for each of the reference place.
As they’re quite generic, even bsd will require either max_intr or vec mapping table.



--
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27 10:33     ` David Marchand
@ 2015-02-27 11:28       ` Liang, Cunming
  2015-02-27 14:42         ` Thomas Monjalon
  2015-02-27 14:52         ` Thomas Monjalon
  0 siblings, 2 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-02-27 11:28 UTC (permalink / raw)
  To: David Marchand; +Cc: dev



From: David Marchand [mailto:david.marchand@6wind.com]
Sent: Friday, February 27, 2015 6:33 PM
To: Liang, Cunming
Cc: dev@dpdk.org; Stephen Hemminger; Thomas Monjalon; Zhou, Danny
Subject: Re: [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct

Hello,

On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>> wrote:
Per vector event fd will store in rte_intr_handle during init.
Device drivers take responsibility to fill queue-vec mapping table(vec_num[]).

Signed-off-by: Danny Zhou <danny.zhou@intel.com<mailto:danny.zhou@intel.com>>
Signed-off-by: Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>>
---
v6 changes:
 - add mapping table between irq vector number and queue id.

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6a159c7..9f45377 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,9 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_

+#define VFIO_MAX_RXTX_INTR_ID        32
+#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID
+

This is a little weird to talk about vfio here.
This file is "generic".

Ok, you will store vfio eventfds here, but vfio is an implementation, not the abstraction.
[Liang, Cunming] If looking at the rte_intr_hanle_type, it includes UIO/VFIO_LEGACY/VFIO_MSI/VFIO_MSIX.
I agree, VFIO is an implementation, but the different type combination is a kind of ‘abstraction’.
So in rte_intr_handle (like a multiplexing), some specified field for vfio interrupter mapping, I feel it’s reasonable.


--
David Marchand

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO
  2015-02-27 10:33     ` David Marchand
@ 2015-02-27 12:22       ` Liang, Cunming
  2015-02-27 14:13         ` Thomas Monjalon
  0 siblings, 1 reply; 242+ messages in thread
From: Liang, Cunming @ 2015-02-27 12:22 UTC (permalink / raw)
  To: David Marchand; +Cc: dev

Hi,

From: David Marchand [mailto:david.marchand@6wind.com]
Sent: Friday, February 27, 2015 6:34 PM
To: Liang, Cunming
Cc: dev@dpdk.org; Stephen Hemminger; Thomas Monjalon; Zhou, Danny
Subject: Re: [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO

I am not really comfortable with this api.

This is just creating something on top of the standard epoll api with limitations.
In the end, we could just use an external lib that does this already.
[Liang, Cunming] Not really, I think. We try to protect the data inside ‘rte_intr_handle’, it doesn’t expect user to understand the things defined inside ‘rte_intr_handle’.
It’s better typedef ‘rte_intr_handle’ as a raw integer ID, having a function to get it from a ethdev. Then all the interrupt api is around it.
It provides the common pci NIC devices rxtx interrupt processing approach. For the limitations, we can fix it step by step.

So ok, this will work for your limited use case, but this will not be really useful for anything else.
Not sure it has its place in eal, this is more an example to me.
[Liang, Cunming] ‘limited use case’ do you means only for rxtx ? It don’t expect to provide a generic event mechanism (like libev/libevent does), but a simple way to allow PMD work with DMA interrupt. It mainly abstract for rx interrupt purpose. I appreciate if you could help to list more useful cases.


On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>> wrote:
This patch does below:
 - Create multiple VFIO eventfd for rx queues.
 - Handle per rx queue interrupt.
 - Eliminate unnecessary suspended DPDK polling thread wakeup mechanism
   for rx interrupt by allowing polling thread epoll_wait rx queue
   interrupt notification.

Signed-off-by: Danny Zhou <danny.zhou@intel.com<mailto:danny.zhou@intel.com>>
Signed-off-by: Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>>
---
v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.


 lib/librte_eal/linuxapp/eal/eal_interrupts.c    | 224 +++++++++++++++++++-----
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c      |  23 ++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |   2 +
 3 files changed, 201 insertions(+), 48 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 8c5b834..f90c2b4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c

[snip]


+static void
+eal_intr_process_rxtx_interrupts(struct rte_intr_handle *intr_handle,
+                                struct epoll_event *events,
+                                uint32_t *vec, int nfds)
+{
+       int i, bytes_read;
+       union rte_intr_read_buffer buf;
+       int fd;
+
+       for (i = 0; i < nfds; i++) {
+               /* set the length to be read for different handle type */
+               switch (intr_handle->type) {
+               case RTE_INTR_HANDLE_UIO:
+                       bytes_read = sizeof(buf.uio_intr_count);
+                       break;
+               case RTE_INTR_HANDLE_ALARM:
+                       bytes_read = sizeof(buf.timerfd_num);
+                       break;
+#ifdef VFIO_PRESENT
+               case RTE_INTR_HANDLE_VFIO_MSIX:
+               case RTE_INTR_HANDLE_VFIO_MSI:
+               case RTE_INTR_HANDLE_VFIO_LEGACY:
+                       bytes_read = sizeof(buf.vfio_intr_count);
+                       break;
+#endif
+               default:
+                       bytes_read = 1;
+                       break;
+               }
+
+               /**
+               * read out to clear the ready-to-be-read flag
+               * for epoll_wait.
+               */
+               vec[i] = events[i].data.u32;
+               assert(vec[i] < VFIO_MAX_RXTX_INTR_ID);
+
+               fd = intr_handle->efds[vec[i]];
+               bytes_read = read(fd, &buf, bytes_read);
+               if (bytes_read < 0)
+                       RTE_LOG(ERR, EAL, "Error reading from file "
+                               "descriptor %d: %s\n", fd, strerror(errno));
+               else if (bytes_read == 0)
+                       RTE_LOG(ERR, EAL, "Read nothing from file "
+                               "descriptor %d\n", fd);
+       }
+}

Why unconditionnally read ?
You are absorbing events from the application if the application gave you an external epfd and populated it with its own fds.
[Liang, Cunming] The vector number was checked. If an external epfd populated some event carry fd rather than a data.u32 but the value inside the valid range, it considers as a valid vector number. No matter the read success or not, it always notify the event. Do you have any suggestion used here to check the condition ?

+
+static int init_tls_epfd(void)
+{
+       int pfd = epoll_create(1);
+       if (pfd < 0) {
+               RTE_LOG(ERR, EAL,
+                       "Cannot create epoll instance\n");
+               return -1;
+       }
+       return pfd;
+}
+
+int
+rte_intr_rx_wait(struct rte_intr_handle *intr_handle, int epfd,
+                uint32_t *vec, uint16_t num)
+{

In the end, this "rx" does not mean anything to eal.
[Liang, Cunming] That’s a good point. I tried to remove ‘rx’ and use a generic word here.
‘rte_intr_wait’ looks like too generic, ‘rte_intr_epfd_wait’ looks not abstract with bsd.
As the function only serves for rxtx vector, so using the rx prefix. Which name do you prefer ?


+#define MAX_EVENTS      8
+       struct epoll_event events[MAX_EVENTS];
+       int ret, nfds = 0;
+
+       if (!intr_handle || !vec) {
+               RTE_LOG(ERR, EAL, "invalid input parameter\n");
+               return -1;
+       }
+
+       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
+               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
+               return -1;
+       }
+
+       if (epfd == RTE_EPOLL_FD_ANY) {
+               /* using per thread epoll fd */
+               if (unlikely(RTE_PER_LCORE(_epfd) == -1))
+                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
+               epfd = RTE_PER_LCORE(_epfd);
+       }

Rather than testing every time, this should be set by the caller, i.e. epfd is always valid.
If application does not want to create a epfd, then it calls  rte_intr_rx_wait with RTE_EPOLL_FD_ANY (this name is not well chosen) that is a macro wrapped to RTE_PER_LCORE(_epfd).
[Liang, Cunming] It sounds good to me. As we don’t expect to expose *rte_per_lcore__epfd* as an public symbol, so will define rte_epfd() instread.
Within rte_epfd(), if RTE_PER_LCORE(_epfd) not assigned, then init_tls_epfd() once.

init_tls_epfd() should be called only once at init time.
No need to check every time.
[Liang, Cunming] As it probably not need per thread epfd at all. So I prefer to create it when it real needed as above I mentioned.

+
+       do {
+               ret = epoll_wait(epfd, events,
+                                RTE_MIN(num, MAX_EVENTS),
+                                EAL_INTR_EPOLL_WAIT_FOREVER);
+               if (unlikely(ret < 0)) {
+                       /* epoll_wait fail */
+                       RTE_LOG(ERR, EAL, "epoll_wait returns with fail\n");
+                       return -1;
+               } else if (ret > 0) {
+                       /* epoll_wait has at least one fd ready to read */
+                       eal_intr_process_rxtx_interrupts(intr_handle, events,
+                                                        vec, ret);
+                       num -= ret;
+                       vec += ret;
+                       nfds += ret;
+               } else if (nfds > 0)
+                       break;
+       } while (num > 0);
+
+       return nfds;
+}

You are blocking unless all fds have been set, so you are serialising all events.
[Liang, Cunming] I’m not sure fully got your point. If any event arrives, it gets back. Do you means if no fds added in, it’s always blocking.
You expect to have a timeout return ?

+
+int
+rte_intr_rx_set(struct rte_intr_handle *intr_handle, int epfd,
+               int op, uint32_t vec)
+{
+       struct epoll_event ev;
+
+       if (!intr_handle || vec >= VFIO_MAX_RXTX_INTR_ID) {
+               RTE_LOG(ERR, EAL, "invalid input parameter\n");
+               return -1;
+       }
+
+       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
+               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
+               return -1;
+       }
+
+       switch (op) {
+       case RTE_INTR_EVENT_ADD:
+               op = EPOLL_CTL_ADD;
+               break;
+       case RTE_INTR_EVENT_DEL:
+               op = EPOLL_CTL_DEL;
+               break;
+       default:
+               RTE_LOG(ERR, EAL, "event op type mismatch\n");
+               return -1;
+       }
+
+       if (epfd == RTE_EPOLL_FD_ANY) {
+               /* using per thread epoll fd */
+               if (RTE_PER_LCORE(_epfd) == -1)
+                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
+               epfd = RTE_PER_LCORE(_epfd);
+       }
+
+       ev.data.u32 = vec;
+       ev.events = EPOLLIN | EPOLLPRI;
+       if (epoll_ctl(epfd, op, intr_handle->efds[vec], &ev) < 0) {
+               RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+                       op, intr_handle->efds[vec], strerror(errno));
+               return -1;
+       }
+
+       return 0;
+}


diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index ee9660f..d90d23c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -38,6 +38,7 @@
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/epoll.h>

 #include <rte_log.h>
 #include <rte_pci.h>
@@ -274,16 +275,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
                ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
                if (ret < 0) {
                        RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
-                                       "error %i (%s)\n", errno, strerror(errno));
+                               "error %i (%s)\n", errno, strerror(errno));
                        return -1;
                }

Garbage, this has nothing to do with the patch.
[Liang, Cunming] It’s for line number exceed 80 margin complain.


                /* if this vector cannot be used with eventfd, fail if we explicitly
                 * specified interrupt type, otherwise continue */
                if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
-                       if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) {
+                       if (internal_config.vfio_intr_mode !=
+                           RTE_INTR_MODE_NONE) {
                                RTE_LOG(ERR, EAL,
-                                               "  interrupt vector does not support eventfd!\n");
+                                       "  interrupt vector "
+                                       "does not support eventfd!\n");
                                return -1;
                        } else
                                continue;

Idem.
[Liang, Cunming] The same.


@@ -293,17 +296,27 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
                fd = eventfd(0, 0);
                if (fd < 0) {
                        RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
-                                       "error %i (%s)\n", errno, strerror(errno));
+                               "error %i (%s)\n", errno, strerror(errno));

Idem.
[Liang, Cunming] The same.


                        return -1;
                }

                dev->intr_handle.fd = fd;
                dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
-

Idem.
[Liang, Cunming] Accept.

                switch (i) {
                case VFIO_PCI_MSIX_IRQ_INDEX:
                        internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
                        dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+                       for (i = 0; i < VFIO_MAX_RXTX_INTR_ID; i++) {
+                               fd = eventfd(0, 0);
+                               if (fd < 0) {
+                                       RTE_LOG(ERR, EAL,
+                                               "cannot setup eventfd,"
+                                               "error %i (%s)\n",
+                                               errno, strerror(errno));
+                                       return -1;
+                               }
+                               dev->intr_handle.efds[i] = fd;
+                       }
                        break;
                case VFIO_PCI_MSI_IRQ_INDEX:
                        internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 5f1857d..892a452 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -64,6 +64,8 @@ DPDK_2.0 {
        rte_intr_callback_unregister;
        rte_intr_disable;
        rte_intr_enable;
+       rte_intr_rx_set;
+       rte_intr_rx_wait;
        rte_log;
        rte_log_add_in_history;
        rte_log_cur_msg_loglevel;
--
1.8.1.4




--
David Marchand


^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO
  2015-02-27 12:22       ` Liang, Cunming
@ 2015-02-27 14:13         ` Thomas Monjalon
  2015-02-28  1:45           ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Thomas Monjalon @ 2015-02-27 14:13 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

Hi Cunming,

First, sorry to have to say that, but it is not easy to read discussions
where quote marks are not used. I re-insert them for clarity.

Comments below.

2015-02-27 12:22, Liang, Cunming:
> From: David Marchand [mailto:david.marchand@6wind.com]
> Sent: Friday, February 27, 2015 6:34 PM
> 
> > I am not really comfortable with this api.
> > 
> > This is just creating something on top of the standard epoll api with
> > limitations. In the end, we could just use an external lib that does this
> > already.
> 
> [Liang, Cunming] Not really, I think. We try to protect the data inside
> ‘rte_intr_handle’, it doesn’t expect user to understand the things defined
> inside ‘rte_intr_handle’.
> It’s better typedef ‘rte_intr_handle’ as a raw integer ID, having a function
> to get it from a ethdev. Then all the interrupt api is around it.
> It provides the common pci NIC devices rxtx interrupt processing approach.
> For the limitations, we can fix it step by step.
> 
> > So ok, this will work for your limited use case, but this will not be
> > really useful for anything else.
> > Not sure it has its place in eal, this is more an example to me.
> 
> [Liang, Cunming] ‘limited use case’ do you means only for rxtx ?
> It don’t expect to provide a generic event mechanism (like libev/libevent
> does), but a simple way to allow PMD work with DMA interrupt. It mainly
> abstract for rx interrupt purpose. I appreciate if you could help to list
> more useful cases.

You don't expect to provide a generic event mechanism but application
developpers could need to wait for many events at once, not only Rx ones.
That's why it's better to provide only the needed parts to use something
generic like libevent.
And we should avoid reinventing the wheel.

> > > +static void
> > > +eal_intr_process_rxtx_interrupts(struct rte_intr_handle *intr_handle,
> > > +                                struct epoll_event *events,
> > > +                                uint32_t *vec, int nfds)
> > > +{
> > > +       int i, bytes_read;
> > > +       union rte_intr_read_buffer buf;
> > > +       int fd;
> > > +
> > > +       for (i = 0; i < nfds; i++) {
> > > +               /* set the length to be read for different handle type */
> > > +               switch (intr_handle->type) {
> > > +               case RTE_INTR_HANDLE_UIO:
> > > +                       bytes_read = sizeof(buf.uio_intr_count);
> > > +                       break;
> > > +               case RTE_INTR_HANDLE_ALARM:
> > > +                       bytes_read = sizeof(buf.timerfd_num);
> > > +                       break;
> > > +#ifdef VFIO_PRESENT
> > > +               case RTE_INTR_HANDLE_VFIO_MSIX:
> > > +               case RTE_INTR_HANDLE_VFIO_MSI:
> > > +               case RTE_INTR_HANDLE_VFIO_LEGACY:
> > > +                       bytes_read = sizeof(buf.vfio_intr_count);
> > > +                       break;
> > > +#endif
> > > +               default:
> > > +                       bytes_read = 1;
> > > +                       break;
> > > +               }
> > > +
> > > +               /**
> > > +               * read out to clear the ready-to-be-read flag
> > > +               * for epoll_wait.
> > > +               */
> > > +               vec[i] = events[i].data.u32;
> > > +               assert(vec[i] < VFIO_MAX_RXTX_INTR_ID);
> > > +
> > > +               fd = intr_handle->efds[vec[i]];
> > > +               bytes_read = read(fd, &buf, bytes_read);
> > > +               if (bytes_read < 0)
> > > +                       RTE_LOG(ERR, EAL, "Error reading from file "
> > > +                               "descriptor %d: %s\n", fd, strerror(errno));
> > > +               else if (bytes_read == 0)
> > > +                       RTE_LOG(ERR, EAL, "Read nothing from file "
> > > +                               "descriptor %d\n", fd);
> > > +       }
> > > +}
> > 
> > Why unconditionnally read ?
> > You are absorbing events from the application if the application gave you
> > an external epfd and populated it with its own fds.
> 
> [Liang, Cunming] The vector number was checked. If an external epfd
> populated some event carry fd rather than a data.u32 but the value
> inside the valid range, it considers as a valid vector number. No matter
> the read success or not, it always notify the event. Do you have any
> suggestion used here to check the condition ?
> 
> > > +static int init_tls_epfd(void)
> > > +{
> > > +       int pfd = epoll_create(1);
> > > +       if (pfd < 0) {
> > > +               RTE_LOG(ERR, EAL,
> > > +                       "Cannot create epoll instance\n");
> > > +               return -1;
> > > +       }
> > > +       return pfd;
> > > +}
> > > +
> > > +int
> > > +rte_intr_rx_wait(struct rte_intr_handle *intr_handle, int epfd,
> > > +                uint32_t *vec, uint16_t num)
> > > +{
> > 
> > In the end, this "rx" does not mean anything to eal.
> 
> [Liang, Cunming] That’s a good point. I tried to remove ‘rx’ and use a
> generic word here. ‘rte_intr_wait’ looks like too generic,
> ‘rte_intr_epfd_wait’ looks not abstract with bsd.
> As the function only serves for rxtx vector, so using the rx prefix.
> Which name do you prefer ?

You should understand that you are trying to wrongly replace a generic lib.
The best name is probably /dev/null.

> > > +#define MAX_EVENTS      8
> > > +       struct epoll_event events[MAX_EVENTS];
> > > +       int ret, nfds = 0;
> > > +
> > > +       if (!intr_handle || !vec) {
> > > +               RTE_LOG(ERR, EAL, "invalid input parameter\n");
> > > +               return -1;
> > > +       }
> > > +
> > > +       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
> > > +               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
> > > +               return -1;
> > > +       }
> > > +
> > > +       if (epfd == RTE_EPOLL_FD_ANY) {
> > > +               /* using per thread epoll fd */
> > > +               if (unlikely(RTE_PER_LCORE(_epfd) == -1))
> > > +                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
> > > +               epfd = RTE_PER_LCORE(_epfd);
> > > +       }
> > 
> > Rather than testing every time, this should be set by the caller,
> > i.e. epfd is always valid.
> > If application does not want to create a epfd, then it calls
> > rte_intr_rx_wait with RTE_EPOLL_FD_ANY (this name is not well chosen)
> > that is a macro wrapped to RTE_PER_LCORE(_epfd).
> 
> [Liang, Cunming] It sounds good to me. As we don’t expect to expose
> *rte_per_lcore__epfd* as an public symbol, so will define rte_epfd()
> instread.
> Within rte_epfd(), if RTE_PER_LCORE(_epfd) not assigned, then
> init_tls_epfd() once.
> 
> > init_tls_epfd() should be called only once at init time.
> > No need to check every time.
> 
> [Liang, Cunming] As it probably not need per thread epfd at all.
> So I prefer to create it when it real needed as above I mentioned.

> > > +       do {
> > > +               ret = epoll_wait(epfd, events,
> > > +                                RTE_MIN(num, MAX_EVENTS),
> > > +                                EAL_INTR_EPOLL_WAIT_FOREVER);
> > > +               if (unlikely(ret < 0)) {
> > > +                       /* epoll_wait fail */
> > > +                       RTE_LOG(ERR, EAL, "epoll_wait returns with fail\n");
> > > +                       return -1;
> > > +               } else if (ret > 0) {
> > > +                       /* epoll_wait has at least one fd ready to read */
> > > +                       eal_intr_process_rxtx_interrupts(intr_handle, events,
> > > +                                                        vec, ret);
> > > +                       num -= ret;
> > > +                       vec += ret;
> > > +                       nfds += ret;
> > > +               } else if (nfds > 0)
> > > +                       break;
> > > +       } while (num > 0);
> > > +
> > > +       return nfds;
> > > +}
> >
> > You are blocking unless all fds have been set, so you are serialising
> > all events.
> 
> [Liang, Cunming] I’m not sure fully got your point. If any event arrives,
> it gets back. Do you means if no fds added in, it’s always blocking.
> You expect to have a timeout return ?

> > >                         RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
> > > 
> > > -                                       "error %i (%s)\n", errno, strerror(errno));
> > > +                               "error %i (%s)\n", errno, strerror(errno));
> > 
> > Garbage, this has nothing to do with the patch.
> 
> [Liang, Cunming] It’s for line number exceed 80 margin complain.

The title of the patch is "add per rx queue interrupt handling based on VFIO".
So this kind of modification is a garbage.
Sorry, I won't play the game "idem / the same" below ;)

> > > -                       if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) {
> > > +                       if (internal_config.vfio_intr_mode !=
> > > +                           RTE_INTR_MODE_NONE) {
> > >                                 RTE_LOG(ERR, EAL,
> > > -                                               "  interrupt vector does not support eventfd!\n");
> > > +                                       "  interrupt vector "
> > > +                                       "does not support eventfd!\n");> > 
> > 
> > Idem.
> 
> [Liang, Cunming] The same.

> > > -                                       "error %i (%s)\n", errno, strerror(errno));
> > > +                               "error %i (%s)\n", errno, strerror(errno));
> > 
> > Idem.
> 
> [Liang, Cunming] The same.

> > >                 dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
> > > -
> > 
> > Idem.
> 
> [Liang, Cunming] Accept.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition
  2015-02-27 11:21       ` Liang, Cunming
@ 2015-02-27 14:22         ` Thomas Monjalon
  2015-02-28  0:37           ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Thomas Monjalon @ 2015-02-27 14:22 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

2015-02-27 11:21, Liang, Cunming:
> From: David Marchand [mailto:david.marchand@6wind.com]
> > On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang  wrote:
> > > @@ -49,6 +51,8 @@ enum rte_intr_handle_type {
> > > 
> > >  struct rte_intr_handle {
> > >  
> > >         int fd;                          /**< file descriptor */
> > >         enum rte_intr_handle_type type;  /**< handle type */
> > > 
> > > +       int max_intr;                    /**< max interrupt requested */
> > > +       uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector number */
> > > };
> > 
> > No need to add those since this is not supported for bsd.
> 
> [Liang, Cunming] max_intr is used in dev_init for pci_dev->intr_handle init.
> Vec_num is used in ethdev API rx_intr_vec_get. Without it, BSD macro will
> used for each of the reference place.
> As they’re quite generic, even bsd will require either max_intr or vec
> mapping table.

Is it needed to build and run DPDK on FreeBSD?

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27 11:28       ` Liang, Cunming
@ 2015-02-27 14:42         ` Thomas Monjalon
  2015-02-27 14:52         ` Thomas Monjalon
  1 sibling, 0 replies; 242+ messages in thread
From: Thomas Monjalon @ 2015-02-27 14:42 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

2015-02-27 11:28, Liang, Cunming:
> From: David Marchand [mailto:david.marchand@6wind.com]
> Sent: Friday, February 27, 2015 6:33 PM
> On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com<mailto:cunming.liang@intel.com>> wrote:
> > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > @@ -38,6 +38,9 @@
> > 
> >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > 
> > +#define VFIO_MAX_RXTX_INTR_ID        32
> > +#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID> 
> This is a little weird to talk about vfio here.
> This file is "generic".
> 
> Ok, you will store vfio eventfds here, but vfio is an implementation, not the abstraction.
> [Liang, Cunming] If looking at the rte_intr_hanle_type, it includes UIO/VFIO_LEGACY/VFIO_MSI/VFIO_MSIX.
> I agree, VFIO is an implementation, but the different type combination is a kind of ‘abstraction’.
> So in rte_intr_handle (like a multiplexing), some specified field for vfio interrupter mapping, I feel it’s reasonable.
> 
> 
> --
> David Marchand
> 

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27 11:28       ` Liang, Cunming
  2015-02-27 14:42         ` Thomas Monjalon
@ 2015-02-27 14:52         ` Thomas Monjalon
  2015-02-28  0:32           ` Liang, Cunming
  1 sibling, 1 reply; 242+ messages in thread
From: Thomas Monjalon @ 2015-02-27 14:52 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

2015-02-27 11:28, Liang, Cunming:
> From: David Marchand [mailto:david.marchand@6wind.com]
> Sent: Friday, February 27, 2015 6:33 PM
> > On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang wrote:
> > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > @@ -38,6 +38,9 @@
> > > 
> > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > 
> > > +#define VFIO_MAX_RXTX_INTR_ID        32
> > > +#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID
> > 
> > This is a little weird to talk about vfio here.
> > This file is "generic".
> > 
> > Ok, you will store vfio eventfds here, but vfio is an implementation,
> > not the abstraction.
> 
> [Liang, Cunming] If looking at the rte_intr_hanle_type, it includes UIO/VFIO_LEGACY/VFIO_MSI/VFIO_MSIX.
> I agree, VFIO is an implementation, but the different type combination is a kind of ‘abstraction’.
> So in rte_intr_handle (like a multiplexing), some specified field for vfio interrupter mapping, I feel it’s reasonable.

Not sure to understand. Are we trying to mask the different kernel drivers
from an application point of view, and provide a generic interrupt mechanism?
If yes, why some VFIO constants are needed?
I'm not saying that the current implementation is perfect, but we should try
to improve it.

Thanks

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle struct
  2015-02-27 14:52         ` Thomas Monjalon
@ 2015-02-28  0:32           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-02-28  0:32 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, February 27, 2015 10:52 PM
> To: Liang, Cunming
> Cc: David Marchand; dev@dpdk.org; Stephen Hemminger; Zhou, Danny
> Subject: Re: [PATCH v6 2/8] eal/linux: add rx queue interrupt FDs to intr handle
> struct
> 
> 2015-02-27 11:28, Liang, Cunming:
> > From: David Marchand [mailto:david.marchand@6wind.com]
> > Sent: Friday, February 27, 2015 6:33 PM
> > > On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang wrote:
> > > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > @@ -38,6 +38,9 @@
> > > >
> > > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > >
> > > > +#define VFIO_MAX_RXTX_INTR_ID        32
> > > > +#define VFIO_MAX_QUEUE_ID            VFIO_MAX_RXTX_INTR_ID
> > >
> > > This is a little weird to talk about vfio here.
> > > This file is "generic".
> > >
> > > Ok, you will store vfio eventfds here, but vfio is an implementation,
> > > not the abstraction.
> >
> > [Liang, Cunming] If looking at the rte_intr_hanle_type, it includes
> UIO/VFIO_LEGACY/VFIO_MSI/VFIO_MSIX.
> > I agree, VFIO is an implementation, but the different type combination is a kind
> of ‘abstraction’.
> > So in rte_intr_handle (like a multiplexing), some specified field for vfio
> interrupter mapping, I feel it’s reasonable.
> 
> Not sure to understand. Are we trying to mask the different kernel drivers
> from an application point of view, and provide a generic interrupt mechanism?
> If yes, why some VFIO constants are needed?
> I'm not saying that the current implementation is perfect, but we should try
> to improve it.
[LCM] VFIO_MAX_RXTX_INTR_ID is easy to fix, it can move to a private interrupt header file, as only be used inside EAL.
VFIO_MAX_QUEUE_ID can be removed, so vec_num[] dynamic creation by the device driver. Sounds good ?
> 
> Thanks

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/8] eal/bsd: dummy for new intr definition
  2015-02-27 14:22         ` Thomas Monjalon
@ 2015-02-28  0:37           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-02-28  0:37 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, February 27, 2015 10:22 PM
> To: Liang, Cunming
> Cc: David Marchand; dev@dpdk.org; Stephen Hemminger
> Subject: Re: [PATCH v6 3/8] eal/bsd: dummy for new intr definition
> 
> 2015-02-27 11:21, Liang, Cunming:
> > From: David Marchand [mailto:david.marchand@6wind.com]
> > > On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang  wrote:
> > > > @@ -49,6 +51,8 @@ enum rte_intr_handle_type {
> > > >
> > > >  struct rte_intr_handle {
> > > >
> > > >         int fd;                          /**< file descriptor */
> > > >         enum rte_intr_handle_type type;  /**< handle type */
> > > >
> > > > +       int max_intr;                    /**< max interrupt requested */
> > > > +       uint32_t vec_num[VFIO_MAX_QUEUE_ID]; /**< rxtx intr vector
> number */
> > > > };
> > >
> > > No need to add those since this is not supported for bsd.
> >
> > [Liang, Cunming] max_intr is used in dev_init for pci_dev->intr_handle init.
> > Vec_num is used in ethdev API rx_intr_vec_get. Without it, BSD macro will
> > used for each of the reference place.
> > As they’re quite generic, even bsd will require either max_intr or vec
> > mapping table.
> 
> Is it needed to build and run DPDK on FreeBSD?
[LCM] As it's the EAL change, so I try to make sure FreeBSD can build and run as normal.


^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based on VFIO
  2015-02-27 14:13         ` Thomas Monjalon
@ 2015-02-28  1:45           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-02-28  1:45 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Thanks Thomas.
It's my fault that directly reply David's mail, haven't notice his mail isn't in a plain text mode.

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, February 27, 2015 10:13 PM
> To: Liang, Cunming
> Cc: David Marchand; dev@dpdk.org; Stephen Hemminger; Zhou, Danny
> Subject: Re: [PATCH v6 4/8] eal/linux: add per rx queue interrupt handling based
> on VFIO
> 
> Hi Cunming,
> 
> First, sorry to have to say that, but it is not easy to read discussions
> where quote marks are not used. I re-insert them for clarity.
> 
> Comments below.
> 
> 2015-02-27 12:22, Liang, Cunming:
> > From: David Marchand [mailto:david.marchand@6wind.com]
> > Sent: Friday, February 27, 2015 6:34 PM
> >
> > > I am not really comfortable with this api.
> > >
> > > This is just creating something on top of the standard epoll api with
> > > limitations. In the end, we could just use an external lib that does this
> > > already.
> >
> > [Liang, Cunming] Not really, I think. We try to protect the data inside
> > ‘rte_intr_handle’, it doesn’t expect user to understand the things defined
> > inside ‘rte_intr_handle’.
> > It’s better typedef ‘rte_intr_handle’ as a raw integer ID, having a function
> > to get it from a ethdev. Then all the interrupt api is around it.
> > It provides the common pci NIC devices rxtx interrupt processing approach.
> > For the limitations, we can fix it step by step.
> >
> > > So ok, this will work for your limited use case, but this will not be
> > > really useful for anything else.
> > > Not sure it has its place in eal, this is more an example to me.
> >
> > [Liang, Cunming] ‘limited use case’ do you means only for rxtx ?
> > It don’t expect to provide a generic event mechanism (like libev/libevent
> > does), but a simple way to allow PMD work with DMA interrupt. It mainly
> > abstract for rx interrupt purpose. I appreciate if you could help to list
> > more useful cases.
> 
> You don't expect to provide a generic event mechanism but application
> developpers could need to wait for many events at once, not only Rx ones.
> That's why it's better to provide only the needed parts to use something
> generic like libevent.
> And we should avoid reinventing the wheel.
[LCM] Ok, I get you. I have a simple proposal to allow either RX event or other events can be handled in rte_intr_wait().
For the input data 'epoll_data', instead of using 'u32', let's keep use 'int fd'.
If the most significant bit is 0, event[n] stands for a fd. If it's 1, event[0]&0xFFFF stands for a vector number.
So during 'rte_intr_set', it get 16bit vector number and encode it as a 32bit int with the most significant bit 1.
Then on 'rte_intr_wait', only process the data.fd with the most significant bit 1. And bypass the user fd.
'rte_intr_wait(struct rte_intr_handle *intr_handle, int epfd, int *event, uint16_t num)'.
As user already can assign an epfd, so they can add any normal event fd into the epfd. Make sense ?
> 
> > > > +static void
> > > > +eal_intr_process_rxtx_interrupts(struct rte_intr_handle *intr_handle,
> > > > +                                struct epoll_event *events,
> > > > +                                uint32_t *vec, int nfds)
> > > > +{
> > > > +       int i, bytes_read;
> > > > +       union rte_intr_read_buffer buf;
> > > > +       int fd;
> > > > +
> > > > +       for (i = 0; i < nfds; i++) {
> > > > +               /* set the length to be read for different handle type */
> > > > +               switch (intr_handle->type) {
> > > > +               case RTE_INTR_HANDLE_UIO:
> > > > +                       bytes_read = sizeof(buf.uio_intr_count);
> > > > +                       break;
> > > > +               case RTE_INTR_HANDLE_ALARM:
> > > > +                       bytes_read = sizeof(buf.timerfd_num);
> > > > +                       break;
> > > > +#ifdef VFIO_PRESENT
> > > > +               case RTE_INTR_HANDLE_VFIO_MSIX:
> > > > +               case RTE_INTR_HANDLE_VFIO_MSI:
> > > > +               case RTE_INTR_HANDLE_VFIO_LEGACY:
> > > > +                       bytes_read = sizeof(buf.vfio_intr_count);
> > > > +                       break;
> > > > +#endif
> > > > +               default:
> > > > +                       bytes_read = 1;
> > > > +                       break;
> > > > +               }
> > > > +
> > > > +               /**
> > > > +               * read out to clear the ready-to-be-read flag
> > > > +               * for epoll_wait.
> > > > +               */
> > > > +               vec[i] = events[i].data.u32;
> > > > +               assert(vec[i] < VFIO_MAX_RXTX_INTR_ID);
> > > > +
> > > > +               fd = intr_handle->efds[vec[i]];
> > > > +               bytes_read = read(fd, &buf, bytes_read);
> > > > +               if (bytes_read < 0)
> > > > +                       RTE_LOG(ERR, EAL, "Error reading from file "
> > > > +                               "descriptor %d: %s\n", fd, strerror(errno));
> > > > +               else if (bytes_read == 0)
> > > > +                       RTE_LOG(ERR, EAL, "Read nothing from file "
> > > > +                               "descriptor %d\n", fd);
> > > > +       }
> > > > +}
> > >
> > > Why unconditionnally read ?
> > > You are absorbing events from the application if the application gave you
> > > an external epfd and populated it with its own fds.
> >
> > [Liang, Cunming] The vector number was checked. If an external epfd
> > populated some event carry fd rather than a data.u32 but the value
> > inside the valid range, it considers as a valid vector number. No matter
> > the read success or not, it always notify the event. Do you have any
> > suggestion used here to check the condition ?
> >
> > > > +static int init_tls_epfd(void)
> > > > +{
> > > > +       int pfd = epoll_create(1);
> > > > +       if (pfd < 0) {
> > > > +               RTE_LOG(ERR, EAL,
> > > > +                       "Cannot create epoll instance\n");
> > > > +               return -1;
> > > > +       }
> > > > +       return pfd;
> > > > +}
> > > > +
> > > > +int
> > > > +rte_intr_rx_wait(struct rte_intr_handle *intr_handle, int epfd,
> > > > +                uint32_t *vec, uint16_t num)
> > > > +{
> > >
> > > In the end, this "rx" does not mean anything to eal.
> >
> > [Liang, Cunming] That’s a good point. I tried to remove ‘rx’ and use a
> > generic word here. ‘rte_intr_wait’ looks like too generic,
> > ‘rte_intr_epfd_wait’ looks not abstract with bsd.
> > As the function only serves for rxtx vector, so using the rx prefix.
> > Which name do you prefer ?
> 
> You should understand that you are trying to wrongly replace a generic lib.
> The best name is probably /dev/null.
[Liang, Cunming] If allowing other user fd added into the epfd instance, I feel ok to rename it to rte_intr_wait().
> 
> > > > +#define MAX_EVENTS      8
> > > > +       struct epoll_event events[MAX_EVENTS];
> > > > +       int ret, nfds = 0;
> > > > +
> > > > +       if (!intr_handle || !vec) {
> > > > +               RTE_LOG(ERR, EAL, "invalid input parameter\n");
> > > > +               return -1;
> > > > +       }
> > > > +
> > > > +       if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
> > > > +               RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
> > > > +               return -1;
> > > > +       }
> > > > +
> > > > +       if (epfd == RTE_EPOLL_FD_ANY) {
> > > > +               /* using per thread epoll fd */
> > > > +               if (unlikely(RTE_PER_LCORE(_epfd) == -1))
> > > > +                       RTE_PER_LCORE(_epfd) = init_tls_epfd();
> > > > +               epfd = RTE_PER_LCORE(_epfd);
> > > > +       }
> > >
> > > Rather than testing every time, this should be set by the caller,
> > > i.e. epfd is always valid.
> > > If application does not want to create a epfd, then it calls
> > > rte_intr_rx_wait with RTE_EPOLL_FD_ANY (this name is not well chosen)
> > > that is a macro wrapped to RTE_PER_LCORE(_epfd).
> >
> > [Liang, Cunming] It sounds good to me. As we don’t expect to expose
> > *rte_per_lcore__epfd* as an public symbol, so will define rte_epfd()
> > instread.
> > Within rte_epfd(), if RTE_PER_LCORE(_epfd) not assigned, then
> > init_tls_epfd() once.
> >
> > > init_tls_epfd() should be called only once at init time.
> > > No need to check every time.
> >
> > [Liang, Cunming] As it probably not need per thread epfd at all.
> > So I prefer to create it when it real needed as above I mentioned.
> 
> > > > +       do {
> > > > +               ret = epoll_wait(epfd, events,
> > > > +                                RTE_MIN(num, MAX_EVENTS),
> > > > +                                EAL_INTR_EPOLL_WAIT_FOREVER);
> > > > +               if (unlikely(ret < 0)) {
> > > > +                       /* epoll_wait fail */
> > > > +                       RTE_LOG(ERR, EAL, "epoll_wait returns with fail\n");
> > > > +                       return -1;
> > > > +               } else if (ret > 0) {
> > > > +                       /* epoll_wait has at least one fd ready to read */
> > > > +                       eal_intr_process_rxtx_interrupts(intr_handle, events,
> > > > +                                                        vec, ret);
> > > > +                       num -= ret;
> > > > +                       vec += ret;
> > > > +                       nfds += ret;
> > > > +               } else if (nfds > 0)
> > > > +                       break;
> > > > +       } while (num > 0);
> > > > +
> > > > +       return nfds;
> > > > +}
> > >
> > > You are blocking unless all fds have been set, so you are serialising
> > > all events.
> >
> > [Liang, Cunming] I’m not sure fully got your point. If any event arrives,
> > it gets back. Do you means if no fds added in, it’s always blocking.
> > You expect to have a timeout return ?
> 
> > > >                         RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
> > > >
> > > > -                                       "error %i (%s)\n", errno, strerror(errno));
> > > > +                               "error %i (%s)\n", errno, strerror(errno));
> > >
> > > Garbage, this has nothing to do with the patch.
> >
> > [Liang, Cunming] It’s for line number exceed 80 margin complain.
> 
> The title of the patch is "add per rx queue interrupt handling based on VFIO".
> So this kind of modification is a garbage.
> Sorry, I won't play the game "idem / the same" below ;)
[Liang, Cunming] That's actually a question in my mind.
For example, there's one line change in diff file. It usually has several line code around it.
If these line code can't pass code checkpatch rule, what's the correct way to handle it?
Do you suggest having a patch first to correct the format, and then follows the patch to submit the formal content.

> 
> > > > -                       if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE)
> {
> > > > +                       if (internal_config.vfio_intr_mode !=
> > > > +                           RTE_INTR_MODE_NONE) {
> > > >                                 RTE_LOG(ERR, EAL,
> > > > -                                               "  interrupt vector does not support eventfd!\n");
> > > > +                                       "  interrupt vector "
> > > > +                                       "does not support eventfd!\n");> >
> > >
> > > Idem.
> >
> > [Liang, Cunming] The same.
> 
> > > > -                                       "error %i (%s)\n", errno, strerror(errno));
> > > > +                               "error %i (%s)\n", errno, strerror(errno));
> > >
> > > Idem.
> >
> > [Liang, Cunming] The same.
> 
> > > >                 dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
> > > > -
> > >
> > > Idem.
> >
> > [Liang, Cunming] Accept.


^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-02-27 10:38   ` David Marchand
@ 2015-02-28 22:38     ` Stephen Hemminger
  2015-03-04  0:52     ` Stephen Hemminger
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-02-28 22:38 UTC (permalink / raw)
  To: David Marchand; +Cc: dev

On Fri, 27 Feb 2015 11:38:25 +0100
David Marchand <david.marchand@6wind.com> wrote:

> On Fri, Feb 27, 2015 at 5:56 AM, Cunming Liang <cunming.liang@intel.com> wrote:
> v6 changes
>  - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
>  - rewrite rte_intr_rx_wait/rte_intr_rx_set.
>  - using vector number instead of queue_id as interrupt API params.
>  - patch reorder and split.
> 
> 
> Ok, so after looking at this patchset, I would say this is the right direction, but still this is too limited.
> The ethdev part and the vfio eventfds part look acceptable to me.
> But thinking about it, I could just reuse a standard event library with the eventfds I would get from ethdev without a need for a new eal api.

Also, you need to introduce a flag (in pci drv_flags?) so that application can
know if poll mode interrupt will work or not on the given device before
configuring it.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
@ 2015-02-28 22:57     ` Stephen Hemminger
  2015-02-28 23:00     ` Stephen Hemminger
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-02-28 22:57 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Fri, 27 Feb 2015 12:56:16 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +	/* Enable one-shot rx interrupt */
> +	rte_spinlock_lock(&(locks[port_id]));
> +	rte_eth_dev_rx_intr_enable(port_id, queue_id);
> +	rte_spinlock_unlock(&(locks[port_id]));
> +

If always requires locks like this, then the API should
do the locking internally.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 8/8] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
  2015-02-28 22:57     ` Stephen Hemminger
@ 2015-02-28 23:00     ` Stephen Hemminger
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-02-28 23:00 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Fri, 27 Feb 2015 12:56:16 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +/* ethernet addresses of ports */
> +static rte_spinlock_t locks[RTE_MAX_ETHPORTS];

Comment is incorrect this is a lock array not an address array.


>  static struct rte_eth_conf port_conf = {
>  	.rxmode = {
> -		.mq_mode	= ETH_MQ_RX_RSS,
> +		.mq_mode = ETH_MQ_RX_RSS,

Please don't mix white space changes with code changes

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-02-27 10:38   ` David Marchand
  2015-02-28 22:38     ` Stephen Hemminger
@ 2015-03-04  0:52     ` Stephen Hemminger
  2015-03-04  3:20       ` Liang, Cunming
  1 sibling, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-03-04  0:52 UTC (permalink / raw)
  To: David Marchand; +Cc: dev

On Fri, 27 Feb 2015 11:38:25 +0100
David Marchand <david.marchand@6wind.com> wrote:

> Ok, so after looking at this patchset, I would say this is the right direction, but still this is too limited.
> The ethdev part and the vfio eventfds part look acceptable to me.
> But thinking about it, I could just reuse a standard event library with the eventfds I would get from ethdev without a need for a new eal api.

I would prefer that there was just an fd and a callback.
An application should be able to use what ever event model or library it wants.

IMHO the existing interrupt thread model is incorrectly designed and creates
lots of opportunities for races because of that. Look at the effort it has to
use to pass the event back to link state code.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD
  2015-03-04  0:52     ` Stephen Hemminger
@ 2015-03-04  3:20       ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-03-04  3:20 UTC (permalink / raw)
  To: Stephen Hemminger, David Marchand; +Cc: dev

Hi Stephen,

On 3/4/2015 8:52 AM, Stephen Hemminger wrote:
> On Fri, 27 Feb 2015 11:38:25 +0100
> David Marchand <david.marchand@6wind.com> wrote:
>
>> Ok, so after looking at this patchset, I would say this is the right direction, but still this is too limited.
>> The ethdev part and the vfio eventfds part look acceptable to me.
>> But thinking about it, I could just reuse a standard event library with the eventfds I would get from ethdev without a need for a new eal api.
> I would prefer that there was just an fd and a callback.
> An application should be able to use what ever event model or library it wants.
[LCM] I agree, on application perspective it is.
As it's easy to get RX/TX interrupt fd, there's no limit for application 
to do all the things with the 3rd party event library.
The improvement probably be 1) a rte_intr_vec_to_fd() API; 2) expose 
eal_intr_process_rxtx_interrupts() as a public API for RX/TX interrupt 
callback.
However, it should allow to use the packet interrupt feature in case 
application don't choose any 3rd party event library.
That's the motivation to give a very lightweight 'wait' EAL API. Sounds 
reasonable ?
>
> IMHO the existing interrupt thread model is incorrectly designed and creates
> lots of opportunities for races because of that. Look at the effort it has to
> use to pass the event back to link state code.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF
  2015-02-27  4:56   ` [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-03-20 20:51     ` Stephen Hemminger
  2015-05-11  5:16       ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-03-20 20:51 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Fri, 27 Feb 2015 12:56:15 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

>  
>  /*
> + * It clears the interrupt causes and enables the interrupt.
> + * It will be called once only during nic initialized.
> + *
> + * @param dev
> + *  Pointer to struct rte_eth_dev.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
> +{
> +

This function should be void
It always succeeds and the caller just not check the return value.

If you did this in one driver, I bet other drivers have same problem.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com>
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (9 preceding siblings ...)
  2015-02-27 10:38   ` David Marchand
@ 2015-05-05  5:39   ` Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 01/10] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
                       ` (10 more replies)
  2015-05-05  5:53   ` [dpdk-dev] [PATCH v7 00/10] " Cunming Liang
  11 siblings, 11 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

v7 changes
 - decouple epoll event and intr operation
 - add condition check in the case intr vector is disabled
 - renaming some APIs

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess.
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (10):
  eal/linux: add interrupt vectors support in intr_handle
  eal/linux: add rte_epoll_wait/ctl support
  eal/linux: add API to set rx interrupt event monitor
  eal/bsd: dummy for new intr definition
  eal/linux: fix comments typo on vfio msi
  eal/linux: add interrupt vectors handling on VFIO
  ethdev: add rx intr enable, disable and ctl functions
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch

 examples/l3fwd-power/main.c                        | 206 ++++++++--
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |   6 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 232 +++++++++--
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  12 +
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  97 +++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   4 +
 lib/librte_ether/rte_ethdev.c                      | 132 +++++++
 lib/librte_ether/rte_ethdev.h                      | 104 +++++
 lib/librte_ether/rte_ether_version.map             |   4 +
 lib/librte_pmd_e1000/e1000_ethdev.h                |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c                  | 256 +++++++++++--
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 425 ++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   7 +
 13 files changed, 1394 insertions(+), 94 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 01/10] eal/linux: add interrupt vectors support in intr_handle
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
                       ` (9 subsequent siblings)
  10 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch adds interrupt vectors support in rte_intr_handle.
'vec_en' is set when interrupt vectors are detected and associated event fds are set.
Those event fds are stored in efds[].
'intr_vec' is reserved for device driver to initialize the vector mapping table.
When the event fds add to a specified epoll instance, 'eptrs' will hold the rte_epoll_event object pointer.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes:
 - add eptrs[], it's used to store the register rte_epoll_event instances.
 - add vec_en, to log the vector capability status.

v6 changes:
 - add mapping table between irq vector number and queue id.

v5 changes:
 - Create this new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect.

 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6a159c7..e1f4a7a 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define RTE_MAX_RXTX_INTR_VEC_ID     32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -48,6 +50,8 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
+struct rte_epoll_event;
+
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	union {
@@ -57,6 +61,12 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	int vec_en;                      /**< intr vectors enabled */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
+	struct rte_epoll_event *eptrs[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event ptr */
+	int *intr_vec;                   /**< intr vector number array */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 01/10] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-08  2:57       ` Stephen Hemminger
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor Cunming Liang
                       ` (8 subsequent siblings)
  10 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch adds 'rte_epoll_wait' and 'rte_epoll_ctl' for async event wakeup.
It defines 'struct rte_epoll_event' as the event param.
The 'op' uses the same enum as epoll_wait/ctl does.
The epoll event support to carry a raw user data and to register a callback which is exectuted during wakeup.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - split v6[4/8] into two patches, one for epoll event(this one)
   another for rx intr(next patch)
 - introduce rte_epoll_event definition
 - rte_epoll_wait/ctl for more generic RTE epoll API

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 97 ++++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 66 ++++++++++++++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |  3 +
 3 files changed, 165 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 66deda2..b641745 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -69,6 +69,8 @@
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
 
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
 /**
  * union for pipe fds.
  */
@@ -859,3 +861,98 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static void
+eal_epoll_process_event(struct epoll_event *evs, int n,
+			struct rte_epoll_event *events)
+{
+	int i;
+	struct rte_epoll_event *rev;
+	for (i = 0; i < n; i++) {
+		rev = (struct rte_epoll_event *)evs[i].data.ptr;
+		if (rev) {
+			events[i].fd    = rev->fd;
+			events[i].event = rev->event;
+			events[i].data  = rev->data;
+			if (rev->cb_fun)
+				rev->cb_fun(rev->fd, rev->cb_arg);
+		}
+	}
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+	int pfd = epoll_create(255);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+	if (RTE_PER_LCORE(_epfd) == -1)
+		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+	return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout)
+{
+	struct epoll_event evs[maxevents];
+	int rc;
+
+	if (!events) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	rc = epoll_wait(epfd, evs, maxevents, timeout);
+	if (likely(rc > 0))
+		/* epoll_wait has at least one fd ready to read */
+		eal_epoll_process_event(evs, rc, events);
+	else if (rc < 0) {
+		/* epoll_wait fail */
+		RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+			strerror(errno));
+		rc = -1;
+	}
+
+	return rc;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event)
+{
+	struct epoll_event ev;
+
+	if (!event) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	event->fd   = fd;  /* ignore fd in rev */
+	ev.data.ptr = (void *)event;
+	ev.events   = event->event;
+	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, fd, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index e1f4a7a..af405cf 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -50,7 +50,19 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
-struct rte_epoll_event;
+#define RTE_INTR_EVENT_ADD            1UL
+#define	RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	int fd;                       /**< OUT: event fd */
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
 
 /** Handle for interrupts. */
 struct rte_intr_handle {
@@ -69,4 +81,56 @@ struct rte_intr_handle {
 	int *intr_vec;                   /**< intr vector number array */
 };
 
+#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
+
+/**
+ * It waits for events on the epoll instance.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It performs control operations on epoll instance referred by the epfd.
+ * It requests that the operation op be performed for the target fd.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller perform control operations.
+ * @param op
+ *   The operation be performed for the target fd.
+ * @param fd
+ *   The target fd on which the control ops perform.
+ * @param event
+ *   Describes the object linked to the fd.
+ *   Note: The caller must take care the object deletion after CTL_DEL.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event);
+
+/**
+ * The function returns the per thread epoll instance.
+ *
+ * @return
+ *   epfd the epoll instance refered to.
+ */
+int
+rte_intr_tls_epfd(void);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 7e850a9..840002e 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -52,6 +52,8 @@ DPDK_2.0 {
 	rte_eal_vdev_init;
 	rte_eal_vdev_uninit;
 	rte_eal_wait_lcore;
+	rte_epoll_ctl;
+	rte_epoll_wait;
 	rte_exit;
 	rte_get_hpet_cycles;
 	rte_get_hpet_hz;
@@ -61,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 01/10] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05 18:34       ` Stephen Hemminger
  2015-05-08  2:58       ` Stephen Hemminger
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 04/10] eal/bsd: dummy for new intr definition Cunming Liang
                       ` (7 subsequent siblings)
  10 siblings, 2 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch adds 'rte_intr_rx_ctl' to add or delete interrupt vector events monitor on specified epoll instance.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - rename rte_intr_rx_set to rte_intr_rx_ctl.
 - rte_intr_rx_ctl uses rte_epoll_ctl to register epoll event instance.
 - the intr rx event instance includes a intr process callback.

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 95 ++++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 23 ++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |  1 +
 3 files changed, 119 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index b641745..1090d7b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -862,6 +862,35 @@ rte_eal_intr_init(void)
 }
 
 static void
+eal_intr_proc_rxtx_intr(int fd, struct rte_intr_handle *intr_handle)
+{
+	union rte_intr_read_buffer buf;
+	int bytes_read = 1;
+
+	if (intr_handle->type != RTE_INTR_HANDLE_VFIO_MSIX) {
+		RTE_LOG(ERR, EAL, "intr type should be VFIO_MSIX\n");
+		return;
+	}
+
+#ifdef VFIO_PRESENT
+	bytes_read = sizeof(buf.vfio_intr_count);
+#endif
+
+	/**
+	 * read out to clear the ready-to-be-read flag
+	 * for epoll_wait.
+	 */
+	bytes_read = read(fd, &buf, bytes_read);
+	if (bytes_read < 0)
+		RTE_LOG(ERR, EAL, "Error reading from file "
+			"descriptor %d: %s\n", fd,
+			strerror(errno));
+	else if (bytes_read == 0)
+		RTE_LOG(ERR, EAL, "Read nothing from file "
+			"descriptor %d\n", fd);
+}
+
+static void
 eal_epoll_process_event(struct epoll_event *evs, int n,
 			struct rte_epoll_event *events)
 {
@@ -956,3 +985,69 @@ rte_epoll_ctl(int epfd, int op, int fd,
 
 	return 0;
 }
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+		int op, unsigned int vec, void *data, int socket)
+{
+	struct rte_epoll_event *rev;
+	int epfd_op;
+	int rc = 0;
+
+	if (!intr_handle || vec >= RTE_MAX_RXTX_INTR_VEC_ID ||
+	    !intr_handle->vec_en) {
+		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+		return -1;
+	}
+
+	if (socket == SOCKET_ID_ANY)
+		socket = rte_socket_id();
+
+	switch (op) {
+	case RTE_INTR_EVENT_ADD:
+		epfd_op = EPOLL_CTL_ADD;
+		if (intr_handle->eptrs[vec] != NULL) {
+			RTE_LOG(ERR, EAL, "Event already been added.\n");
+			return -1;
+		}
+
+		/* new event */
+		rev = rte_zmalloc_socket("eptrs", sizeof(*rev),
+					 RTE_CACHE_LINE_SIZE, socket);
+		if (rev == NULL) {
+			RTE_LOG(ERR, EAL, "event obj alloc fail\n");
+			return -1;
+		}
+
+		/* attach to intr vector fd */
+		rev->fd     = intr_handle->efds[vec];
+		rev->event  = EPOLLIN | EPOLLPRI | EPOLLET;
+		rev->data   = data;
+		rev->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+		rev->cb_arg = (void *)intr_handle;
+
+		rc = rte_epoll_ctl(epfd, epfd_op, rev->fd, rev);
+		if (!rc)
+			intr_handle->eptrs[vec] = rev;
+		else
+			rte_free(rev);
+
+		break;
+	case RTE_INTR_EVENT_DEL:
+		epfd_op = EPOLL_CTL_DEL;
+		if (intr_handle->eptrs[vec] != NULL) {
+			rev = intr_handle->eptrs[vec];
+			rc = rte_epoll_ctl(epfd, epfd_op, rev->fd, rev);
+			if (!rc) {
+				rte_free(rev);
+				intr_handle->eptrs[vec] = NULL;
+			}
+		}
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "event op type mismatch\n");
+		rc = -1;
+	}
+
+	return rc;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index af405cf..3d9f6d7 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -133,4 +133,27 @@ rte_epoll_ctl(int epfd, int op, int fd,
 int
 rte_intr_tls_epfd(void);
 
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @param socket
+ *   Specifying the socket id.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec,
+		void *data, int socket);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 840002e..65b5ed2 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -63,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_rx_ctl;
 	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 04/10] eal/bsd: dummy for new intr definition
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (2 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 05/10] eal/linux: fix comments typo on vfio msi Cunming Liang
                       ` (6 subsequent siblings)
  10 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

To make bsd compiling happy with new intr changes.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - remove stub 'linux only' function from source file

 lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index 87a9cf6..f7fb6af 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define RTE_MAX_RXTX_INTR_VEC_ID        32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -49,6 +51,10 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	int vec_en;                      /**< intr vectors enabled */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */
+	uint16_t *intr_vec;               /**< intr vector number array */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 05/10] eal/linux: fix comments typo on vfio msi
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (3 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 04/10] eal/bsd: dummy for new intr definition Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
                       ` (5 subsequent siblings)
  10 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 1090d7b..178a88e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -219,7 +219,7 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* enable MSI-X interrupts */
+/* enable MSI interrupts */
 static int
 vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	int len, ret;
@@ -265,7 +265,7 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* disable MSI-X interrupts */
+/* disable MSI interrupts */
 static int
 vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (4 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 05/10] eal/linux: fix comments typo on vfio msi Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05 18:38       ` Stephen Hemminger
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 07/10] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
                       ` (4 subsequent siblings)
  10 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

This patch does below:
 - Create VFIO eventfds for each interrupt vector
 - Assign per interrupt vector's eventfd to VFIO by ioctl

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - cleanup unnecessary code change
 - split event and intr operation to other patches

 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 36 ++++++++++++----------------
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c   | 12 ++++++++++
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 178a88e..dfe857e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -128,6 +128,9 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT
 
 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
 
 /* enable legacy (INTx) interrupts */
 static int
@@ -293,8 +296,8 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 /* enable MSI-X interrupts */
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
-	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	int len, ret, max_intr;
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
 
@@ -302,12 +305,19 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if ((!intr_handle->max_intr) ||
+		(intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID))
+		max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
+	else
+		max_intr = intr_handle->max_intr;
+
+	irq_set->count = max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
+	fd_ptr[max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -317,22 +327,6 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 		return -1;
 	}
 
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI-X interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -340,7 +334,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	int len, ret;
 
 	len = sizeof(struct vfio_irq_set);
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index aea1fb1..387f54c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -308,6 +308,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 		case VFIO_PCI_MSIX_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
 			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
+			for (i = 0; i < RTE_MAX_RXTX_INTR_VEC_ID; i++) {
+				fd = eventfd(0, 0);
+				if (fd < 0) {
+					RTE_LOG(ERR, EAL,
+						"cannot setup eventfd,"
+						"error %i (%s)\n",
+						errno, strerror(errno));
+					return -1;
+				}
+				dev->intr_handle.efds[i] = fd;
+			}
+			dev->intr_handle.vec_en = 1;
 			break;
 		case VFIO_PCI_MSI_IRQ_INDEX:
 			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 07/10] ethdev: add rx intr enable, disable and ctl functions
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (5 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
                       ` (3 subsequent siblings)
  10 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch adds two dev_ops functions to enable and disable rx queue interrupts.
In addtion, it adds rte_eth_dev_rx_intr_ctl/rx_intr_q to support per port or per queue rx intr event set.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - remove rx_intr_vec_get
 - add rx_intr_ctl and rx_intr_ctl_q

v6 changes
 - add rx_intr_vec_get to retrieve the vector num of the queue.

v5 changes
 - Rebase the patchset onto the HEAD

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Put new functions at the end of eth_dev_ops to avoid breaking ABI

v3 changes
 - Add return value for interrupt enable/disable functions

 lib/librte_ether/rte_ethdev.c          | 132 +++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 104 ++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |   4 +
 3 files changed, 240 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 024fe8b..cdde14c 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3281,6 +3281,138 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	}
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	uint16_t qid;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec) {
+		PMD_DEBUG_TRACE("RX Intr vector unset\n");
+		return -EPERM;
+	}
+
+	for (qid = 0; qid < dev->data->nb_rx_queues; qid++) {
+		if (intr_handle->intr_vec[qid] < 0) {
+			PMD_DEBUG_TRACE("RX Intr vector invalid on %d\n", qid);
+			continue;
+		}
+
+		vec = intr_handle->intr_vec[qid];
+		rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec,
+				     data, rte_eth_dev_socket_id(port_id));
+		if (rc) {
+			PMD_DEBUG_TRACE("p %d q %d rx ctl error"
+					" op %d epfd %d vec %u\n",
+					port_id, qid, op, epfd, vec);
+		}
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	if (queue_id >= dev->data->nb_rx_queues) {
+		PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
+		return -EINVAL;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec || intr_handle->intr_vec[queue_id] < 0) {
+		PMD_DEBUG_TRACE("RX Intr vector unset on %d\n", rx_queue_id);
+		return -EPERM;
+	}
+
+	vec = intr_handle->intr_vec[queue_id];
+	rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec,
+			     data, rte_eth_dev_socket_id(port_id));
+	if (rc) {
+		PMD_DEBUG_TRACE("p %d q %d rx ctl error"
+				" op %d epfd %d vec %u\n",
+				port_id, queue_id, op, epfd, vec);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			   uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+}
+
+int
+rte_eth_dev_rx_intr_disable(uint8_t port_id,
+			    uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 4648290..e5efec0 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -829,6 +829,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+	uint16_t rxq;
 };
 
 /**
@@ -1034,6 +1036,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */
 
@@ -1385,6 +1395,10 @@ struct eth_dev_ops {
 	/** Get current RSS hash configuration. */
 	rss_hash_conf_get_t rss_hash_conf_get;
 	eth_filter_ctrl_t              filter_ctrl;          /**< common filter control*/
+
+	/** Enable/disable Rx queue interrupt. */
+	eth_rx_enable_intr_t       rx_queue_intr_enable; /**< Enable Rx queue interrupt. */
+	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt.*/
 };
 
 /**
@@ -2867,6 +2881,96 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 				enum rte_eth_event_type event);
 
 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			       uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_intr_disable() function disables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_disable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
+ * RX Interrupt control per port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
+
+/**
+ * RX Interrupt control per queue.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data);
+
+/**
  * Turn on the LED on the Ethernet device.
  * This function turns on the LED on the Ethernet device.
  *
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index a2d25a6..2799b99 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -48,6 +48,10 @@ DPDK_2.0 {
 	rte_eth_dev_rss_hash_update;
 	rte_eth_dev_rss_reta_query;
 	rte_eth_dev_rss_reta_update;
+	rte_eth_dev_rx_intr_ctl;
+	rte_eth_dev_rx_intr_ctl_q;
+	rte_eth_dev_rx_intr_disable;
+	rte_eth_dev_rx_intr_enable;
 	rte_eth_dev_rx_queue_start;
 	rte_eth_dev_rx_queue_stop;
 	rte_eth_dev_set_link_down;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (6 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 07/10] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05 18:36       ` Stephen Hemminger
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF Cunming Liang
                       ` (2 subsequent siblings)
  10 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch does below things for ixgbe PF and VF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Yong Liu <yong.liu@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 425 +++++++++++++++++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   7 +
 2 files changed, 428 insertions(+), 4 deletions(-)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 366aa45..ee0e10b 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -82,6 +82,9 @@
  */
 #define IXGBE_FC_LO    0x40
 
+/* Default minimum inter-interrupt interval for EITR configuration */
+#define IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT    0x79E
+
 /* Timer value included in XOFF frames. */
 #define IXGBE_FC_PAUSE 0x680
 
@@ -171,6 +174,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -183,11 +187,14 @@ static void ixgbe_dcb_init(struct ixgbe_hw *hw,struct ixgbe_dcb_config *dcb_conf
 
 /* For Virtual Function support */
 static int eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev);
+static int ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev);
+static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_configure(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_start(struct rte_eth_dev *dev);
 static void ixgbevf_dev_stop(struct rte_eth_dev *dev);
 static void ixgbevf_dev_close(struct rte_eth_dev *dev);
 static void ixgbevf_intr_disable(struct ixgbe_hw *hw);
+static void ixgbevf_intr_enable(struct ixgbe_hw *hw);
 static void ixgbevf_dev_stats_get(struct rte_eth_dev *dev,
 		struct rte_eth_stats *stats);
 static void ixgbevf_dev_stats_reset(struct rte_eth_dev *dev);
@@ -197,6 +204,15 @@ static void ixgbevf_vlan_strip_queue_set(struct rte_eth_dev *dev,
 		uint16_t queue, int on);
 static void ixgbevf_vlan_offload_set(struct rte_eth_dev *dev, int mask);
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on);
+static void ixgbevf_dev_interrupt_handler(struct rte_intr_handle *handle,
+		void *param);
+static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+		uint16_t queue_id);
+static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+		 uint16_t queue_id);
+static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+		 uint8_t queue, uint8_t msix_vector);
+static void ixgbevf_configure_msix(struct rte_eth_dev *dev);
 
 /* For Eth VMDQ APIs support */
 static int ixgbe_uc_hash_table_set(struct rte_eth_dev *dev, struct
@@ -214,6 +230,14 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
 		uint8_t	rule_id);
 
+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void ixgbe_configure_msix(struct rte_eth_dev *dev);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 		uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -262,7 +286,7 @@ static int ixgbevf_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
  */
 #define UPDATE_VF_STAT(reg, last, cur)	                        \
 {                                                               \
-	u32 latest = IXGBE_READ_REG(hw, reg);                   \
+	uint32_t latest = IXGBE_READ_REG(hw, reg);                   \
 	cur += latest - last;                                   \
 	last = latest;                                          \
 }
@@ -343,6 +367,8 @@ static const struct eth_dev_ops ixgbe_eth_dev_ops = {
 	.tx_queue_start	      = ixgbe_dev_tx_queue_start,
 	.tx_queue_stop        = ixgbe_dev_tx_queue_stop,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
+	.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
 	.rx_queue_count       = ixgbe_dev_rx_queue_count,
 	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -402,8 +428,11 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = {
 	.vlan_offload_set     = ixgbevf_vlan_offload_set,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
+	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
 	.tx_queue_setup       = ixgbe_dev_tx_queue_setup,
 	.tx_queue_release     = ixgbe_dev_tx_queue_release,
+	.rx_queue_intr_enable = ixgbevf_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbevf_dev_rx_queue_intr_disable,
 	.mac_addr_add         = ixgbevf_add_mac_addr,
 	.mac_addr_remove      = ixgbevf_remove_mac_addr,
 };
@@ -899,9 +928,24 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
 			eth_dev->data->port_id, pci_dev->id.vendor_id,
 			pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	if (pci_dev->intr_handle.vec_en) {
+		pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+			IXGBE_MAX_OTHER_INTR;
+		pci_dev->intr_handle.intr_vec =
+			rte_zmalloc("intr_vec",
+				    hw->mac.max_rx_queues * sizeof(int), 0);
+		if (pci_dev->intr_handle.intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     "intr_vec\n", hw->mac.max_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		ixgbe_dev_interrupt_handler, (void *)eth_dev);
 
+
 	/* enable uio intr after callback register */
 	rte_intr_enable(&(pci_dev->intr_handle));
 
@@ -1079,6 +1123,25 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
 			return (-EIO);
 	}
 
+	/* set max interrupt vfio request */
+	if (pci_dev->intr_handle.vec_en) {
+		pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
+			IXGBEVF_MAX_OTHER_INTR;
+		pci_dev->intr_handle.intr_vec =
+			rte_zmalloc("intr_vec",
+				    hw->mac.max_rx_queues * sizeof(int), 0);
+		if (pci_dev->intr_handle.intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", hw->mac.max_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
+	rte_intr_callback_register(&(pci_dev->intr_handle),
+		ixgbevf_dev_interrupt_handler, (void *)eth_dev);
+
+	rte_intr_enable(&(pci_dev->intr_handle));
+
 	PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x mac.type=%s",
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id, "ixgbe_mac_82599_vf");
@@ -1489,6 +1552,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+	/* confiugre msix for sleep until rx interrupt */
+	ixgbe_configure_msix(dev);
+
 	/* initialize transmission unit */
 	ixgbe_dev_tx_init(dev);
 
@@ -1564,6 +1630,10 @@ skip_link_setup:
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ixgbe_dev_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		ixgbe_dev_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	ixgbe_enable_intr(dev);
 
@@ -1725,6 +1795,7 @@ ixgbe_dev_close(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_pci_device *pci_dev;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -1737,6 +1808,12 @@ ixgbe_dev_close(struct rte_eth_dev *dev)
 
 	/* reprogram the RAR[0] in case user changed it. */
 	ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV);
+
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
 }
 
 /*
@@ -2252,6 +2329,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+	return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -2278,10 +2377,10 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	PMD_DRV_LOG(INFO, "eicr %x", eicr);
 
 	intr->flags = 0;
-	if (eicr & IXGBE_EICR_LSC) {
-		/* set flag for async link update */
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
 		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
-	}
 
 	if (eicr & IXGBE_EICR_MAILBOX)
 		intr->flags |= IXGBE_FLAG_MAILBOX;
@@ -2289,6 +2388,30 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev)
+{
+	uint32_t eicr;
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	/* clear all cause mask */
+	ixgbevf_intr_disable(hw);
+
+	/* read-on-clear nic registers here */
+	eicr = IXGBE_READ_REG(hw, IXGBE_VTEICR);
+	PMD_DRV_LOG(INFO, "eicr %x", eicr);
+
+	intr->flags = 0;
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
+		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
+
+	return 0;
+}
+
 /**
  * It gets and then prints the link status.
  *
@@ -2384,6 +2507,18 @@ ixgbe_dev_interrupt_action(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	PMD_DRV_LOG(DEBUG, "enable intr immediately");
+	ixgbevf_intr_enable(hw);
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+	return 0;
+}
+
 /**
  * Interrupt handler which shall be registered for alarm callback for delayed
  * handling specific interrupt to wait for the stable nic state. As the
@@ -2445,6 +2580,15 @@ ixgbe_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	ixgbe_dev_interrupt_action(dev);
 }
 
+static void
+ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
+							void *param)
+{
+	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+	ixgbevf_dev_interrupt_get_status(dev);
+	ixgbevf_dev_interrupt_action(dev);
+}
+
 static int
 ixgbe_dev_led_on(struct rte_eth_dev *dev)
 {
@@ -2943,6 +3087,19 @@ ixgbevf_intr_disable(struct ixgbe_hw *hw)
 	IXGBE_WRITE_FLUSH(hw);
 }
 
+static void
+ixgbevf_intr_enable(struct ixgbe_hw *hw)
+{
+	PMD_INIT_FUNC_TRACE();
+
+	/* VF enable interrupt autoclean */
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_VF_IRQ_ENABLE_MASK);
+
+	IXGBE_WRITE_FLUSH(hw);
+}
+
 static int
 ixgbevf_dev_configure(struct rte_eth_dev *dev)
 {
@@ -3005,6 +3162,11 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+	ixgbevf_configure_msix(dev);
+
+	/* Re-enable interrupt for VF */
+	ixgbevf_intr_enable(hw);
+
 	return 0;
 }
 
@@ -3034,6 +3196,7 @@ static void
 ixgbevf_dev_close(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_pci_device *pci_dev;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3043,6 +3206,12 @@ ixgbevf_dev_close(struct rte_eth_dev *dev)
 
 	/* reprogram the RAR[0] in case user changed it. */
 	ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV);
+
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
 }
 
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on)
@@ -3542,6 +3711,254 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 	return 0;
 }
 
+
+static int
+ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask |= (1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask &= ~(1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask |= (1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= (1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= (1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask &= ~(1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= ~(1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= ~(1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static void
+ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+	if (direction == -1) {
+		/* other causes */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC);
+		tmp &= ~0xFF;
+		tmp |= msix_vector;
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, tmp);
+	} else {
+		/* rx or tx cause */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		idx = ((16 * (queue & 1)) + (8 * direction));
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR(queue >> 1));
+		tmp &= ~(0xFF << idx);
+		tmp |= (msix_vector << idx);
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(queue >> 1), tmp);
+	}
+}
+
+/**
+ * set the IVAR registers, mapping interrupt causes to vectors
+ * @param hw
+ *  pointer to ixgbe_hw struct
+ * @direction
+ *  0 for Rx, 1 for Tx, -1 for other causes
+ * @queue
+ *  queue to map the corresponding interrupt to
+ * @msix_vector
+ *  the vector to map to the corresponding queue
+ */
+static void
+ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			   uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+
+	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+	if (hw->mac.type == ixgbe_mac_82598EB) {
+		if (direction == -1)
+			direction = 0;
+		idx = (((direction * 64) + queue) >> 2) & 0x1F;
+		tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(idx));
+		tmp &= ~(0xFF << (8 * (queue & 0x3)));
+		tmp |= (msix_vector << (8 * (queue & 0x3)));
+		IXGBE_WRITE_REG(hw, IXGBE_IVAR(idx), tmp);
+	} else if ((hw->mac.type == ixgbe_mac_82599EB) ||
+			(hw->mac.type == ixgbe_mac_X540)) {
+		if (direction == -1) {
+			/* other causes */
+			idx = ((queue & 1) * 8);
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, tmp);
+		} else {
+			/* rx or tx causes */
+			idx = ((16 * (queue & 1)) + (8 * direction));
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(queue >> 1));
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR(queue >> 1), tmp);
+		}
+	}
+}
+
+static void
+ixgbevf_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t q_idx;
+	uint32_t vector_idx = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!intr_handle->vec_en)
+		return;
+
+	/* Configure all RX queues of VF */
+	for (q_idx = 0; q_idx < (hw->mac.max_rx_queues - 1); q_idx++) {
+		/* Force all queue use vector 0,
+		 * as IXGBE_VF_MAXMSIVECOTR = 1 */
+		ixgbevf_set_ivar_map(hw, 0, q_idx, vector_idx);
+		intr_handle->intr_vec[q_idx] = vector_idx;
+	}
+
+	/* Configure VF Rx queue ivar */
+	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+}
+
+/**
+ * Sets up the hardware to properly generate MSI-X interrupts
+ * @hw
+ *  board private structure
+ */
+static void
+ixgbe_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	int queue_id, vec = 0;
+	uint32_t mask;
+	uint32_t gpie;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!intr_handle->vec_en)
+		return;
+
+	/* setup GPIE for MSI-x mode */
+	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
+		   IXGBE_GPIE_OCD | IXGBE_GPIE_EIAME;
+	/*
+	* auto clearing and auto setting corresponding bits in EIMS
+	* when MSI-X interrupt is triggered
+	*/
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
+	else {
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF);
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF);
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+
+	/*
+	* Populate the IVAR table and set the ITR values to the
+	* corresponding register.
+	*/
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		/* by default, 1:1 mapping */
+		ixgbe_set_ivar_map(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < RTE_MAX_RXTX_INTR_VEC_ID)
+			vec++;
+	}
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		ixgbe_set_ivar_map(hw, -1, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+				   RTE_MAX_RXTX_INTR_VEC_ID);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		ixgbe_set_ivar_map(hw, -1, 1, RTE_MAX_RXTX_INTR_VEC_ID);
+		break;
+	default:
+		break;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_EITR(queue_id),
+			 IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT & 0xFFF);
+
+	/* set up to autoclear timer, and the vectors */
+	mask = IXGBE_EIMS_ENABLE_MASK;
+	mask &= ~(IXGBE_EIMS_OTHER |
+		  IXGBE_EIMS_MAILBOX |
+		  IXGBE_EIMS_LSC);
+
+	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+}
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 	uint16_t queue_idx, uint16_t tx_rate)
 {
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
index e45e727..836101c 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
@@ -117,6 +117,12 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+#define IXGBE_VF_IRQ_ENABLE_MASK        3          /* vf irq enable mask */
+#define IXGBE_VF_MAXMSIVECTOR           1
+/* maximum other interrupts besides rx&tx*/
+#define IXGBE_MAX_OTHER_INTR            1
+#define IXGBEVF_MAX_OTHER_INTR          1
+
 /*
  * Information about the fdir mode.
  */
@@ -325,6 +331,7 @@ uint32_t ixgbe_dev_rx_queue_count(struct rte_eth_dev *dev,
 		uint16_t rx_queue_id);
 
 int ixgbe_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
+int ixgbevf_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
 
 int ixgbe_dev_rx_init(struct rte_eth_dev *dev);
 
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (7 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-05 23:16       ` Stephen Hemminger
  2015-05-28 21:25       ` Stephen Hemminger
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 10/10] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
  10 siblings, 2 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove unnecessary variables in e1000_mac_info
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_e1000/e1000_ethdev.h |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c   | 256 ++++++++++++++++++++++++++++++++----
 2 files changed, 234 insertions(+), 25 deletions(-)

diff --git a/lib/librte_pmd_e1000/e1000_ethdev.h b/lib/librte_pmd_e1000/e1000_ethdev.h
index c451faa..13c4cad 100644
--- a/lib/librte_pmd_e1000/e1000_ethdev.h
+++ b/lib/librte_pmd_e1000/e1000_ethdev.h
@@ -108,6 +108,9 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+/* maximum number of other interrupts besides Rx & Tx interrupts */
+#define E1000_MAX_OTHER_INTR		1
+
 /* structure for interrupt relative data */
 struct e1000_interrupt {
 	uint32_t flags;
diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 4415155..d7ec696 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -96,6 +96,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -194,6 +195,16 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -253,6 +264,8 @@ static const struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -463,6 +476,7 @@ eth_igb_dev_init(struct rte_eth_dev *eth_dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(eth_dev->data->dev_private);
 	uint32_t ctrl_ext;
+	struct rte_eth_dev_info dev_info;
 
 	pci_dev = eth_dev->pci_dev;
 	eth_dev->dev_ops = &eth_igb_ops;
@@ -584,6 +598,23 @@ eth_igb_dev_init(struct rte_eth_dev *eth_dev)
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
+	/* set max interrupt vfio request */
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(eth_dev, &dev_info);
+
+	if (pci_dev->intr_handle.vec_en) {
+		pci_dev->intr_handle.max_intr = dev_info.max_rx_queues +
+			E1000_MAX_OTHER_INTR;
+		pci_dev->intr_handle.intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev_info.max_rx_queues * sizeof(int), 0);
+		if (pci_dev->intr_handle.intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", dev_info.max_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		eth_igb_interrupt_handler, (void *)eth_dev);
 
@@ -752,7 +783,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -792,6 +823,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(dev);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -819,33 +853,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -899,6 +909,10 @@ eth_igb_start(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.intr_conf.lsc != 0)
 		ret = eth_igb_lsc_interrupt_setup(dev);
 
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
 
@@ -987,6 +1001,7 @@ eth_igb_close(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_link link;
+	struct rte_pci_device *pci_dev;
 
 	eth_igb_stop(dev);
 	e1000_phy_hw_reset(hw);
@@ -1004,6 +1019,12 @@ eth_igb_close(struct rte_eth_dev *dev)
 
 	igb_dev_clear_queues(dev);
 
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
+
 	memset(&link, 0, sizeof(link));
 	rte_igb_dev_atomic_write_link_status(dev, &link);
 }
@@ -1828,6 +1849,34 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 }
 
 /*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+
+	return 0;
+}
+
+/*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
  * to update link status.
  *
@@ -3652,5 +3701,162 @@ static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	uint32_t max_rx_queues;
+	struct rte_eth_dev_info dev_info;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	int vec = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!intr_handle->vec_en)
+		return;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+	max_rx_queues = dev_info.max_rx_queues;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+					E1000_GPIE_PBA | E1000_GPIE_EIAME |
+					E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		intr_mask = 1 << max_rx_queues;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
+
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
+	* is asserted, this saves a register write for every interrupt
+	*/
+	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIAC);
+	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		eth_igb_assign_msix_vector(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < RTE_MAX_RXTX_INTR_VEC_ID)
+			vec++;
+	}
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 10/10] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (8 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-05-05  5:39     ` Cunming Liang
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
  10 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:39 UTC (permalink / raw)
  To: dev; +Cc: shemming

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in
polling mode and switch to interrupt mode only if there is no
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles
than kernel, so one-shot interrupt is used here to guarantee minimum
overhead and DPDK polling thread returns to polling mode immediately
once it receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - using new APIs
 - demo multiple port/queue pair wait on the same epoll instance

v6 changes
 - Split event fd add and wait

v5 changes
 - Change invoked function name and parameter to accomodate EAL change

v3 changes
 - Add spinlock to ensure thread safe when accessing interrupt mask
   register

v2 changes
 - Remove unused function which is for debug purpose

 examples/l3fwd-power/main.c | 206 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 164 insertions(+), 42 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index bb0b66f..08b36e0 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -74,12 +74,14 @@
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -155,6 +157,9 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+/* ethernet addresses of ports */
+static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 /* Ports set in promiscuous mode off by default. */
@@ -187,6 +192,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -213,7 +221,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -225,11 +233,14 @@ static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -401,19 +412,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -706,22 +720,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the
+		minimum latency switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -761,6 +773,84 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(int num)
+{
+	struct rte_epoll_event event[num];
+	int n, i;
+	uint8_t port_id, queue_id;
+	void *data;
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt triggers\n",
+		rte_lcore_id());
+
+	n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, -1);
+	for (i = 0; i < n; i++) {
+		data = event[i].data;
+		port_id = ((uintptr_t)data) >> CHAR_BIT;
+		queue_id = ((uintptr_t)data) &
+			RTE_LEN2MASK(CHAR_BIT, uint8_t);
+		RTE_LOG(INFO, L3FWD_POWER,
+			"lcore %u is waked up from rx interrupt on"
+			" port %d queue %d\n",
+			rte_lcore_id(), port_id, queue_id);
+	}
+
+	return 0;
+}
+
+static int turn_on_intr(struct lcore_conf *qconf)
+{
+	int i;
+	struct lcore_rx_queue *rx_queue;
+	uint8_t port_id, queue_id;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		port_id = rx_queue->port_id;
+		queue_id = rx_queue->queue_id;
+
+		rte_spinlock_lock(&(locks[port_id]));
+		rte_eth_dev_rx_intr_enable(port_id, queue_id);
+		rte_spinlock_unlock(&(locks[port_id]));
+	}
+}
+
+static int event_register(struct lcore_conf *qconf)
+{
+	struct lcore_rx_queue *rx_queue;
+	uint8_t portid, queueid;
+	uint32_t data;
+	int ret;
+	int i;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		portid = rx_queue->port_id;
+		queueid = rx_queue->queue_id;
+		data = portid << CHAR_BIT | queueid;
+
+		ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
+						RTE_EPOLL_PER_THREAD,
+						RTE_INTR_EVENT_ADD,
+						(void *)((uintptr_t)data));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -774,9 +864,9 @@ main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
+	int intr_en = 0;
 
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
 
@@ -793,13 +883,18 @@ main_loop(__attribute__((unused)) void *dummy)
 	RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id);
 
 	for (i = 0; i < qconf->n_rx_queue; i++) {
-
 		portid = qconf->rx_queue_list[i].port_id;
 		queueid = qconf->rx_queue_list[i].queue_id;
 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%hhu "
 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
 	}
 
+	/* add into event wait list */
+	if (event_register(qconf) == 0)
+		intr_en = 1;
+	else
+		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
+
 	while (1) {
 		stats[lcore_id].nb_iteration_looped++;
 
@@ -834,6 +929,7 @@ main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -847,6 +943,7 @@ main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -909,10 +1006,13 @@ main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -927,16 +1027,23 @@ main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of
+				* microsecond for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				if (intr_en) {
+					turn_on_intr(qconf);
+					sleep_until_rx_interrupt(
+						qconf->n_rx_queue);
+				}
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1269,7 +1376,7 @@ setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1279,7 +1386,7 @@ setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1472,6 +1579,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1521,10 +1629,19 @@ main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure not existed rxq: "
+				"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1548,6 +1665,9 @@ main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1582,8 +1702,9 @@ main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1631,7 +1752,6 @@ main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets
@@ -1640,6 +1760,8 @@ main(int argc, char **argv)
 		 */
 		if (promiscuous_on)
 			rte_eth_promiscuous_enable(portid);
+		/* initialize spinlock for each port */
+		rte_spinlock_init(&(locks[portid]));
 	}
 
 	check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v7 00/10] Interrupt mode PMD
  2015-02-27  4:56 ` [dpdk-dev] [PATCH v6 0/8] Interrupt mode PMD Cunming Liang
                     ` (10 preceding siblings ...)
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
@ 2015-05-05  5:53   ` Cunming Liang
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-05  5:53 UTC (permalink / raw)
  To: dev; +Cc: shemming

v7 changes
 - decouple epoll event and intr operation
 - add condition check in the case intr vector is disabled
 - renaming some APIs

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess.
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (10):
  eal/linux: add interrupt vectors support in intr_handle
  eal/linux: add rte_epoll_wait/ctl support
  eal/linux: add API to set rx interrupt event monitor
  eal/bsd: dummy for new intr definition
  eal/linux: fix comments typo on vfio msi
  eal/linux: add interrupt vectors handling on VFIO
  ethdev: add rx intr enable, disable and ctl functions
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch

 examples/l3fwd-power/main.c                        | 206 ++++++++--
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |   6 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 232 +++++++++--
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  12 +
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  97 +++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   4 +
 lib/librte_ether/rte_ethdev.c                      | 132 +++++++
 lib/librte_ether/rte_ethdev.h                      | 104 +++++
 lib/librte_ether/rte_ether_version.map             |   4 +
 lib/librte_pmd_e1000/e1000_ethdev.h                |   3 +
 lib/librte_pmd_e1000/igb_ethdev.c                  | 256 +++++++++++--
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 425 ++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   7 +
 13 files changed, 1394 insertions(+), 94 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor Cunming Liang
@ 2015-05-05 18:34       ` Stephen Hemminger
  2015-05-07  6:29         ` Liang, Cunming
  2015-05-08  2:58       ` Stephen Hemminger
  1 sibling, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-05 18:34 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:39 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

>  static void
> +eal_intr_proc_rxtx_intr(int fd, struct rte_intr_handle *intr_handle)
> +{

Should be const intr_handle is not modified

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-05-05 18:36       ` Stephen Hemminger
  2015-05-11  5:31         ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-05 18:36 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:44 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

>  
> +	/* set max interrupt vfio request */
> +	if (pci_dev->intr_handle.vec_en) {
> +		pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
> +			IXGBEVF_MAX_OTHER_INTR;
> +		pci_dev->intr_handle.intr_vec =
> +			rte_zmalloc("intr_vec",
> +				    hw->mac.max_rx_queues * sizeof(int), 0);
> +	

Since MSI-X vectors are limited on many hardware platforms, this whole API
should be changed so that max_intr is based on number of rx_queues actually
used by the application.  That means the setup needs to move from init to configure.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
@ 2015-05-05 18:38       ` Stephen Hemminger
  2015-05-07  6:29         ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-05 18:38 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:42 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index aea1fb1..387f54c 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -308,6 +308,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
>  		case VFIO_PCI_MSIX_IRQ_INDEX:
>  			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
>  			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
> +			for (i = 0; i < RTE_MAX_RXTX_INTR_VEC_ID; i++) {
> +				fd = eventfd(0, 0);
> +				if (fd < 0) {
> +

You should pass EFD_NONBLOCK and EFD_CLOEXEC as flags to any eventfd's created
internally.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-05-05 23:16       ` Stephen Hemminger
  2015-05-11  5:05         ` Liang, Cunming
  2015-05-28 21:25       ` Stephen Hemminger
  1 sibling, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-05 23:16 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:45 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> The patch does below for igb PF:
> - Setup NIC to generate MSI-X interrupts
> - Set the IVAR register to map interrupt causes to vectors
> - Implement interrupt enable/disable functions
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>

What about E1000?

This only usable if it works on all devices.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor
  2015-05-05 18:34       ` Stephen Hemminger
@ 2015-05-07  6:29         ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-07  6:29 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/6/2015 2:34 AM, Stephen Hemminger wrote:
> On Tue,  5 May 2015 13:39:39 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>>   static void
>> +eal_intr_proc_rxtx_intr(int fd, struct rte_intr_handle *intr_handle)
>> +{
> Should be const intr_handle is not modified
[LCM] accept.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 06/10] eal/linux: add interrupt vectors handling on VFIO
  2015-05-05 18:38       ` Stephen Hemminger
@ 2015-05-07  6:29         ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-07  6:29 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/6/2015 2:38 AM, Stephen Hemminger wrote:
> On Tue,  5 May 2015 13:39:42 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
>> index aea1fb1..387f54c 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
>> @@ -308,6 +308,18 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
>>   		case VFIO_PCI_MSIX_IRQ_INDEX:
>>   			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
>>   			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
>> +			for (i = 0; i < RTE_MAX_RXTX_INTR_VEC_ID; i++) {
>> +				fd = eventfd(0, 0);
>> +				if (fd < 0) {
>> +
> You should pass EFD_NONBLOCK and EFD_CLOEXEC as flags to any eventfd's created
> internally.
[LCM] Agree, make sense.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
@ 2015-05-08  2:57       ` Stephen Hemminger
  2015-05-11  3:32         ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-08  2:57 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:38 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +	else if (rc < 0) {
> +		/* epoll_wait fail */
> +		RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
> +			strerror(errno));

In real application there maybe other random signals.
Therefore the code should ignore and return for case of EWOULDBLOCK and EINTR

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 03/10] eal/linux: add API to set rx interrupt event monitor Cunming Liang
  2015-05-05 18:34       ` Stephen Hemminger
@ 2015-05-08  2:58       ` Stephen Hemminger
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-08  2:58 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:39 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +	bytes_read = read(fd, &buf, bytes_read);
> +	if (bytes_read < 0)
> +		RTE_LOG(ERR, EAL, "Error reading from file "
> +			"descriptor %d: %s\n", fd,
> +			strerror(errno)

The read could be interrupted (EINTR) or there could be a race (EWOULDBLOCK).
In those cases the code should not log anything.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 02/10] eal/linux: add rte_epoll_wait/ctl support
  2015-05-08  2:57       ` Stephen Hemminger
@ 2015-05-11  3:32         ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-11  3:32 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/8/2015 10:57 AM, Stephen Hemminger wrote:
> On Tue,  5 May 2015 13:39:38 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> +	else if (rc < 0) {
>> +		/* epoll_wait fail */
>> +		RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
>> +			strerror(errno));
> In real application there maybe other random signals.
> Therefore the code should ignore and return for case of EWOULDBLOCK and EINTR
[LCM] Thanks, you're right, when EINTR happens, shall continue 
epoll_wait instead of return.
Per EWOULDBLOCK, seems epoll_wait won't return it, so I assume your 
mention is about epoll event read.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF
  2015-05-05 23:16       ` Stephen Hemminger
@ 2015-05-11  5:05         ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-11  5:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/6/2015 7:16 AM, Stephen Hemminger wrote:
> On Tue,  5 May 2015 13:39:45 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> The patch does below for igb PF:
>> - Setup NIC to generate MSI-X interrupts
>> - Set the IVAR register to map interrupt causes to vectors
>> - Implement interrupt enable/disable functions
>>
>> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
>> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> What about E1000?
>
> This only usable if it works on all devices.
[LCM] Agree with you, will send separate patch for e1000 after the patch 
series close.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v6 7/8] igb: enable rx queue interrupts for PF
  2015-03-20 20:51     ` Stephen Hemminger
@ 2015-05-11  5:16       ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-11  5:16 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 3/21/2015 4:51 AM, Stephen Hemminger wrote:
> On Fri, 27 Feb 2015 12:56:15 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>>   
>>   /*
>> + * It clears the interrupt causes and enables the interrupt.
>> + * It will be called once only during nic initialized.
>> + *
>> + * @param dev
>> + *  Pointer to struct rte_eth_dev.
>> + *
>> + * @return
>> + *  - On success, zero.
>> + *  - On failure, a negative value.
>> + */
>> +static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
>> +{
>> +
> This function should be void
> It always succeeds and the caller just not check the return value.
>
> If you did this in one driver, I bet other drivers have same problem.
[LCM] The previous reason probably to keep consistent with 
lsc_interrupt_setup. But I think it's reasonable to change to void.
I'm considering another thing is that does is necessary to have 
condition rxq_interrupt_setup by intr_conf.rxq.
As even without it, we can manually turn on rxq interrupt by API 
rte_eth_dev_rx_intr_enable.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-05 18:36       ` Stephen Hemminger
@ 2015-05-11  5:31         ` Liang, Cunming
  2015-05-11 15:00           ` Stephen Hemminger
  0 siblings, 1 reply; 242+ messages in thread
From: Liang, Cunming @ 2015-05-11  5:31 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/6/2015 2:36 AM, Stephen Hemminger wrote:
> On Tue,  5 May 2015 13:39:44 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>>   
>> +	/* set max interrupt vfio request */
>> +	if (pci_dev->intr_handle.vec_en) {
>> +		pci_dev->intr_handle.max_intr = hw->mac.max_rx_queues +
>> +			IXGBEVF_MAX_OTHER_INTR;
>> +		pci_dev->intr_handle.intr_vec =
>> +			rte_zmalloc("intr_vec",
>> +				    hw->mac.max_rx_queues * sizeof(int), 0);
>> +	
> Since MSI-X vectors are limited on many hardware platforms, this whole API
> should be changed so that max_intr is based on number of rx_queues actually
> used by the application.  That means the setup needs to move from init to configure.
[LCM] When MSI-X is not used, intr_vec and set max_intr are useless. It 
doesn't matter to non MSI-X mode.
As it allows the sequence "dev_stop->dev_reconfig->dev_start", the real 
used number of queue may change.
So allocation only on dev_init and release only on dev_close, just make 
it simple. During configure_msix, it do use the real useful queue number 
to set queue/vector mapping, refer xxx_configure_msix().

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH] lib: syntax cleanup
  2015-02-23 17:17     ` Zhou, Danny
@ 2015-05-11 14:10       ` Ferruh Yigit
  2015-06-23 14:28         ` Thomas Monjalon
  0 siblings, 1 reply; 242+ messages in thread
From: Ferruh Yigit @ 2015-05-11 14:10 UTC (permalink / raw)
  To: dev

Remove extra parenthesis from return statements.

Signed-off-by: Ferruh Yigit <ferruhy@gmail.com>
---
 lib/librte_cmdline/cmdline_parse_etheraddr.c       | 10 ++---
 lib/librte_cmdline/cmdline_parse_ipaddr.c          | 38 ++++++++---------
 lib/librte_cmdline/cmdline_parse_num.c             | 24 +++++------
 lib/librte_cmdline/cmdline_parse_portlist.c        | 14 +++----
 lib/librte_cmdline/cmdline_socket.c                |  2 +-
 lib/librte_eal/bsdapp/contigmem/contigmem.c        | 22 +++++-----
 lib/librte_eal/bsdapp/eal/eal.c                    |  4 +-
 lib/librte_eal/bsdapp/eal/eal_pci.c                | 12 +++---
 lib/librte_eal/bsdapp/nic_uio/nic_uio.c            | 10 ++---
 lib/librte_eal/common/eal_common_memzone.c         |  2 +-
 lib/librte_eal/common/include/rte_common.h         |  2 +-
 lib/librte_eal/common/include/rte_pci.h            |  6 +--
 lib/librte_eal/linuxapp/eal/eal.c                  |  4 +-
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       |  4 +-
 lib/librte_eal/linuxapp/eal/eal_memory.c           |  4 +-
 .../linuxapp/kni/ethtool/igb/igb_procfs.c          |  2 +-
 lib/librte_eal/linuxapp/kni/kni_net.c              |  8 ++--
 lib/librte_ip_frag/ip_frag_internal.c              | 12 +++---
 lib/librte_ip_frag/rte_ip_frag_common.c            |  6 +--
 lib/librte_ip_frag/rte_ipv4_reassembly.c           |  8 ++--
 lib/librte_ip_frag/rte_ipv6_fragmentation.c        |  8 ++--
 lib/librte_lpm/rte_lpm.c                           |  2 +-
 lib/librte_mbuf/rte_mbuf.h                         | 16 ++++----
 lib/librte_mempool/rte_dom0_mempool.c              |  2 +-
 lib/librte_mempool/rte_mempool.c                   | 18 ++++----
 lib/librte_net/rte_ip.h                            |  2 +-
 lib/librte_pmd_e1000/em_ethdev.c                   | 36 ++++++++--------
 lib/librte_pmd_e1000/em_rxtx.c                     | 48 +++++++++++-----------
 lib/librte_pmd_e1000/igb_ethdev.c                  | 26 ++++++------
 lib/librte_pmd_e1000/igb_rxtx.c                    | 30 +++++++-------
 lib/librte_pmd_fm10k/fm10k_ethdev.c                | 30 +++++++-------
 lib/librte_pmd_i40e/i40e_rxtx.c                    | 14 +++----
 lib/librte_pmd_ixgbe/ixgbe_82599_bypass.c          |  4 +-
 lib/librte_pmd_ixgbe/ixgbe_bypass.c                |  2 +-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 42 +++++++++----------
 lib/librte_pmd_ixgbe/ixgbe_rxtx.c                  | 36 ++++++++--------
 lib/librte_pmd_virtio/virtio_ethdev.c              |  4 +-
 37 files changed, 257 insertions(+), 257 deletions(-)

diff --git a/lib/librte_cmdline/cmdline_parse_etheraddr.c b/lib/librte_cmdline/cmdline_parse_etheraddr.c
index 64ae86c..dbfe4a6 100644
--- a/lib/librte_cmdline/cmdline_parse_etheraddr.c
+++ b/lib/librte_cmdline/cmdline_parse_etheraddr.c
@@ -105,32 +105,32 @@ my_ether_aton(const char *a)
 		errno = 0;
 		o[i] = strtoul(a, &end, 16);
 		if (errno != 0 || end == a || (end[0] != ':' && end[0] != 0))
-			return (NULL);
+			return NULL;
 		a = end + 1;
 	} while (++i != sizeof (o) / sizeof (o[0]) && end[0] != 0);
 
 	/* Junk at the end of line */
 	if (end[0] != 0)
-		return (NULL);
+		return NULL;
 
 	/* Support the format XX:XX:XX:XX:XX:XX */
 	if (i == ETHER_ADDR_LEN) {
 		while (i-- != 0) {
 			if (o[i] > UINT8_MAX)
-				return (NULL);
+				return NULL;
 			ether_addr.ea_oct[i] = (uint8_t)o[i];
 		}
 	/* Support the format XXXX:XXXX:XXXX */
 	} else if (i == ETHER_ADDR_LEN / 2) {
 		while (i-- != 0) {
 			if (o[i] > UINT16_MAX)
-				return (NULL);
+				return NULL;
 			ether_addr.ea_oct[i * 2] = (uint8_t)(o[i] >> 8);
 			ether_addr.ea_oct[i * 2 + 1] = (uint8_t)(o[i] & 0xff);
 		}
 	/* unknown format */
 	} else
-		return (NULL);
+		return NULL;
 
 	return (struct ether_addr *)&ether_addr;
 }
diff --git a/lib/librte_cmdline/cmdline_parse_ipaddr.c b/lib/librte_cmdline/cmdline_parse_ipaddr.c
index 7f33599..d3d3e04 100644
--- a/lib/librte_cmdline/cmdline_parse_ipaddr.c
+++ b/lib/librte_cmdline/cmdline_parse_ipaddr.c
@@ -135,12 +135,12 @@ my_inet_pton(int af, const char *src, void *dst)
 {
 	switch (af) {
 		case AF_INET:
-			return (inet_pton4(src, dst));
+			return inet_pton4(src, dst);
 		case AF_INET6:
-			return (inet_pton6(src, dst));
+			return inet_pton6(src, dst);
 		default:
 			errno = EAFNOSUPPORT;
-			return (-1);
+			return -1;
 	}
 	/* NOTREACHED */
 }
@@ -172,26 +172,26 @@ inet_pton4(const char *src, unsigned char *dst)
 			unsigned int new = *tp * 10 + (pch - digits);
 
 			if (new > 255)
-				return (0);
+				return 0;
 			if (! saw_digit) {
 				if (++octets > 4)
-					return (0);
+					return 0;
 				saw_digit = 1;
 			}
 			*tp = (unsigned char)new;
 		} else if (ch == '.' && saw_digit) {
 			if (octets == 4)
-				return (0);
+				return 0;
 			*++tp = 0;
 			saw_digit = 0;
 		} else
-			return (0);
+			return 0;
 	}
 	if (octets < 4)
-		return (0);
+		return 0;
 
 	memcpy(dst, tmp, INADDRSZ);
-	return (1);
+	return 1;
 }
 
 /* int
@@ -224,7 +224,7 @@ inet_pton6(const char *src, unsigned char *dst)
 	/* Leading :: requires some special handling. */
 	if (*src == ':')
 		if (*++src != ':')
-			return (0);
+			return 0;
 	curtok = src;
 	saw_xdigit = count_xdigit = 0;
 	val = 0;
@@ -236,11 +236,11 @@ inet_pton6(const char *src, unsigned char *dst)
 			pch = strchr((xdigits = xdigits_u), ch);
 		if (pch != NULL) {
 			if (count_xdigit >= 4)
-				return (0);
+				return 0;
 			val <<= 4;
 			val |= (pch - xdigits);
 			if (val > 0xffff)
-				return (0);
+				return 0;
 			saw_xdigit = 1;
 			count_xdigit++;
 			continue;
@@ -249,14 +249,14 @@ inet_pton6(const char *src, unsigned char *dst)
 			curtok = src;
 			if (!saw_xdigit) {
 				if (colonp)
-					return (0);
+					return 0;
 				colonp = tp;
 				continue;
 			} else if (*src == '\0') {
-				return (0);
+				return 0;
 			}
 			if (tp + sizeof(int16_t) > endp)
-				return (0);
+				return 0;
 			*tp++ = (unsigned char) ((val >> 8) & 0xff);
 			*tp++ = (unsigned char) (val & 0xff);
 			saw_xdigit = 0;
@@ -272,11 +272,11 @@ inet_pton6(const char *src, unsigned char *dst)
 			dbloct_count += 2;
 			break;  /* '\0' was seen by inet_pton4(). */
 		}
-		return (0);
+		return 0;
 	}
 	if (saw_xdigit) {
 		if (tp + sizeof(int16_t) > endp)
-			return (0);
+			return 0;
 		*tp++ = (unsigned char) ((val >> 8) & 0xff);
 		*tp++ = (unsigned char) (val & 0xff);
 		dbloct_count++;
@@ -300,9 +300,9 @@ inet_pton6(const char *src, unsigned char *dst)
 		tp = endp;
 	}
 	if (tp != endp)
-		return (0);
+		return 0;
 	memcpy(dst, tmp, IN6ADDRSZ);
-	return (1);
+	return 1;
 }
 
 int
diff --git a/lib/librte_cmdline/cmdline_parse_num.c b/lib/librte_cmdline/cmdline_parse_num.c
index d8cf37f..b0f9a35 100644
--- a/lib/librte_cmdline/cmdline_parse_num.c
+++ b/lib/librte_cmdline/cmdline_parse_num.c
@@ -315,35 +315,35 @@ cmdline_parse_num(cmdline_parse_token_hdr_t *tk, const char *srcbuf, void *res,
 	case BIN_OK:
 		if ( nd.type == INT8 && res1 <= INT8_MAX ) {
 			if (res) *(int8_t *)res = (int8_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT16 && res1 <= INT16_MAX ) {
 			if (res) *(int16_t *)res = (int16_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT32 && res1 <= INT32_MAX ) {
 			if (res) *(int32_t *)res = (int32_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT64 && res1 <= INT64_MAX ) {
 			if (res) *(int64_t *)res = (int64_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == UINT8 && res1 <= UINT8_MAX ) {
 			if (res) *(uint8_t *)res = (uint8_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if (nd.type == UINT16  && res1 <= UINT16_MAX ) {
 			if (res) *(uint16_t *)res = (uint16_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == UINT32 && res1 <= UINT32_MAX ) {
 			if (res) *(uint32_t *)res = (uint32_t) res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == UINT64 ) {
 			if (res) *(uint64_t *)res = res1;
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else {
 			return -1;
@@ -353,19 +353,19 @@ cmdline_parse_num(cmdline_parse_token_hdr_t *tk, const char *srcbuf, void *res,
 	case DEC_NEG_OK:
 		if ( nd.type == INT8 && res1 <= INT8_MAX + 1 ) {
 			if (res) *(int8_t *)res = (int8_t) (-res1);
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT16 && res1 <= (uint16_t)INT16_MAX + 1 ) {
 			if (res) *(int16_t *)res = (int16_t) (-res1);
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT32 && res1 <= (uint32_t)INT32_MAX + 1 ) {
 			if (res) *(int32_t *)res = (int32_t) (-res1);
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else if ( nd.type == INT64 && res1 <= (uint64_t)INT64_MAX + 1 ) {
 			if (res) *(int64_t *)res = (int64_t) (-res1);
-			return (buf-srcbuf);
+			return buf-srcbuf;
 		}
 		else {
 			return -1;
diff --git a/lib/librte_cmdline/cmdline_parse_portlist.c b/lib/librte_cmdline/cmdline_parse_portlist.c
index 9c1fe3e..f11bdf0 100644
--- a/lib/librte_cmdline/cmdline_parse_portlist.c
+++ b/lib/librte_cmdline/cmdline_parse_portlist.c
@@ -102,7 +102,7 @@ parse_ports(cmdline_portlist_t *pl, const char *str)
 		ps = strtoul(first, &end, 10);
 		if (errno != 0 || end == first ||
 		    (end[0] != '-' && end[0] != 0 && end != last))
-			return (-1);
+			return -1;
 
 		/* Support for N-M portlist format */
 		if (end[0] == '-') {
@@ -111,18 +111,18 @@ parse_ports(cmdline_portlist_t *pl, const char *str)
 			pe = strtoul(first, &end, 10);
 			if (errno != 0 || end == first ||
 			    (end[0] != 0 && end != last))
-				return (-1);
+				return -1;
 		} else {
 			pe = ps;
 		}
 
 		if (ps > pe || pe >= sizeof (pl->map) * 8)
-			return (-1);
+			return -1;
 
 		parse_set_list(pl, ps, pe);
 	}
 
-	return (0);
+	return 0;
 }
 
 int
@@ -134,7 +134,7 @@ cmdline_parse_portlist(__attribute__((unused)) cmdline_parse_token_hdr_t *tk,
 	cmdline_portlist_t *pl;
 
 	if (!buf || ! *buf)
-		return (-1);
+		return -1;
 
 	if (res && ressize < sizeof(cmdline_portlist_t))
 		return -1;
@@ -146,7 +146,7 @@ cmdline_parse_portlist(__attribute__((unused)) cmdline_parse_token_hdr_t *tk,
 		token_len++;
 
 	if (token_len >= PORTLIST_TOKEN_SIZE)
-		return (-1);
+		return -1;
 
 	snprintf(portlist_str, token_len+1, "%s", buf);
 
@@ -155,7 +155,7 @@ cmdline_parse_portlist(__attribute__((unused)) cmdline_parse_token_hdr_t *tk,
 		if (strcmp("all", portlist_str) == 0)
 			pl->map	= UINT32_MAX;
 		else if (parse_ports(pl, portlist_str) != 0)
-			return (-1);
+			return -1;
 	}
 
 	return token_len;
diff --git a/lib/librte_cmdline/cmdline_socket.c b/lib/librte_cmdline/cmdline_socket.c
index 6820b6d..3fc243b 100644
--- a/lib/librte_cmdline/cmdline_socket.c
+++ b/lib/librte_cmdline/cmdline_socket.c
@@ -86,7 +86,7 @@ cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path)
 		dprintf("open() failed\n");
 		return NULL;
 	}
-	return (cmdline_new(ctx, prompt, fd, -1));
+	return cmdline_new(ctx, prompt, fd, -1);
 }
 
 struct cmdline *
diff --git a/lib/librte_eal/bsdapp/contigmem/contigmem.c b/lib/librte_eal/bsdapp/contigmem/contigmem.c
index 6634daa..5de57fc 100644
--- a/lib/librte_eal/bsdapp/contigmem/contigmem.c
+++ b/lib/librte_eal/bsdapp/contigmem/contigmem.c
@@ -99,7 +99,7 @@ static int contigmem_modevent(module_t mod, int type, void *arg)
 		break;
 	}
 
-	return (error);
+	return error;
 }
 
 moduledata_t contigmem_mod = {
@@ -128,14 +128,14 @@ contigmem_load()
 	if (contigmem_num_buffers > RTE_CONTIGMEM_MAX_NUM_BUFS) {
 		printf("%d buffers requested is greater than %d allowed\n",
 				contigmem_num_buffers, RTE_CONTIGMEM_MAX_NUM_BUFS);
-		return (EINVAL);
+		return EINVAL;
 	}
 
 	if (contigmem_buffer_size < PAGE_SIZE ||
 			(contigmem_buffer_size & (contigmem_buffer_size - 1)) != 0) {
 		printf("buffer size 0x%lx is not greater than PAGE_SIZE and "
 				"power of two\n", contigmem_buffer_size);
-		return (EINVAL);
+		return EINVAL;
 	}
 
 	for (i = 0; i < contigmem_num_buffers; i++) {
@@ -145,7 +145,7 @@ contigmem_load()
 
 		if (contigmem_buffers[i] == NULL) {
 			printf("contigmalloc failed for buffer %d\n", i);
-			return (ENOMEM);
+			return ENOMEM;
 		}
 
 		printf("%2u: virt=%p phys=%p\n", i, contigmem_buffers[i],
@@ -164,7 +164,7 @@ contigmem_load()
 	contigmem_cdev = make_dev_credf(0, &contigmem_ops, 0, NULL, UID_ROOT,
 			GID_WHEEL, 0600, "contigmem");
 
-	return (0);
+	return 0;
 }
 
 static int
@@ -183,7 +183,7 @@ contigmem_unload()
 			contigfree(contigmem_buffers[i], contigmem_buffer_size,
 					M_CONTIGMEM);
 
-	return (0);
+	return 0;
 }
 
 static int
@@ -193,14 +193,14 @@ contigmem_physaddr(SYSCTL_HANDLER_ARGS)
 	int		index = (int)(uintptr_t)arg1;
 
 	physaddr = (uint64_t)vtophys(contigmem_buffers[index]);
-	return (sysctl_handle_64(oidp, &physaddr, 0, req));
+	return sysctl_handle_64(oidp, &physaddr, 0, req);
 }
 
 static int
 contigmem_open(struct cdev *cdev, int fflags, int devtype,
 		struct thread *td)
 {
-	return (0);
+	return 0;
 }
 
 static int
@@ -209,7 +209,7 @@ contigmem_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
 {
 
 	*paddr = offset;
-	return (0);
+	return 0;
 }
 
 static int
@@ -222,12 +222,12 @@ contigmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
 	 *  app.
 	 */
 	if ((*offset/PAGE_SIZE) >= contigmem_num_buffers)
-		return (EINVAL);
+		return EINVAL;
 
 	*offset = (vm_ooffset_t)vtophys(contigmem_buffers[*offset/PAGE_SIZE]);
 	*obj = vm_pager_allocate(OBJT_DEVICE, cdev, size, nprot, *offset,
 			curthread->td_ucred);
 
-	return (0);
+	return 0;
 }
 
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 43e8a47..cbc7b46 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -561,12 +561,12 @@ rte_eal_init(int argc, char **argv)
 enum rte_lcore_role_t
 rte_eal_lcore_role(unsigned lcore_id)
 {
-	return (rte_config.lcore_role[lcore_id]);
+	return rte_config.lcore_role[lcore_id];
 }
 
 enum rte_proc_type_t
 rte_eal_process_type(void)
 {
-	return (rte_config.process_type);
+	return rte_config.process_type;
 }
 
diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
index 30f0232..9be6983 100644
--- a/lib/librte_eal/bsdapp/eal/eal_pci.c
+++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
@@ -179,10 +179,10 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
 			    != uio_res->maps[i].addr) {
 				RTE_LOG(ERR, EAL,
 					"Cannot mmap device resource\n");
-				return (-1);
+				return -1;
 			}
 		}
-		return (0);
+		return 0;
 	}
 
 	RTE_LOG(ERR, EAL, "Cannot find resource for device\n");
@@ -209,7 +209,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 
 	/* secondary processes - use already recorded details */
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
-		return (pci_uio_map_secondary(dev));
+		return pci_uio_map_secondary(dev);
 
 	snprintf(devname, sizeof(devname), "/dev/uio@pci:%u:%u:%u",
 			dev->addr.bus, dev->addr.devid, dev->addr.function);
@@ -233,7 +233,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 	if ((uio_res = rte_zmalloc("UIO_RES", sizeof (*uio_res), 0)) == NULL) {
 		RTE_LOG(ERR, EAL,
 			"%s(): cannot store uio mmap details\n", __func__);
-		return (-1);
+		return -1;
 	}
 
 	snprintf(uio_res->path, sizeof(uio_res->path), "%s", devname);
@@ -261,7 +261,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 						(size_t)maps[j].size)
 		    ) == NULL) {
 			rte_free(uio_res);
-			return (-1);
+			return -1;
 		}
 
 		maps[j].addr = mapaddr;
@@ -271,7 +271,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 
 	TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);
 
-	return (0);
+	return 0;
 }
 
 /* Scan one pci sysfs entry, and fill the devices list from it. */
diff --git a/lib/librte_eal/bsdapp/nic_uio/nic_uio.c b/lib/librte_eal/bsdapp/nic_uio/nic_uio.c
index e649e32..720ceed 100644
--- a/lib/librte_eal/bsdapp/nic_uio/nic_uio.c
+++ b/lib/librte_eal/bsdapp/nic_uio/nic_uio.c
@@ -131,7 +131,7 @@ nic_uio_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
 		int prot, vm_memattr_t *memattr)
 {
 	*paddr = offset;
-	return (0);
+	return 0;
 }
 
 static int
@@ -197,10 +197,10 @@ nic_uio_probe (device_t dev)
 			pci_get_device(dev) == devices[i].dev) {
 
 			device_set_desc(dev, "Intel(R) DPDK PCI Device");
-			return (BUS_PROBE_SPECIFIC);
+			return BUS_PROBE_SPECIFIC;
 		}
 
-	return (ENXIO);
+	return ENXIO;
 }
 
 static int
@@ -305,7 +305,7 @@ nic_uio_unload(void)
 static int
 nic_uio_shutdown(void)
 {
-	return (0);
+	return 0;
 }
 
 static int
@@ -326,5 +326,5 @@ nic_uio_modevent(module_t mod, int type, void *arg)
 		break;
 	}
 
-	return (0);
+	return 0;
 }
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index 888f9e5..aee184a 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -120,7 +120,7 @@ align_phys_boundary(const struct rte_memseg *ms, size_t len, size_t align,
 		addr_offset = start - ms->phys_addr;
 	}
 
-	return (addr_offset);
+	return addr_offset;
 }
 
 static const struct rte_memzone *
diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index c0ab8b4..9c1c238 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -291,7 +291,7 @@ rte_pause(void) {}
 static inline uint32_t
 rte_bsf32(uint32_t v)
 {
-	return (__builtin_ctz(v));
+	return __builtin_ctz(v);
 }
 
 #ifndef offsetof
diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 223d3cd..a9008cf 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -227,7 +227,7 @@ do {                                                               \
 	errno = 0;                                              \
 	val = strtoul((in), &end, 16);                          \
 	if (errno != 0 || end[0] != (dlm) || val > (lim))       \
-		return (-EINVAL);                               \
+		return -EINVAL;                                 \
 	(fd) = (typeof (fd))val;                                \
 	(in) = end + 1;                                         \
 } while(0)
@@ -252,7 +252,7 @@ eal_parse_pci_BDF(const char *input, struct rte_pci_addr *dev_addr)
 	GET_PCIADDR_FIELD(input, dev_addr->bus, UINT8_MAX, ':');
 	GET_PCIADDR_FIELD(input, dev_addr->devid, UINT8_MAX, '.');
 	GET_PCIADDR_FIELD(input, dev_addr->function, UINT8_MAX, 0);
-	return (0);
+	return 0;
 }
 
 /**
@@ -274,7 +274,7 @@ eal_parse_pci_DomBDF(const char *input, struct rte_pci_addr *dev_addr)
 	GET_PCIADDR_FIELD(input, dev_addr->bus, UINT8_MAX, ':');
 	GET_PCIADDR_FIELD(input, dev_addr->devid, UINT8_MAX, '.');
 	GET_PCIADDR_FIELD(input, dev_addr->function, UINT8_MAX, 0);
-	return (0);
+	return 0;
 }
 #undef GET_PCIADDR_FIELD
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index bd770cf..6334bc8 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -852,13 +852,13 @@ rte_eal_init(int argc, char **argv)
 enum rte_lcore_role_t
 rte_eal_lcore_role(unsigned lcore_id)
 {
-	return (rte_config.lcore_role[lcore_id]);
+	return rte_config.lcore_role[lcore_id];
 }
 
 enum rte_proc_type_t
 rte_eal_process_type(void)
 {
-	return (rte_config.process_type);
+	return rte_config.process_type;
 }
 
 int rte_eal_has_hugepages(void)
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 66deda2..e9c5af3 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -477,7 +477,7 @@ rte_intr_callback_register(struct rte_intr_handle *intr_handle,
 		if (write(intr_pipe.writefd, "1", 1) < 0)
 			return -EPIPE;
 
-	return (ret);
+	return ret;
 }
 
 int
@@ -541,7 +541,7 @@ rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
 		ret = -EPIPE;
 	}
 
-	return (ret);
+	return ret;
 }
 
 int
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5f9f92e..f7d1957 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -880,7 +880,7 @@ get_socket_mem_size(int socket)
 			size += hpi->hugepage_sz * hpi->num_pages[socket];
 	}
 
-	return (size);
+	return size;
 }
 
 /*
@@ -1339,7 +1339,7 @@ rte_eal_hugepage_init(void)
 			"of memory.\n",
 			i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
 			RTE_MAX_MEMSEG);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	return 0;
diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_procfs.c b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_procfs.c
index 2e7850c..f0e7309 100644
--- a/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_procfs.c
+++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/igb_procfs.c
@@ -262,7 +262,7 @@ int igb_procfs_topdir_init(void)
 {
 	igb_top_dir = proc_mkdir("driver/igb", NULL);
 	if (igb_top_dir == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/kni/kni_net.c b/lib/librte_eal/linuxapp/kni/kni_net.c
index dd95db5..2c3c2d8 100644
--- a/lib/librte_eal/linuxapp/kni/kni_net.c
+++ b/lib/librte_eal/linuxapp/kni/kni_net.c
@@ -88,7 +88,7 @@ kni_net_open(struct net_device *dev)
 	req.if_up = 1;
 	ret = kni_net_process_request(kni, &req);
 
-	return (ret == 0 ? req.result : ret);
+	return (ret == 0) ? req.result : ret;
 }
 
 static int
@@ -107,7 +107,7 @@ kni_net_release(struct net_device *dev)
 	req.if_up = 0;
 	ret = kni_net_process_request(kni, &req);
 
-	return (ret == 0 ? req.result : ret);
+	return (ret == 0) ? req.result : ret;
 }
 
 /*
@@ -511,7 +511,7 @@ kni_net_change_mtu(struct net_device *dev, int new_mtu)
 	if (ret == 0 && req.result == 0)
 		dev->mtu = new_mtu;
 
-	return (ret == 0 ? req.result : ret);
+	return (ret == 0) ? req.result : ret;
 }
 
 /*
@@ -597,7 +597,7 @@ kni_net_header(struct sk_buff *skb, struct net_device *dev,
 	memcpy(eth->h_dest,   daddr ? daddr : dev->dev_addr, dev->addr_len);
 	eth->h_proto = htons(type);
 
-	return (dev->hard_header_len);
+	return dev->hard_header_len;
 }
 
 
diff --git a/lib/librte_ip_frag/ip_frag_internal.c b/lib/librte_ip_frag/ip_frag_internal.c
index a2c645b..3d36b7d 100644
--- a/lib/librte_ip_frag/ip_frag_internal.c
+++ b/lib/librte_ip_frag/ip_frag_internal.c
@@ -200,7 +200,7 @@ ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr,
 		ip_frag_key_invalidate(&fp->key);
 		IP_FRAG_MBUF2DR(dr, mb);
 
-		return (NULL);
+		return NULL;
 	}
 
 	fp->frags[idx].ofs = ofs;
@@ -211,7 +211,7 @@ ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr,
 
 	/* not all fragments are collected yet. */
 	if (likely (fp->frag_size < fp->total_size)) {
-		return (mb);
+		return mb;
 
 	/* if we collected all fragments, then try to reassemble. */
 	} else if (fp->frag_size == fp->total_size &&
@@ -259,7 +259,7 @@ ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr,
 
 	/* we are done with that entry, invalidate it. */
 	ip_frag_key_invalidate(&fp->key);
-	return (mb);
+	return mb;
 }
 
 
@@ -327,7 +327,7 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
 	IP_FRAG_TBL_STAT_UPDATE(&tbl->stat, fail_total, (pkt == NULL));
 
 	tbl->last = pkt;
-	return (pkt);
+	return pkt;
 }
 
 struct ip_frag_pkt *
@@ -347,7 +347,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	assoc = tbl->bucket_entries;
 
 	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
-		return (tbl->last);
+		return tbl->last;
 
 	/* different hashing methods for IPv4 and IPv6 */
 	if (key->key_len == IPV4_KEYLEN)
@@ -414,5 +414,5 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 
 	*free = empty;
 	*stale = old;
-	return (NULL);
+	return NULL;
 }
diff --git a/lib/librte_ip_frag/rte_ip_frag_common.c b/lib/librte_ip_frag/rte_ip_frag_common.c
index c982d8c..6176ff4 100644
--- a/lib/librte_ip_frag/rte_ip_frag_common.c
+++ b/lib/librte_ip_frag/rte_ip_frag_common.c
@@ -83,7 +83,7 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 			nb_entries > UINT32_MAX || nb_entries == 0 ||
 			nb_entries < max_entries) {
 		RTE_LOG(ERR, USER1, "%s: invalid input parameter\n", __func__);
-		return (NULL);
+		return NULL;
 	}
 
 	sz = sizeof (*tbl) + nb_entries * sizeof (tbl->pkt[0]);
@@ -92,7 +92,7 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 		RTE_LOG(ERR, USER1,
 			"%s: allocation of %zu bytes at socket %d failed do\n",
 			__func__, sz, socket_id);
-		return (NULL);
+		return NULL;
 	}
 
 	RTE_LOG(INFO, USER1, "%s: allocated of %zu bytes at socket %d\n",
@@ -106,7 +106,7 @@ rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
 
 	TAILQ_INIT(&(tbl->lru));
-	return (tbl);
+	return tbl;
 }
 
 /* dump frag table statistics to file */
diff --git a/lib/librte_ip_frag/rte_ipv4_reassembly.c b/lib/librte_ip_frag/rte_ipv4_reassembly.c
index 841ac14..a52f549 100644
--- a/lib/librte_ip_frag/rte_ipv4_reassembly.c
+++ b/lib/librte_ip_frag/rte_ipv4_reassembly.c
@@ -73,7 +73,7 @@ ipv4_frag_reassemble(const struct ip_frag_pkt *fp)
 
 		/* error - hole in the packet. */
 		if (m == prev) {
-			return (NULL);
+			return NULL;
 		}
 	}
 
@@ -94,7 +94,7 @@ ipv4_frag_reassemble(const struct ip_frag_pkt *fp)
 		rte_cpu_to_be_16(IPV4_HDR_DF_FLAG));
 	ip_hdr->hdr_checksum = 0;
 
-	return (m);
+	return m;
 }
 
 /*
@@ -151,7 +151,7 @@ rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 	/* try to find/add entry into the fragment's table. */
 	if ((fp = ip_frag_find(tbl, dr, &key, tms)) == NULL) {
 		IP_FRAG_MBUF2DR(dr, mb);
-		return (NULL);
+		return NULL;
 	}
 
 	IP_FRAG_LOG(DEBUG, "%s:%d:\n"
@@ -178,5 +178,5 @@ rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		fp, fp->key.src_dst[0], fp->key.id, fp->start,
 		fp->total_size, fp->frag_size, fp->last_idx);
 
-	return (mb);
+	return mb;
 }
diff --git a/lib/librte_ip_frag/rte_ipv6_fragmentation.c b/lib/librte_ip_frag/rte_ipv6_fragmentation.c
index 4ffcc7c..0e32aa8 100644
--- a/lib/librte_ip_frag/rte_ipv6_fragmentation.c
+++ b/lib/librte_ip_frag/rte_ipv6_fragmentation.c
@@ -123,7 +123,7 @@ rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in,
 	/* Check that pkts_out is big enough to hold all fragments */
 	if (unlikely (frag_size * nb_pkts_out <
 	    (uint16_t)(pkt_in->pkt_len - sizeof (struct ipv6_hdr))))
-		return (-EINVAL);
+		return -EINVAL;
 
 	in_hdr = rte_pktmbuf_mtod(pkt_in, struct ipv6_hdr *);
 
@@ -142,7 +142,7 @@ rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in,
 		out_pkt = rte_pktmbuf_alloc(pool_direct);
 		if (unlikely(out_pkt == NULL)) {
 			__free_fragments(pkts_out, out_pkt_pos);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 
 		/* Reserve space for the IP header that will be built later */
@@ -160,7 +160,7 @@ rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in,
 			if (unlikely(out_seg == NULL)) {
 				rte_pktmbuf_free(out_pkt);
 				__free_fragments(pkts_out, out_pkt_pos);
-				return (-ENOMEM);
+				return -ENOMEM;
 			}
 			out_seg_prev->next = out_seg;
 			out_seg_prev = out_seg;
@@ -211,5 +211,5 @@ rte_ipv6_fragment_packet(struct rte_mbuf *pkt_in,
 		out_pkt_pos ++;
 	}
 
-	return (out_pkt_pos);
+	return out_pkt_pos;
 }
diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c
index 0945b81..a99384b 100644
--- a/lib/librte_lpm/rte_lpm.c
+++ b/lib/librte_lpm/rte_lpm.c
@@ -371,7 +371,7 @@ rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth)
 	for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) {
 		/* If rule is found return the rule index. */
 		if (lpm->rules_tbl[rule_index].ip == ip_masked)
-			return (rule_index);
+			return rule_index;
 	}
 
 	/* If rule is not found return -EINVAL. */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 70b0987..0922644 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -530,7 +530,7 @@ static inline struct rte_mbuf *__rte_mbuf_raw_alloc(struct rte_mempool *mp)
 	m = (struct rte_mbuf *)mb;
 	RTE_MBUF_ASSERT(rte_mbuf_refcnt_read(m) == 0);
 	rte_mbuf_refcnt_set(m, 1);
-	return (m);
+	return m;
 }
 
 /**
@@ -626,7 +626,7 @@ void rte_ctrlmbuf_init(struct rte_mempool *mp, void *opaque_arg,
 static inline int
 rte_is_ctrlmbuf(struct rte_mbuf *m)
 {
-	return (!!(m->ol_flags & CTRL_MBUF_FLAG));
+	return !!(m->ol_flags & CTRL_MBUF_FLAG);
 }
 
 /* Operations on pkt mbuf */
@@ -797,7 +797,7 @@ static inline struct rte_mbuf *rte_pktmbuf_alloc(struct rte_mempool *mp)
 	struct rte_mbuf *m;
 	if ((m = __rte_mbuf_raw_alloc(mp)) != NULL)
 		rte_pktmbuf_reset(m);
-	return (m);
+	return m;
 }
 
 /**
@@ -910,9 +910,9 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 			if (rte_mbuf_refcnt_update(md, -1) == 0)
 				__rte_mbuf_raw_free(md);
 		}
-		return(m);
+		return m;
 	}
-	return (NULL);
+	return NULL;
 }
 
 /**
@@ -980,7 +980,7 @@ static inline struct rte_mbuf *rte_pktmbuf_clone(struct rte_mbuf *md,
 	uint8_t nseg;
 
 	if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
-		return (NULL);
+		return NULL;
 
 	mi = mc;
 	prev = &mi->next;
@@ -1002,11 +1002,11 @@ static inline struct rte_mbuf *rte_pktmbuf_clone(struct rte_mbuf *md,
 	/* Allocation of new indirect segment failed */
 	if (unlikely (mi == NULL)) {
 		rte_pktmbuf_free(mc);
-		return (NULL);
+		return NULL;
 	}
 
 	__rte_mbuf_sanity_check(mc, 1);
-	return (mc);
+	return mc;
 }
 
 /**
diff --git a/lib/librte_mempool/rte_dom0_mempool.c b/lib/librte_mempool/rte_dom0_mempool.c
index 8900171..4d84b17 100644
--- a/lib/librte_mempool/rte_dom0_mempool.c
+++ b/lib/librte_mempool/rte_dom0_mempool.c
@@ -129,5 +129,5 @@ rte_dom0_mempool_create(const char *name, unsigned elt_num, unsigned elt_size,
 
 	free(pa);
 
-	return (mp);
+	return mp;
 }
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index cf7ed76..c105dc8 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -200,7 +200,7 @@ rte_mempool_obj_iter(void *vaddr, uint32_t elt_num, size_t elt_sz, size_t align,
 		}
 	}
 
-	return (i);
+	return i;
 }
 
 /*
@@ -309,7 +309,7 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 	/* this is the size of an object, including header and trailer */
 	sz->total_size = sz->header_size + sz->elt_size + sz->trailer_size;
 
-	return (sz->total_size);
+	return sz->total_size;
 }
 
 
@@ -330,7 +330,7 @@ rte_mempool_xmem_size(uint32_t elt_num, size_t elt_sz, uint32_t pg_shift)
 		sz = RTE_ALIGN_CEIL(elt_sz, pg_sz) * elt_num;
 	}
 
-	return (sz);
+	return sz;
 }
 
 /*
@@ -359,12 +359,12 @@ rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num, size_t elt_sz,
 	if ((n = rte_mempool_obj_iter(vaddr, elt_num, elt_sz, 1,
 			paddr, pg_num, pg_shift, mempool_lelem_iter,
 			&uv)) != elt_num) {
-		return (-n);
+		return -n;
 	}
 
 	uv = RTE_ALIGN_CEIL(uv, pg_sz);
 	usz = uv - va;
-	return (usz);
+	return usz;
 }
 
 /* create the mempool */
@@ -376,18 +376,18 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 		   int socket_id, unsigned flags)
 {
 #ifdef RTE_LIBRTE_XEN_DOM0
-	return (rte_dom0_mempool_create(name, n, elt_size,
+	return rte_dom0_mempool_create(name, n, elt_size,
 		cache_size, private_data_size,
 		mp_init, mp_init_arg,
 		obj_init, obj_init_arg,
-		socket_id, flags));
+		socket_id, flags);
 #else
-	return (rte_mempool_xmem_create(name, n, elt_size,
+	return rte_mempool_xmem_create(name, n, elt_size,
 		cache_size, private_data_size,
 		mp_init, mp_init_arg,
 		obj_init, obj_init_arg,
 		socket_id, flags,
-		NULL, NULL, MEMPOOL_PG_NUM_DEFAULT, MEMPOOL_PG_SHIFT_MAX));
+		NULL, NULL, MEMPOOL_PG_NUM_DEFAULT, MEMPOOL_PG_SHIFT_MAX);
 #endif
 }
 
diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h
index cdbce20..71c519a 100644
--- a/lib/librte_net/rte_ip.h
+++ b/lib/librte_net/rte_ip.h
@@ -243,7 +243,7 @@ rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr)
 {
 	uint16_t cksum;
 	cksum = rte_raw_cksum(ipv4_hdr, sizeof(struct ipv4_hdr));
-	return ((cksum == 0xffff) ? cksum : ~cksum);
+	return (cksum == 0xffff) ? cksum : ~cksum;
 }
 
 /**
diff --git a/lib/librte_pmd_e1000/em_ethdev.c b/lib/librte_pmd_e1000/em_ethdev.c
index da02988..8b24e8b 100644
--- a/lib/librte_pmd_e1000/em_ethdev.c
+++ b/lib/librte_pmd_e1000/em_ethdev.c
@@ -277,7 +277,7 @@ eth_em_dev_init(struct rte_eth_dev *eth_dev)
 	rte_intr_callback_register(&(pci_dev->intr_handle),
 		eth_em_interrupt_handler, (void *)eth_dev);
 
-	return (0);
+	return 0;
 }
 
 static struct eth_driver rte_em_pmd = {
@@ -375,11 +375,11 @@ em_hw_init(struct e1000_hw *hw)
 		PMD_INIT_LOG(ERR, "PHY reset is blocked due to "
 			"SOL/IDER session");
 	}
-	return (0);
+	return 0;
 
 error:
 	em_hw_control_release(hw);
-	return (diag);
+	return diag;
 }
 
 static int
@@ -392,7 +392,7 @@ eth_em_configure(struct rte_eth_dev *dev)
 	intr->flags |= E1000_FLAG_NEED_LINK_UPDATE;
 	PMD_INIT_FUNC_TRACE();
 
-	return (0);
+	return 0;
 }
 
 static void
@@ -476,7 +476,7 @@ eth_em_start(struct rte_eth_dev *dev)
 	/* Initialize the hardware */
 	if (em_hardware_init(hw)) {
 		PMD_INIT_LOG(ERR, "Unable to initialize the hardware");
-		return (-EIO);
+		return -EIO;
 	}
 
 	E1000_WRITE_REG(hw, E1000_VET, ETHER_TYPE_VLAN);
@@ -567,14 +567,14 @@ eth_em_start(struct rte_eth_dev *dev)
 
 	PMD_INIT_LOG(DEBUG, "<<");
 
-	return (0);
+	return 0;
 
 error_invalid_config:
 	PMD_INIT_LOG(ERR, "Invalid link_speed/link_duplex (%u/%u) for port %u",
 		     dev->data->dev_conf.link_speed,
 		     dev->data->dev_conf.link_duplex, dev->data->port_id);
 	em_dev_clear_queues(dev);
-	return (-EINVAL);
+	return -EINVAL;
 }
 
 /*********************************************************************
@@ -687,9 +687,9 @@ em_hardware_init(struct e1000_hw *hw)
 
 	diag = e1000_init_hw(hw);
 	if (diag < 0)
-		return (diag);
+		return diag;
 	e1000_check_for_link(hw);
-	return (0);
+	return 0;
 }
 
 /* This function is based on em_update_stats_counters() in e1000/if_em.c */
@@ -843,15 +843,15 @@ em_get_max_pktlen(const struct e1000_hw *hw)
 	case e1000_pch2lan:
 	case e1000_82574:
 	case e1000_80003es2lan: /* 9K Jumbo Frame size */
-		return (0x2412);
+		return 0x2412;
 	case e1000_pchlan:
-		return (0x1000);
+		return 0x1000;
 	/* Adapters that do not support jumbo frames */
 	case e1000_82583:
 	case e1000_ich8lan:
-		return (ETHER_MAX_LEN);
+		return ETHER_MAX_LEN;
 	default:
-		return (MAX_JUMBO_FRAME_SIZE);
+		return MAX_JUMBO_FRAME_SIZE;
 	}
 }
 
@@ -1223,7 +1223,7 @@ eth_em_interrupt_setup(struct rte_eth_dev *dev)
 
 	E1000_WRITE_REG(hw, E1000_IMS, E1000_ICR_LSC);
 	rte_intr_enable(&(dev->pci_dev->intr_handle));
-	return (0);
+	return 0;
 }
 
 /*
@@ -1349,7 +1349,7 @@ eth_em_led_on(struct rte_eth_dev *dev)
 	struct e1000_hw *hw;
 
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (e1000_led_on(hw) == E1000_SUCCESS ? 0 : -ENOTSUP);
+	return (e1000_led_on(hw) == E1000_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -1358,7 +1358,7 @@ eth_em_led_off(struct rte_eth_dev *dev)
 	struct e1000_hw *hw;
 
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (e1000_led_off(hw) == E1000_SUCCESS ? 0 : -ENOTSUP);
+	return (e1000_led_off(hw) == E1000_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -1430,7 +1430,7 @@ eth_em_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	    (fc_conf->high_water < fc_conf->low_water)) {
 		PMD_INIT_LOG(ERR, "e1000 incorrect high/low water value");
 		PMD_INIT_LOG(ERR, "high water must <= 0x%x", max_high_water);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	hw->fc.requested_mode = rte_fcmode_2_e1000_fcmode[fc_conf->mode];
@@ -1460,7 +1460,7 @@ eth_em_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	}
 
 	PMD_INIT_LOG(ERR, "e1000_setup_link_generic = 0x%x", err);
-	return (-EIO);
+	return -EIO;
 }
 
 static void
diff --git a/lib/librte_pmd_e1000/em_rxtx.c b/lib/librte_pmd_e1000/em_rxtx.c
index 64d067c..fd21bd1 100644
--- a/lib/librte_pmd_e1000/em_rxtx.c
+++ b/lib/librte_pmd_e1000/em_rxtx.c
@@ -85,7 +85,7 @@ rte_rxmbuf_alloc(struct rte_mempool *mp)
 
 	m = __rte_mbuf_raw_alloc(mp);
 	__rte_mbuf_sanity_check_raw(m, 0);
-	return (m);
+	return m;
 }
 
 #define RTE_MBUF_DATA_DMA_ADDR(mb)             \
@@ -310,10 +310,10 @@ what_ctx_update(struct em_tx_queue *txq, uint64_t flags,
 	if (likely (txq->ctx_cache.flags == flags &&
 			((txq->ctx_cache.hdrlen.data ^ hdrlen.data) &
 			txq->ctx_cache.cmp_mask) == 0))
-		return (EM_CTX_0);
+		return EM_CTX_0;
 
 	/* Mismatch */
-	return (EM_CTX_NUM);
+	return EM_CTX_NUM;
 }
 
 /* Reset transmit descriptors after they have been used */
@@ -371,7 +371,7 @@ em_xmit_cleanup(struct em_tx_queue *txq)
 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + nb_tx_to_clean);
 
 	/* No Error */
-	return (0);
+	return 0;
 }
 
 static inline uint32_t
@@ -383,7 +383,7 @@ tx_desc_cksum_flags_to_upper(uint64_t ol_flags)
 
 	tmp = l4_olinfo[(ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM];
 	tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
-	return (tmp);
+	return tmp;
 }
 
 uint16_t
@@ -492,7 +492,7 @@ eth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			if (em_xmit_cleanup(txq) != 0) {
 				/* Could not clean any descriptors */
 				if (nb_tx == 0)
-					return (0);
+					return 0;
 				goto end_of_tx;
 			}
 		}
@@ -629,7 +629,7 @@ end_of_tx:
 	E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
-	return (nb_tx);
+	return nb_tx;
 }
 
 /*********************************************************************
@@ -658,7 +658,7 @@ rx_desc_error_to_pkt_flags(uint32_t rx_error)
 		pkt_flags |= PKT_RX_IP_CKSUM_BAD;
 	if (rx_error & E1000_RXD_ERR_TCPE)
 		pkt_flags |= PKT_RX_L4_CKSUM_BAD;
-	return (pkt_flags);
+	return pkt_flags;
 }
 
 uint16_t
@@ -832,7 +832,7 @@ eth_em_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
-	return (nb_rx);
+	return nb_rx;
 }
 
 uint16_t
@@ -1077,7 +1077,7 @@ eth_em_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
-	return (nb_rx);
+	return nb_rx;
 }
 
 /*
@@ -1115,7 +1115,7 @@ ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
 		queue_id);
 
 	if ((mz = rte_memzone_lookup(z_name)) != 0)
-		return (mz);
+		return mz;
 
 #ifdef RTE_LIBRTE_XEN_DOM0
 	return rte_memzone_reserve_bounded(z_name, ring_size,
@@ -1274,19 +1274,19 @@ eth_em_tx_queue_setup(struct rte_eth_dev *dev,
 	tsize = sizeof (txq->tx_ring[0]) * EM_MAX_RING_DESC;
 	if ((tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx, tsize,
 			socket_id)) == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/* Allocate the tx queue data structure. */
 	if ((txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),
 			RTE_CACHE_LINE_SIZE)) == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/* Allocate software ring */
 	if ((txq->sw_ring = rte_zmalloc("txq->sw_ring",
 			sizeof(txq->sw_ring[0]) * nb_desc,
 			RTE_CACHE_LINE_SIZE)) == NULL) {
 		em_tx_queue_release(txq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	txq->nb_tx_desc = nb_desc;
@@ -1312,7 +1312,7 @@ eth_em_tx_queue_setup(struct rte_eth_dev *dev,
 	em_reset_tx_queue(txq);
 
 	dev->data->tx_queues[queue_idx] = txq;
-	return (0);
+	return 0;
 }
 
 static void
@@ -1379,7 +1379,7 @@ eth_em_rx_queue_setup(struct rte_eth_dev *dev,
 	if (((nb_desc * sizeof(rxq->rx_ring[0])) % EM_ALIGN) != 0 ||
 			(nb_desc > EM_MAX_RING_DESC) ||
 			(nb_desc < EM_MIN_RING_DESC)) {
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -1388,7 +1388,7 @@ eth_em_rx_queue_setup(struct rte_eth_dev *dev,
 	if (rx_conf->rx_drop_en) {
 		PMD_INIT_LOG(ERR, "drop_en functionality not supported by "
 			     "device");
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/* Free memory prior to re-allocation if needed. */
@@ -1401,19 +1401,19 @@ eth_em_rx_queue_setup(struct rte_eth_dev *dev,
 	rsize = sizeof (rxq->rx_ring[0]) * EM_MAX_RING_DESC;
 	if ((rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, rsize,
 			socket_id)) == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/* Allocate the RX queue data structure. */
 	if ((rxq = rte_zmalloc("ethdev RX queue", sizeof(*rxq),
 			RTE_CACHE_LINE_SIZE)) == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/* Allocate software ring. */
 	if ((rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
 			sizeof (rxq->sw_ring[0]) * nb_desc,
 			RTE_CACHE_LINE_SIZE)) == NULL) {
 		em_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	rxq->mb_pool = mp;
@@ -1442,7 +1442,7 @@ eth_em_rx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->rx_queues[queue_idx] = rxq;
 	em_reset_rx_queue(rxq);
 
-	return (0);
+	return 0;
 }
 
 uint32_t
@@ -1575,12 +1575,12 @@ em_rctl_bsize(__rte_unused enum e1000_mac_type hwtyp, uint32_t *bufsz)
 			i++) {
 		if (rctl_bsize >= bufsz_to_rctl[i].bufsz) {
 			*bufsz = bufsz_to_rctl[i].bufsz;
-			return (bufsz_to_rctl[i].rctl);
+			return bufsz_to_rctl[i].rctl;
 		}
 	}
 
 	/* Should never happen. */
-	return (-EINVAL);
+	return -EINVAL;
 }
 
 static int
@@ -1601,7 +1601,7 @@ em_alloc_rx_queue_mbufs(struct em_rx_queue *rxq)
 		if (mbuf == NULL) {
 			PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
 				     "queue_id=%hu", rxq->queue_id);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 
 		dma_addr = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 4415155..c5ce6ba 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -605,7 +605,7 @@ eth_igb_dev_init(struct rte_eth_dev *eth_dev)
 err_late:
 	igb_hw_control_release(hw);
 
-	return (error);
+	return error;
 }
 
 /*
@@ -731,7 +731,7 @@ rte_igbvf_pmd_init(const char *name __rte_unused, const char *params __rte_unuse
 	PMD_INIT_FUNC_TRACE();
 
 	rte_eth_driver_register(&rte_igbvf_pmd);
-	return (0);
+	return 0;
 }
 
 static int
@@ -744,7 +744,7 @@ eth_igb_configure(struct rte_eth_dev *dev)
 	intr->flags |= E1000_FLAG_NEED_LINK_UPDATE;
 	PMD_INIT_FUNC_TRACE();
 
-	return (0);
+	return 0;
 }
 
 static int
@@ -778,7 +778,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* Initialize the hardware */
 	if (igb_hardware_init(hw)) {
 		PMD_INIT_LOG(ERR, "Unable to initialize the hardware");
-		return (-EIO);
+		return -EIO;
 	}
 
 	E1000_WRITE_REG(hw, E1000_VET, ETHER_TYPE_VLAN << 16 | ETHER_TYPE_VLAN);
@@ -904,14 +904,14 @@ eth_igb_start(struct rte_eth_dev *dev)
 
 	PMD_INIT_LOG(DEBUG, "<<");
 
-	return (0);
+	return 0;
 
 error_invalid_config:
 	PMD_INIT_LOG(ERR, "Invalid link_speed/link_duplex (%u/%u) for port %u",
 		     dev->data->dev_conf.link_speed,
 		     dev->data->dev_conf.link_duplex, dev->data->port_id);
 	igb_dev_clear_queues(dev);
-	return (-EINVAL);
+	return -EINVAL;
 }
 
 /*********************************************************************
@@ -1075,13 +1075,13 @@ igb_hardware_init(struct e1000_hw *hw)
 
 	diag = e1000_init_hw(hw);
 	if (diag < 0)
-		return (diag);
+		return diag;
 
 	E1000_WRITE_REG(hw, E1000_VET, ETHER_TYPE_VLAN << 16 | ETHER_TYPE_VLAN);
 	e1000_get_phy_info(hw);
 	e1000_check_for_link(hw);
 
-	return (0);
+	return 0;
 }
 
 /* This function is based on igb_update_stats_counters() in igb/if_igb.c */
@@ -1968,7 +1968,7 @@ eth_igb_led_on(struct rte_eth_dev *dev)
 	struct e1000_hw *hw;
 
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (e1000_led_on(hw) == E1000_SUCCESS ? 0 : -ENOTSUP);
+	return (e1000_led_on(hw) == E1000_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -1977,7 +1977,7 @@ eth_igb_led_off(struct rte_eth_dev *dev)
 	struct e1000_hw *hw;
 
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (e1000_led_off(hw) == E1000_SUCCESS ? 0 : -ENOTSUP);
+	return (e1000_led_off(hw) == E1000_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -2049,7 +2049,7 @@ eth_igb_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	    (fc_conf->high_water < fc_conf->low_water)) {
 		PMD_INIT_LOG(ERR, "e1000 incorrect high/low water value");
 		PMD_INIT_LOG(ERR, "high water must <=  0x%x", max_high_water);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	hw->fc.requested_mode = rte_fcmode_2_e1000_fcmode[fc_conf->mode];
@@ -2079,7 +2079,7 @@ eth_igb_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	}
 
 	PMD_INIT_LOG(ERR, "e1000_setup_link_generic = 0x%x", err);
-	return (-EIO);
+	return -EIO;
 }
 
 #define E1000_RAH_POOLSEL_SHIFT      (18)
@@ -2284,7 +2284,7 @@ static int igbvf_set_vfta(struct e1000_hw *hw, uint16_t vid, bool on)
 	if (on)
 		msgbuf[0] |= E1000_VF_SET_VLAN_ADD;
 
-	return (mbx->ops.write_posted(hw, msgbuf, 2, 0));
+	return mbx->ops.write_posted(hw, msgbuf, 2, 0);
 }
 
 static void igbvf_set_vfta_all(struct rte_eth_dev *dev, bool on)
diff --git a/lib/librte_pmd_e1000/igb_rxtx.c b/lib/librte_pmd_e1000/igb_rxtx.c
index 80d05c0..1a61e4a 100644
--- a/lib/librte_pmd_e1000/igb_rxtx.c
+++ b/lib/librte_pmd_e1000/igb_rxtx.c
@@ -85,7 +85,7 @@ rte_rxmbuf_alloc(struct rte_mempool *mp)
 
 	m = __rte_mbuf_raw_alloc(mp);
 	__rte_mbuf_sanity_check_raw(m, 0);
-	return (m);
+	return m;
 }
 
 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
@@ -321,7 +321,7 @@ what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
 	}
 
 	/* Mismatch, use the previous context */
-	return (IGB_CTX_NUM);
+	return IGB_CTX_NUM;
 }
 
 static inline uint32_t
@@ -473,7 +473,7 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 */
 		if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
 			if (nb_tx == 0)
-				return (0);
+				return 0;
 			goto end_of_tx;
 		}
 
@@ -582,7 +582,7 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		   (unsigned) tx_id, (unsigned) nb_tx);
 	txq->tx_tail = tx_id;
 
-	return (nb_tx);
+	return nb_tx;
 }
 
 /*********************************************************************
@@ -821,7 +821,7 @@ eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
-	return (nb_rx);
+	return nb_rx;
 }
 
 uint16_t
@@ -1074,7 +1074,7 @@ eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
-	return (nb_rx);
+	return nb_rx;
 }
 
 /*
@@ -1244,7 +1244,7 @@ eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
 	txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
 							RTE_CACHE_LINE_SIZE);
 	if (txq == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Allocate TX ring hardware descriptors. A memzone large enough to
@@ -1256,7 +1256,7 @@ eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
 					size, socket_id);
 	if (tz == NULL) {
 		igb_tx_queue_release(txq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	txq->nb_tx_desc = nb_desc;
@@ -1283,7 +1283,7 @@ eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
 				   RTE_CACHE_LINE_SIZE);
 	if (txq->sw_ring == NULL) {
 		igb_tx_queue_release(txq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
 		     txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
@@ -1292,7 +1292,7 @@ eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->tx_pkt_burst = eth_igb_xmit_pkts;
 	dev->data->tx_queues[queue_idx] = txq;
 
-	return (0);
+	return 0;
 }
 
 static void
@@ -1364,7 +1364,7 @@ eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
 	 */
 	if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
 	    (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/* Free memory prior to re-allocation if needed */
@@ -1377,7 +1377,7 @@ eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
 	rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
 			  RTE_CACHE_LINE_SIZE);
 	if (rxq == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 	rxq->mb_pool = mp;
 	rxq->nb_rx_desc = nb_desc;
 	rxq->pthresh = rx_conf->rx_thresh.pthresh;
@@ -1403,7 +1403,7 @@ eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
 	rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
 	if (rz == NULL) {
 		igb_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
 	rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
@@ -1420,7 +1420,7 @@ eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
 				   RTE_CACHE_LINE_SIZE);
 	if (rxq->sw_ring == NULL) {
 		igb_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
 		     rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
@@ -1863,7 +1863,7 @@ igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
 		if (mbuf == NULL) {
 			PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
 				     "queue_id=%hu", rxq->queue_id);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 		dma_addr =
 			rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
diff --git a/lib/librte_pmd_fm10k/fm10k_ethdev.c b/lib/librte_pmd_fm10k/fm10k_ethdev.c
index 275c19c..c6a4dd2 100644
--- a/lib/librte_pmd_fm10k/fm10k_ethdev.c
+++ b/lib/librte_pmd_fm10k/fm10k_ethdev.c
@@ -876,7 +876,7 @@ handle_rxconf(struct fm10k_rx_queue *q, const struct rte_eth_rxconf *conf)
 			rx_free_thresh, FM10K_RX_FREE_THRESH_MAX(q),
 			FM10K_RX_FREE_THRESH_MIN(q),
 			FM10K_RX_FREE_THRESH_DIV(q));
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	q->alloc_thresh = rx_free_thresh;
@@ -936,7 +936,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 	/* make sure the mempool element size can account for alignment. */
 	if (!mempool_element_size_valid(mp)) {
 		PMD_INIT_LOG(ERR, "Error : Mempool element size is too small");
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/* make sure a valid number of descriptors have been requested */
@@ -948,7 +948,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 			"and a multiple of %u",
 			nb_desc, (uint32_t)FM10K_MAX_RX_DESC, FM10K_MIN_RX_DESC,
 			FM10K_MULT_RX_DESC);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -966,7 +966,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 				socket_id);
 	if (q == NULL) {
 		PMD_INIT_LOG(ERR, "Cannot allocate queue structure");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/* setup queue */
@@ -977,7 +977,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 	q->tail_ptr = (volatile uint32_t *)
 		&((uint32_t *)hw->hw_addr)[FM10K_RDT(queue_id)];
 	if (handle_rxconf(q, conf))
-		return (-EINVAL);
+		return -EINVAL;
 
 	/* allocate memory for the software ring */
 	q->sw_ring = rte_zmalloc_socket("fm10k sw ring",
@@ -986,7 +986,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 	if (q->sw_ring == NULL) {
 		PMD_INIT_LOG(ERR, "Cannot allocate software ring");
 		rte_free(q);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/*
@@ -1001,7 +1001,7 @@ fm10k_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 		PMD_INIT_LOG(ERR, "Cannot allocate hardware ring");
 		rte_free(q->sw_ring);
 		rte_free(q);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	q->hw_ring = mz->addr;
 	q->hw_ring_phys_addr = mz->phys_addr;
@@ -1043,7 +1043,7 @@ handle_txconf(struct fm10k_tx_queue *q, const struct rte_eth_txconf *conf)
 			tx_free_thresh, FM10K_TX_FREE_THRESH_MAX(q),
 			FM10K_TX_FREE_THRESH_MIN(q),
 			FM10K_TX_FREE_THRESH_DIV(q));
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	q->free_thresh = tx_free_thresh;
@@ -1067,7 +1067,7 @@ handle_txconf(struct fm10k_tx_queue *q, const struct rte_eth_txconf *conf)
 			tx_rs_thresh, FM10K_TX_RS_THRESH_MAX(q),
 			FM10K_TX_RS_THRESH_MIN(q),
 			FM10K_TX_RS_THRESH_DIV(q));
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	q->rs_thresh = tx_rs_thresh;
@@ -1095,7 +1095,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 			"and a multiple of %u",
 			nb_desc, (uint32_t)FM10K_MAX_TX_DESC, FM10K_MIN_TX_DESC,
 			FM10K_MULT_TX_DESC);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/*
@@ -1113,7 +1113,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 				socket_id);
 	if (q == NULL) {
 		PMD_INIT_LOG(ERR, "Cannot allocate queue structure");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/* setup queue */
@@ -1123,7 +1123,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 	q->tail_ptr = (volatile uint32_t *)
 		&((uint32_t *)hw->hw_addr)[FM10K_TDT(queue_id)];
 	if (handle_txconf(q, conf))
-		return (-EINVAL);
+		return -EINVAL;
 
 	/* allocate memory for the software ring */
 	q->sw_ring = rte_zmalloc_socket("fm10k sw ring",
@@ -1132,7 +1132,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 	if (q->sw_ring == NULL) {
 		PMD_INIT_LOG(ERR, "Cannot allocate software ring");
 		rte_free(q);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/*
@@ -1147,7 +1147,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 		PMD_INIT_LOG(ERR, "Cannot allocate hardware ring");
 		rte_free(q->sw_ring);
 		rte_free(q);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	q->hw_ring = mz->addr;
 	q->hw_ring_phys_addr = mz->phys_addr;
@@ -1164,7 +1164,7 @@ fm10k_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id,
 		PMD_INIT_LOG(ERR, "Cannot allocate RS bit tracker");
 		rte_free(q->sw_ring);
 		rte_free(q);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	dev->data->tx_queues[queue_id] = q;
diff --git a/lib/librte_pmd_i40e/i40e_rxtx.c b/lib/librte_pmd_i40e/i40e_rxtx.c
index 493cfa3..38a6536 100644
--- a/lib/librte_pmd_i40e/i40e_rxtx.c
+++ b/lib/librte_pmd_i40e/i40e_rxtx.c
@@ -1647,7 +1647,7 @@ i40e_get_queue_offset_by_qindex(struct i40e_pf *pf, uint16_t queue_idx)
 		return queue_idx % pf->vmdq_nb_qps;
 	else {
 		PMD_INIT_LOG(ERR, "Fail to get queue offset");
-		return (uint16_t)(-1);
+		return (uint16_t)-1;
 	}
 }
 
@@ -1821,7 +1821,7 @@ i40e_dev_rx_queue_setup(struct rte_eth_dev *dev,
 	if (!rxq) {
 		PMD_DRV_LOG(ERR, "Failed to allocate memory for "
 			    "rx queue data structure");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	rxq->mp = mp;
 	rxq->nb_rx_desc = nb_desc;
@@ -1851,7 +1851,7 @@ i40e_dev_rx_queue_setup(struct rte_eth_dev *dev,
 	if (!rz) {
 		i40e_dev_rx_queue_release(rxq);
 		PMD_DRV_LOG(ERR, "Failed to reserve DMA memory for RX");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/* Zero all the descriptors in the ring. */
@@ -1880,7 +1880,7 @@ i40e_dev_rx_queue_setup(struct rte_eth_dev *dev,
 	if (!rxq->sw_ring) {
 		i40e_dev_rx_queue_release(rxq);
 		PMD_DRV_LOG(ERR, "Failed to allocate memory for SW ring");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	i40e_reset_rx_queue(rxq);
@@ -2105,7 +2105,7 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	if (!txq) {
 		PMD_DRV_LOG(ERR, "Failed to allocate memory for "
 			    "tx queue structure");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/* Allocate TX hardware ring descriptors. */
@@ -2119,7 +2119,7 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	if (!tz) {
 		i40e_dev_tx_queue_release(txq);
 		PMD_DRV_LOG(ERR, "Failed to reserve DMA memory for TX");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	txq->nb_tx_desc = nb_desc;
@@ -2156,7 +2156,7 @@ i40e_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	if (!txq->sw_ring) {
 		i40e_dev_tx_queue_release(txq);
 		PMD_DRV_LOG(ERR, "Failed to allocate memory for SW TX ring");
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	i40e_reset_tx_queue(txq);
diff --git a/lib/librte_pmd_ixgbe/ixgbe_82599_bypass.c b/lib/librte_pmd_ixgbe/ixgbe_82599_bypass.c
index 12cc01d..0d7081d 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_82599_bypass.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_82599_bypass.c
@@ -268,7 +268,7 @@ ixgbe_bypass_get_media_type(struct ixgbe_hw *hw)
 	} else {
 		media_type = ixgbe_get_media_type_82599(hw);
 	}
-	return (media_type);
+	return media_type;
 }
 
 /*
@@ -310,5 +310,5 @@ ixgbe_bypass_init_hw(struct ixgbe_hw *hw)
                 hw->mac.ops.flap_tx_laser = NULL;
 	}
 
-	return (rc);
+	return rc;
 }
diff --git a/lib/librte_pmd_ixgbe/ixgbe_bypass.c b/lib/librte_pmd_ixgbe/ixgbe_bypass.c
index 832f415..e846946 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_bypass.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_bypass.c
@@ -153,7 +153,7 @@ ixgbe_bypass_state_show(struct rte_eth_dev *dev, u32 *state)
 	 */
 	*state = (by_ctl >> BYPASS_STATUS_OFF_SHIFT) &  BYPASS_STATUS_OFF_MASK;
 
-	return (ret_val);
+	return ret_val;
 }
 
 
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 5f9a1cf..aad7b27 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -1025,7 +1025,7 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
 	 */
 	if ((diag != IXGBE_SUCCESS) && (diag != IXGBE_ERR_INVALID_MAC_ADDR)) {
 		PMD_INIT_LOG(ERR, "VF Initialization Failure: %d", diag);
-		return (diag);
+		return diag;
 	}
 
 	/* negotiate mailbox API version to use with the PF. */
@@ -1076,7 +1076,7 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
 
 		default:
 			PMD_INIT_LOG(ERR, "VF Initialization Failure: %d", diag);
-			return (-EIO);
+			return -EIO;
 	}
 
 	PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x mac.type=%s",
@@ -1134,7 +1134,7 @@ rte_ixgbevf_pmd_init(const char *name __rte_unused, const char *param __rte_unus
 	PMD_INIT_FUNC_TRACE();
 
 	rte_eth_driver_register(&rte_ixgbevf_pmd);
-	return (0);
+	return 0;
 }
 
 static int
@@ -1597,7 +1597,7 @@ skip_link_setup:
 
 	ixgbe_restore_statistics_mapping(dev);
 
-	return (0);
+	return 0;
 
 error:
 	PMD_INIT_LOG(ERR, "failure in ixgbe_dev_start(): %d", err);
@@ -2451,7 +2451,7 @@ ixgbe_dev_led_on(struct rte_eth_dev *dev)
 	struct ixgbe_hw *hw;
 
 	hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (ixgbe_led_on(hw, 0) == IXGBE_SUCCESS ? 0 : -ENOTSUP);
+	return (ixgbe_led_on(hw, 0) == IXGBE_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -2460,7 +2460,7 @@ ixgbe_dev_led_off(struct rte_eth_dev *dev)
 	struct ixgbe_hw *hw;
 
 	hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	return (ixgbe_led_off(hw, 0) == IXGBE_SUCCESS ? 0 : -ENOTSUP);
+	return (ixgbe_led_off(hw, 0) == IXGBE_SUCCESS) ? 0 : -ENOTSUP;
 }
 
 static int
@@ -2544,7 +2544,7 @@ ixgbe_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		(fc_conf->high_water < fc_conf->low_water)) {
 		PMD_INIT_LOG(ERR, "Invalid high/low water setup value in KB");
 		PMD_INIT_LOG(ERR, "High_water must <= 0x%x", max_high_water);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	hw->fc.requested_mode = rte_fcmode_2_ixgbe_fcmode[fc_conf->mode];
@@ -2765,7 +2765,7 @@ ixgbe_priority_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_pfc_conf *p
 	    (pfc_conf->fc.high_water <= pfc_conf->fc.low_water)) {
 		PMD_INIT_LOG(ERR, "Invalid high/low water setup value in KB");
 		PMD_INIT_LOG(ERR, "High_water must <= 0x%x", max_high_water);
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	hw->fc.requested_mode = rte_fcmode_2_ixgbe_fcmode[pfc_conf->fc.mode];
@@ -3145,7 +3145,7 @@ ixgbe_vmdq_mode_check(struct ixgbe_hw *hw)
 	reg_val = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
 	if (!(reg_val & IXGBE_VT_CTL_VT_ENABLE)) {
 		PMD_INIT_LOG(ERR, "VMDq must be enabled for this setting");
-		return (-1);
+		return -1;
 	}
 
 	return 0;
@@ -3202,7 +3202,7 @@ ixgbe_uc_hash_table_set(struct rte_eth_dev *dev,struct ether_addr* mac_addr,
 
 	/* The UTA table only exists on 82599 hardware and newer */
 	if (hw->mac.type < ixgbe_mac_82599EB)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	vector = ixgbe_uta_vector(hw,mac_addr);
 	uta_idx = (vector >> ixgbe_uta_bit_shift) & ixgbe_uta_idx_mask;
@@ -3245,7 +3245,7 @@ ixgbe_uc_all_hash_table_set(struct rte_eth_dev *dev, uint8_t on)
 
 	/* The UTA table only exists on 82599 hardware and newer */
 	if (hw->mac.type < ixgbe_mac_82599EB)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	if(on) {
 		for (i = 0; i < ETH_VMDQ_NUM_UC_HASH_ARRAY; i++) {
@@ -3294,10 +3294,10 @@ ixgbe_set_pool_rx_mode(struct rte_eth_dev *dev, uint16_t pool,
 	if (hw->mac.type == ixgbe_mac_82598EB) {
 		PMD_INIT_LOG(ERR, "setting VF receive mode set should be done"
 			     " on 82599 hardware and newer");
-		return (-ENOTSUP);
+		return -ENOTSUP;
 	}
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	val = ixgbe_convert_vm_rx_mask_to_val(rx_mask, val);
 
@@ -3322,7 +3322,7 @@ ixgbe_set_pool_rx(struct rte_eth_dev *dev, uint16_t pool, uint8_t on)
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	addr = IXGBE_VFRE(pool >= ETH_64_POOLS/2);
 	reg = IXGBE_READ_REG(hw, addr);
@@ -3349,7 +3349,7 @@ ixgbe_set_pool_tx(struct rte_eth_dev *dev, uint16_t pool, uint8_t on)
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	addr = IXGBE_VFTE(pool >= ETH_64_POOLS/2);
 	reg = IXGBE_READ_REG(hw, addr);
@@ -3375,7 +3375,7 @@ ixgbe_set_pool_vlan_filter(struct rte_eth_dev *dev, uint16_t vlan,
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 	for (pool_idx = 0; pool_idx < ETH_64_POOLS; pool_idx++) {
 		if (pool_mask & ((uint64_t)(1ULL << pool_idx)))
 			ret = hw->mac.ops.set_vfta(hw,vlan,pool_idx,vlan_on);
@@ -3412,12 +3412,12 @@ ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	/* Check if vlan mask is valid */
 	if ((mirror_conf->rule_type_mask & ETH_VMDQ_VLAN_MIRROR) && (on)) {
 		if (mirror_conf->vlan.vlan_mask == 0)
-			return (-EINVAL);
+			return -EINVAL;
 	}
 
 	/* Check if vlan id is valid and find conresponding VLAN ID index in VLVF */
@@ -3428,14 +3428,14 @@ ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 				reg_index = ixgbe_find_vlvf_slot(hw,
 						mirror_conf->vlan.vlan_id[i]);
 				if(reg_index < 0)
-					return (-EINVAL);
+					return -EINVAL;
 				vlvf = IXGBE_READ_REG(hw, IXGBE_VLVF(reg_index));
 				if ((vlvf & IXGBE_VLVF_VIEN) &&
 					((vlvf & IXGBE_VLVF_VLANID_MASK)
 						== mirror_conf->vlan.vlan_id[i]))
 					vlan_mask |= (1ULL << reg_index);
 				else
-					return (-EINVAL);
+					return -EINVAL;
 			}
 		}
 
@@ -3523,7 +3523,7 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 		(IXGBE_DEV_PRIVATE_TO_PFDATA(dev->data->dev_private));
 
 	if (ixgbe_vmdq_mode_check(hw) < 0)
-		return (-ENOTSUP);
+		return -ENOTSUP;
 
 	memset(&mr_info->mr_conf[rule_id], 0,
 		sizeof(struct rte_eth_vmdq_mirror_conf));
diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
index 57c9430..8323bf9 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
@@ -94,7 +94,7 @@ rte_rxmbuf_alloc(struct rte_mempool *mp)
 
 	m = __rte_mbuf_raw_alloc(mp);
 	__rte_mbuf_sanity_check_raw(m, 0);
-	return (m);
+	return m;
 }
 
 
@@ -461,7 +461,7 @@ what_advctx_update(struct ixgbe_tx_queue *txq, uint64_t flags,
 	}
 
 	/* Mismatch, use the previous context */
-	return (IXGBE_CTX_NUM);
+	return IXGBE_CTX_NUM;
 }
 
 static inline uint32_t
@@ -552,7 +552,7 @@ ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
 	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + nb_tx_to_clean);
 
 	/* No Error */
-	return (0);
+	return 0;
 }
 
 uint16_t
@@ -668,7 +668,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			if (ixgbe_xmit_cleanup(txq) != 0) {
 				/* Could not clean any descriptors */
 				if (nb_tx == 0)
-					return (0);
+					return 0;
 				goto end_of_tx;
 			}
 
@@ -697,7 +697,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 						 * descriptors
 						 */
 						if (nb_tx == 0)
-							return (0);
+							return 0;
 						goto end_of_tx;
 					}
 				}
@@ -847,7 +847,7 @@ end_of_tx:
 	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
-	return (nb_tx);
+	return nb_tx;
 }
 
 /*********************************************************************
@@ -1037,7 +1037,7 @@ ixgbe_rx_alloc_bufs(struct ixgbe_rx_queue *rxq, bool reset_mbuf)
 	diag = rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
 				    rxq->rx_free_thresh);
 	if (unlikely(diag != 0))
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	rxdp = &rxq->rx_ring[alloc_idx];
 	for (i = 0; i < rxq->rx_free_thresh; ++i) {
@@ -1372,7 +1372,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
-	return (nb_rx);
+	return nb_rx;
 }
 
 /**
@@ -2003,7 +2003,7 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct ixgbe_tx_queue),
 				 RTE_CACHE_LINE_SIZE, socket_id);
 	if (txq == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 
 	/*
 	 * Allocate TX ring hardware descriptors. A memzone large enough to
@@ -2015,7 +2015,7 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 			socket_id);
 	if (tz == NULL) {
 		ixgbe_tx_queue_release(txq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	txq->nb_tx_desc = nb_desc;
@@ -2055,7 +2055,7 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 				RTE_CACHE_LINE_SIZE, socket_id);
 	if (txq->sw_ring == NULL) {
 		ixgbe_tx_queue_release(txq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 	PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
 		     txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
@@ -2068,7 +2068,7 @@ ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	dev->data->tx_queues[queue_idx] = txq;
 
 
-	return (0);
+	return 0;
 }
 
 /**
@@ -2284,7 +2284,7 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
 	if (((nb_desc * sizeof(union ixgbe_adv_rx_desc)) % IXGBE_ALIGN) != 0 ||
 	    (nb_desc > IXGBE_MAX_RING_DESC) ||
 	    (nb_desc < IXGBE_MIN_RING_DESC)) {
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	/* Free memory prior to re-allocation if needed... */
@@ -2297,7 +2297,7 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
 	rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct ixgbe_rx_queue),
 				 RTE_CACHE_LINE_SIZE, socket_id);
 	if (rxq == NULL)
-		return (-ENOMEM);
+		return -ENOMEM;
 	rxq->mb_pool = mp;
 	rxq->nb_rx_desc = nb_desc;
 	rxq->rx_free_thresh = rx_conf->rx_free_thresh;
@@ -2319,7 +2319,7 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
 				   RX_RING_SZ, socket_id);
 	if (rz == NULL) {
 		ixgbe_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/*
@@ -2379,7 +2379,7 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
 					  RTE_CACHE_LINE_SIZE, socket_id);
 	if (!rxq->sw_ring) {
 		ixgbe_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	/*
@@ -2396,7 +2396,7 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
 				   RTE_CACHE_LINE_SIZE, socket_id);
 	if (!rxq->sw_sc_ring) {
 		ixgbe_rx_queue_release(rxq);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	PMD_INIT_LOG(DEBUG, "sw_ring=%p sw_sc_ring=%p hw_ring=%p "
@@ -3452,7 +3452,7 @@ ixgbe_alloc_rx_queue_mbufs(struct ixgbe_rx_queue *rxq)
 		if (mbuf == NULL) {
 			PMD_INIT_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
 				     (unsigned) rxq->queue_id);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 
 		rte_mbuf_refcnt_set(mbuf, 1);
diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c b/lib/librte_pmd_virtio/virtio_ethdev.c
index e63dbfb..5eac60a 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.c
+++ b/lib/librte_pmd_virtio/virtio_ethdev.c
@@ -296,7 +296,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 	}
 	if (vq == NULL) {
 		PMD_INIT_LOG(ERR, "%s: Can not allocate virtqueue", __func__);
-		return (-ENOMEM);
+		return -ENOMEM;
 	}
 
 	vq->hw = hw;
@@ -1293,7 +1293,7 @@ virtio_dev_configure(struct rte_eth_dev *dev)
 
 	if (rxmode->hw_ip_checksum) {
 		PMD_DRV_LOG(ERR, "HW IP checksum not supported");
-		return (-EINVAL);
+		return -EINVAL;
 	}
 
 	hw->vlan_strip = rxmode->hw_vlan_strip;
-- 
2.4.0

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-11  5:31         ` Liang, Cunming
@ 2015-05-11 15:00           ` Stephen Hemminger
  2015-05-12  1:07             ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-11 15:00 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

On Mon, 11 May 2015 13:31:04 +0800
"Liang, Cunming" <cunming.liang@intel.com> wrote:

> > Since MSI-X vectors are limited on many hardware platforms, this whole API
> > should be changed so that max_intr is based on number of rx_queues actually
> > used by the application.  That means the setup needs to move from init to configure.  
> [LCM] When MSI-X is not used, intr_vec and set max_intr are useless. It 
> doesn't matter to non MSI-X mode.
> As it allows the sequence "dev_stop->dev_reconfig->dev_start", the real 
> used number of queue may change.
> So allocation only on dev_init and release only on dev_close, just make 
> it simple. During configure_msix, it do use the real useful queue number 
> to set queue/vector mapping, refer xxx_configure_msix().

The problem is that if a customer has 16 NIC's with 32 MSI vectors per NIC,
it maybe that the MSI table in south bridge gets full. That is why the ixgbe
driver for Linux limits itself to num_online_cpu() + 1 MSI interrrupts.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 08/10] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-11 15:00           ` Stephen Hemminger
@ 2015-05-12  1:07             ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-12  1:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



On 5/11/2015 11:00 PM, Stephen Hemminger wrote:
> On Mon, 11 May 2015 13:31:04 +0800
> "Liang, Cunming" <cunming.liang@intel.com> wrote:
>
>>> Since MSI-X vectors are limited on many hardware platforms, this whole API
>>> should be changed so that max_intr is based on number of rx_queues actually
>>> used by the application.  That means the setup needs to move from init to configure.
>> [LCM] When MSI-X is not used, intr_vec and set max_intr are useless. It
>> doesn't matter to non MSI-X mode.
>> As it allows the sequence "dev_stop->dev_reconfig->dev_start", the real
>> used number of queue may change.
>> So allocation only on dev_init and release only on dev_close, just make
>> it simple. During configure_msix, it do use the real useful queue number
>> to set queue/vector mapping, refer xxx_configure_msix().
> The problem is that if a customer has 16 NIC's with 32 MSI vectors per NIC,
> it maybe that the MSI table in south bridge gets full. That is why the ixgbe
> driver for Linux limits itself to num_online_cpu() + 1 MSI interrrupts.
>
[LCM] So your concern actual is not about when to allocate the queue/vec 
mapping table, but the number of vectors enabled per NIC.
But even using num_online_cpu()+1, the number of online cpu on a 2U 
system is easy to exceed 32. So probably using a MSI_VECTOR_NB_MAX to 
define it in config.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD
  2015-05-05  5:39   ` [dpdk-dev] From: Cunming Liang <cunming.liang@intel.com> Cunming Liang
                       ` (9 preceding siblings ...)
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 10/10] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
@ 2015-05-21  8:55     ` Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
                         ` (11 more replies)
  10 siblings, 12 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

v8 changes
 - remove condition check for only vfio-msix
 - add multiplex intr support when only one intr vector allowed
 - lsc and rxq interrupt runtime enable decision
 - add safe event delete while the event wakeup execution happens

v7 changes
 - decouple epoll event and intr operation
 - add condition check in the case intr vector is disabled
 - renaming some APIs

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess.
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (11):
  eal/linux: add interrupt vectors support in intr_handle
  eal/linux: add rte_epoll_wait/ctl support
  eal/linux: add API to set rx interrupt event monitor
  eal/linux: fix comments typo on vfio msi
  eal/linux: add interrupt vectors handling on VFIO
  eal/linux: standalone intr event fd create support
  eal/bsd: dummy for new intr definition
  ethdev: add rx intr enable, disable and ctl functions
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch

 examples/l3fwd-power/main.c                        | 207 +++++++--
 lib/librte_eal/bsdapp/eal/eal_interrupts.c         |  20 +
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |  77 ++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   5 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 351 +++++++++++++--
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 160 +++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   8 +
 lib/librte_ether/rte_ethdev.c                      | 127 ++++++
 lib/librte_ether/rte_ethdev.h                      | 104 +++++
 lib/librte_ether/rte_ether_version.map             |   4 +
 lib/librte_pmd_e1000/igb_ethdev.c                  | 292 +++++++++++--
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c                | 482 ++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h                |   4 +
 13 files changed, 1715 insertions(+), 126 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21 10:32         ` Neil Horman
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
                         ` (10 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds interrupt vectors support in rte_intr_handle.
'vec_en' is set when interrupt vectors are detected and associated event fds are set.
Those event fds are stored in efds[].
'intr_vec' is reserved for device driver to initialize the vector mapping table.
When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes:
 - add eptrs[], it's used to store the register rte_epoll_event instances.
 - add vec_en, to log the vector capability status.

v6 changes:
 - add mapping table between irq vector number and queue id.

v5 changes:
 - Create this new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect.

 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 6a159c7..27174df 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define RTE_MAX_RXTX_INTR_VEC_ID     32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
@@ -48,6 +50,8 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
+struct rte_epoll_event;
+
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	union {
@@ -57,6 +61,12 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	uint32_t max_intr;               /**< max interrupt requested */
+	uint32_t nb_efd;                 /**< number of available efds */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
+	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event ptr */
+	int *intr_vec;                   /**< intr vector number array */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21 18:22         ` Stephen Hemminger
       [not found]         ` <20150521111704.727cf3a1@urahara>
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 03/11] eal/linux: add API to set rx interrupt event monitor Cunming Liang
                         ` (9 subsequent siblings)
  11 siblings, 2 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds 'rte_epoll_wait' and 'rte_epoll_ctl' for async event wakeup.
It defines 'struct rte_epoll_event' as the event param.
The 'op' uses the same enum as epoll_wait/ctl does.
The epoll event support to carry a raw user data and to register a callback which is exectuted during wakeup.


Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - support delete event in safety during the wakeup execution
 - add EINTR process during epoll_wait

v7 changes
 - split v6[4/8] into two patches, one for epoll event(this one)
   another for rx intr(next patch)
 - introduce rte_epoll_event definition
 - rte_epoll_wait/ctl for more generic RTE epoll API

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 137 +++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  82 +++++++++++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   3 +
 3 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 66deda2..129fd1d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -69,6 +69,8 @@
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
 
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
 /**
  * union for pipe fds.
  */
@@ -859,3 +861,138 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static int
+eal_epoll_process_event(struct epoll_event *evs, int n,
+			struct rte_epoll_event *events)
+{
+	int i;
+	int count = 0;
+	struct rte_epoll_event *rev;
+	for (i = 0; i < n; i++) {
+		rev = (struct rte_epoll_event *)evs[i].data.ptr;
+		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+						 RTE_EPOLL_EXEC))
+			continue;
+
+		events[count].status        = RTE_EPOLL_VALID;
+		events[count].fd            = rev->fd;
+		events[count].epfd          = rev->epfd;
+		events[count].epdata.event  = rev->epdata.event;
+		events[count].epdata.data   = rev->epdata.data;
+		if (rev->epdata.cb_fun)
+			rev->epdata.cb_fun(rev->fd,
+					   rev->epdata.cb_arg);
+
+		rte_compiler_barrier();
+		rev->status = RTE_EPOLL_VALID;
+		count++;
+	}
+	return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+	int pfd = epoll_create(255);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+	if (RTE_PER_LCORE(_epfd) == -1)
+		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+	return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout)
+{
+	struct epoll_event evs[maxevents];
+	int rc;
+
+	if (!events) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	while (1) {
+		rc = epoll_wait(epfd, evs, maxevents, timeout);
+		if (likely(rc > 0)) {
+			/* epoll_wait has at least one fd ready to read */
+			rc = eal_epoll_process_event(evs, rc, events);
+			break;
+		} else if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			/* epoll_wait fail */
+			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+				strerror(errno));
+			rc = -1;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+	while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+				    RTE_EPOLL_INVALID))
+		while (ev->status != RTE_EPOLL_VALID)
+			rte_pause();
+	memset(&ev->epdata, 0, sizeof(ev->epdata));
+	ev->fd = -1;
+	ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event)
+{
+	struct epoll_event ev;
+
+	if (!event) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	if (op == EPOLL_CTL_ADD) {
+		event->status = RTE_EPOLL_VALID;
+		event->fd = fd;  /* ignore fd in event */
+		event->epfd = epfd;
+		ev.data.ptr = (void *)event;
+	}
+
+	ev.events = event->epdata.event;
+	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, fd, strerror(errno));
+		if (op == EPOLL_CTL_ADD)
+			/* rollback status when CTL_ADD fail */
+			event->status = RTE_EPOLL_INVALID;
+		return -1;
+	}
+
+	if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+		eal_epoll_data_safe_free(event);
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 27174df..98d9a48 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -50,7 +50,31 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
-struct rte_epoll_event;
+#define RTE_INTR_EVENT_ADD            1UL
+#define RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+struct rte_epoll_data {
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
+
+enum {
+	RTE_EPOLL_INVALID = 0,
+	RTE_EPOLL_VALID,
+	RTE_EPOLL_EXEC,
+};
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	volatile uint32_t status;  /**< OUT: event status */
+	int fd;                    /**< OUT: event fd */
+	int epfd;       /**< OUT: epoll instance the ev associated with */
+	struct rte_epoll_data epdata;
+};
 
 /** Handle for interrupts. */
 struct rte_intr_handle {
@@ -64,9 +88,61 @@ struct rte_intr_handle {
 	uint32_t max_intr;               /**< max interrupt requested */
 	uint32_t nb_efd;                 /**< number of available efds */
 	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
-	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
-					 /**< intr vector epoll event ptr */
+	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event */
 	int *intr_vec;                   /**< intr vector number array */
 };
 
+#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
+
+/**
+ * It waits for events on the epoll instance.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It performs control operations on epoll instance referred by the epfd.
+ * It requests that the operation op be performed for the target fd.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller perform control operations.
+ * @param op
+ *   The operation be performed for the target fd.
+ * @param fd
+ *   The target fd on which the control ops perform.
+ * @param event
+ *   Describes the object linked to the fd.
+ *   Note: The caller must take care the object deletion after CTL_DEL.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event);
+
+/**
+ * The function returns the per thread epoll instance.
+ *
+ * @return
+ *   epfd the epoll instance refered to.
+ */
+int
+rte_intr_tls_epfd(void);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 7e850a9..840002e 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -52,6 +52,8 @@ DPDK_2.0 {
 	rte_eal_vdev_init;
 	rte_eal_vdev_uninit;
 	rte_eal_wait_lcore;
+	rte_epoll_ctl;
+	rte_epoll_wait;
 	rte_exit;
 	rte_get_hpet_cycles;
 	rte_get_hpet_hz;
@@ -61,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 03/11] eal/linux: add API to set rx interrupt event monitor
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 04/11] eal/linux: fix comments typo on vfio msi Cunming Liang
                         ` (8 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds 'rte_intr_rx_ctl' to add or delete interrupt vector events monitor on specified epoll instance.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - fix EWOULDBLOCK and EINTR processing
 - add event status check

v7 changes
 - rename rte_intr_rx_set to rte_intr_rx_ctl.
 - rte_intr_rx_ctl uses rte_epoll_ctl to register epoll event instance.
 - the intr rx event instance includes a intr process callback.

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 103 +++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  23 +++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   1 +
 3 files changed, 127 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 129fd1d..6fb7fc7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -861,6 +861,49 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static void
+eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
+{
+	union rte_intr_read_buffer buf;
+	int bytes_read = 1;
+
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+		bytes_read = sizeof(buf.uio_intr_count);
+		break;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		bytes_read = sizeof(buf.vfio_intr_count);
+		break;
+#endif
+	default:
+		bytes_read = 1;
+		RTE_LOG(INFO, EAL, "unexpected intr type\n");
+		break;
+	}
+
+	/**
+	 * read out to clear the ready-to-be-read flag
+	 * for epoll_wait.
+	 */
+	do {
+		bytes_read = read(fd, &buf, bytes_read);
+		if (bytes_read < 0) {
+			if (errno == EINTR || errno == EWOULDBLOCK ||
+			    errno == EAGAIN)
+				continue;
+			RTE_LOG(ERR, EAL, "Error reading from file "
+				"descriptor %d: %s\n", fd,
+				strerror(errno));
+		} else if (bytes_read == 0)
+			RTE_LOG(ERR, EAL, "Read nothing from file "
+				"descriptor %d\n", fd);
+		return;
+	} while (1);
+}
+
 static int
 eal_epoll_process_event(struct epoll_event *evs, int n,
 			struct rte_epoll_event *events)
@@ -996,3 +1039,63 @@ rte_epoll_ctl(int epfd, int op, int fd,
 
 	return 0;
 }
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+		int op, unsigned int vec, void *data, int socket)
+{
+	struct rte_epoll_event *rev;
+	struct rte_epoll_data *epdata;
+	int epfd_op;
+	int rc = 0;
+
+	if (!intr_handle || intr_handle->nb_efd == 0 ||
+	    vec >= intr_handle->nb_efd) {
+		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+		return -EPERM;
+	}
+
+	if (socket == SOCKET_ID_ANY)
+		socket = rte_socket_id();
+
+	switch (op) {
+	case RTE_INTR_EVENT_ADD:
+		epfd_op = EPOLL_CTL_ADD;
+		rev = &intr_handle->elist[vec];
+		if (rev->status != RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event already been added.\n");
+			return -EEXIST;
+		}
+
+		/* attach to intr vector fd */
+		epdata = &rev->epdata;
+		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
+		epdata->data   = data;
+		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+		epdata->cb_arg = (void *)intr_handle;
+		rc = rte_epoll_ctl(epfd, epfd_op, intr_handle->efds[vec], rev);
+		if (!rc)
+			RTE_LOG(DEBUG, EAL, "eventfd %d associated with vec %d"
+				" is added on epfd %d\n", rev->fd, vec, epfd);
+		else
+			rc = -EPERM;
+		break;
+	case RTE_INTR_EVENT_DEL:
+		epfd_op = EPOLL_CTL_DEL;
+		rev = &intr_handle->elist[vec];
+		if (rev->status == RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event does not exist.\n");
+			return -EPERM;
+		}
+
+		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
+		if (rc)
+			rc = -EPERM;
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "event op type mismatch\n");
+		rc = -EPERM;
+	}
+
+	return rc;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 98d9a48..41753e8 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -145,4 +145,27 @@ rte_epoll_ctl(int epfd, int op, int fd,
 int
 rte_intr_tls_epfd(void);
 
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @param socket
+ *   Specifying the socket id.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec,
+		void *data, int socket);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 840002e..65b5ed2 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -63,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_rx_ctl;
 	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 04/11] eal/linux: fix comments typo on vfio msi
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (2 preceding siblings ...)
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 03/11] eal/linux: add API to set rx interrupt event monitor Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
                         ` (7 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang


Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 6fb7fc7..59f4214 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -219,7 +219,7 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* enable MSI-X interrupts */
+/* enable MSI interrupts */
 static int
 vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	int len, ret;
@@ -265,7 +265,7 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* disable MSI-X interrupts */
+/* disable MSI interrupts */
 static int
 vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (3 preceding siblings ...)
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 04/11] eal/linux: fix comments typo on vfio msi Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-22 20:21         ` Stephen Hemminger
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 06/11] eal/linux: standalone intr event fd create support Cunming Liang
                         ` (6 subsequent siblings)
  11 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

This patch does below:
 - Create VFIO eventfds for each interrupt vector (move to next)
 - Assign per interrupt vector's eventfd to VFIO by ioctl

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - move eventfd creation out of the setup_interrupts to a standalone function

v7 changes
 - cleanup unnecessary code change
 - split event and intr operation to other patches

 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 50 ++++++++--------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 59f4214..d1e9013 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -128,6 +128,9 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT
 
 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
 
 /* enable legacy (INTx) interrupts */
 static int
@@ -245,23 +248,6 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -294,7 +280,7 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
 
@@ -302,12 +288,18 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if (!intr_handle->max_intr)
+		intr_handle->max_intr = 1;
+	else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID)
+		intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
+
+	irq_set->count = intr_handle->max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
+	fd_ptr[intr_handle->max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -317,22 +309,6 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 		return -1;
 	}
 
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI-X interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -340,7 +316,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	int len, ret;
 
 	len = sizeof(struct vfio_irq_set);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 06/11] eal/linux: standalone intr event fd create support
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (4 preceding siblings ...)
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 07/11] eal/bsd: dummy for new intr definition Cunming Liang
                         ` (5 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch exposes intr event fd create and release for PMD.
The device driver can assign the number of event associated with interrupt vector.
It also provides misc funtions to check 1) allows other slowpath intr(e.g. lsc);
2) intr event on fastpath is enabled or not.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 57 ++++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 51 +++++++++++++++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |  4 ++
 3 files changed, 112 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index d1e9013..742fdab 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -44,6 +44,7 @@
 #include <sys/epoll.h>
 #include <sys/signalfd.h>
 #include <sys/ioctl.h>
+#include <sys/eventfd.h>
 
 #include <rte_common.h>
 #include <rte_interrupts.h>
@@ -68,6 +69,7 @@
 #include "eal_vfio.h"
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
+#define NB_OTHER_INTR               1
 
 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
 
@@ -1075,3 +1077,58 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
 
 	return rc;
 }
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	uint32_t i;
+	int fd;
+	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
+		for (i = 0; i < n; i++) {
+			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+			if (fd < 0) {
+				RTE_LOG(ERR, EAL,
+					"cannot setup eventfd,"
+					"error %i (%s)\n",
+					errno, strerror(errno));
+				return -1;
+			}
+			intr_handle->efds[i] = fd;
+		}
+		intr_handle->nb_efd   = n;
+		intr_handle->max_intr = NB_OTHER_INTR + n;
+	} else {
+		intr_handle->efds[0]  = intr_handle->fd;
+		intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
+		intr_handle->max_intr = NB_OTHER_INTR;
+	}
+
+	return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	uint32_t i;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < intr_handle->nb_efd; i++) {
+		rev = &intr_handle->elist[i];
+		if (rev->status == RTE_EPOLL_INVALID)
+			continue;
+		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+			/* force free if the entry valid */
+			eal_epoll_data_safe_free(rev);
+			rev->status = RTE_EPOLL_INVALID;
+		}
+	}
+
+	if (intr_handle->max_intr > intr_handle->nb_efd) {
+		for (i = 0; i < intr_handle->nb_efd; i++)
+			close(intr_handle->efds[i]);
+	}
+	intr_handle->nb_efd = 0;
+	intr_handle->max_intr = 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 41753e8..46b1113 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -168,4 +168,55 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
 		int epfd, int op, unsigned int vec,
 		void *data, int socket);
 
+/**
+ * It enables the fastpath event fds if it's necessary.
+ * It creates event fds when multi-vectors allowed,
+ * otherwise it multiplexes the single event fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_vec
+ *   Number of intrrupt vector trying to enable.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * It disable the fastpath event fds.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * The fastpath interrupt is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	return !(!intr_handle->nb_efd);
+}
+
+/**
+ * The interrupt handle instance allows other cause or not.
+ * Other cause stands for none fastpath interrupt.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	return !!(intr_handle->max_intr - intr_handle->nb_efd);
+}
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 65b5ed2..d0df6b4 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -61,7 +61,11 @@ DPDK_2.0 {
 	rte_hexdump;
 	rte_intr_callback_register;
 	rte_intr_callback_unregister;
+	rte_intr_allow_others;
 	rte_intr_disable;
+	rte_intr_dp_is_en;
+	rte_intr_efd_enable;
+	rte_intr_efd_disable;
 	rte_intr_enable;
 	rte_intr_rx_ctl;
 	rte_intr_tls_epfd;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 07/11] eal/bsd: dummy for new intr definition
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (5 preceding siblings ...)
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 06/11] eal/linux: standalone intr event fd create support Cunming Liang
@ 2015-05-21  8:55       ` Cunming Liang
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
                         ` (4 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:55 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

To make bsd compiling happy with new intr changes.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - add stub for new function

v7 changes
 - remove stub 'linux only' function from source file

 lib/librte_eal/bsdapp/eal/eal_interrupts.c         | 20 ++++++
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   | 77 ++++++++++++++++++++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |  5 ++
 3 files changed, 102 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/eal_interrupts.c b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
index cb7d4f1..d63d82e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
@@ -69,3 +69,23 @@ rte_eal_intr_init(void)
 	return 0;
 }
 
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec,
+		void *data, int socket)
+{
+	return -ENOTSUP;
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	return;
+}
+
diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index 87a9cf6..30ec4d1 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -49,6 +49,83 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	uint32_t nb_efd;                 /**< number of available efds */
+	int *intr_vec;               /**< intr vector number array */
 };
 
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @param socket
+ *   Specifying the socket id.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec,
+		void *data, int socket);
+
+/**
+ * It enables the fastpath event fds if it's necessary.
+ * It creates event fds when multi-vectors allowed,
+ * otherwise it multiplexes the single event fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_vec
+ *   Number of intrrupt vector trying to enable.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * It disable the fastpath event fds.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * The fastpath interrupt is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	return 0;
+}
+
+/**
+ * The interrupt handle instance allows other cause or not.
+ * Other cause stands for none fastpath interrupt.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	return 1;
+}
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 67b6a6c..a74671b 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -53,8 +53,13 @@ DPDK_2.0 {
 	rte_hexdump;
 	rte_intr_callback_register;
 	rte_intr_callback_unregister;
+	rte_intr_allow_others;
 	rte_intr_disable;
+	rte_intr_dp_is_en;
+	rte_intr_efd_enable;
+	rte_intr_efd_disable;
 	rte_intr_enable;
+	rte_intr_rx_ctl;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (6 preceding siblings ...)
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 07/11] eal/bsd: dummy for new intr definition Cunming Liang
@ 2015-05-21  8:56       ` Cunming Liang
  2015-05-21 18:22         ` Stephen Hemminger
                           ` (2 more replies)
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 09/11] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
                         ` (3 subsequent siblings)
  11 siblings, 3 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:56 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds two dev_ops functions to enable and disable rx queue interrupts.
In addtion, it adds rte_eth_dev_rx_intr_ctl/rx_intr_q to support per port or per queue rx intr event set.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - add addtion check for EEXIT

v7 changes
 - remove rx_intr_vec_get
 - add rx_intr_ctl and rx_intr_ctl_q

v6 changes
 - add rx_intr_vec_get to retrieve the vector num of the queue.

v5 changes
 - Rebase the patchset onto the HEAD

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Put new functions at the end of eth_dev_ops to avoid breaking ABI

v3 changes
 - Add return value for interrupt enable/disable functions

 lib/librte_ether/rte_ethdev.c          | 127 +++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 104 +++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |   4 ++
 3 files changed, 235 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 024fe8b..1a47d9a 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3281,6 +3281,133 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	}
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	uint16_t qid;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec) {
+		PMD_DEBUG_TRACE("RX Intr vector unset\n");
+		return -EPERM;
+	}
+
+	for (qid = 0; qid < dev->data->nb_rx_queues; qid++) {
+		vec = intr_handle->intr_vec[qid];
+		rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec,
+				     data, rte_eth_dev_socket_id(port_id));
+		if (rc && rc != -EEXIST) {
+			PMD_DEBUG_TRACE("p %d q %d rx ctl error"
+					" op %d epfd %d vec %u\n",
+					port_id, qid, op, epfd, vec);
+		}
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	if (queue_id >= dev->data->nb_rx_queues) {
+		PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
+		return -EINVAL;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec) {
+		PMD_DEBUG_TRACE("RX Intr vector unset\n");
+		return -EPERM;
+	}
+
+	vec = intr_handle->intr_vec[queue_id];
+	rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec,
+			     data, rte_eth_dev_socket_id(port_id));
+	if (rc && rc != -EEXIST) {
+		PMD_DEBUG_TRACE("p %d q %d rx ctl error"
+				" op %d epfd %d vec %u\n",
+				port_id, queue_id, op, epfd, vec);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			   uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+}
+
+int
+rte_eth_dev_rx_intr_disable(uint8_t port_id,
+			    uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (dev == NULL) {
+		PMD_DEBUG_TRACE("Invalid port device\n");
+		return -ENODEV;
+	}
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 4648290..e5efec0 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -829,6 +829,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+	uint16_t rxq;
 };
 
 /**
@@ -1034,6 +1036,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */
 
@@ -1385,6 +1395,10 @@ struct eth_dev_ops {
 	/** Get current RSS hash configuration. */
 	rss_hash_conf_get_t rss_hash_conf_get;
 	eth_filter_ctrl_t              filter_ctrl;          /**< common filter control*/
+
+	/** Enable/disable Rx queue interrupt. */
+	eth_rx_enable_intr_t       rx_queue_intr_enable; /**< Enable Rx queue interrupt. */
+	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt.*/
 };
 
 /**
@@ -2867,6 +2881,96 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 				enum rte_eth_event_type event);
 
 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			       uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_intr_disable() function disables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_disable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
+ * RX Interrupt control per port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
+
+/**
+ * RX Interrupt control per queue.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data);
+
+/**
  * Turn on the LED on the Ethernet device.
  * This function turns on the LED on the Ethernet device.
  *
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index a2d25a6..2799b99 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -48,6 +48,10 @@ DPDK_2.0 {
 	rte_eth_dev_rss_hash_update;
 	rte_eth_dev_rss_reta_query;
 	rte_eth_dev_rss_reta_update;
+	rte_eth_dev_rx_intr_ctl;
+	rte_eth_dev_rx_intr_ctl_q;
+	rte_eth_dev_rx_intr_disable;
+	rte_eth_dev_rx_intr_enable;
 	rte_eth_dev_rx_queue_start;
 	rte_eth_dev_rx_queue_stop;
 	rte_eth_dev_set_link_down;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 09/11] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (7 preceding siblings ...)
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
@ 2015-05-21  8:56       ` Cunming Liang
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 10/11] igb: enable rx queue interrupts for PF Cunming Liang
                         ` (2 subsequent siblings)
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:56 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch does below things for ixgbe PF and VF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Yong Liu <yong.liu@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - add vfio-msi/vfio-legacy and uio-legacy support

v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 482 +++++++++++++++++++++++++++++++++++-
 lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   4 +
 2 files changed, 474 insertions(+), 12 deletions(-)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 366aa45..97ed5b7 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -82,6 +82,9 @@
  */
 #define IXGBE_FC_LO    0x40
 
+/* Default minimum inter-interrupt interval for EITR configuration */
+#define IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT    0x79E
+
 /* Timer value included in XOFF frames. */
 #define IXGBE_FC_PAUSE 0x680
 
@@ -171,6 +174,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -183,11 +187,14 @@ static void ixgbe_dcb_init(struct ixgbe_hw *hw,struct ixgbe_dcb_config *dcb_conf
 
 /* For Virtual Function support */
 static int eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev);
+static int ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev);
+static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_configure(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_start(struct rte_eth_dev *dev);
 static void ixgbevf_dev_stop(struct rte_eth_dev *dev);
 static void ixgbevf_dev_close(struct rte_eth_dev *dev);
 static void ixgbevf_intr_disable(struct ixgbe_hw *hw);
+static void ixgbevf_intr_enable(struct ixgbe_hw *hw);
 static void ixgbevf_dev_stats_get(struct rte_eth_dev *dev,
 		struct rte_eth_stats *stats);
 static void ixgbevf_dev_stats_reset(struct rte_eth_dev *dev);
@@ -197,6 +204,15 @@ static void ixgbevf_vlan_strip_queue_set(struct rte_eth_dev *dev,
 		uint16_t queue, int on);
 static void ixgbevf_vlan_offload_set(struct rte_eth_dev *dev, int mask);
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on);
+static void ixgbevf_dev_interrupt_handler(struct rte_intr_handle *handle,
+		void *param);
+static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+		uint16_t queue_id);
+static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+		 uint16_t queue_id);
+static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+		 uint8_t queue, uint8_t msix_vector);
+static void ixgbevf_configure_msix(struct rte_eth_dev *dev);
 
 /* For Eth VMDQ APIs support */
 static int ixgbe_uc_hash_table_set(struct rte_eth_dev *dev, struct
@@ -214,6 +230,14 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
 		uint8_t	rule_id);
 
+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void ixgbe_configure_msix(struct rte_eth_dev *dev);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 		uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -262,7 +286,7 @@ static int ixgbevf_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
  */
 #define UPDATE_VF_STAT(reg, last, cur)	                        \
 {                                                               \
-	u32 latest = IXGBE_READ_REG(hw, reg);                   \
+	uint32_t latest = IXGBE_READ_REG(hw, reg);                   \
 	cur += latest - last;                                   \
 	last = latest;                                          \
 }
@@ -343,6 +367,8 @@ static const struct eth_dev_ops ixgbe_eth_dev_ops = {
 	.tx_queue_start	      = ixgbe_dev_tx_queue_start,
 	.tx_queue_stop        = ixgbe_dev_tx_queue_stop,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
+	.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
 	.rx_queue_count       = ixgbe_dev_rx_queue_count,
 	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -402,8 +428,11 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = {
 	.vlan_offload_set     = ixgbevf_vlan_offload_set,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
+	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
 	.tx_queue_setup       = ixgbe_dev_tx_queue_setup,
 	.tx_queue_release     = ixgbe_dev_tx_queue_release,
+	.rx_queue_intr_enable = ixgbevf_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbevf_dev_rx_queue_intr_disable,
 	.mac_addr_add         = ixgbevf_add_mac_addr,
 	.mac_addr_remove      = ixgbevf_remove_mac_addr,
 };
@@ -899,12 +928,6 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
 			eth_dev->data->port_id, pci_dev->id.vendor_id,
 			pci_dev->id.device_id);
 
-	rte_intr_callback_register(&(pci_dev->intr_handle),
-		ixgbe_dev_interrupt_handler, (void *)eth_dev);
-
-	/* enable uio intr after callback register */
-	rte_intr_enable(&(pci_dev->intr_handle));
-
 	/* enable support intr */
 	ixgbe_enable_intr(eth_dev);
 
@@ -1430,6 +1453,8 @@ ixgbe_dev_configure(struct rte_eth_dev *dev)
 		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t intr_vector = 0;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -1443,6 +1468,24 @@ ixgbe_dev_configure(struct rte_eth_dev *dev)
 	hw->rx_bulk_alloc_allowed = true;
 	hw->rx_vec_allowed = true;
 
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	if (rte_intr_dp_is_en(intr_handle)) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int),
+				    0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     "intr_vec\n", dev->data->nb_rx_queues);
+			return -1;
+		}
+	}
+
 	return 0;
 }
 
@@ -1489,6 +1532,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+	/* confiugre msix for sleep until rx interrupt */
+	ixgbe_configure_msix(dev);
+
 	/* initialize transmission unit */
 	ixgbe_dev_tx_init(dev);
 
@@ -1561,8 +1607,23 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 skip_link_setup:
 
 	/* check if lsc interrupt is enabled */
-	if (dev->data->dev_conf.intr_conf.lsc != 0)
-		ixgbe_dev_lsc_interrupt_setup(dev);
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(&dev->pci_dev->intr_handle)) {
+			rte_intr_callback_register(&(dev->pci_dev->intr_handle),
+						   ixgbe_dev_interrupt_handler,
+						   (void *)dev);
+			ixgbe_dev_lsc_interrupt_setup(dev);
+		} else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		ixgbe_dev_rxq_interrupt_setup(dev);
+
+	/* enable uio/vfio intr/eventfd mapping */
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
 
 	/* resume enabled intr since hw reset */
 	ixgbe_enable_intr(dev);
@@ -1619,6 +1680,7 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct ixgbe_5tuple_filter *p_5tuple, *p_5tuple_next;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 	int vf;
 
 	PMD_INIT_FUNC_TRACE();
@@ -1626,6 +1688,9 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	/* disable interrupts */
 	ixgbe_disable_intr(hw);
 
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
 	/* reset the NIC */
 	ixgbe_pf_reset_hw(hw);
 	hw->adapter_stopped = FALSE;
@@ -1661,6 +1726,12 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	memset(filter_info->fivetuple_mask, 0,
 		sizeof(uint32_t) * IXGBE_5TUPLE_ARRAY_SIZE);
 
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 /*
@@ -2252,6 +2323,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+	return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -2278,10 +2371,10 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	PMD_DRV_LOG(INFO, "eicr %x", eicr);
 
 	intr->flags = 0;
-	if (eicr & IXGBE_EICR_LSC) {
-		/* set flag for async link update */
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
 		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
-	}
 
 	if (eicr & IXGBE_EICR_MAILBOX)
 		intr->flags |= IXGBE_FLAG_MAILBOX;
@@ -2289,6 +2382,30 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev)
+{
+	uint32_t eicr;
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	/* clear all cause mask */
+	ixgbevf_intr_disable(hw);
+
+	/* read-on-clear nic registers here */
+	eicr = IXGBE_READ_REG(hw, IXGBE_VTEICR);
+	PMD_DRV_LOG(INFO, "eicr %x", eicr);
+
+	intr->flags = 0;
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
+		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
+
+	return 0;
+}
+
 /**
  * It gets and then prints the link status.
  *
@@ -2384,6 +2501,18 @@ ixgbe_dev_interrupt_action(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	PMD_DRV_LOG(DEBUG, "enable intr immediately");
+	ixgbevf_intr_enable(hw);
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+	return 0;
+}
+
 /**
  * Interrupt handler which shall be registered for alarm callback for delayed
  * handling specific interrupt to wait for the stable nic state. As the
@@ -2445,6 +2574,15 @@ ixgbe_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	ixgbe_dev_interrupt_action(dev);
 }
 
+static void
+ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
+							void *param)
+{
+	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+	ixgbevf_dev_interrupt_get_status(dev);
+	ixgbevf_dev_interrupt_action(dev);
+}
+
 static int
 ixgbe_dev_led_on(struct rte_eth_dev *dev)
 {
@@ -2943,10 +3081,25 @@ ixgbevf_intr_disable(struct ixgbe_hw *hw)
 	IXGBE_WRITE_FLUSH(hw);
 }
 
+static void
+ixgbevf_intr_enable(struct ixgbe_hw *hw)
+{
+	PMD_INIT_FUNC_TRACE();
+
+	/* VF enable interrupt autoclean */
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_VF_IRQ_ENABLE_MASK);
+
+	IXGBE_WRITE_FLUSH(hw);
+}
+
 static int
 ixgbevf_dev_configure(struct rte_eth_dev *dev)
 {
 	struct rte_eth_conf* conf = &dev->data->dev_conf;
+	uint32_t intr_vector = 0;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	PMD_INIT_LOG(DEBUG, "Configured Virtual Function port id: %d",
 		     dev->data->port_id);
@@ -2967,6 +3120,24 @@ ixgbevf_dev_configure(struct rte_eth_dev *dev)
 	}
 #endif
 
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	/* set max interrupt vfio request */
+	if (rte_intr_dp_is_en(intr_handle)) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int), 0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", dev->data->nb_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
 	return 0;
 }
 
@@ -3005,6 +3176,23 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+	ixgbevf_configure_msix(dev);
+
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(&dev->pci_dev->intr_handle))
+			rte_intr_callback_register(&dev->pci_dev->intr_handle,
+					ixgbevf_dev_interrupt_handler,
+					(void *)dev);
+		else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	rte_intr_enable(&dev->pci_dev->intr_handle);
+
+	/* Re-enable interrupt for VF */
+	ixgbevf_intr_enable(hw);
+
 	return 0;
 }
 
@@ -3012,6 +3200,7 @@ static void
 ixgbevf_dev_stop(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3028,12 +3217,23 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev)
 	dev->data->scattered_rx = 0;
 
 	ixgbe_dev_clear_queues(dev);
+
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 static void
 ixgbevf_dev_close(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_pci_device *pci_dev;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3043,6 +3243,12 @@ ixgbevf_dev_close(struct rte_eth_dev *dev)
 
 	/* reprogram the RAR[0] in case user changed it. */
 	ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV);
+
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
 }
 
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on)
@@ -3542,6 +3748,258 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 	return 0;
 }
 
+
+static int
+ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask |= (1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static int
+ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask &= ~(1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask |= (1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= (1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= (1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask &= ~(1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= ~(1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= ~(1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static void
+ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+	if (direction == -1) {
+		/* other causes */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC);
+		tmp &= ~0xFF;
+		tmp |= msix_vector;
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, tmp);
+	} else {
+		/* rx or tx cause */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		idx = ((16 * (queue & 1)) + (8 * direction));
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR(queue >> 1));
+		tmp &= ~(0xFF << idx);
+		tmp |= (msix_vector << idx);
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(queue >> 1), tmp);
+	}
+}
+
+/**
+ * set the IVAR registers, mapping interrupt causes to vectors
+ * @param hw
+ *  pointer to ixgbe_hw struct
+ * @direction
+ *  0 for Rx, 1 for Tx, -1 for other causes
+ * @queue
+ *  queue to map the corresponding interrupt to
+ * @msix_vector
+ *  the vector to map to the corresponding queue
+ */
+static void
+ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			   uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+
+	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+	if (hw->mac.type == ixgbe_mac_82598EB) {
+		if (direction == -1)
+			direction = 0;
+		idx = (((direction * 64) + queue) >> 2) & 0x1F;
+		tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(idx));
+		tmp &= ~(0xFF << (8 * (queue & 0x3)));
+		tmp |= (msix_vector << (8 * (queue & 0x3)));
+		IXGBE_WRITE_REG(hw, IXGBE_IVAR(idx), tmp);
+	} else if ((hw->mac.type == ixgbe_mac_82599EB) ||
+			(hw->mac.type == ixgbe_mac_X540)) {
+		if (direction == -1) {
+			/* other causes */
+			idx = ((queue & 1) * 8);
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, tmp);
+		} else {
+			/* rx or tx causes */
+			idx = ((16 * (queue & 1)) + (8 * direction));
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(queue >> 1));
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR(queue >> 1), tmp);
+		}
+	}
+}
+
+static void
+ixgbevf_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t q_idx;
+	uint32_t vector_idx = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	/* Configure all RX queues of VF */
+	for (q_idx = 0; q_idx < dev->data->nb_rx_queues; q_idx++) {
+		/* Force all queue use vector 0,
+		 * as IXGBE_VF_MAXMSIVECOTR = 1 */
+		ixgbevf_set_ivar_map(hw, 0, q_idx, vector_idx);
+		intr_handle->intr_vec[q_idx] = vector_idx;
+	}
+
+	/* Configure VF Rx queue ivar */
+	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+}
+
+/**
+ * Sets up the hardware to properly generate MSI-X interrupts
+ * @hw
+ *  board private structure
+ */
+static void
+ixgbe_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t queue_id, vec = 0;
+	uint32_t mask;
+	uint32_t gpie;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	/* setup GPIE for MSI-x mode */
+	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
+		IXGBE_GPIE_OCD | IXGBE_GPIE_EIAME;
+	/*
+	* auto clearing and auto setting corresponding bits in EIMS
+	* when MSI-X interrupt is triggered
+	*/
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
+	else {
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF);
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF);
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+
+	/*
+	 * Populate the IVAR table and set the ITR values to the
+	 * corresponding register.
+	 */
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues;
+	     queue_id++) {
+		/* by default, 1:1 mapping */
+		ixgbe_set_ivar_map(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < intr_handle->nb_efd - 1)
+			vec++;
+	}
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		ixgbe_set_ivar_map(hw, -1, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+				   intr_handle->max_intr - 1);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		ixgbe_set_ivar_map(hw, -1, 1, intr_handle->max_intr - 1);
+		break;
+	default:
+		break;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_EITR(queue_id),
+			 IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT & 0xFFF);
+
+	/* set up to autoclear timer, and the vectors */
+	mask = IXGBE_EIMS_ENABLE_MASK;
+	mask &= ~(IXGBE_EIMS_OTHER |
+		  IXGBE_EIMS_MAILBOX |
+		  IXGBE_EIMS_LSC);
+
+	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+}
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 	uint16_t queue_idx, uint16_t tx_rate)
 {
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
index e45e727..a0359eb 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
@@ -117,6 +117,9 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+#define IXGBE_VF_IRQ_ENABLE_MASK        3          /* vf irq enable mask */
+#define IXGBE_VF_MAXMSIVECTOR           1
+
 /*
  * Information about the fdir mode.
  */
@@ -325,6 +328,7 @@ uint32_t ixgbe_dev_rx_queue_count(struct rte_eth_dev *dev,
 		uint16_t rx_queue_id);
 
 int ixgbe_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
+int ixgbevf_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
 
 int ixgbe_dev_rx_init(struct rte_eth_dev *dev);
 
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 10/11] igb: enable rx queue interrupts for PF
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (8 preceding siblings ...)
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 09/11] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-05-21  8:56       ` Cunming Liang
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 11/11] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:56 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - add vfio-msi/vfio-legacy and uio-legacy support

v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove unnecessary variables in e1000_mac_info
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 lib/librte_pmd_e1000/igb_ethdev.c | 292 +++++++++++++++++++++++++++++++++-----
 1 file changed, 259 insertions(+), 33 deletions(-)

diff --git a/lib/librte_pmd_e1000/igb_ethdev.c b/lib/librte_pmd_e1000/igb_ethdev.c
index 4415155..e0925ce 100644
--- a/lib/librte_pmd_e1000/igb_ethdev.c
+++ b/lib/librte_pmd_e1000/igb_ethdev.c
@@ -96,6 +96,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -194,6 +195,16 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -253,6 +264,8 @@ static const struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -584,12 +597,6 @@ eth_igb_dev_init(struct rte_eth_dev *eth_dev)
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
-	rte_intr_callback_register(&(pci_dev->intr_handle),
-		eth_igb_interrupt_handler, (void *)eth_dev);
-
-	/* enable uio intr after callback register */
-	rte_intr_enable(&(pci_dev->intr_handle));
-
 	/* enable support intr */
 	igb_intr_enable(eth_dev);
 
@@ -739,11 +746,30 @@ eth_igb_configure(struct rte_eth_dev *dev)
 {
 	struct e1000_interrupt *intr =
 		E1000_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t intr_vector = 0;
 
 	PMD_INIT_FUNC_TRACE();
 	intr->flags |= E1000_FLAG_NEED_LINK_UPDATE;
 	PMD_INIT_FUNC_TRACE();
 
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	if (rte_intr_dp_is_en(intr_handle)) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int), 0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", dev->data->nb_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
 	return (0);
 }
 
@@ -752,7 +778,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -792,6 +818,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(dev);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -819,33 +848,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -896,8 +901,23 @@ eth_igb_start(struct rte_eth_dev *dev)
 	e1000_setup_link(hw);
 
 	/* check if lsc interrupt feature is enabled */
-	if (dev->data->dev_conf.intr_conf.lsc != 0)
-		ret = eth_igb_lsc_interrupt_setup(dev);
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(&dev->pci_dev->intr_handle)) {
+			rte_intr_callback_register(&(dev->pci_dev->intr_handle),
+						   eth_igb_interrupt_handler,
+						   (void *)dev);
+			eth_igb_lsc_interrupt_setup(dev);
+		} else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
+	/* enable uio/vfio intr/eventfd mapping */
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
 
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
@@ -930,8 +950,13 @@ eth_igb_stop(struct rte_eth_dev *dev)
 	struct e1000_flex_filter *p_flex;
 	struct e1000_5tuple_filter *p_5tuple, *p_5tuple_next;
 	struct e1000_2tuple_filter *p_2tuple, *p_2tuple_next;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	igb_intr_disable(hw);
+
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
 	igb_pf_reset_hw(hw);
 	E1000_WRITE_REG(hw, E1000_WUC, 0);
 
@@ -980,6 +1005,13 @@ eth_igb_stop(struct rte_eth_dev *dev)
 		rte_free(p_2tuple);
 	}
 	filter_info->twotuple_mask = 0;
+
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 static void
@@ -987,6 +1019,7 @@ eth_igb_close(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_link link;
+	struct rte_pci_device *pci_dev;
 
 	eth_igb_stop(dev);
 	e1000_phy_hw_reset(hw);
@@ -1004,6 +1037,12 @@ eth_igb_close(struct rte_eth_dev *dev)
 
 	igb_dev_clear_queues(dev);
 
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
+
 	memset(&link, 0, sizeof(link));
 	rte_igb_dev_atomic_write_link_status(dev, &link);
 }
@@ -1828,6 +1867,34 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 }
 
 /*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+
+	return 0;
+}
+
+/*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
  * to update link status.
  *
@@ -3652,5 +3719,164 @@ static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	uint32_t max_rx_queues;
+	struct rte_eth_dev_info dev_info;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t vec = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+	max_rx_queues = dev_info.max_rx_queues;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+					E1000_GPIE_PBA | E1000_GPIE_EIAME |
+					E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		intr_mask = 1 << max_rx_queues;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (max_rx_queues | E1000_IVAR_VALID) << 8;
+
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM and EIAC to auto-mask and auto-clear when MSI-X interrupt
+	* is asserted, this saves a register write for every interrupt
+	*/
+	intr_mask = 0xFFFFFFFF >> (32 - max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIAC);
+	E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		eth_igb_assign_msix_vector(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < intr_handle->nb_efd - 1)
+			vec++;
+	}
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v8 11/11] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (9 preceding siblings ...)
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 10/11] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-05-21  8:56       ` Cunming Liang
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
  11 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-21  8:56 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in
polling mode and switch to interrupt mode only if there is no
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles
than kernel, so one-shot interrupt is used here to guarantee minimum
overhead and DPDK polling thread returns to polling mode immediately
once it receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - using new APIs
 - demo multiple port/queue pair wait on the same epoll instance

v6 changes
 - Split event fd add and wait

v5 changes
 - Change invoked function name and parameter to accomodate EAL change

v3 changes
 - Add spinlock to ensure thread safe when accessing interrupt mask
   register

v2 changes
 - Remove unused function which is for debug purpose

 examples/l3fwd-power/main.c | 207 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 165 insertions(+), 42 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index bb0b66f..6dcc0b7 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -74,12 +74,14 @@
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -155,6 +157,9 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+/* ethernet addresses of ports */
+static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 /* Ports set in promiscuous mode off by default. */
@@ -187,6 +192,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -213,7 +221,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -225,11 +233,15 @@ static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.lsc = 1,
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -401,19 +413,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -706,22 +721,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the
+		minimum latency switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -761,6 +774,84 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(int num)
+{
+	struct rte_epoll_event event[num];
+	int n, i;
+	uint8_t port_id, queue_id;
+	void *data;
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt triggers\n",
+		rte_lcore_id());
+
+	n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, -1);
+	for (i = 0; i < n; i++) {
+		data = event[i].epdata.data;
+		port_id = ((uintptr_t)data) >> CHAR_BIT;
+		queue_id = ((uintptr_t)data) &
+			RTE_LEN2MASK(CHAR_BIT, uint8_t);
+		RTE_LOG(INFO, L3FWD_POWER,
+			"lcore %u is waked up from rx interrupt on"
+			" port %d queue %d\n",
+			rte_lcore_id(), port_id, queue_id);
+	}
+
+	return 0;
+}
+
+static int turn_on_intr(struct lcore_conf *qconf)
+{
+	int i;
+	struct lcore_rx_queue *rx_queue;
+	uint8_t port_id, queue_id;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		port_id = rx_queue->port_id;
+		queue_id = rx_queue->queue_id;
+
+		rte_spinlock_lock(&(locks[port_id]));
+		rte_eth_dev_rx_intr_enable(port_id, queue_id);
+		rte_spinlock_unlock(&(locks[port_id]));
+	}
+}
+
+static int event_register(struct lcore_conf *qconf)
+{
+	struct lcore_rx_queue *rx_queue;
+	uint8_t portid, queueid;
+	uint32_t data;
+	int ret;
+	int i;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		portid = rx_queue->port_id;
+		queueid = rx_queue->queue_id;
+		data = portid << CHAR_BIT | queueid;
+
+		ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
+						RTE_EPOLL_PER_THREAD,
+						RTE_INTR_EVENT_ADD,
+						(void *)((uintptr_t)data));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -774,9 +865,9 @@ main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
+	int intr_en = 0;
 
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
 
@@ -793,13 +884,18 @@ main_loop(__attribute__((unused)) void *dummy)
 	RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id);
 
 	for (i = 0; i < qconf->n_rx_queue; i++) {
-
 		portid = qconf->rx_queue_list[i].port_id;
 		queueid = qconf->rx_queue_list[i].queue_id;
 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%hhu "
 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
 	}
 
+	/* add into event wait list */
+	if (port_conf.intr_conf.rxq && event_register(qconf) == 0)
+		intr_en = 1;
+	else
+		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
+
 	while (1) {
 		stats[lcore_id].nb_iteration_looped++;
 
@@ -834,6 +930,7 @@ main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -847,6 +944,7 @@ main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -909,10 +1007,13 @@ main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -927,16 +1028,23 @@ main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of
+				* microsecond for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				if (intr_en) {
+					turn_on_intr(qconf);
+					sleep_until_rx_interrupt(
+						qconf->n_rx_queue);
+				}
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1269,7 +1377,7 @@ setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1279,7 +1387,7 @@ setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1472,6 +1580,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1521,10 +1630,19 @@ main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure not existed rxq: "
+				"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1548,6 +1666,9 @@ main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1582,8 +1703,9 @@ main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1631,7 +1753,6 @@ main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets
@@ -1640,6 +1761,8 @@ main(int argc, char **argv)
 		 */
 		if (promiscuous_on)
 			rte_eth_promiscuous_enable(portid);
+		/* initialize spinlock for each port */
+		rte_spinlock_init(&(locks[portid]));
 	}
 
 	check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-05-21 10:32         ` Neil Horman
       [not found]           ` <20150521104300.00757b4e@urahara>
  0 siblings, 1 reply; 242+ messages in thread
From: Neil Horman @ 2015-05-21 10:32 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang, shemming

On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> The patch adds interrupt vectors support in rte_intr_handle.
> 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> Those event fds are stored in efds[].
> 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> ---
> v7 changes:
>  - add eptrs[], it's used to store the register rte_epoll_event instances.
>  - add vec_en, to log the vector capability status.
> 
> v6 changes:
>  - add mapping table between irq vector number and queue id.
> 
> v5 changes:
>  - Create this new patch file for changed struct rte_intr_handle that
>    other patches depend on, to avoid breaking git bisect.
> 
>  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index 6a159c7..27174df 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -38,6 +38,8 @@
>  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
>  #define _RTE_LINUXAPP_INTERRUPTS_H_
>  
> +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> +
>  enum rte_intr_handle_type {
>  	RTE_INTR_HANDLE_UNKNOWN = 0,
>  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
>  	RTE_INTR_HANDLE_MAX
>  };
>  
> +struct rte_epoll_event;
> +
>  /** Handle for interrupts. */
>  struct rte_intr_handle {
>  	union {
> @@ -57,6 +61,12 @@ struct rte_intr_handle {
>  	};
>  	int fd;	 /**< interrupt event file descriptor */
>  	enum rte_intr_handle_type type;  /**< handle type */
> +	uint32_t max_intr;               /**< max interrupt requested */
> +	uint32_t nb_efd;                 /**< number of available efds */
> +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> +					 /**< intr vector epoll event ptr */
> +	int *intr_vec;                   /**< intr vector number array */
>  };
>  

This is going to be ABI breaking if this from test_interrupts.c:
static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];

is a plausible way of using this structure.  Even putting the data at the end of
the structure won't help, as the array indicies are off
Neil

>  #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
> -- 
> 1.8.1.4
> 
> 

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
       [not found]           ` <20150521104300.00757b4e@urahara>
@ 2015-05-21 17:58             ` Neil Horman
  2015-05-21 18:21               ` Stephen Hemminger
                                 ` (2 more replies)
  0 siblings, 3 replies; 242+ messages in thread
From: Neil Horman @ 2015-05-21 17:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang

On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
> On Thu, 21 May 2015 06:32:02 -0400
> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> > On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> > > The patch adds interrupt vectors support in rte_intr_handle.
> > > 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> > > Those event fds are stored in efds[].
> > > 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> > > When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> > > 
> > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > ---
> > > v7 changes:
> > >  - add eptrs[], it's used to store the register rte_epoll_event instances.
> > >  - add vec_en, to log the vector capability status.
> > > 
> > > v6 changes:
> > >  - add mapping table between irq vector number and queue id.
> > > 
> > > v5 changes:
> > >  - Create this new patch file for changed struct rte_intr_handle that
> > >    other patches depend on, to avoid breaking git bisect.
> > > 
> > >  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
> > >  1 file changed, 10 insertions(+)
> > > 
> > > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > index 6a159c7..27174df 100644
> > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > @@ -38,6 +38,8 @@
> > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > >  
> > > +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> > > +
> > >  enum rte_intr_handle_type {
> > >  	RTE_INTR_HANDLE_UNKNOWN = 0,
> > >  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> > > @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
> > >  	RTE_INTR_HANDLE_MAX
> > >  };
> > >  
> > > +struct rte_epoll_event;
> > > +
> > >  /** Handle for interrupts. */
> > >  struct rte_intr_handle {
> > >  	union {
> > > @@ -57,6 +61,12 @@ struct rte_intr_handle {
> > >  	};
> > >  	int fd;	 /**< interrupt event file descriptor */
> > >  	enum rte_intr_handle_type type;  /**< handle type */
> > > +	uint32_t max_intr;               /**< max interrupt requested */
> > > +	uint32_t nb_efd;                 /**< number of available efds */
> > > +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> > > +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > > +					 /**< intr vector epoll event ptr */
> > > +	int *intr_vec;                   /**< intr vector number array */
> > >  };
> > >    
> > 
> > This is going to be ABI breaking if this from test_interrupts.c:
> > static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
> > 
> > is a plausible way of using this structure.  Even putting the data at the end of
> > the structure won't help, as the array indicies are off
> 
> This needs to go in 2.0 and 2.0 has to have new ABI anyway.
> 
We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
because we didn't announce it in 1.8.  The earliest we can update the ABI
(according to the ABI docs) at this point is 2.2, since we need to announce the
change in 2.1, then make it in 2.2

Neil

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
  2015-05-21 17:58             ` Neil Horman
@ 2015-05-21 18:21               ` Stephen Hemminger
       [not found]               ` <20150521111400.2a04a196@urahara>
  2015-05-29  8:56               ` Liang, Cunming
  2 siblings, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-21 18:21 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, liang-min.wang

On Thu, 21 May 2015 13:58:46 -0400
Neil Horman <nhorman@tuxdriver.com> wrote:

> On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
> > On Thu, 21 May 2015 06:32:02 -0400
> > Neil Horman <nhorman@tuxdriver.com> wrote:
> > 
> > > On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> > > > The patch adds interrupt vectors support in rte_intr_handle.
> > > > 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> > > > Those event fds are stored in efds[].
> > > > 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> > > > When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> > > > 
> > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > ---
> > > > v7 changes:
> > > >  - add eptrs[], it's used to store the register rte_epoll_event instances.
> > > >  - add vec_en, to log the vector capability status.
> > > > 
> > > > v6 changes:
> > > >  - add mapping table between irq vector number and queue id.
> > > > 
> > > > v5 changes:
> > > >  - Create this new patch file for changed struct rte_intr_handle that
> > > >    other patches depend on, to avoid breaking git bisect.
> > > > 
> > > >  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
> > > >  1 file changed, 10 insertions(+)
> > > > 
> > > > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > index 6a159c7..27174df 100644
> > > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > @@ -38,6 +38,8 @@
> > > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > >  
> > > > +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> > > > +
> > > >  enum rte_intr_handle_type {
> > > >  	RTE_INTR_HANDLE_UNKNOWN = 0,
> > > >  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> > > > @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
> > > >  	RTE_INTR_HANDLE_MAX
> > > >  };
> > > >  
> > > > +struct rte_epoll_event;
> > > > +
> > > >  /** Handle for interrupts. */
> > > >  struct rte_intr_handle {
> > > >  	union {
> > > > @@ -57,6 +61,12 @@ struct rte_intr_handle {
> > > >  	};
> > > >  	int fd;	 /**< interrupt event file descriptor */
> > > >  	enum rte_intr_handle_type type;  /**< handle type */
> > > > +	uint32_t max_intr;               /**< max interrupt requested */
> > > > +	uint32_t nb_efd;                 /**< number of available efds */
> > > > +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> > > > +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > > > +					 /**< intr vector epoll event ptr */
> > > > +	int *intr_vec;                   /**< intr vector number array */
> > > >  };
> > > >    
> > > 
> > > This is going to be ABI breaking if this from test_interrupts.c:
> > > static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
> > > 
> > > is a plausible way of using this structure.  Even putting the data at the end of
> > > the structure won't help, as the array indicies are off
> > 
> > This needs to go in 2.0 and 2.0 has to have new ABI anyway.
> > 
> We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
> because we didn't announce it in 1.8.  The earliest we can update the ABI
> (according to the ABI docs) at this point is 2.2, since we need to announce the
> change in 2.1, then make it in 2.2
> 
> Neil
> 

Then just skip 2.1 (or make it a trivial doc change only dummy release),
and call it 2.2.

I guess we need to proactively say every .x release will have new ABI.
Sorry, this is a project under development.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
@ 2015-05-21 18:22         ` Stephen Hemminger
       [not found]         ` <20150521111704.727cf3a1@urahara>
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-21 18:22 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Thu, 21 May 2015 16:55:54 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +static int
> +eal_epoll_process_event(struct epoll_event *evs, int n,
> +			struct rte_epoll_event *events)
> +{
> +	int i;
> +	int count = 0;
> +	struct rte_epoll_event *rev;
> +	for (i = 0; i < n; i++) {
> +		rev = (struct rte_epoll_event *)evs[i].data.ptr;
> +		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
> +						 RTE_EPOLL_EXEC))
> +			continue;
> +
> +		events[count].status        = RTE_EPOLL_VALID;
> +		events[count].fd            = rev->fd;
> +		events[count].epfd          = rev->epfd;
> +		events[count].epdata.event  = rev->epdata.event;
> +		events[count].epdata.data   = rev->epdata.data;

This code has several style issues:
 1. Always put blank line after declarations

 2. Use unsigned where ever it makes sense as a matter of habit.
      unsigned int i, count = 0;

 3. Don't add casts where not necessary, it reduces compiler type checking
    and is a bad habit. In this case evs[i].data.ptr is void *
    and therefore no cast is needed.
 

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
@ 2015-05-21 18:22         ` Stephen Hemminger
  2015-05-21 18:22         ` Stephen Hemminger
       [not found]         ` <20150521112030.4d31a0e4@urahara>
  2 siblings, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-21 18:22 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Thu, 21 May 2015 16:56:00 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +	if (!rte_eth_dev_is_valid_port(port_id)) {
> +		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
> +		return -ENODEV;
> +	}
> +
> +	dev = &rte_eth_devices[port_id];
> +	if (dev == NULL) {
> +		PMD_DEBUG_TRACE("Invalid port device\n");
> +		return -ENODEV;
> +	}

This check is not needed, rte_eth_dev_is_valid_port already
checked that.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
  2015-05-21 18:22         ` Stephen Hemminger
@ 2015-05-21 18:22         ` Stephen Hemminger
       [not found]         ` <20150521112030.4d31a0e4@urahara>
  2 siblings, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-21 18:22 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Thu, 21 May 2015 16:56:00 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +int
> +rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
> +			  int epfd, int op, void *data)
> +{
> +	uint32_t vec;
> +	struct rte_eth_dev *dev;
> +	struct rte_intr_handle *intr_handle;
> +	int rc;
> +
> +	if (!rte_eth_dev_is_valid_port(port_id)) {
> +		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);

Use %u when printing unsigned value

> +		return -ENODEV;
> +	}
> +
> +	dev = &rte_eth_devices[port_id];
> +	if (dev == NULL) {
> +		PMD_DEBUG_TRACE("Invalid port device\n");
> +		return -ENODEV;
> +	}
Another unnecessary conditional check, already done in rte_eth_dev_is_valid_port

> +
> +	if (queue_id >= dev->data->nb_rx_queues) {
> +		PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
This is wrong, won't build with debug enabled, You meant to use queue_id
here and use %u

> +		return -EINVAL;
> +	}
> +
> +	intr_handle = &dev->pci_dev->intr_handle;
> +	if (!intr_handle->intr_vec) {
> +		PMD_DEBUG_TRACE("RX Intr vector unset\n");
> +		return -EPERM;
> +	}
> +

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
       [not found]               ` <20150521111400.2a04a196@urahara>
@ 2015-05-22  0:05                 ` Neil Horman
       [not found]                 ` <40594e9e6e0543afa11e4dbd90e59b22@BRMWP-EXMB11.corp.brocade.com>
  1 sibling, 0 replies; 242+ messages in thread
From: Neil Horman @ 2015-05-22  0:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang

On Thu, May 21, 2015 at 11:14:00AM -0700, Stephen Hemminger wrote:
> On Thu, 21 May 2015 13:58:46 -0400
> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> > On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
> > > On Thu, 21 May 2015 06:32:02 -0400
> > > Neil Horman <nhorman@tuxdriver.com> wrote:
> > > 
> > > > On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> > > > > The patch adds interrupt vectors support in rte_intr_handle.
> > > > > 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> > > > > Those event fds are stored in efds[].
> > > > > 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> > > > > When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> > > > > 
> > > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > ---
> > > > > v7 changes:
> > > > >  - add eptrs[], it's used to store the register rte_epoll_event instances.
> > > > >  - add vec_en, to log the vector capability status.
> > > > > 
> > > > > v6 changes:
> > > > >  - add mapping table between irq vector number and queue id.
> > > > > 
> > > > > v5 changes:
> > > > >  - Create this new patch file for changed struct rte_intr_handle that
> > > > >    other patches depend on, to avoid breaking git bisect.
> > > > > 
> > > > >  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
> > > > >  1 file changed, 10 insertions(+)
> > > > > 
> > > > > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > index 6a159c7..27174df 100644
> > > > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > @@ -38,6 +38,8 @@
> > > > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > > > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > > >  
> > > > > +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> > > > > +
> > > > >  enum rte_intr_handle_type {
> > > > >  	RTE_INTR_HANDLE_UNKNOWN = 0,
> > > > >  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> > > > > @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
> > > > >  	RTE_INTR_HANDLE_MAX
> > > > >  };
> > > > >  
> > > > > +struct rte_epoll_event;
> > > > > +
> > > > >  /** Handle for interrupts. */
> > > > >  struct rte_intr_handle {
> > > > >  	union {
> > > > > @@ -57,6 +61,12 @@ struct rte_intr_handle {
> > > > >  	};
> > > > >  	int fd;	 /**< interrupt event file descriptor */
> > > > >  	enum rte_intr_handle_type type;  /**< handle type */
> > > > > +	uint32_t max_intr;               /**< max interrupt requested */
> > > > > +	uint32_t nb_efd;                 /**< number of available efds */
> > > > > +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> > > > > +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > > > > +					 /**< intr vector epoll event ptr */
> > > > > +	int *intr_vec;                   /**< intr vector number array */
> > > > >  };
> > > > >    
> > > > 
> > > > This is going to be ABI breaking if this from test_interrupts.c:
> > > > static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
> > > > 
> > > > is a plausible way of using this structure.  Even putting the data at the end of
> > > > the structure won't help, as the array indicies are off
> > > 
> > > This needs to go in 2.0 and 2.0 has to have new ABI anyway.
> > > 
> > We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
> > because we didn't announce it in 1.8.  The earliest we can update the ABI
> > (according to the ABI docs) at this point is 2.2, since we need to announce the
> > change in 2.1, then make it in 2.2
> > 
> > Neil
> > 
> 
> Then just skip 2.1 (or make it a trivial doc change only dummy release),
> and call it 2.2.
> 
> I guess we need to proactively say every .x release will have new ABI.
> Sorry, this is a project under development.
> 
Sorry, NAK.  I didn't go through all the trouble of creating an ABI
infrastructure just to throw it out the window on some rubber stamp.  We decided
on the rules, we need to stick to them.  We have large projects that rely on
DPDK now (OVS primarily), and we owe it to them to not just go completely throw
out the ABI every release.  We have a process for doing it, lets follow it.

Neil

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 02/11] eal/linux: add rte_epoll_wait/ctl support
       [not found]         ` <20150521111704.727cf3a1@urahara>
@ 2015-05-22  2:08           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-22  2:08 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang



On 5/22/2015 2:17 AM, Stephen Hemminger wrote:
> On Thu, 21 May 2015 16:55:54 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> +static int
>> +eal_epoll_process_event(struct epoll_event *evs, int n,
>> +			struct rte_epoll_event *events)
>> +{
>> +	int i;
>> +	int count = 0;
>> +	struct rte_epoll_event *rev;
>> +	for (i = 0; i < n; i++) {
>> +		rev = (struct rte_epoll_event *)evs[i].data.ptr;
>> +		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
>> +						 RTE_EPOLL_EXEC))
>> +			continue;
>> +
>> +		events[count].status        = RTE_EPOLL_VALID;
>> +		events[count].fd            = rev->fd;
>> +		events[count].epfd          = rev->epfd;
>> +		events[count].epdata.event  = rev->epdata.event;
>> +		events[count].epdata.data   = rev->epdata.data;
> This code has several style issues:
>   1. Always put blank line after declarations
>
>   2. Use unsigned where ever it makes sense as a matter of habit.
>        unsigned int i, count = 0;
>
>   3. Don't add casts where not necessary, it reduces compiler type checking
>      and is a bad habit. In this case evs[i].data.ptr is void *
>      and therefore no cast is needed.
Fully agree, thanks for the comment.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 08/11] ethdev: add rx intr enable, disable and ctl functions
       [not found]         ` <20150521112030.4d31a0e4@urahara>
@ 2015-05-22  2:17           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-22  2:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang



On 5/22/2015 2:20 AM, Stephen Hemminger wrote:
> On Thu, 21 May 2015 16:56:00 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> +int
>> +rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
>> +			  int epfd, int op, void *data)
>> +{
>> +	uint32_t vec;
>> +	struct rte_eth_dev *dev;
>> +	struct rte_intr_handle *intr_handle;
>> +	int rc;
>> +
>> +	if (!rte_eth_dev_is_valid_port(port_id)) {
>> +		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
> Use %u when printing unsigned value
Agree.
>
>> +		return -ENODEV;
>> +	}
>> +
>> +	dev = &rte_eth_devices[port_id];
>> +	if (dev == NULL) {
>> +		PMD_DEBUG_TRACE("Invalid port device\n");
>> +		return -ENODEV;
>> +	}
> Another unnecessary conditional check, already done in rte_eth_dev_is_valid_port
Yes, it's not necessary anymore.
>> +
>> +	if (queue_id >= dev->data->nb_rx_queues) {
>> +		PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
> This is wrong, won't build with debug enabled, You meant to use queue_id
> here and use %u
I'm sorry, that's a mistake, thanks.
>> +		return -EINVAL;
>> +	}
>> +
>> +	intr_handle = &dev->pci_dev->intr_handle;
>> +	if (!intr_handle->intr_vec) {
>> +		PMD_DEBUG_TRACE("RX Intr vector unset\n");
>> +		return -EPERM;
>> +	}
>> +

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
       [not found]                 ` <40594e9e6e0543afa11e4dbd90e59b22@BRMWP-EXMB11.corp.brocade.com>
@ 2015-05-22 16:52                   ` Stephen Hemminger
  2015-05-27 10:33                     ` Neil Horman
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-22 16:52 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, liang-min.wang

On Fri, 22 May 2015 00:05:36 +0000
Neil Horman <nhorman@tuxdriver.com> wrote:

> On Thu, May 21, 2015 at 11:14:00AM -0700, Stephen Hemminger wrote:
> > On Thu, 21 May 2015 13:58:46 -0400
> > Neil Horman <nhorman@tuxdriver.com> wrote:
> > 
> > > On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
> > > > On Thu, 21 May 2015 06:32:02 -0400
> > > > Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > 
> > > > > On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> > > > > > The patch adds interrupt vectors support in rte_intr_handle.
> > > > > > 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> > > > > > Those event fds are stored in efds[].
> > > > > > 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> > > > > > When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> > > > > > 
> > > > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > > ---
> > > > > > v7 changes:
> > > > > >  - add eptrs[], it's used to store the register rte_epoll_event instances.
> > > > > >  - add vec_en, to log the vector capability status.
> > > > > > 
> > > > > > v6 changes:
> > > > > >  - add mapping table between irq vector number and queue id.
> > > > > > 
> > > > > > v5 changes:
> > > > > >  - Create this new patch file for changed struct rte_intr_handle that
> > > > > >    other patches depend on, to avoid breaking git bisect.
> > > > > > 
> > > > > >  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
> > > > > >  1 file changed, 10 insertions(+)
> > > > > > 
> > > > > > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > index 6a159c7..27174df 100644
> > > > > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > @@ -38,6 +38,8 @@
> > > > > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > > > > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > > > >  
> > > > > > +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> > > > > > +
> > > > > >  enum rte_intr_handle_type {
> > > > > >  	RTE_INTR_HANDLE_UNKNOWN = 0,
> > > > > >  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> > > > > > @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
> > > > > >  	RTE_INTR_HANDLE_MAX
> > > > > >  };
> > > > > >  
> > > > > > +struct rte_epoll_event;
> > > > > > +
> > > > > >  /** Handle for interrupts. */
> > > > > >  struct rte_intr_handle {
> > > > > >  	union {
> > > > > > @@ -57,6 +61,12 @@ struct rte_intr_handle {
> > > > > >  	};
> > > > > >  	int fd;	 /**< interrupt event file descriptor */
> > > > > >  	enum rte_intr_handle_type type;  /**< handle type */
> > > > > > +	uint32_t max_intr;               /**< max interrupt requested */
> > > > > > +	uint32_t nb_efd;                 /**< number of available efds */
> > > > > > +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> > > > > > +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > > > > > +					 /**< intr vector epoll event ptr */
> > > > > > +	int *intr_vec;                   /**< intr vector number array */
> > > > > >  };
> > > > > >    
> > > > > 
> > > > > This is going to be ABI breaking if this from test_interrupts.c:
> > > > > static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
> > > > > 
> > > > > is a plausible way of using this structure.  Even putting the data at the end of
> > > > > the structure won't help, as the array indicies are off
> > > > 
> > > > This needs to go in 2.0 and 2.0 has to have new ABI anyway.
> > > > 
> > > We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
> > > because we didn't announce it in 1.8.  The earliest we can update the ABI
> > > (according to the ABI docs) at this point is 2.2, since we need to announce the
> > > change in 2.1, then make it in 2.2
> > > 
> > > Neil
> > > 
> > 
> > Then just skip 2.1 (or make it a trivial doc change only dummy release),
> > and call it 2.2.
> > 
> > I guess we need to proactively say every .x release will have new ABI.
> > Sorry, this is a project under development.
> > 
> Sorry, NAK.  I didn't go through all the trouble of creating an ABI
> infrastructure just to throw it out the window on some rubber stamp.  We decided
> on the rules, we need to stick to them.  We have large projects that rely on
> DPDK now (OVS primarily), and we owe it to them to not just go completely throw
> out the ABI every release.  We have a process for doing it, lets follow it.
> 
> Neil
> 

I meant, that close and ship existing 2.1 code base early and open 2.2 early
to keep things rolling. But in general this project needs x.x.y releases
with ABI stability, and just admit that x.x releases will not have stable ABI.
That is reality now.

A lot of the ABI problem is that the code does not do a good job of hiding.
And also does not sepearte driver ABI from user ABI. There are things like
structure of PCI and interrupt handles that the user from library point
of view should not care about, but drivers will need to.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO
  2015-05-21  8:55       ` [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
@ 2015-05-22 20:21         ` Stephen Hemminger
  2015-05-27  9:00           ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-22 20:21 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Thu, 21 May 2015 16:55:57 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> This patch does below:
>  - Create VFIO eventfds for each interrupt vector (move to next)
>  - Assign per interrupt vector's eventfd to VFIO by ioctl
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>

One non-trivial performance related change here would be to set SMP
affinity of the receive IRQ to the CPU that is handling that receive queue.
Not sure the full API to do this, but ideally you should not have the
receive interrupt occurring on one CPU then having to cause scheduler
to wakeup receive thread on another CPU.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 05/11] eal/linux: add interrupt vectors handling on VFIO
  2015-05-22 20:21         ` Stephen Hemminger
@ 2015-05-27  9:00           ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-27  9:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang



On 5/23/2015 4:21 AM, Stephen Hemminger wrote:
> On Thu, 21 May 2015 16:55:57 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> This patch does below:
>>   - Create VFIO eventfds for each interrupt vector (move to next)
>>   - Assign per interrupt vector's eventfd to VFIO by ioctl
>>
>> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
>> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> One non-trivial performance related change here would be to set SMP
> affinity of the receive IRQ to the CPU that is handling that receive queue.
> Not sure the full API to do this, but ideally you should not have the
> receive interrupt occurring on one CPU then having to cause scheduler
> to wakeup receive thread on another CPU.
>
That's a good point. The previous thought was to configure irq affinity 
by script from outside.
I haven't found some API to do that well, the well known way is by sysfs.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
  2015-05-22 16:52                   ` Stephen Hemminger
@ 2015-05-27 10:33                     ` Neil Horman
  0 siblings, 0 replies; 242+ messages in thread
From: Neil Horman @ 2015-05-27 10:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang

On Fri, May 22, 2015 at 09:52:06AM -0700, Stephen Hemminger wrote:
> On Fri, 22 May 2015 00:05:36 +0000
> Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> > On Thu, May 21, 2015 at 11:14:00AM -0700, Stephen Hemminger wrote:
> > > On Thu, 21 May 2015 13:58:46 -0400
> > > Neil Horman <nhorman@tuxdriver.com> wrote:
> > > 
> > > > On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
> > > > > On Thu, 21 May 2015 06:32:02 -0400
> > > > > Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > 
> > > > > > On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
> > > > > > > The patch adds interrupt vectors support in rte_intr_handle.
> > > > > > > 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
> > > > > > > Those event fds are stored in efds[].
> > > > > > > 'intr_vec' is reserved for device driver to initialize the vector mapping table.
> > > > > > > When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
> > > > > > > 
> > > > > > > Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> > > > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > > > ---
> > > > > > > v7 changes:
> > > > > > >  - add eptrs[], it's used to store the register rte_epoll_event instances.
> > > > > > >  - add vec_en, to log the vector capability status.
> > > > > > > 
> > > > > > > v6 changes:
> > > > > > >  - add mapping table between irq vector number and queue id.
> > > > > > > 
> > > > > > > v5 changes:
> > > > > > >  - Create this new patch file for changed struct rte_intr_handle that
> > > > > > >    other patches depend on, to avoid breaking git bisect.
> > > > > > > 
> > > > > > >  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
> > > > > > >  1 file changed, 10 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > > index 6a159c7..27174df 100644
> > > > > > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> > > > > > > @@ -38,6 +38,8 @@
> > > > > > >  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
> > > > > > >  #define _RTE_LINUXAPP_INTERRUPTS_H_
> > > > > > >  
> > > > > > > +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> > > > > > > +
> > > > > > >  enum rte_intr_handle_type {
> > > > > > >  	RTE_INTR_HANDLE_UNKNOWN = 0,
> > > > > > >  	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
> > > > > > > @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
> > > > > > >  	RTE_INTR_HANDLE_MAX
> > > > > > >  };
> > > > > > >  
> > > > > > > +struct rte_epoll_event;
> > > > > > > +
> > > > > > >  /** Handle for interrupts. */
> > > > > > >  struct rte_intr_handle {
> > > > > > >  	union {
> > > > > > > @@ -57,6 +61,12 @@ struct rte_intr_handle {
> > > > > > >  	};
> > > > > > >  	int fd;	 /**< interrupt event file descriptor */
> > > > > > >  	enum rte_intr_handle_type type;  /**< handle type */
> > > > > > > +	uint32_t max_intr;               /**< max interrupt requested */
> > > > > > > +	uint32_t nb_efd;                 /**< number of available efds */
> > > > > > > +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
> > > > > > > +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > > > > > > +					 /**< intr vector epoll event ptr */
> > > > > > > +	int *intr_vec;                   /**< intr vector number array */
> > > > > > >  };
> > > > > > >    
> > > > > > 
> > > > > > This is going to be ABI breaking if this from test_interrupts.c:
> > > > > > static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
> > > > > > 
> > > > > > is a plausible way of using this structure.  Even putting the data at the end of
> > > > > > the structure won't help, as the array indicies are off
> > > > > 
> > > > > This needs to go in 2.0 and 2.0 has to have new ABI anyway.
> > > > > 
> > > > We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
> > > > because we didn't announce it in 1.8.  The earliest we can update the ABI
> > > > (according to the ABI docs) at this point is 2.2, since we need to announce the
> > > > change in 2.1, then make it in 2.2
> > > > 
> > > > Neil
> > > > 
> > > 
> > > Then just skip 2.1 (or make it a trivial doc change only dummy release),
> > > and call it 2.2.
> > > 
> > > I guess we need to proactively say every .x release will have new ABI.
> > > Sorry, this is a project under development.
> > > 
> > Sorry, NAK.  I didn't go through all the trouble of creating an ABI
> > infrastructure just to throw it out the window on some rubber stamp.  We decided
> > on the rules, we need to stick to them.  We have large projects that rely on
> > DPDK now (OVS primarily), and we owe it to them to not just go completely throw
> > out the ABI every release.  We have a process for doing it, lets follow it.
> > 
> > Neil
> > 
> 
> I meant, that close and ship existing 2.1 code base early and open 2.2 early
> to keep things rolling. But in general this project needs x.x.y releases
> with ABI stability, and just admit that x.x releases will not have stable ABI.
> That is reality now.
> 
I'm  not opposed to doing that, though the purpose of the proposed cadence was
to give sufficient notice to downstream consumers of DPDK that ABI changes were
comming.  As long as the time delta beweeen 2.X and 2.X+1 is sufficient for
consumers to have time to react and update their applications I'm ok with it
(which I know is subjective, but I'm willing to experiment there).

> A lot of the ABI problem is that the code does not do a good job of hiding.
> And also does not sepearte driver ABI from user ABI. There are things like
> structure of PCI and interrupt handles that the user from library point
> of view should not care about, but drivers will need to.
> 
I agree.  I had hoped that implementing an ABI process would help drive
improvements in this area, but it hasn't seemed to yet.

Neil

> 
> 

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF
  2015-05-05  5:39     ` [dpdk-dev] [PATCH v7 09/10] igb: enable rx queue interrupts for PF Cunming Liang
  2015-05-05 23:16       ` Stephen Hemminger
@ 2015-05-28 21:25       ` Stephen Hemminger
  1 sibling, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-28 21:25 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev

On Tue,  5 May 2015 13:39:45 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +		pci_dev->intr_handle.intr_vec =
> +			rte_zmalloc("intr_vec",
> +				    dev_info.max_rx_queues * sizeof(int), 0);
> +	

This and other drivers should be using rte_zmalloc_socket to ensure
that the intr_vec table is allocated on the same NUMA node as the hardware.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD
  2015-05-21  8:55     ` [dpdk-dev] [PATCH v8 00/11] Interrupt mode PMD Cunming Liang
                         ` (10 preceding siblings ...)
  2015-05-21  8:56       ` [dpdk-dev] [PATCH v8 11/11] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
@ 2015-05-29  8:45       ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
                           ` (12 more replies)
  11 siblings, 13 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

v9 changes
 - code rework to fix open comment
 - bug fix for igb lsc when both lsc and rxq are enabled in vfio-msix
 - new patch to turn off the feature by defalut so as to avoid v2.1 abi broken

v8 changes
 - remove condition check for only vfio-msix
 - add multiplex intr support when only one intr vector allowed
 - lsc and rxq interrupt runtime enable decision
 - add safe event delete while the event wakeup execution happens

v7 changes
 - decouple epoll event and intr operation
 - add condition check in the case intr vector is disabled
 - renaming some APIs

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess. [FIXED]
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (12):
  eal/linux: add interrupt vectors support in intr_handle
  eal/linux: add rte_epoll_wait/ctl support
  eal/linux: add API to set rx interrupt event monitor
  eal/linux: fix comments typo on vfio msi
  eal/linux: add interrupt vectors handling on VFIO
  eal/linux: standalone intr event fd create support
  eal/bsd: dummy for new intr definition
  ethdev: add rx intr enable, disable and ctl functions
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch
  abi: fix v2.1 abi broken issue

 drivers/net/e1000/igb_ethdev.c                     | 311 ++++++++++--
 drivers/net/ixgbe/ixgbe_ethdev.c                   | 519 ++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_ethdev.h                   |   4 +
 examples/l3fwd-power/main.c                        | 207 ++++++--
 lib/librte_eal/bsdapp/eal/eal_interrupts.c         |  19 +
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |  81 ++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   5 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 358 ++++++++++++--
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 219 +++++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   8 +
 lib/librte_ether/rte_ethdev.c                      | 109 +++++
 lib/librte_ether/rte_ethdev.h                      | 132 ++++++
 lib/librte_ether/rte_ether_version.map             |   4 +
 13 files changed, 1851 insertions(+), 125 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-06-02  5:27           ` Liu, Yong
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 02/12] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
                           ` (11 subsequent siblings)
  12 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds interrupt vectors support in rte_intr_handle.
'vec_en' is set when interrupt vectors are detected and associated event fds are set.
Those event fds are stored in efds[].
'intr_vec' is reserved for device driver to initialize the vector mapping table.
When the event fds add to a specified epoll instance, 'eptrs' will hold the rte_epoll_event object pointer.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes:
 - add eptrs[], it's used to store the register rte_epoll_event instances.
 - add vec_en, to log the vector capability status.

v6 changes:
 - add mapping table between irq vector number and queue id.

v5 changes:
 - Create this new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect.

 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index bdeb3fc..9c86a15 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define RTE_MAX_RXTX_INTR_VEC_ID     32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,          /**< uio device handle */
@@ -49,6 +51,8 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
+struct rte_epoll_event;
+
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	union {
@@ -58,6 +62,12 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	uint32_t max_intr;               /**< max interrupt requested */
+	uint32_t nb_efd;                 /**< number of available efds */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
+	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event ptr */
+	int *intr_vec;                   /**< intr vector number array */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 02/12] eal/linux: add rte_epoll_wait/ctl support
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 03/12] eal/linux: add API to set rx interrupt event monitor Cunming Liang
                           ` (10 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds 'rte_epoll_wait' and 'rte_epoll_ctl' for async event wakeup.
It defines 'struct rte_epoll_event' as the event param.
The 'op' uses the same enum as epoll_wait/ctl does.
The epoll event support to carry a raw user data and to register a callback which is exectuted during wakeup.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v9 changes
 - rework on coding style

v8 changes
 - support delete event in safety during the wakeup execution
 - add EINTR process during epoll_wait

v7 changes
 - split v6[4/8] into two patches, one for epoll event(this one)
   another for rx intr(next patch)
 - introduce rte_epoll_event definition
 - rte_epoll_wait/ctl for more generic RTE epoll API

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 137 +++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  82 +++++++++++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   3 +
 3 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 3a84b3c..2f56000 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -69,6 +69,8 @@
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
 
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
 /**
  * union for pipe fds.
  */
@@ -895,3 +897,138 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static int
+eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
+			struct rte_epoll_event *events)
+{
+	unsigned int i, count = 0;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < n; i++) {
+		rev = evs[i].data.ptr;
+		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+						 RTE_EPOLL_EXEC))
+			continue;
+
+		events[count].status        = RTE_EPOLL_VALID;
+		events[count].fd            = rev->fd;
+		events[count].epfd          = rev->epfd;
+		events[count].epdata.event  = rev->epdata.event;
+		events[count].epdata.data   = rev->epdata.data;
+		if (rev->epdata.cb_fun)
+			rev->epdata.cb_fun(rev->fd,
+					   rev->epdata.cb_arg);
+
+		rte_compiler_barrier();
+		rev->status = RTE_EPOLL_VALID;
+		count++;
+	}
+	return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+	int pfd = epoll_create(255);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+	if (RTE_PER_LCORE(_epfd) == -1)
+		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+	return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout)
+{
+	struct epoll_event evs[maxevents];
+	int rc;
+
+	if (!events) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	while (1) {
+		rc = epoll_wait(epfd, evs, maxevents, timeout);
+		if (likely(rc > 0)) {
+			/* epoll_wait has at least one fd ready to read */
+			rc = eal_epoll_process_event(evs, rc, events);
+			break;
+		} else if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			/* epoll_wait fail */
+			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+				strerror(errno));
+			rc = -1;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+	while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+				    RTE_EPOLL_INVALID))
+		while (ev->status != RTE_EPOLL_VALID)
+			rte_pause();
+	memset(&ev->epdata, 0, sizeof(ev->epdata));
+	ev->fd = -1;
+	ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event)
+{
+	struct epoll_event ev;
+
+	if (!event) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	if (op == EPOLL_CTL_ADD) {
+		event->status = RTE_EPOLL_VALID;
+		event->fd = fd;  /* ignore fd in event */
+		event->epfd = epfd;
+		ev.data.ptr = (void *)event;
+	}
+
+	ev.events = event->epdata.event;
+	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, fd, strerror(errno));
+		if (op == EPOLL_CTL_ADD)
+			/* rollback status when CTL_ADD fail */
+			event->status = RTE_EPOLL_INVALID;
+		return -1;
+	}
+
+	if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+		eal_epoll_data_safe_free(event);
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 9c86a15..7c21060 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -51,7 +51,31 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
-struct rte_epoll_event;
+#define RTE_INTR_EVENT_ADD            1UL
+#define	RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+struct rte_epoll_data {
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
+
+enum {
+	RTE_EPOLL_INVALID = 0,
+	RTE_EPOLL_VALID,
+	RTE_EPOLL_EXEC,
+};
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	volatile uint32_t status;  /**< OUT: event status */
+	int fd;                    /**< OUT: event fd */
+	int epfd;       /**< OUT: epoll instance the ev associated with */
+	struct rte_epoll_data epdata;
+};
 
 /** Handle for interrupts. */
 struct rte_intr_handle {
@@ -65,9 +89,61 @@ struct rte_intr_handle {
 	uint32_t max_intr;               /**< max interrupt requested */
 	uint32_t nb_efd;                 /**< number of available efds */
 	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
-	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
-					 /**< intr vector epoll event ptr */
+	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event */
 	int *intr_vec;                   /**< intr vector number array */
 };
 
+#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
+
+/**
+ * It waits for events on the epoll instance.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It performs control operations on epoll instance referred by the epfd.
+ * It requests that the operation op be performed for the target fd.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller perform control operations.
+ * @param op
+ *   The operation be performed for the target fd.
+ * @param fd
+ *   The target fd on which the control ops perform.
+ * @param event
+ *   Describes the object linked to the fd.
+ *   Note: The caller must take care the object deletion after CTL_DEL.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event);
+
+/**
+ * The function returns the per thread epoll instance.
+ *
+ * @return
+ *   epfd the epoll instance refered to.
+ */
+int
+rte_intr_tls_epfd(void);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 7e850a9..840002e 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -52,6 +52,8 @@ DPDK_2.0 {
 	rte_eal_vdev_init;
 	rte_eal_vdev_uninit;
 	rte_eal_wait_lcore;
+	rte_epoll_ctl;
+	rte_epoll_wait;
 	rte_exit;
 	rte_get_hpet_cycles;
 	rte_get_hpet_hz;
@@ -61,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 03/12] eal/linux: add API to set rx interrupt event monitor
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 02/12] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 04/12] eal/linux: fix comments typo on vfio msi Cunming Liang
                           ` (9 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds 'rte_intr_rx_ctl' to add or delete interrupt vector events monitor on specified epoll instance.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - fix EWOULDBLOCK and EINTR processing
 - add event status check

v7 changes
 - rename rte_intr_rx_set to rte_intr_rx_ctl.
 - rte_intr_rx_ctl uses rte_epoll_ctl to register epoll event instance.
 - the intr rx event instance includes a intr process callback.

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 100 +++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  20 +++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   1 +
 3 files changed, 121 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 2f56000..e2392cb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -897,6 +897,49 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static void
+eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
+{
+	union rte_intr_read_buffer buf;
+	int bytes_read = 1;
+
+	switch (intr_handle->type) {
+	case RTE_INTR_HANDLE_UIO:
+		bytes_read = sizeof(buf.uio_intr_count);
+		break;
+#ifdef VFIO_PRESENT
+	case RTE_INTR_HANDLE_VFIO_MSIX:
+	case RTE_INTR_HANDLE_VFIO_MSI:
+	case RTE_INTR_HANDLE_VFIO_LEGACY:
+		bytes_read = sizeof(buf.vfio_intr_count);
+		break;
+#endif
+	default:
+		bytes_read = 1;
+		RTE_LOG(INFO, EAL, "unexpected intr type\n");
+		break;
+	}
+
+	/**
+	 * read out to clear the ready-to-be-read flag
+	 * for epoll_wait.
+	 */
+	do {
+		bytes_read = read(fd, &buf, bytes_read);
+		if (bytes_read < 0) {
+			if (errno == EINTR || errno == EWOULDBLOCK ||
+			    errno == EAGAIN)
+				continue;
+			RTE_LOG(ERR, EAL, "Error reading from file "
+				"descriptor %d: %s\n", fd,
+				strerror(errno));
+		} else if (bytes_read == 0)
+			RTE_LOG(ERR, EAL, "Read nothing from file "
+				"descriptor %d\n", fd);
+		return;
+	} while (1);
+}
+
 static int
 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
 			struct rte_epoll_event *events)
@@ -1032,3 +1075,60 @@ rte_epoll_ctl(int epfd, int op, int fd,
 
 	return 0;
 }
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+		int op, unsigned int vec, void *data)
+{
+	struct rte_epoll_event *rev;
+	struct rte_epoll_data *epdata;
+	int epfd_op;
+	int rc = 0;
+
+	if (!intr_handle || intr_handle->nb_efd == 0 ||
+	    vec >= intr_handle->nb_efd) {
+		RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+		return -EPERM;
+	}
+
+	switch (op) {
+	case RTE_INTR_EVENT_ADD:
+		epfd_op = EPOLL_CTL_ADD;
+		rev = &intr_handle->elist[vec];
+		if (rev->status != RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event already been added.\n");
+			return -EEXIST;
+		}
+
+		/* attach to intr vector fd */
+		epdata = &rev->epdata;
+		epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
+		epdata->data   = data;
+		epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+		epdata->cb_arg = (void *)intr_handle;
+		rc = rte_epoll_ctl(epfd, epfd_op, intr_handle->efds[vec], rev);
+		if (!rc)
+			RTE_LOG(DEBUG, EAL, "eventfd %d associated with vec %d"
+				" is added on epfd %d\n", rev->fd, vec, epfd);
+		else
+			rc = -EPERM;
+		break;
+	case RTE_INTR_EVENT_DEL:
+		epfd_op = EPOLL_CTL_DEL;
+		rev = &intr_handle->elist[vec];
+		if (rev->status == RTE_EPOLL_INVALID) {
+			RTE_LOG(INFO, EAL, "Event does not exist.\n");
+			return -EPERM;
+		}
+
+		rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
+		if (rc)
+			rc = -EPERM;
+		break;
+	default:
+		RTE_LOG(ERR, EAL, "event op type mismatch\n");
+		rc = -EPERM;
+	}
+
+	return rc;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 7c21060..8b7793f 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -146,4 +146,24 @@ rte_epoll_ctl(int epfd, int op, int fd,
 int
 rte_intr_tls_epfd(void);
 
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data);
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 840002e..65b5ed2 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -63,6 +63,7 @@ DPDK_2.0 {
 	rte_intr_callback_unregister;
 	rte_intr_disable;
 	rte_intr_enable;
+	rte_intr_rx_ctl;
 	rte_intr_tls_epfd;
 	rte_log;
 	rte_log_add_in_history;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 04/12] eal/linux: fix comments typo on vfio msi
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (2 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 03/12] eal/linux: add API to set rx interrupt event monitor Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 05/12] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
                           ` (8 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang


Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index e2392cb..8891bd3 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -219,7 +219,7 @@ vfio_disable_intx(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* enable MSI-X interrupts */
+/* enable MSI interrupts */
 static int
 vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	int len, ret;
@@ -265,7 +265,7 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 	return 0;
 }
 
-/* disable MSI-X interrupts */
+/* disable MSI interrupts */
 static int
 vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 05/12] eal/linux: add interrupt vectors handling on VFIO
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (3 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 04/12] eal/linux: fix comments typo on vfio msi Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 06/12] eal/linux: standalone intr event fd create support Cunming Liang
                           ` (7 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

This patch does below:
 - Create VFIO eventfds for each interrupt vector (move to next)
 - Assign per interrupt vector's eventfd to VFIO by ioctl

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - move eventfd creation out of the setup_interrupts to a standalone function

v7 changes
 - cleanup unnecessary code change
 - split event and intr operation to other patches

 lib/librte_eal/linuxapp/eal/eal_interrupts.c | 50 ++++++++--------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 8891bd3..c39e206 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -128,6 +128,9 @@ static pthread_t intr_thread;
 #ifdef VFIO_PRESENT
 
 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+			      sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
 
 /* enable legacy (INTx) interrupts */
 static int
@@ -245,23 +248,6 @@ vfio_enable_msi(struct rte_intr_handle *intr_handle) {
 						intr_handle->fd);
 		return -1;
 	}
-
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -294,7 +280,7 @@ vfio_disable_msi(struct rte_intr_handle *intr_handle) {
 static int
 vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 	int len, ret;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	struct vfio_irq_set *irq_set;
 	int *fd_ptr;
 
@@ -302,12 +288,18 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
-	irq_set->count = 1;
+	if (!intr_handle->max_intr)
+		intr_handle->max_intr = 1;
+	else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID)
+		intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
+
+	irq_set->count = intr_handle->max_intr;
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
-	*fd_ptr = intr_handle->fd;
+	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
+	fd_ptr[intr_handle->max_intr - 1] = intr_handle->fd;
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -317,22 +309,6 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 		return -1;
 	}
 
-	/* manually trigger interrupt to enable it */
-	memset(irq_set, 0, len);
-	len = sizeof(struct vfio_irq_set);
-	irq_set->argsz = len;
-	irq_set->count = 1;
-	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-	irq_set->start = 0;
-
-	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "Error triggering MSI-X interrupts for fd %d\n",
-						intr_handle->fd);
-		return -1;
-	}
 	return 0;
 }
 
@@ -340,7 +316,7 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 static int
 vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 	struct vfio_irq_set *irq_set;
-	char irq_set_buf[IRQ_SET_BUF_LEN];
+	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 	int len, ret;
 
 	len = sizeof(struct vfio_irq_set);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 06/12] eal/linux: standalone intr event fd create support
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (4 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 05/12] eal/linux: add interrupt vectors handling on VFIO Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 07/12] eal/bsd: dummy for new intr definition Cunming Liang
                           ` (6 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch exposes intr event fd create and release for PMD.
The device driver can assign the number of event associated with interrupt vector.
It also provides misc funtions to check 1) allows other slowpath intr(e.g. lsc);
2) intr event on fastpath is enabled or not.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 57 ++++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 51 +++++++++++++++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |  4 ++
 3 files changed, 112 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index c39e206..1b80359 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -44,6 +44,7 @@
 #include <sys/epoll.h>
 #include <sys/signalfd.h>
 #include <sys/ioctl.h>
+#include <sys/eventfd.h>
 
 #include <rte_common.h>
 #include <rte_interrupts.h>
@@ -68,6 +69,7 @@
 #include "eal_vfio.h"
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
+#define NB_OTHER_INTR               1
 
 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
 
@@ -1108,3 +1110,58 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
 
 	return rc;
 }
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	uint32_t i;
+	int fd;
+	uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+	if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
+		for (i = 0; i < n; i++) {
+			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+			if (fd < 0) {
+				RTE_LOG(ERR, EAL,
+					"cannot setup eventfd,"
+					"error %i (%s)\n",
+					errno, strerror(errno));
+				return -1;
+			}
+			intr_handle->efds[i] = fd;
+		}
+		intr_handle->nb_efd   = n;
+		intr_handle->max_intr = NB_OTHER_INTR + n;
+	} else {
+		intr_handle->efds[0]  = intr_handle->fd;
+		intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
+		intr_handle->max_intr = NB_OTHER_INTR;
+	}
+
+	return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	uint32_t i;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < intr_handle->nb_efd; i++) {
+		rev = &intr_handle->elist[i];
+		if (rev->status == RTE_EPOLL_INVALID)
+			continue;
+		if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+			/* force free if the entry valid */
+			eal_epoll_data_safe_free(rev);
+			rev->status = RTE_EPOLL_INVALID;
+		}
+	}
+
+	if (intr_handle->max_intr > intr_handle->nb_efd) {
+		for (i = 0; i < intr_handle->nb_efd; i++)
+			close(intr_handle->efds[i]);
+	}
+	intr_handle->nb_efd = 0;
+	intr_handle->max_intr = 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 8b7793f..7c8a62b 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -166,4 +166,55 @@ int
 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
 		int epfd, int op, unsigned int vec, void *data);
 
+/**
+ * It enables the fastpath event fds if it's necessary.
+ * It creates event fds when multi-vectors allowed,
+ * otherwise it multiplexes the single event fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_vec
+ *   Number of intrrupt vector trying to enable.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * It disable the fastpath event fds.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * The fastpath interrupt is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	return !(!intr_handle->nb_efd);
+}
+
+/**
+ * The interrupt handle instance allows other cause or not.
+ * Other cause stands for none fastpath interrupt.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	return !!(intr_handle->max_intr - intr_handle->nb_efd);
+}
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 65b5ed2..d0df6b4 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -61,7 +61,11 @@ DPDK_2.0 {
 	rte_hexdump;
 	rte_intr_callback_register;
 	rte_intr_callback_unregister;
+	rte_intr_allow_others;
 	rte_intr_disable;
+	rte_intr_dp_is_en;
+	rte_intr_efd_enable;
+	rte_intr_efd_disable;
 	rte_intr_enable;
 	rte_intr_rx_ctl;
 	rte_intr_tls_epfd;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 07/12] eal/bsd: dummy for new intr definition
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (5 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 06/12] eal/linux: standalone intr event fd create support Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 08/12] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
                           ` (5 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

To make bsd compiling happy with new intr changes.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v8 changes
 - add stub for new function

v7 changes
 - remove stub 'linux only' function from source file

 lib/librte_eal/bsdapp/eal/eal_interrupts.c         | 19 ++++++
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   | 74 ++++++++++++++++++++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |  5 ++
 3 files changed, 98 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/eal_interrupts.c b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
index cb7d4f1..ea85be3 100644
--- a/lib/librte_eal/bsdapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/bsdapp/eal/eal_interrupts.c
@@ -69,3 +69,22 @@ rte_eal_intr_init(void)
 	return 0;
 }
 
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data)
+{
+	return -ENOTSUP;
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	return;
+}
+
diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index 87a9cf6..fc2c46b 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -49,6 +49,80 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	int max_intr;                    /**< max interrupt requested */
+	uint32_t nb_efd;                 /**< number of available efds */
+	int *intr_vec;               /**< intr vector number array */
 };
 
+/**
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data);
+
+/**
+ * It enables the fastpath event fds if it's necessary.
+ * It creates event fds when multi-vectors allowed,
+ * otherwise it multiplexes the single event fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_vec
+ *   Number of intrrupt vector trying to enable.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * It disable the fastpath event fds.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * The fastpath interrupt is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	return 0;
+}
+
+/**
+ * The interrupt handle instance allows other cause or not.
+ * Other cause stands for none fastpath interrupt.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+static inline int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	return 1;
+}
+
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 67b6a6c..a74671b 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -53,8 +53,13 @@ DPDK_2.0 {
 	rte_hexdump;
 	rte_intr_callback_register;
 	rte_intr_callback_unregister;
+	rte_intr_allow_others;
 	rte_intr_disable;
+	rte_intr_dp_is_en;
+	rte_intr_efd_enable;
+	rte_intr_efd_disable;
 	rte_intr_enable;
+	rte_intr_rx_ctl;
 	rte_log;
 	rte_log_add_in_history;
 	rte_log_cur_msg_loglevel;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 08/12] ethdev: add rx intr enable, disable and ctl functions
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (6 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 07/12] eal/bsd: dummy for new intr definition Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 09/12] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
                           ` (4 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds two dev_ops functions to enable and disable rx queue interrupts.
In addtion, it adds rte_eth_dev_rx_intr_ctl/rx_intr_q to support per port or per queue rx intr event set.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v9 changes
 - remove unnecessary check after rte_eth_dev_is_valid_port.
   the same as http://www.dpdk.org/dev/patchwork/patch/4784

v8 changes
 - add addtion check for EEXIT

v7 changes
 - remove rx_intr_vec_get
 - add rx_intr_ctl and rx_intr_ctl_q

v6 changes
 - add rx_intr_vec_get to retrieve the vector num of the queue.

v5 changes
 - Rebase the patchset onto the HEAD

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Put new functions at the end of eth_dev_ops to avoid breaking ABI

v3 changes
 - Add return value for interrupt enable/disable functions

 lib/librte_ether/rte_ethdev.c          | 107 +++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          | 104 ++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |   4 ++
 3 files changed, 215 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 024fe8b..846d7f8 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3281,6 +3281,113 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	}
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
+
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	uint16_t qid;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%u\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec) {
+		PMD_DEBUG_TRACE("RX Intr vector unset\n");
+		return -EPERM;
+	}
+
+	for (qid = 0; qid < dev->data->nb_rx_queues; qid++) {
+		vec = intr_handle->intr_vec[qid];
+		rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data);
+		if (rc && rc != -EEXIST) {
+			PMD_DEBUG_TRACE("p %u q %u rx ctl error"
+					" op %d epfd %d vec %u\n",
+					port_id, qid, op, epfd, vec);
+		}
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data)
+{
+	uint32_t vec;
+	struct rte_eth_dev *dev;
+	struct rte_intr_handle *intr_handle;
+	int rc;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%u\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+	if (queue_id >= dev->data->nb_rx_queues) {
+		PMD_DEBUG_TRACE("Invalid RX queue_id=%u\n", queue_id);
+		return -EINVAL;
+	}
+
+	intr_handle = &dev->pci_dev->intr_handle;
+	if (!intr_handle->intr_vec) {
+		PMD_DEBUG_TRACE("RX Intr vector unset\n");
+		return -EPERM;
+	}
+
+	vec = intr_handle->intr_vec[queue_id];
+	rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data);
+	if (rc && rc != -EEXIST) {
+		PMD_DEBUG_TRACE("p %u q %u rx ctl error"
+				" op %d epfd %d vec %u\n",
+				port_id, queue_id, op, epfd, vec);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			   uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_enable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_enable)(dev, queue_id);
+}
+
+int
+rte_eth_dev_rx_intr_disable(uint8_t port_id,
+			    uint16_t queue_id)
+{
+	struct rte_eth_dev *dev;
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[port_id];
+
+	FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_intr_disable, -ENOTSUP);
+	return (*dev->dev_ops->rx_queue_intr_disable)(dev, queue_id);
+}
+
 #ifdef RTE_NIC_BYPASS
 int rte_eth_dev_bypass_init(uint8_t port_id)
 {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 16dbe00..c199d32 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -830,6 +830,8 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
+	uint16_t rxq;
 };
 
 /**
@@ -1035,6 +1037,14 @@ typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Enable interrupt of a receive queue of an Ethernet device. */
+
+typedef int (*eth_rx_disable_intr_t)(struct rte_eth_dev *dev,
+				    uint16_t rx_queue_id);
+/**< @internal Disable interrupt of a receive queue of an Ethernet device. */
+
 typedef void (*eth_queue_release_t)(void *queue);
 /**< @internal Release memory resources allocated by given RX/TX queue. */
 
@@ -1386,6 +1396,10 @@ struct eth_dev_ops {
 	/** Get current RSS hash configuration. */
 	rss_hash_conf_get_t rss_hash_conf_get;
 	eth_filter_ctrl_t              filter_ctrl;          /**< common filter control*/
+
+	/** Enable/disable Rx queue interrupt. */
+	eth_rx_enable_intr_t       rx_queue_intr_enable; /**< Enable Rx queue interrupt. */
+	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt.*/
 };
 
 /**
@@ -2868,6 +2882,96 @@ void _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 				enum rte_eth_event_type event);
 
 /**
+ * When there is no rx packet coming in Rx Queue for a long time, we can
+ * sleep lcore related to RX Queue for power saving, and enable rx interrupt
+ * to be triggered when rx packect arrives.
+ *
+ * The rte_eth_dev_rx_intr_enable() function enables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_enable(uint8_t port_id,
+			       uint16_t queue_id);
+
+/**
+ * When lcore wakes up from rx interrupt indicating packet coming, disable rx
+ * interrupt and returns to polling mode.
+ *
+ * The rte_eth_dev_rx_intr_disable() function disables rx queue
+ * interrupt on specific rx queue of a port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @return
+ *   - (0) if successful.
+ *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
+ *     that operation.
+ *   - (-ENODEV) if *port_id* invalid.
+ */
+int rte_eth_dev_rx_intr_disable(uint8_t port_id,
+				uint16_t queue_id);
+
+/**
+ * RX Interrupt control per port.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
+
+/**
+ * RX Interrupt control per queue.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the receive queue from which to retrieve input packets.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data);
+
+/**
  * Turn on the LED on the Ethernet device.
  * This function turns on the LED on the Ethernet device.
  *
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index a2d25a6..2799b99 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -48,6 +48,10 @@ DPDK_2.0 {
 	rte_eth_dev_rss_hash_update;
 	rte_eth_dev_rss_reta_query;
 	rte_eth_dev_rss_reta_update;
+	rte_eth_dev_rx_intr_ctl;
+	rte_eth_dev_rx_intr_ctl_q;
+	rte_eth_dev_rx_intr_disable;
+	rte_eth_dev_rx_intr_enable;
 	rte_eth_dev_rx_queue_start;
 	rte_eth_dev_rx_queue_stop;
 	rte_eth_dev_set_link_down;
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 09/12] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (7 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 08/12] ethdev: add rx intr enable, disable and ctl functions Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29 15:57           ` Stephen Hemminger
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 10/12] igb: enable rx queue interrupts for PF Cunming Liang
                           ` (3 subsequent siblings)
  12 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch does below things for ixgbe PF and VF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Yong Liu <yong.liu@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v9 changes
 - move queue-vec mapping init from dev_configure to dev_start

v8 changes
 - add vfio-msi/vfio-legacy and uio-legacy support

v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 drivers/net/ixgbe/ixgbe_ethdev.c | 484 ++++++++++++++++++++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_ethdev.h |   4 +
 2 files changed, 476 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 0d9f9b2..798bb85 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -82,6 +82,9 @@
  */
 #define IXGBE_FC_LO    0x40
 
+/* Default minimum inter-interrupt interval for EITR configuration */
+#define IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT    0x79E
+
 /* Timer value included in XOFF frames. */
 #define IXGBE_FC_PAUSE 0x680
 
@@ -171,6 +174,7 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -183,11 +187,14 @@ static void ixgbe_dcb_init(struct ixgbe_hw *hw,struct ixgbe_dcb_config *dcb_conf
 
 /* For Virtual Function support */
 static int eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev);
+static int ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev);
+static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_configure(struct rte_eth_dev *dev);
 static int  ixgbevf_dev_start(struct rte_eth_dev *dev);
 static void ixgbevf_dev_stop(struct rte_eth_dev *dev);
 static void ixgbevf_dev_close(struct rte_eth_dev *dev);
 static void ixgbevf_intr_disable(struct ixgbe_hw *hw);
+static void ixgbevf_intr_enable(struct ixgbe_hw *hw);
 static void ixgbevf_dev_stats_get(struct rte_eth_dev *dev,
 		struct rte_eth_stats *stats);
 static void ixgbevf_dev_stats_reset(struct rte_eth_dev *dev);
@@ -197,6 +204,15 @@ static void ixgbevf_vlan_strip_queue_set(struct rte_eth_dev *dev,
 		uint16_t queue, int on);
 static void ixgbevf_vlan_offload_set(struct rte_eth_dev *dev, int mask);
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on);
+static void ixgbevf_dev_interrupt_handler(struct rte_intr_handle *handle,
+		void *param);
+static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+		uint16_t queue_id);
+static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+		 uint16_t queue_id);
+static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+		 uint8_t queue, uint8_t msix_vector);
+static void ixgbevf_configure_msix(struct rte_eth_dev *dev);
 
 /* For Eth VMDQ APIs support */
 static int ixgbe_uc_hash_table_set(struct rte_eth_dev *dev, struct
@@ -214,6 +230,14 @@ static int ixgbe_mirror_rule_set(struct rte_eth_dev *dev,
 static int ixgbe_mirror_rule_reset(struct rte_eth_dev *dev,
 		uint8_t	rule_id);
 
+static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void ixgbe_configure_msix(struct rte_eth_dev *dev);
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 		uint16_t queue_idx, uint16_t tx_rate);
 static int ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf,
@@ -262,7 +286,7 @@ static int ixgbevf_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu);
  */
 #define UPDATE_VF_STAT(reg, last, cur)	                        \
 {                                                               \
-	u32 latest = IXGBE_READ_REG(hw, reg);                   \
+	uint32_t latest = IXGBE_READ_REG(hw, reg);                   \
 	cur += latest - last;                                   \
 	last = latest;                                          \
 }
@@ -343,6 +367,8 @@ static const struct eth_dev_ops ixgbe_eth_dev_ops = {
 	.tx_queue_start	      = ixgbe_dev_tx_queue_start,
 	.tx_queue_stop        = ixgbe_dev_tx_queue_stop,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
+	.rx_queue_intr_enable = ixgbe_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbe_dev_rx_queue_intr_disable,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
 	.rx_queue_count       = ixgbe_dev_rx_queue_count,
 	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
@@ -402,8 +428,11 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = {
 	.vlan_offload_set     = ixgbevf_vlan_offload_set,
 	.rx_queue_setup       = ixgbe_dev_rx_queue_setup,
 	.rx_queue_release     = ixgbe_dev_rx_queue_release,
+	.rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
 	.tx_queue_setup       = ixgbe_dev_tx_queue_setup,
 	.tx_queue_release     = ixgbe_dev_tx_queue_release,
+	.rx_queue_intr_enable = ixgbevf_dev_rx_queue_intr_enable,
+	.rx_queue_intr_disable = ixgbevf_dev_rx_queue_intr_disable,
 	.mac_addr_add         = ixgbevf_add_mac_addr,
 	.mac_addr_remove      = ixgbevf_remove_mac_addr,
 };
@@ -899,12 +928,6 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
 			eth_dev->data->port_id, pci_dev->id.vendor_id,
 			pci_dev->id.device_id);
 
-	rte_intr_callback_register(&(pci_dev->intr_handle),
-		ixgbe_dev_interrupt_handler, (void *)eth_dev);
-
-	/* enable uio intr after callback register */
-	rte_intr_enable(&(pci_dev->intr_handle));
-
 	/* enable support intr */
 	ixgbe_enable_intr(eth_dev);
 
@@ -1457,6 +1480,8 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct ixgbe_vf_info *vfinfo =
 		*IXGBE_DEV_PRIVATE_TO_P_VFDATA(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t intr_vector = 0;
 	int err, link_up = 0, negotiate = 0;
 	uint32_t speed = 0;
 	int mask = 0;
@@ -1489,6 +1514,28 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+	/* check and configure queue intr-vector mapping */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	if (rte_intr_dp_is_en(intr_handle) && !intr_handle->intr_vec) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int),
+				    0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     "intr_vec\n", dev->data->nb_rx_queues);
+			return -1;
+		}
+	}
+
+	/* confiugre msix for sleep until rx interrupt */
+	ixgbe_configure_msix(dev);
+
 	/* initialize transmission unit */
 	ixgbe_dev_tx_init(dev);
 
@@ -1561,8 +1608,23 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 skip_link_setup:
 
 	/* check if lsc interrupt is enabled */
-	if (dev->data->dev_conf.intr_conf.lsc != 0)
-		ixgbe_dev_lsc_interrupt_setup(dev);
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(intr_handle)) {
+			rte_intr_callback_register(intr_handle,
+						   ixgbe_dev_interrupt_handler,
+						   (void *)dev);
+			ixgbe_dev_lsc_interrupt_setup(dev);
+		} else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		ixgbe_dev_rxq_interrupt_setup(dev);
+
+	/* enable uio/vfio intr/eventfd mapping */
+	rte_intr_enable(intr_handle);
 
 	/* resume enabled intr since hw reset */
 	ixgbe_enable_intr(dev);
@@ -1619,6 +1681,7 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct ixgbe_5tuple_filter *p_5tuple, *p_5tuple_next;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 	int vf;
 
 	PMD_INIT_FUNC_TRACE();
@@ -1626,6 +1689,9 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	/* disable interrupts */
 	ixgbe_disable_intr(hw);
 
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
 	/* reset the NIC */
 	ixgbe_pf_reset_hw(hw);
 	hw->adapter_stopped = FALSE;
@@ -1661,6 +1727,12 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	memset(filter_info->fivetuple_mask, 0,
 		sizeof(uint32_t) * IXGBE_5TUPLE_ARRAY_SIZE);
 
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 /*
@@ -2252,6 +2324,28 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+/**
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int
+ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	intr->mask |= IXGBE_EICR_RTX_QUEUE;
+
+	return 0;
+}
+
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
  *
@@ -2278,10 +2372,10 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	PMD_DRV_LOG(INFO, "eicr %x", eicr);
 
 	intr->flags = 0;
-	if (eicr & IXGBE_EICR_LSC) {
-		/* set flag for async link update */
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
 		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
-	}
 
 	if (eicr & IXGBE_EICR_MAILBOX)
 		intr->flags |= IXGBE_FLAG_MAILBOX;
@@ -2289,6 +2383,30 @@ ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_get_status(struct rte_eth_dev *dev)
+{
+	uint32_t eicr;
+	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	/* clear all cause mask */
+	ixgbevf_intr_disable(hw);
+
+	/* read-on-clear nic registers here */
+	eicr = IXGBE_READ_REG(hw, IXGBE_VTEICR);
+	PMD_DRV_LOG(INFO, "eicr %x", eicr);
+
+	intr->flags = 0;
+
+	/* set flag for async link update */
+	if (eicr & IXGBE_EICR_LSC)
+		intr->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
+
+	return 0;
+}
+
 /**
  * It gets and then prints the link status.
  *
@@ -2384,6 +2502,18 @@ ixgbe_dev_interrupt_action(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int
+ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	PMD_DRV_LOG(DEBUG, "enable intr immediately");
+	ixgbevf_intr_enable(hw);
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+	return 0;
+}
+
 /**
  * Interrupt handler which shall be registered for alarm callback for delayed
  * handling specific interrupt to wait for the stable nic state. As the
@@ -2445,6 +2575,15 @@ ixgbe_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	ixgbe_dev_interrupt_action(dev);
 }
 
+static void
+ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
+							void *param)
+{
+	struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+	ixgbevf_dev_interrupt_get_status(dev);
+	ixgbevf_dev_interrupt_action(dev);
+}
+
 static int
 ixgbe_dev_led_on(struct rte_eth_dev *dev)
 {
@@ -2943,6 +3082,19 @@ ixgbevf_intr_disable(struct ixgbe_hw *hw)
 	IXGBE_WRITE_FLUSH(hw);
 }
 
+static void
+ixgbevf_intr_enable(struct ixgbe_hw *hw)
+{
+	PMD_INIT_FUNC_TRACE();
+
+	/* VF enable interrupt autoclean */
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, IXGBE_VF_IRQ_ENABLE_MASK);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_VF_IRQ_ENABLE_MASK);
+
+	IXGBE_WRITE_FLUSH(hw);
+}
+
 static int
 ixgbevf_dev_configure(struct rte_eth_dev *dev)
 {
@@ -2975,6 +3127,9 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t intr_vector = 0;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+
 	int err, mask = 0;
 
 	PMD_INIT_FUNC_TRACE();
@@ -3005,6 +3160,41 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+	/* check and configure queue intr-vector mapping */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	if (rte_intr_dp_is_en(intr_handle) && !intr_handle->intr_vec) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int), 0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", dev->data->nb_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
+	ixgbevf_configure_msix(dev);
+
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(intr_handle))
+			rte_intr_callback_register(intr_handle,
+					ixgbevf_dev_interrupt_handler,
+					(void *)dev);
+		else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	rte_intr_enable(intr_handle);
+
+	/* Re-enable interrupt for VF */
+	ixgbevf_intr_enable(hw);
+
 	return 0;
 }
 
@@ -3012,6 +3202,7 @@ static void
 ixgbevf_dev_stop(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3028,12 +3219,23 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev)
 	dev->data->scattered_rx = 0;
 
 	ixgbe_dev_clear_queues(dev);
+
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 static void
 ixgbevf_dev_close(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_pci_device *pci_dev;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3043,6 +3245,12 @@ ixgbevf_dev_close(struct rte_eth_dev *dev)
 
 	/* reprogram the RAR[0] in case user changed it. */
 	ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV);
+
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
 }
 
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on)
@@ -3542,6 +3750,258 @@ ixgbe_mirror_rule_reset(struct rte_eth_dev *dev, uint8_t rule_id)
 	return 0;
 }
 
+
+static int
+ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask |= (1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static int
+ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+
+	mask = IXGBE_READ_REG(hw, IXGBE_VTEIMS);
+	mask &= ~(1 << queue_id);
+	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask);
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask |= (1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= (1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= (1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static int
+ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	uint32_t mask;
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct ixgbe_interrupt *intr =
+		IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
+
+	if (queue_id < 16) {
+		ixgbe_disable_intr(hw);
+		intr->mask &= ~(1 << queue_id);
+		ixgbe_enable_intr(dev);
+	} else if (queue_id < 32) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(0));
+		mask &= ~(1 << queue_id);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+	} else if (queue_id < 64) {
+		mask = IXGBE_READ_REG(hw, IXGBE_EIMS_EX(1));
+		mask &= ~(1 << (queue_id - 32));
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+	}
+
+	return 0;
+}
+
+static void
+ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+	if (direction == -1) {
+		/* other causes */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC);
+		tmp &= ~0xFF;
+		tmp |= msix_vector;
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, tmp);
+	} else {
+		/* rx or tx cause */
+		msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+		idx = ((16 * (queue & 1)) + (8 * direction));
+		tmp = IXGBE_READ_REG(hw, IXGBE_VTIVAR(queue >> 1));
+		tmp &= ~(0xFF << idx);
+		tmp |= (msix_vector << idx);
+		IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(queue >> 1), tmp);
+	}
+}
+
+/**
+ * set the IVAR registers, mapping interrupt causes to vectors
+ * @param hw
+ *  pointer to ixgbe_hw struct
+ * @direction
+ *  0 for Rx, 1 for Tx, -1 for other causes
+ * @queue
+ *  queue to map the corresponding interrupt to
+ * @msix_vector
+ *  the vector to map to the corresponding queue
+ */
+static void
+ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
+			   uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp, idx;
+
+	msix_vector |= IXGBE_IVAR_ALLOC_VAL;
+	if (hw->mac.type == ixgbe_mac_82598EB) {
+		if (direction == -1)
+			direction = 0;
+		idx = (((direction * 64) + queue) >> 2) & 0x1F;
+		tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(idx));
+		tmp &= ~(0xFF << (8 * (queue & 0x3)));
+		tmp |= (msix_vector << (8 * (queue & 0x3)));
+		IXGBE_WRITE_REG(hw, IXGBE_IVAR(idx), tmp);
+	} else if ((hw->mac.type == ixgbe_mac_82599EB) ||
+			(hw->mac.type == ixgbe_mac_X540)) {
+		if (direction == -1) {
+			/* other causes */
+			idx = ((queue & 1) * 8);
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, tmp);
+		} else {
+			/* rx or tx causes */
+			idx = ((16 * (queue & 1)) + (8 * direction));
+			tmp = IXGBE_READ_REG(hw, IXGBE_IVAR(queue >> 1));
+			tmp &= ~(0xFF << idx);
+			tmp |= (msix_vector << idx);
+			IXGBE_WRITE_REG(hw, IXGBE_IVAR(queue >> 1), tmp);
+		}
+	}
+}
+
+static void
+ixgbevf_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t q_idx;
+	uint32_t vector_idx = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	/* Configure all RX queues of VF */
+	for (q_idx = 0; q_idx < dev->data->nb_rx_queues; q_idx++) {
+		/* Force all queue use vector 0,
+		 * as IXGBE_VF_MAXMSIVECOTR = 1 */
+		ixgbevf_set_ivar_map(hw, 0, q_idx, vector_idx);
+		intr_handle->intr_vec[q_idx] = vector_idx;
+	}
+
+	/* Configure VF Rx queue ivar */
+	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+}
+
+/**
+ * Sets up the hardware to properly generate MSI-X interrupts
+ * @hw
+ *  board private structure
+ */
+static void
+ixgbe_configure_msix(struct rte_eth_dev *dev)
+{
+	struct ixgbe_hw *hw =
+		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t queue_id, vec = 0;
+	uint32_t mask;
+	uint32_t gpie;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	/* setup GPIE for MSI-x mode */
+	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
+	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
+		IXGBE_GPIE_OCD | IXGBE_GPIE_EIAME;
+	/*
+	* auto clearing and auto setting corresponding bits in EIMS
+	* when MSI-X interrupt is triggered
+	*/
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE);
+	else {
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF);
+		IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF);
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie);
+
+	/*
+	 * Populate the IVAR table and set the ITR values to the
+	 * corresponding register.
+	 */
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues;
+	     queue_id++) {
+		/* by default, 1:1 mapping */
+		ixgbe_set_ivar_map(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < intr_handle->nb_efd - 1)
+			vec++;
+	}
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		ixgbe_set_ivar_map(hw, -1, IXGBE_IVAR_OTHER_CAUSES_INDEX,
+				   intr_handle->max_intr - 1);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		ixgbe_set_ivar_map(hw, -1, 1, intr_handle->max_intr - 1);
+		break;
+	default:
+		break;
+	}
+	IXGBE_WRITE_REG(hw, IXGBE_EITR(queue_id),
+			 IXGBE_MIN_INTER_INTERRUPT_INTERVAL_DEFAULT & 0xFFF);
+
+	/* set up to autoclear timer, and the vectors */
+	mask = IXGBE_EIMS_ENABLE_MASK;
+	mask &= ~(IXGBE_EIMS_OTHER |
+		  IXGBE_EIMS_MAILBOX |
+		  IXGBE_EIMS_LSC);
+
+	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+}
+
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 	uint16_t queue_idx, uint16_t tx_rate)
 {
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 19237b8..cccef46 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -117,6 +117,9 @@
 	ETH_RSS_IPV6_TCP_EX | \
 	ETH_RSS_IPV6_UDP_EX)
 
+#define IXGBE_VF_IRQ_ENABLE_MASK        3          /* vf irq enable mask */
+#define IXGBE_VF_MAXMSIVECTOR           1
+
 /*
  * Information about the fdir mode.
  */
@@ -328,6 +331,7 @@ uint32_t ixgbe_dev_rx_queue_count(struct rte_eth_dev *dev,
 		uint16_t rx_queue_id);
 
 int ixgbe_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
+int ixgbevf_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);
 
 int ixgbe_dev_rx_init(struct rte_eth_dev *dev);
 
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 10/12] igb: enable rx queue interrupts for PF
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (8 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 09/12] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 11/12] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
                           ` (2 subsequent siblings)
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch does below for igb PF:
- Setup NIC to generate MSI-X interrupts
- Set the IVAR register to map interrupt causes to vectors
- Implement interrupt enable/disable functions

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v9 changes
 - move queue-vec mapping init from dev_configure to dev_start
 - fix link interrupt not working issue in vfio-msix

v8 changes
 - add vfio-msi/vfio-legacy and uio-legacy support

v7 changes
 - add condition check when intr vector is not enabled

v6 changes
 - fill queue-vector mapping table

v5 changes
 - Rebase the patchset onto the HEAD

v3 changes
 - Remove unnecessary variables in e1000_mac_info
 - Remove spinlok from PMD

v2 changes
 - Consolidate review comments related to coding style

 drivers/net/e1000/igb_ethdev.c | 285 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 252 insertions(+), 33 deletions(-)

diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index e4b370d..bbd7b74 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -96,6 +96,7 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -194,6 +195,16 @@ static int eth_igb_filter_ctrl(struct rte_eth_dev *dev,
 		     enum rte_filter_op filter_op,
 		     void *arg);
 
+static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
+					uint16_t queue_id);
+static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				uint8_t queue, uint8_t msix_vector);
+static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
+				uint8_t index, uint8_t offset);
+
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
  */
@@ -253,6 +264,8 @@ static const struct eth_dev_ops eth_igb_ops = {
 	.vlan_tpid_set        = eth_igb_vlan_tpid_set,
 	.vlan_offload_set     = eth_igb_vlan_offload_set,
 	.rx_queue_setup       = eth_igb_rx_queue_setup,
+	.rx_queue_intr_enable = eth_igb_rx_queue_intr_enable,
+	.rx_queue_intr_disable = eth_igb_rx_queue_intr_disable,
 	.rx_queue_release     = eth_igb_rx_queue_release,
 	.rx_queue_count       = eth_igb_rx_queue_count,
 	.rx_descriptor_done   = eth_igb_rx_descriptor_done,
@@ -584,12 +597,6 @@ eth_igb_dev_init(struct rte_eth_dev *eth_dev)
 		     eth_dev->data->port_id, pci_dev->id.vendor_id,
 		     pci_dev->id.device_id);
 
-	rte_intr_callback_register(&(pci_dev->intr_handle),
-		eth_igb_interrupt_handler, (void *)eth_dev);
-
-	/* enable uio intr after callback register */
-	rte_intr_enable(&(pci_dev->intr_handle));
-
 	/* enable support intr */
 	igb_intr_enable(eth_dev);
 
@@ -752,7 +759,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	int ret, i, mask;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t intr_vector = 0;
+	int ret, mask;
 	uint32_t ctrl_ext;
 
 	PMD_INIT_FUNC_TRACE();
@@ -792,6 +801,27 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+	/* check and configure queue intr-vector mapping */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		intr_vector = dev->data->nb_rx_queues;
+
+	if (rte_intr_efd_enable(intr_handle, intr_vector))
+		return -1;
+
+	if (rte_intr_dp_is_en(intr_handle)) {
+		intr_handle->intr_vec =
+			rte_zmalloc("intr_vec",
+				    dev->data->nb_rx_queues * sizeof(int), 0);
+		if (intr_handle->intr_vec == NULL) {
+			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
+				     " intr_vec\n", dev->data->nb_rx_queues);
+			return -ENOMEM;
+		}
+	}
+
+	/* confiugre msix for rx interrupt */
+	eth_igb_configure_msix_intr(dev);
+
 	/* Configure for OS presence */
 	igb_init_manageability(hw);
 
@@ -819,33 +849,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 		igb_vmdq_vlan_hw_filter_enable(dev);
 	}
 
-	/*
-	 * Configure the Interrupt Moderation register (EITR) with the maximum
-	 * possible value (0xFFFF) to minimize "System Partial Write" issued by
-	 * spurious [DMA] memory updates of RX and TX ring descriptors.
-	 *
-	 * With a EITR granularity of 2 microseconds in the 82576, only 7/8
-	 * spurious memory updates per second should be expected.
-	 * ((65535 * 2) / 1000.1000 ~= 0.131 second).
-	 *
-	 * Because interrupts are not used at all, the MSI-X is not activated
-	 * and interrupt moderation is controlled by EITR[0].
-	 *
-	 * Note that having [almost] disabled memory updates of RX and TX ring
-	 * descriptors through the Interrupt Moderation mechanism, memory
-	 * updates of ring descriptors are now moderated by the configurable
-	 * value of Write-Back Threshold registers.
-	 */
 	if ((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580) ||
 		(hw->mac.type == e1000_i350) || (hw->mac.type == e1000_i210) ||
 		(hw->mac.type == e1000_i211)) {
-		uint32_t ivar;
-
-		/* Enable all RX & TX queues in the IVAR registers */
-		ivar = (uint32_t) ((E1000_IVAR_VALID << 16) | E1000_IVAR_VALID);
-		for (i = 0; i < 8; i++)
-			E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, i, ivar);
-
 		/* Configure EITR with the maximum possible value (0xFFFF) */
 		E1000_WRITE_REG(hw, E1000_EITR(0), 0xFFFF);
 	}
@@ -896,8 +902,23 @@ eth_igb_start(struct rte_eth_dev *dev)
 	e1000_setup_link(hw);
 
 	/* check if lsc interrupt feature is enabled */
-	if (dev->data->dev_conf.intr_conf.lsc != 0)
-		ret = eth_igb_lsc_interrupt_setup(dev);
+	if (dev->data->dev_conf.intr_conf.lsc != 0) {
+		if (rte_intr_allow_others(intr_handle)) {
+			rte_intr_callback_register(intr_handle,
+						   eth_igb_interrupt_handler,
+						   (void *)dev);
+			eth_igb_lsc_interrupt_setup(dev);
+		} else
+			PMD_INIT_LOG(INFO, "lsc won't enable because of"
+				     " no intr multiplex\n");
+	}
+
+	/* check if rxq interrupt is enabled */
+	if (dev->data->dev_conf.intr_conf.rxq != 0)
+		eth_igb_rxq_interrupt_setup(dev);
+
+	/* enable uio/vfio intr/eventfd mapping */
+	rte_intr_enable(intr_handle);
 
 	/* resume enabled intr since hw reset */
 	igb_intr_enable(dev);
@@ -930,8 +951,13 @@ eth_igb_stop(struct rte_eth_dev *dev)
 	struct e1000_flex_filter *p_flex;
 	struct e1000_5tuple_filter *p_5tuple, *p_5tuple_next;
 	struct e1000_2tuple_filter *p_2tuple, *p_2tuple_next;
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	igb_intr_disable(hw);
+
+	/* disable intr eventfd mapping */
+	rte_intr_disable(intr_handle);
+
 	igb_pf_reset_hw(hw);
 	E1000_WRITE_REG(hw, E1000_WUC, 0);
 
@@ -980,6 +1006,13 @@ eth_igb_stop(struct rte_eth_dev *dev)
 		rte_free(p_2tuple);
 	}
 	filter_info->twotuple_mask = 0;
+
+	/* Clean datapath event and queue/vec mapping */
+	rte_intr_efd_disable(intr_handle);
+	if (intr_handle->intr_vec != NULL) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
 }
 
 static void
@@ -987,6 +1020,7 @@ eth_igb_close(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_link link;
+	struct rte_pci_device *pci_dev;
 
 	eth_igb_stop(dev);
 	e1000_phy_hw_reset(hw);
@@ -1004,6 +1038,12 @@ eth_igb_close(struct rte_eth_dev *dev)
 
 	igb_dev_clear_queues(dev);
 
+	pci_dev = dev->pci_dev;
+	if (pci_dev->intr_handle.intr_vec) {
+		rte_free(pci_dev->intr_handle.intr_vec);
+		pci_dev->intr_handle.intr_vec = NULL;
+	}
+
 	memset(&link, 0, sizeof(link));
 	rte_igb_dev_atomic_write_link_status(dev, &link);
 }
@@ -1828,6 +1868,34 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 }
 
 /*
+ * It clears the interrupt causes and enables the interrupt.
+ * It will be called once only during nic initialized.
+ *
+ * @param dev
+ *  Pointer to struct rte_eth_dev.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
+{
+	uint32_t mask, regval;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	eth_igb_infos_get(dev, &dev_info);
+
+	mask = 0xFFFFFFFF >> (32 - dev_info.max_rx_queues);
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+
+	return 0;
+}
+
+/*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
  * to update link status.
  *
@@ -3652,5 +3720,156 @@ static struct rte_driver pmd_igbvf_drv = {
 	.init = rte_igbvf_pmd_init,
 };
 
+static int
+eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+
+	E1000_WRITE_REG(hw, E1000_EIMC, mask);
+	E1000_WRITE_FLUSH(hw);
+
+	return 0;
+}
+
+static int
+eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
+{
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	uint32_t mask = 1 << queue_id;
+	uint32_t regval;
+
+	regval = E1000_READ_REG(hw, E1000_EIMS);
+	E1000_WRITE_REG(hw, E1000_EIMS, regval | mask);
+	E1000_WRITE_FLUSH(hw);
+
+	rte_intr_enable(&(dev->pci_dev->intr_handle));
+
+	return 0;
+}
+
+static void
+eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
+			uint8_t index, uint8_t offset)
+{
+	uint32_t val = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index);
+
+	/* clear bits */
+	val &= ~((uint32_t)0xFF << offset);
+
+	/* write vector and valid bit */
+	val |= (msix_vector | E1000_IVAR_VALID) << offset;
+
+	E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, val);
+}
+
+static void
+eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
+				 uint8_t queue, uint8_t msix_vector)
+{
+	uint32_t tmp = 0;
+	if (hw->mac.type == e1000_82575) {
+		if (direction == 0)
+			tmp = E1000_EICR_RX_QUEUE0 << queue;
+		else if (direction == 1)
+			tmp = E1000_EICR_TX_QUEUE0 << queue;
+		E1000_WRITE_REG(hw, E1000_MSIXBM(msix_vector), tmp);
+	} else if (hw->mac.type == e1000_82576) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector, queue & 0x7,
+					((queue & 0x8) << 1) + 8 * direction);
+	} else if ((hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		if ((direction == 0) || (direction == 1))
+			eth_igb_write_ivar(hw, msix_vector,
+					queue >> 1,
+					((queue & 0x1) << 4) + 8 * direction);
+	}
+}
+
+/*
+ * Sets up the hardware to generate MSI-X interrupts properly
+ * @hw
+ *  board private structure
+ */
+static void
+eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
+{
+	int queue_id;
+	uint32_t tmpval, regval, intr_mask;
+	struct e1000_hw *hw =
+		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+	uint32_t vec = 0;
+
+	/* won't configure msix register if no mapping is done
+	 * between intr vector and event fd */
+	if (!rte_intr_dp_is_en(intr_handle))
+		return;
+
+	/* set interrupt vector for other causes */
+	if (hw->mac.type == e1000_82575) {
+		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
+		/* enable MSI-X PBA support */
+		tmpval |= E1000_CTRL_EXT_PBA_CLR;
+
+		/* Auto-Mask interrupts upon ICR read */
+		tmpval |= E1000_CTRL_EXT_EIAME;
+		tmpval |= E1000_CTRL_EXT_IRCA;
+
+		E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmpval);
+
+		/* enable msix_other interrupt */
+		E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), 0, E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | E1000_EIMS_OTHER);
+		regval = E1000_READ_REG(hw, E1000_EIAM);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | E1000_EIMS_OTHER);
+	} else if ((hw->mac.type == e1000_82576) ||
+			(hw->mac.type == e1000_82580) ||
+			(hw->mac.type == e1000_i350) ||
+			(hw->mac.type == e1000_i354) ||
+			(hw->mac.type == e1000_i210) ||
+			(hw->mac.type == e1000_i211)) {
+		/* turn on MSI-X capability first */
+		E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE |
+					E1000_GPIE_PBA | E1000_GPIE_EIAME |
+					E1000_GPIE_NSICR);
+
+		intr_mask = (1 << intr_handle->max_intr) - 1;
+		regval = E1000_READ_REG(hw, E1000_EIAC);
+		E1000_WRITE_REG(hw, E1000_EIAC, regval | intr_mask);
+
+		/* enable msix_other interrupt */
+		regval = E1000_READ_REG(hw, E1000_EIMS);
+		E1000_WRITE_REG(hw, E1000_EIMS, regval | intr_mask);
+		tmpval = (dev->data->nb_rx_queues | E1000_IVAR_VALID) << 8;
+		E1000_WRITE_REG(hw, E1000_IVAR_MISC, tmpval);
+	}
+
+	/*
+	* use EIAM to auto-mask when MSI-X interrupt
+	* is asserted, this saves a register write for every interrupt
+	*/
+	intr_mask = (1 << intr_handle->nb_efd) - 1;
+	regval = E1000_READ_REG(hw, E1000_EIAM);
+	E1000_WRITE_REG(hw, E1000_EIAM, regval | intr_mask);
+
+	for (queue_id = 0; queue_id < dev->data->nb_rx_queues; queue_id++) {
+		eth_igb_assign_msix_vector(hw, 0, queue_id, vec);
+		intr_handle->intr_vec[queue_id] = vec;
+		if (vec < intr_handle->nb_efd - 1)
+			vec++;
+	}
+
+	E1000_WRITE_FLUSH(hw);
+}
+
+
 PMD_REGISTER_DRIVER(pmd_igb_drv);
 PMD_REGISTER_DRIVER(pmd_igbvf_drv);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 11/12] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (9 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 10/12] igb: enable rx queue interrupts for PF Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue Cunming Liang
  2015-06-02  6:53         ` [dpdk-dev] [PATCH v10 00/13] Interrupt mode PMD Cunming Liang
  12 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in
polling mode and switch to interrupt mode only if there is no
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles
than kernel, so one-shot interrupt is used here to guarantee minimum
overhead and DPDK polling thread returns to polling mode immediately
once it receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes
 - using new APIs
 - demo multiple port/queue pair wait on the same epoll instance

v6 changes
 - Split event fd add and wait

v5 changes
 - Change invoked function name and parameter to accomodate EAL change

v3 changes
 - Add spinlock to ensure thread safe when accessing interrupt mask
   register

v2 changes
 - Remove unused function which is for debug purpose

 examples/l3fwd-power/main.c | 207 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 165 insertions(+), 42 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index 6ac342b..538bb93 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -74,12 +74,14 @@
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -153,6 +155,9 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+/* ethernet addresses of ports */
+static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 /* Ports set in promiscuous mode off by default. */
@@ -185,6 +190,9 @@ struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -211,7 +219,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -223,11 +231,15 @@ static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.lsc = 1,
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -399,19 +411,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -704,22 +719,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the
+		minimum latency switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -759,6 +772,84 @@ power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(int num)
+{
+	struct rte_epoll_event event[num];
+	int n, i;
+	uint8_t port_id, queue_id;
+	void *data;
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt triggers\n",
+		rte_lcore_id());
+
+	n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, -1);
+	for (i = 0; i < n; i++) {
+		data = event[i].epdata.data;
+		port_id = ((uintptr_t)data) >> CHAR_BIT;
+		queue_id = ((uintptr_t)data) &
+			RTE_LEN2MASK(CHAR_BIT, uint8_t);
+		RTE_LOG(INFO, L3FWD_POWER,
+			"lcore %u is waked up from rx interrupt on"
+			" port %d queue %d\n",
+			rte_lcore_id(), port_id, queue_id);
+	}
+
+	return 0;
+}
+
+static int turn_on_intr(struct lcore_conf *qconf)
+{
+	int i;
+	struct lcore_rx_queue *rx_queue;
+	uint8_t port_id, queue_id;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		port_id = rx_queue->port_id;
+		queue_id = rx_queue->queue_id;
+
+		rte_spinlock_lock(&(locks[port_id]));
+		rte_eth_dev_rx_intr_enable(port_id, queue_id);
+		rte_spinlock_unlock(&(locks[port_id]));
+	}
+}
+
+static int event_register(struct lcore_conf *qconf)
+{
+	struct lcore_rx_queue *rx_queue;
+	uint8_t portid, queueid;
+	uint32_t data;
+	int ret;
+	int i;
+
+	for (i = 0; i < qconf->n_rx_queue; ++i) {
+		rx_queue = &(qconf->rx_queue_list[i]);
+		portid = rx_queue->port_id;
+		queueid = rx_queue->queue_id;
+		data = portid << CHAR_BIT | queueid;
+
+		ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
+						RTE_EPOLL_PER_THREAD,
+						RTE_INTR_EVENT_ADD,
+						(void *)((uintptr_t)data));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -772,9 +863,9 @@ main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
+	int intr_en = 0;
 
 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
 
@@ -791,13 +882,18 @@ main_loop(__attribute__((unused)) void *dummy)
 	RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id);
 
 	for (i = 0; i < qconf->n_rx_queue; i++) {
-
 		portid = qconf->rx_queue_list[i].port_id;
 		queueid = qconf->rx_queue_list[i].queue_id;
 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%hhu "
 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
 	}
 
+	/* add into event wait list */
+	if (port_conf.intr_conf.rxq && event_register(qconf) == 0)
+		intr_en = 1;
+	else
+		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
+
 	while (1) {
 		stats[lcore_id].nb_iteration_looped++;
 
@@ -832,6 +928,7 @@ main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -845,6 +942,7 @@ main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -907,10 +1005,13 @@ main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -925,16 +1026,23 @@ main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of
+				* microsecond for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				if (intr_en) {
+					turn_on_intr(qconf);
+					sleep_until_rx_interrupt(
+						qconf->n_rx_queue);
+				}
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1267,7 +1375,7 @@ setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1277,7 +1385,7 @@ setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1471,6 +1579,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1520,10 +1629,19 @@ main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure not existed rxq: "
+				"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1547,6 +1665,9 @@ main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1581,8 +1702,9 @@ main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1630,7 +1752,6 @@ main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets
@@ -1639,6 +1760,8 @@ main(int argc, char **argv)
 		 */
 		if (promiscuous_on)
 			rte_eth_promiscuous_enable(portid);
+		/* initialize spinlock for each port */
+		rte_spinlock_init(&(locks[portid]));
 	}
 
 	check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (10 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 11/12] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch Cunming Liang
@ 2015-05-29  8:45         ` Cunming Liang
  2015-05-29 15:27           ` Stephen Hemminger
                             ` (2 more replies)
  2015-06-02  6:53         ` [dpdk-dev] [PATCH v10 00/13] Interrupt mode PMD Cunming Liang
  12 siblings, 3 replies; 242+ messages in thread
From: Cunming Liang @ 2015-05-29  8:45 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

RTE_EAL_RX_INTR will be removed from v2.2. It's only used to avoid ABI(unannounced) broken in v2.1.
The usrs should make sure understand the impact before turning on the feature.
There are two abi changes required in this interrupt patch set.
They're 1) struct rte_intr_handle; 2) struct rte_intr_conf.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
 drivers/net/e1000/igb_ethdev.c                     | 28 ++++++++-
 drivers/net/ixgbe/ixgbe_ethdev.c                   | 41 ++++++++++++-
 examples/l3fwd-power/main.c                        |  4 +-
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |  7 +++
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 12 ++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 68 +++++++++++++++++++++-
 lib/librte_ether/rte_ethdev.c                      |  2 +
 lib/librte_ether/rte_ethdev.h                      | 32 +++++++++-
 8 files changed, 183 insertions(+), 11 deletions(-)

diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index bbd7b74..6f29222 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -96,7 +96,9 @@ static int  eth_igb_flow_ctrl_get(struct rte_eth_dev *dev,
 static int  eth_igb_flow_ctrl_set(struct rte_eth_dev *dev,
 				struct rte_eth_fc_conf *fc_conf);
 static int eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev);
+#ifdef RTE_EAL_RX_INTR
 static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev);
+#endif
 static int eth_igb_interrupt_get_status(struct rte_eth_dev *dev);
 static int eth_igb_interrupt_action(struct rte_eth_dev *dev);
 static void eth_igb_interrupt_handler(struct rte_intr_handle *handle,
@@ -199,11 +201,15 @@ static int eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev,
 					uint16_t queue_id);
 static int eth_igb_rx_queue_intr_disable(struct rte_eth_dev *dev,
 					uint16_t queue_id);
+#ifdef RTE_EAL_RX_INTR
 static void eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
 				uint8_t queue, uint8_t msix_vector);
+#endif
 static void eth_igb_configure_msix_intr(struct rte_eth_dev *dev);
+#ifdef RTE_EAL_RX_INTR
 static void eth_igb_write_ivar(struct e1000_hw *hw, uint8_t msix_vector,
 				uint8_t index, uint8_t offset);
+#endif
 
 /*
  * Define VF Stats MACRO for Non "cleared on read" register
@@ -760,7 +766,9 @@ eth_igb_start(struct rte_eth_dev *dev)
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+#ifdef RTE_EAL_RX_INTR
 	uint32_t intr_vector = 0;
+#endif
 	int ret, mask;
 	uint32_t ctrl_ext;
 
@@ -801,6 +809,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	igb_pf_host_configure(dev);
 
+#ifdef RTE_EAL_RX_INTR
 	/* check and configure queue intr-vector mapping */
 	if (dev->data->dev_conf.intr_conf.rxq != 0)
 		intr_vector = dev->data->nb_rx_queues;
@@ -818,6 +827,7 @@ eth_igb_start(struct rte_eth_dev *dev)
 			return -ENOMEM;
 		}
 	}
+#endif
 
 	/* confiugre msix for rx interrupt */
 	eth_igb_configure_msix_intr(dev);
@@ -913,9 +923,11 @@ eth_igb_start(struct rte_eth_dev *dev)
 				     " no intr multiplex\n");
 	}
 
+#ifdef RTE_EAL_RX_INTR
 	/* check if rxq interrupt is enabled */
 	if (dev->data->dev_conf.intr_conf.rxq != 0)
 		eth_igb_rxq_interrupt_setup(dev);
+#endif
 
 	/* enable uio/vfio intr/eventfd mapping */
 	rte_intr_enable(intr_handle);
@@ -1007,12 +1019,14 @@ eth_igb_stop(struct rte_eth_dev *dev)
 	}
 	filter_info->twotuple_mask = 0;
 
+#ifdef RTE_EAL_RX_INTR
 	/* Clean datapath event and queue/vec mapping */
 	rte_intr_efd_disable(intr_handle);
 	if (intr_handle->intr_vec != NULL) {
 		rte_free(intr_handle->intr_vec);
 		intr_handle->intr_vec = NULL;
 	}
+#endif
 }
 
 static void
@@ -1020,7 +1034,9 @@ eth_igb_close(struct rte_eth_dev *dev)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_link link;
+#ifdef RTE_EAL_RX_INTR
 	struct rte_pci_device *pci_dev;
+#endif
 
 	eth_igb_stop(dev);
 	e1000_phy_hw_reset(hw);
@@ -1038,11 +1054,13 @@ eth_igb_close(struct rte_eth_dev *dev)
 
 	igb_dev_clear_queues(dev);
 
+#ifdef RTE_EAL_RX_INTR
 	pci_dev = dev->pci_dev;
 	if (pci_dev->intr_handle.intr_vec) {
 		rte_free(pci_dev->intr_handle.intr_vec);
 		pci_dev->intr_handle.intr_vec = NULL;
 	}
+#endif
 
 	memset(&link, 0, sizeof(link));
 	rte_igb_dev_atomic_write_link_status(dev, &link);
@@ -1867,6 +1885,7 @@ eth_igb_lsc_interrupt_setup(struct rte_eth_dev *dev)
 	return 0;
 }
 
+#ifdef RTE_EAL_RX_INTR
 /*
  * It clears the interrupt causes and enables the interrupt.
  * It will be called once only during nic initialized.
@@ -1894,6 +1913,7 @@ static int eth_igb_rxq_interrupt_setup(struct rte_eth_dev *dev)
 
 	return 0;
 }
+#endif
 
 /*
  * It reads ICR and gets interrupt causes, check it and set a bit flag
@@ -3750,6 +3770,7 @@ eth_igb_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
 	return 0;
 }
 
+#ifdef RTE_EAL_RX_INTR
 static void
 eth_igb_write_ivar(struct e1000_hw *hw, uint8_t  msix_vector,
 			uint8_t index, uint8_t offset)
@@ -3791,6 +3812,7 @@ eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
 					((queue & 0x1) << 4) + 8 * direction);
 	}
 }
+#endif
 
 /*
  * Sets up the hardware to generate MSI-X interrupts properly
@@ -3800,18 +3822,21 @@ eth_igb_assign_msix_vector(struct e1000_hw *hw, int8_t direction,
 static void
 eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
 {
+#ifdef RTE_EAL_RX_INTR
 	int queue_id;
 	uint32_t tmpval, regval, intr_mask;
 	struct e1000_hw *hw =
 		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 	uint32_t vec = 0;
+#endif
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	/* won't configure msix register if no mapping is done
 	 * between intr vector and event fd */
 	if (!rte_intr_dp_is_en(intr_handle))
 		return;
 
+#ifdef RTE_EAL_RX_INTR
 	/* set interrupt vector for other causes */
 	if (hw->mac.type == e1000_82575) {
 		tmpval = E1000_READ_REG(hw, E1000_CTRL_EXT);
@@ -3868,6 +3893,7 @@ eth_igb_configure_msix_intr(struct rte_eth_dev *dev)
 	}
 
 	E1000_WRITE_FLUSH(hw);
+#endif
 }
 
 
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 798bb85..8c7bc99 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -174,7 +174,9 @@ static int ixgbe_dev_rss_reta_query(struct rte_eth_dev *dev,
 			uint16_t reta_size);
 static void ixgbe_dev_link_status_print(struct rte_eth_dev *dev);
 static int ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev);
+#ifdef RTE_EAL_RX_INTR
 static int ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev);
+#endif
 static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev);
 static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev);
 static void ixgbe_dev_interrupt_handler(struct rte_intr_handle *handle,
@@ -210,8 +212,10 @@ static int ixgbevf_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
 		uint16_t queue_id);
 static int ixgbevf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
 		 uint16_t queue_id);
+#ifdef RTE_EAL_RX_INTR
 static void ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
 		 uint8_t queue, uint8_t msix_vector);
+#endif
 static void ixgbevf_configure_msix(struct rte_eth_dev *dev);
 
 /* For Eth VMDQ APIs support */
@@ -234,8 +238,10 @@ static int ixgbe_dev_rx_queue_intr_enable(struct rte_eth_dev *dev,
 					uint16_t queue_id);
 static int ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
 					uint16_t queue_id);
+#ifdef RTE_EAL_RX_INTR
 static void ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
 				uint8_t queue, uint8_t msix_vector);
+#endif
 static void ixgbe_configure_msix(struct rte_eth_dev *dev);
 
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
@@ -1481,7 +1487,9 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	struct ixgbe_vf_info *vfinfo =
 		*IXGBE_DEV_PRIVATE_TO_P_VFDATA(dev->data->dev_private);
 	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+#ifdef RTE_EAL_RX_INTR
 	uint32_t intr_vector = 0;
+#endif
 	int err, link_up = 0, negotiate = 0;
 	uint32_t speed = 0;
 	int mask = 0;
@@ -1514,6 +1522,7 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 	/* configure PF module if SRIOV enabled */
 	ixgbe_pf_host_configure(dev);
 
+#ifdef RTE_EAL_RX_INTR
 	/* check and configure queue intr-vector mapping */
 	if (dev->data->dev_conf.intr_conf.rxq != 0)
 		intr_vector = dev->data->nb_rx_queues;
@@ -1532,6 +1541,7 @@ ixgbe_dev_start(struct rte_eth_dev *dev)
 			return -1;
 		}
 	}
+#endif
 
 	/* confiugre msix for sleep until rx interrupt */
 	ixgbe_configure_msix(dev);
@@ -1619,9 +1629,11 @@ skip_link_setup:
 				     " no intr multiplex\n");
 	}
 
+#ifdef RTE_EAL_RX_INTR
 	/* check if rxq interrupt is enabled */
 	if (dev->data->dev_conf.intr_conf.rxq != 0)
 		ixgbe_dev_rxq_interrupt_setup(dev);
+#endif
 
 	/* enable uio/vfio intr/eventfd mapping */
 	rte_intr_enable(intr_handle);
@@ -1727,12 +1739,14 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
 	memset(filter_info->fivetuple_mask, 0,
 		sizeof(uint32_t) * IXGBE_5TUPLE_ARRAY_SIZE);
 
+#ifdef RTE_EAL_RX_INTR
 	/* Clean datapath event and queue/vec mapping */
 	rte_intr_efd_disable(intr_handle);
 	if (intr_handle->intr_vec != NULL) {
 		rte_free(intr_handle->intr_vec);
 		intr_handle->intr_vec = NULL;
 	}
+#endif
 }
 
 /*
@@ -2335,6 +2349,7 @@ ixgbe_dev_lsc_interrupt_setup(struct rte_eth_dev *dev)
  *  - On success, zero.
  *  - On failure, a negative value.
  */
+#ifdef RTE_EAL_RX_INTR
 static int
 ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
 {
@@ -2345,6 +2360,7 @@ ixgbe_dev_rxq_interrupt_setup(struct rte_eth_dev *dev)
 
 	return 0;
 }
+#endif
 
 /*
  * It reads ICR and sets flag (IXGBE_EICR_LSC) for the link_update.
@@ -3127,7 +3143,9 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+#ifdef RTE_EAL_RX_INTR
 	uint32_t intr_vector = 0;
+#endif
 	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 
 	int err, mask = 0;
@@ -3160,6 +3178,7 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 
 	ixgbevf_dev_rxtx_start(dev);
 
+#ifdef RTE_EAL_RX_INTR
 	/* check and configure queue intr-vector mapping */
 	if (dev->data->dev_conf.intr_conf.rxq != 0)
 		intr_vector = dev->data->nb_rx_queues;
@@ -3177,7 +3196,7 @@ ixgbevf_dev_start(struct rte_eth_dev *dev)
 			return -ENOMEM;
 		}
 	}
-
+#endif
 	ixgbevf_configure_msix(dev);
 
 	if (dev->data->dev_conf.intr_conf.lsc != 0) {
@@ -3223,19 +3242,23 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev)
 	/* disable intr eventfd mapping */
 	rte_intr_disable(intr_handle);
 
+#ifdef RTE_EAL_RX_INTR
 	/* Clean datapath event and queue/vec mapping */
 	rte_intr_efd_disable(intr_handle);
 	if (intr_handle->intr_vec != NULL) {
 		rte_free(intr_handle->intr_vec);
 		intr_handle->intr_vec = NULL;
 	}
+#endif
 }
 
 static void
 ixgbevf_dev_close(struct rte_eth_dev *dev)
 {
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+#ifdef RTE_EAL_RX_INTR
 	struct rte_pci_device *pci_dev;
+#endif
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -3246,11 +3269,13 @@ ixgbevf_dev_close(struct rte_eth_dev *dev)
 	/* reprogram the RAR[0] in case user changed it. */
 	ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV);
 
+#ifdef RTE_EAL_RX_INTR
 	pci_dev = dev->pci_dev;
 	if (pci_dev->intr_handle.intr_vec) {
 		rte_free(pci_dev->intr_handle.intr_vec);
 		pci_dev->intr_handle.intr_vec = NULL;
 	}
+#endif
 }
 
 static void ixgbevf_set_vfta_all(struct rte_eth_dev *dev, bool on)
@@ -3834,6 +3859,7 @@ ixgbe_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
 	return 0;
 }
 
+#ifdef RTE_EAL_RX_INTR
 static void
 ixgbevf_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
 			uint8_t queue, uint8_t msix_vector)
@@ -3902,21 +3928,25 @@ ixgbe_set_ivar_map(struct ixgbe_hw *hw, int8_t direction,
 		}
 	}
 }
+#endif
 
 static void
 ixgbevf_configure_msix(struct rte_eth_dev *dev)
 {
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+#ifdef RTE_EAL_RX_INTR
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 	uint32_t q_idx;
 	uint32_t vector_idx = 0;
+#endif
 
 	/* won't configure msix register if no mapping is done
 	 * between intr vector and event fd */
 	if (!rte_intr_dp_is_en(intr_handle))
 		return;
 
+#ifdef RTE_EAL_RX_INTR
 	/* Configure all RX queues of VF */
 	for (q_idx = 0; q_idx < dev->data->nb_rx_queues; q_idx++) {
 		/* Force all queue use vector 0,
@@ -3927,6 +3957,7 @@ ixgbevf_configure_msix(struct rte_eth_dev *dev)
 
 	/* Configure VF Rx queue ivar */
 	ixgbevf_set_ivar_map(hw, -1, 1, vector_idx);
+#endif
 }
 
 /**
@@ -3937,18 +3968,21 @@ ixgbevf_configure_msix(struct rte_eth_dev *dev)
 static void
 ixgbe_configure_msix(struct rte_eth_dev *dev)
 {
+	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
+#ifdef RTE_EAL_RX_INTR
 	struct ixgbe_hw *hw =
 		IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-	struct rte_intr_handle *intr_handle = &dev->pci_dev->intr_handle;
 	uint32_t queue_id, vec = 0;
 	uint32_t mask;
 	uint32_t gpie;
+#endif
 
 	/* won't configure msix register if no mapping is done
 	 * between intr vector and event fd */
 	if (!rte_intr_dp_is_en(intr_handle))
 		return;
 
+#ifdef RTE_EAL_RX_INTR
 	/* setup GPIE for MSI-x mode */
 	gpie = IXGBE_READ_REG(hw, IXGBE_GPIE);
 	gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_PBA_SUPPORT |
@@ -4000,6 +4034,7 @@ ixgbe_configure_msix(struct rte_eth_dev *dev)
 		  IXGBE_EIMS_LSC);
 
 	IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask);
+#endif
 }
 
 static int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index 538bb93..86ff3e9 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -239,7 +239,7 @@ static struct rte_eth_conf port_conf = {
 	},
 	.intr_conf = {
 		.lsc = 1,
-		.rxq = 1, /**< rxq interrupt feature enabled */
+		.rxq = 1,
 	},
 };
 
@@ -889,7 +889,7 @@ main_loop(__attribute__((unused)) void *dummy)
 	}
 
 	/* add into event wait list */
-	if (port_conf.intr_conf.rxq && event_register(qconf) == 0)
+	if (event_register(qconf) == 0)
 		intr_en = 1;
 	else
 		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
index fc2c46b..f0f6a3f 100644
--- a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_interrupts.h
@@ -49,9 +49,16 @@ enum rte_intr_handle_type {
 struct rte_intr_handle {
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+#ifdef RTE_EAL_RX_INTR
+	/**
+	 * RTE_EAL_RX_INTR will be removed from v2.2.
+	 * It's only used to avoid ABI(unannounced) broken in v2.1.
+	 * Make sure being aware of the impact before turning on the feature.
+	 */
 	int max_intr;                    /**< max interrupt requested */
 	uint32_t nb_efd;                 /**< number of available efds */
 	int *intr_vec;               /**< intr vector number array */
+#endif
 };
 
 /**
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 1b80359..abc2062 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -290,18 +290,26 @@ vfio_enable_msix(struct rte_intr_handle *intr_handle) {
 
 	irq_set = (struct vfio_irq_set *) irq_set_buf;
 	irq_set->argsz = len;
+#ifdef RTE_EAL_RX_INTR
 	if (!intr_handle->max_intr)
 		intr_handle->max_intr = 1;
 	else if (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID)
 		intr_handle->max_intr = RTE_MAX_RXTX_INTR_VEC_ID + 1;
 
 	irq_set->count = intr_handle->max_intr;
+#else
+	irq_set->count = 1;
+#endif
 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *) &irq_set->data;
+#ifdef RTE_EAL_RX_INTR
 	memcpy(fd_ptr, intr_handle->efds, sizeof(intr_handle->efds));
 	fd_ptr[intr_handle->max_intr - 1] = intr_handle->fd;
+#else
+	fd_ptr[0] = intr_handle->fd;
+#endif
 
 	ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 
@@ -875,6 +883,7 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+#ifdef RTE_EAL_RX_INTR
 static void
 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
 {
@@ -917,6 +926,7 @@ eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
 		return;
 	} while (1);
 }
+#endif
 
 static int
 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
@@ -1054,6 +1064,7 @@ rte_epoll_ctl(int epfd, int op, int fd,
 	return 0;
 }
 
+#ifdef RTE_EAL_RX_INTR
 int
 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
 		int op, unsigned int vec, void *data)
@@ -1165,3 +1176,4 @@ rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
 	intr_handle->nb_efd = 0;
 	intr_handle->max_intr = 0;
 }
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 7c8a62b..5390b21 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,10 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#ifndef RTE_EAL_RX_INTR
+#include <rte_common.h>
+#endif
+
 #define RTE_MAX_RXTX_INTR_VEC_ID     32
 
 enum rte_intr_handle_type {
@@ -86,12 +90,19 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+#ifdef RTE_EAL_RX_INTR
+	/**
+	 * RTE_EAL_RX_INTR will be removed from v2.2.
+	 * It's only used to avoid ABI(unannounced) broken in v2.1.
+	 * Make sure being aware of the impact before turning on the feature.
+	 */
 	uint32_t max_intr;               /**< max interrupt requested */
 	uint32_t nb_efd;                 /**< number of available efds */
 	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
 	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
 					 /**< intr vector epoll event */
 	int *intr_vec;                   /**< intr vector number array */
+#endif
 };
 
 #define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
@@ -162,9 +173,23 @@ rte_intr_tls_epfd(void);
  *   - On success, zero.
  *   - On failure, a negative value.
  */
-int
+#ifdef RTE_EAL_RX_INTR
+extern int
 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
 		int epfd, int op, unsigned int vec, void *data);
+#else
+static inline int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data)
+{
+	RTE_SET_USED(intr_handle);
+	RTE_SET_USED(epfd);
+	RTE_SET_USED(op);
+	RTE_SET_USED(vec);
+	RTE_SET_USED(data);
+	return -ENOTSUP;
+}
+#endif
 
 /**
  * It enables the fastpath event fds if it's necessary.
@@ -179,8 +204,18 @@ rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
  *   - On success, zero.
  *   - On failure, a negative value.
  */
-int
+#ifdef RTE_EAL_RX_INTR
+extern int
 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+#else
+static inline int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+	RTE_SET_USED(intr_handle);
+	RTE_SET_USED(nb_efd);
+	return 0;
+}
+#endif
 
 /**
  * It disable the fastpath event fds.
@@ -189,8 +224,17 @@ rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
  * @param intr_handle
  *   Pointer to the interrupt handle.
  */
-void
+#ifdef RTE_EAL_RX_INTR
+extern void
 rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+#else
+static inline void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+	RTE_SET_USED(intr_handle);
+	return;
+}
+#endif
 
 /**
  * The fastpath interrupt is enabled or not.
@@ -198,11 +242,20 @@ rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
  * @param intr_handle
  *   Pointer to the interrupt handle.
  */
+#ifdef RTE_EAL_RX_INTR
 static inline int
 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
 {
 	return !(!intr_handle->nb_efd);
 }
+#else
+static inline int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+	RTE_SET_USED(intr_handle);
+	return 0;
+}
+#endif
 
 /**
  * The interrupt handle instance allows other cause or not.
@@ -211,10 +264,19 @@ rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
  * @param intr_handle
  *   Pointer to the interrupt handle.
  */
+#ifdef RTE_EAL_RX_INTR
 static inline int
 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
 {
 	return !!(intr_handle->max_intr - intr_handle->nb_efd);
 }
+#else
+static inline int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+	RTE_SET_USED(intr_handle);
+	return 1;
+}
+#endif
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 846d7f8..823eb46 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -3282,6 +3282,7 @@ _rte_eth_dev_callback_process(struct rte_eth_dev *dev,
 	rte_spinlock_unlock(&rte_eth_dev_cb_lock);
 }
 
+#ifdef RTE_EAL_RX_INTR
 int
 rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
 {
@@ -3353,6 +3354,7 @@ rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
 
 	return 0;
 }
+#endif
 
 int
 rte_eth_dev_rx_intr_enable(uint8_t port_id,
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index c199d32..8bea68d 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -830,8 +830,10 @@ struct rte_eth_fdir {
 struct rte_intr_conf {
 	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
 	uint16_t lsc;
+#ifdef RTE_EAL_RX_INTR
 	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
 	uint16_t rxq;
+#endif
 };
 
 /**
@@ -2943,8 +2945,20 @@ int rte_eth_dev_rx_intr_disable(uint8_t port_id,
  *   - On success, zero.
  *   - On failure, a negative value.
  */
-int
+#ifdef RTE_EAL_RX_INTR
+extern int
 rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
+#else
+static inline int
+rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
+{
+	RTE_SET_USED(port_id);
+	RTE_SET_USED(epfd);
+	RTE_SET_USED(op);
+	RTE_SET_USED(data);
+	return -1;
+}
+#endif
 
 /**
  * RX Interrupt control per queue.
@@ -2967,9 +2981,23 @@ rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
  *   - On success, zero.
  *   - On failure, a negative value.
  */
-int
+#ifdef RTE_EAL_RX_INTR
+extern int
 rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
 			  int epfd, int op, void *data);
+#else
+static inline int
+rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
+			  int epfd, int op, void *data)
+{
+	RTE_SET_USED(port_id);
+	RTE_SET_USED(queue_id);
+	RTE_SET_USED(epfd);
+	RTE_SET_USED(op);
+	RTE_SET_USED(data);
+	return -1;
+}
+#endif
 
 /**
  * Turn on the LED on the Ethernet device.
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v8 01/11] eal/linux: add interrupt vectors support in intr_handle
  2015-05-21 17:58             ` Neil Horman
  2015-05-21 18:21               ` Stephen Hemminger
       [not found]               ` <20150521111400.2a04a196@urahara>
@ 2015-05-29  8:56               ` Liang, Cunming
  2 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-05-29  8:56 UTC (permalink / raw)
  To: Neil Horman, Stephen Hemminger; +Cc: dev, liang-min.wang

Hi Neil,

On 5/22/2015 1:58 AM, Neil Horman wrote:
> On Thu, May 21, 2015 at 10:43:00AM -0700, Stephen Hemminger wrote:
>> On Thu, 21 May 2015 06:32:02 -0400
>> Neil Horman <nhorman@tuxdriver.com> wrote:
>>
>>> On Thu, May 21, 2015 at 04:55:53PM +0800, Cunming Liang wrote:
>>>> The patch adds interrupt vectors support in rte_intr_handle.
>>>> 'vec_en' is set when interrupt vectors are detected and associated event fds are set.
>>>> Those event fds are stored in efds[].
>>>> 'intr_vec' is reserved for device driver to initialize the vector mapping table.
>>>> When the event fds add to a specified epoll instance, 'elist' will hold the rte_epoll_event object pointer.
>>>>
>>>> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
>>>> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
>>>> ---
>>>> v7 changes:
>>>>   - add eptrs[], it's used to store the register rte_epoll_event instances.
>>>>   - add vec_en, to log the vector capability status.
>>>>
>>>> v6 changes:
>>>>   - add mapping table between irq vector number and queue id.
>>>>
>>>> v5 changes:
>>>>   - Create this new patch file for changed struct rte_intr_handle that
>>>>     other patches depend on, to avoid breaking git bisect.
>>>>
>>>>   lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
>>>>   1 file changed, 10 insertions(+)
>>>>
>>>> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
>>>> index 6a159c7..27174df 100644
>>>> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
>>>> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
>>>> @@ -38,6 +38,8 @@
>>>>   #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
>>>>   #define _RTE_LINUXAPP_INTERRUPTS_H_
>>>>   
>>>> +#define RTE_MAX_RXTX_INTR_VEC_ID     32
>>>> +
>>>>   enum rte_intr_handle_type {
>>>>   	RTE_INTR_HANDLE_UNKNOWN = 0,
>>>>   	RTE_INTR_HANDLE_UIO,      /**< uio device handle */
>>>> @@ -48,6 +50,8 @@ enum rte_intr_handle_type {
>>>>   	RTE_INTR_HANDLE_MAX
>>>>   };
>>>>   
>>>> +struct rte_epoll_event;
>>>> +
>>>>   /** Handle for interrupts. */
>>>>   struct rte_intr_handle {
>>>>   	union {
>>>> @@ -57,6 +61,12 @@ struct rte_intr_handle {
>>>>   	};
>>>>   	int fd;	 /**< interrupt event file descriptor */
>>>>   	enum rte_intr_handle_type type;  /**< handle type */
>>>> +	uint32_t max_intr;               /**< max interrupt requested */
>>>> +	uint32_t nb_efd;                 /**< number of available efds */
>>>> +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
>>>> +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
>>>> +					 /**< intr vector epoll event ptr */
>>>> +	int *intr_vec;                   /**< intr vector number array */
>>>>   };
>>>>     
>>> This is going to be ABI breaking if this from test_interrupts.c:
>>> static struct rte_intr_handle intr_handles[TEST_INTERRUPT_HANDLE_MAX];
>>>
>>> is a plausible way of using this structure.  Even putting the data at the end of
>>> the structure won't help, as the array indicies are off
>> This needs to go in 2.0 and 2.0 has to have new ABI anyway.
>>
> We've already released 2.0, I think you mean 2.1, but 2.1 can't have a new ABI
> because we didn't announce it in 1.8.  The earliest we can update the ABI
> (according to the ABI docs) at this point is 2.2, since we need to announce the
> change in 2.1, then make it in 2.2
>
> Neil
>
I'll follow your guidance to send a separate patch to announce the ABI 
changes in this release.
For this code patch series, I propose to turn off the whole feature by 
default so as to avoid ABI broken in this release.
On next release v2.2, I'll send another cleanup patch to remove the 
feature macro.
In this way, we either won't block the feature code review or won't 
break ABI.
For users who are still going to use the feature in v2.1, they shall 
make sure aware of the impact of ABI changes and take risk to turn on 
the feature manually.
The v9 patch will include this part. Does it sound good to you?

Thanks,
Steve

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue Cunming Liang
@ 2015-05-29 15:27           ` Stephen Hemminger
  2015-06-01  8:48             ` Liang, Cunming
  2015-05-29 15:36           ` Vincent JARDIN
  2015-06-01 14:11           ` Stephen Hemminger
  2 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-29 15:27 UTC (permalink / raw)
  To: dev

On Fri, 29 May 2015 16:45:25 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +#ifdef RTE_EAL_RX_INTR
> +extern int
>  rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
> +#else
> +static inline int
> +rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
> +{
> +	RTE_SET_USED(port_id);
> +	RTE_SET_USED(epfd);
> +	RTE_SET_USED(op);
> +	RTE_SET_USED(data);
> +	return -1;
> +}
> +#endif

Doing ABI compatibility is good but hard.

I think it would be better not to provide the functions for rx_intr_ctl unless
the feature was configured on. That way anyone using them with incorrect config
would detect failure at build time, rather than run time.

Also, doesn't some doc file have to be updated for the announcement?

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue Cunming Liang
  2015-05-29 15:27           ` Stephen Hemminger
@ 2015-05-29 15:36           ` Vincent JARDIN
  2015-06-01 14:11           ` Stephen Hemminger
  2 siblings, 0 replies; 242+ messages in thread
From: Vincent JARDIN @ 2015-05-29 15:36 UTC (permalink / raw)
  To: Cunming Liang, dev; +Cc: liang-min.wang, shemming

On 29/05/2015 10:45, Cunming Liang wrote:
> RTE_EAL_RX_INTR will be removed from v2.2. It's only used to avoid ABI(unannounced) broken in v2.1.
> The usrs should make sure understand the impact before turning on the feature.
> There are two abi changes required in this interrupt patch set.
> They're 1) struct rte_intr_handle; 2) struct rte_intr_conf.
>
> Signed-off-by: Cunming Liang<cunming.liang@intel.com>

Acked-by: vincent jardin <vincent.jardin@6wind.com>

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 09/12] ixgbe: enable rx queue interrupts for both PF and VF
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 09/12] ixgbe: enable rx queue interrupts for both PF and VF Cunming Liang
@ 2015-05-29 15:57           ` Stephen Hemminger
  0 siblings, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-05-29 15:57 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Fri, 29 May 2015 16:45:22 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> +		if (intr_handle->intr_vec == NULL) {
> +			PMD_INIT_LOG(ERR, "Failed to allocate %d rx_queues"
> +				     "intr_vec\n", dev->data->nb_rx_queues);
> +			return -1;
> +		}
> +

Please return an actual error code (like -ENOMEM) rather than generic -1.
It allows for easier diagnosis when using API's.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-05-29 15:27           ` Stephen Hemminger
@ 2015-06-01  8:48             ` Liang, Cunming
  2015-06-01 13:27               ` Stephen Hemminger
  0 siblings, 1 reply; 242+ messages in thread
From: Liang, Cunming @ 2015-06-01  8:48 UTC (permalink / raw)
  To: Stephen Hemminger, dev

Hi Stephen,

On 5/29/2015 11:27 PM, Stephen Hemminger wrote:
> On Fri, 29 May 2015 16:45:25 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
>> +#ifdef RTE_EAL_RX_INTR
>> +extern int
>>   rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
>> +#else
>> +static inline int
>> +rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
>> +{
>> +	RTE_SET_USED(port_id);
>> +	RTE_SET_USED(epfd);
>> +	RTE_SET_USED(op);
>> +	RTE_SET_USED(data);
>> +	return -1;
>> +}
>> +#endif
> Doing ABI compatibility is good but hard.
>
> I think it would be better not to provide the functions for rx_intr_ctl unless
> the feature was configured on. That way anyone using them with incorrect config
> would detect failure at build time, rather than run time.
I tend to not agree. For rx_intr_ctl/rx_intr_ctl_q, no matter w/ or w/o 
RTE_EAL_RX_INTR, it's necessary to check the return value.
The failure return shall cause application give up using epoll waiting 
on the specified epfd for the port, and then degraded to pure polling mode.
So I think these failure should be handled by the caller.
>
> Also, doesn't some doc file have to be updated for the announcement?
Yes, the ABI section in release note (doc/guides/rel_notes/abi.rst) 
shall update for the announcement.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-06-01  8:48             ` Liang, Cunming
@ 2015-06-01 13:27               ` Stephen Hemminger
  2015-06-02  2:14                 ` Liang, Cunming
  0 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-06-01 13:27 UTC (permalink / raw)
  To: Liang, Cunming; +Cc: dev

On Mon, 1 Jun 2015 16:48:01 +0800
"Liang, Cunming" <cunming.liang@intel.com> wrote:

> Hi Stephen,
> 
> On 5/29/2015 11:27 PM, Stephen Hemminger wrote:
> > On Fri, 29 May 2015 16:45:25 +0800
> > Cunming Liang <cunming.liang@intel.com> wrote:
> >  
> >> +#ifdef RTE_EAL_RX_INTR
> >> +extern int
> >>   rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
> >> +#else
> >> +static inline int
> >> +rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
> >> +{
> >> +	RTE_SET_USED(port_id);
> >> +	RTE_SET_USED(epfd);
> >> +	RTE_SET_USED(op);
> >> +	RTE_SET_USED(data);
> >> +	return -1;
> >> +}
> >> +#endif  
> > Doing ABI compatibility is good but hard.
> >
> > I think it would be better not to provide the functions for rx_intr_ctl unless
> > the feature was configured on. That way anyone using them with incorrect config
> > would detect failure at build time, rather than run time.  
> I tend to not agree. For rx_intr_ctl/rx_intr_ctl_q, no matter w/ or w/o 
> RTE_EAL_RX_INTR, it's necessary to check the return value.
> The failure return shall cause application give up using epoll waiting 
> on the specified epfd for the port, and then degraded to pure polling mode.
> So I think these failure should be handled by the caller.

It is always best to fail as early in the development process as possible.
What possible benefit could there be from allowing application to be linked
and run with incorrect configuration.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue Cunming Liang
  2015-05-29 15:27           ` Stephen Hemminger
  2015-05-29 15:36           ` Vincent JARDIN
@ 2015-06-01 14:11           ` Stephen Hemminger
  2015-06-01 14:18             ` Stephen Hemminger
  2 siblings, 1 reply; 242+ messages in thread
From: Stephen Hemminger @ 2015-06-01 14:11 UTC (permalink / raw)
  To: Cunming Liang; +Cc: dev, liang-min.wang

On Fri, 29 May 2015 16:45:25 +0800
Cunming Liang <cunming.liang@intel.com> wrote:

> RTE_EAL_RX_INTR will be removed from v2.2. It's only used to avoid ABI(unannounced) broken in v2.1.
> The usrs should make sure understand the impact before turning on the feature.
> There are two abi changes required in this interrupt patch set.
> They're 1) struct rte_intr_handle; 2) struct rte_intr_conf.
> 
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> ---

While merging for testing I discovered another minor issue.

The patch order here is a problem. The intermediate steps won't build  until
this last patch is applied.

In order to allow git bisect to be useful, it is important that every commit
done in the upstream version build and work. This series does not seem to
build until this last patch is applied. Maybe it should just be first?

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-06-01 14:11           ` Stephen Hemminger
@ 2015-06-01 14:18             ` Stephen Hemminger
  0 siblings, 0 replies; 242+ messages in thread
From: Stephen Hemminger @ 2015-06-01 14:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, liang-min.wang

Never mind, had wrong version of one of the patches.

On Mon, Jun 1, 2015 at 7:11 AM, Stephen Hemminger <shemming@brocade.com>
wrote:

> On Fri, 29 May 2015 16:45:25 +0800
> Cunming Liang <cunming.liang@intel.com> wrote:
>
> > RTE_EAL_RX_INTR will be removed from v2.2. It's only used to avoid
> ABI(unannounced) broken in v2.1.
> > The usrs should make sure understand the impact before turning on the
> feature.
> > There are two abi changes required in this interrupt patch set.
> > They're 1) struct rte_intr_handle; 2) struct rte_intr_conf.
> >
> > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > ---
>
> While merging for testing I discovered another minor issue.
>
> The patch order here is a problem. The intermediate steps won't build
> until
> this last patch is applied.
>
> In order to allow git bisect to be useful, it is important that every
> commit
> done in the upstream version build and work. This series does not seem to
> build until this last patch is applied. Maybe it should just be first?
>

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue
  2015-06-01 13:27               ` Stephen Hemminger
@ 2015-06-02  2:14                 ` Liang, Cunming
  0 siblings, 0 replies; 242+ messages in thread
From: Liang, Cunming @ 2015-06-02  2:14 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev

Hi Stephen,

On 6/1/2015 9:27 PM, Stephen Hemminger wrote:
> On Mon, 1 Jun 2015 16:48:01 +0800
> "Liang, Cunming" <cunming.liang@intel.com> wrote:
>
>> Hi Stephen,
>>
>> On 5/29/2015 11:27 PM, Stephen Hemminger wrote:
>>> On Fri, 29 May 2015 16:45:25 +0800
>>> Cunming Liang <cunming.liang@intel.com> wrote:
>>>   
>>>> +#ifdef RTE_EAL_RX_INTR
>>>> +extern int
>>>>    rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data);
>>>> +#else
>>>> +static inline int
>>>> +rte_eth_dev_rx_intr_ctl(uint8_t port_id, int epfd, int op, void *data)
>>>> +{
>>>> +	RTE_SET_USED(port_id);
>>>> +	RTE_SET_USED(epfd);
>>>> +	RTE_SET_USED(op);
>>>> +	RTE_SET_USED(data);
>>>> +	return -1;
>>>> +}
>>>> +#endif
>>> Doing ABI compatibility is good but hard.
>>>
>>> I think it would be better not to provide the functions for rx_intr_ctl unless
>>> the feature was configured on. That way anyone using them with incorrect config
>>> would detect failure at build time, rather than run time.
>> I tend to not agree. For rx_intr_ctl/rx_intr_ctl_q, no matter w/ or w/o
>> RTE_EAL_RX_INTR, it's necessary to check the return value.
>> The failure return shall cause application give up using epoll waiting
>> on the specified epfd for the port, and then degraded to pure polling mode.
>> So I think these failure should be handled by the caller.
> It is always best to fail as early in the development process as possible.
> What possible benefit could there be from allowing application to be linked
> and run with incorrect configuration.
In fact you'll always detect failure at build time if your application 
insist to call rx_intr_ctl with port_conf.intr_conf.rxq=1.
As port_conf.intr_conf.rxq is not defined when RTE_EAL_RX_INTR is not 
defined.
If you ignore port_conf.intr_conf.rxq, no matter RTE_EAL_RX_INTR is 
defined or not, you always will get failure return when calling rx_intr_ctl.
So I think the behavior doesn't break 'fail as early in the development 
process as possible'.
And I'd like to expose all new APIs in this version, if we don't provide 
this API, what about rte_intr_rx_ctl or others? They're probably used by 
other user application as well.

^ permalink raw reply	[flat|nested] 242+ messages in thread

* Re: [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 01/12] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-06-02  5:27           ` Liu, Yong
  0 siblings, 0 replies; 242+ messages in thread
From: Liu, Yong @ 2015-06-02  5:27 UTC (permalink / raw)
  To: Liang, Cunming, dev; +Cc: shemming, Wang, Liang-min

Tested-by: Yong Liu <yong.liu@intel.com>

- Tested Commit: 7c4c66bf666b8059ed0ad2f2478ef349b3272f51
- OS: Fedora20 3.15.5
- GCC: gcc version 4.8.3 20140911
- CPU: Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
- NIC: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ [8086:10fb]
- NIC: Intel Corporation I350 Gigabit Network Connection [8086:1521]
- Default x86_64-native-linuxapp-gcc configuration
- Prerequisites: vfio related case request vt-d enable in bios and IOMMU enable in kernel 
- Total 17 cases, 15 passed, 2 failed
- Failed cases: pf_rxq_on_uiopcigeneric, pf_lsc_on_uiopcigeneric

- Case: pf_lsc_igbuio_legacy
  Description: check when pf bound to igb_uio with legacy mode, link status change interrupt can be normally handled
  Command / instruction:
    Insmod igb_uio driver with legacy interrupt mode
      insmod ./x86_64-native-linuxapp-gcc/kmod/igb_uio.ko intr_mode=legacy
    Change port config to lsc enable and rxq disable in l3fwd-power/main.c
    Build l3fwd-power and start l3fwd-power with 2 ports  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Change tester port0 link down and verify link down detected on dut port0 
      Port 0: link down
    Change tester port0 link up and verify link up detected on dut port0
      Port 0: link up
    Change tester port1 link down and verify link down detected on dut port1 
      Port 1: link down
    Change tester port1 link up and verify link up detected on dut port1
      Port 1: link up
  
    Change port config to lsc enable and rxq enable in l3fwd-power/main.c
    Build l3fwd-power and start l3fwd-power with 2 ports
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify lsc disabled for can't enable lsc and rxq in the same time when pf bound to igb_uio
      lsc won't enable because of no intr multiplex
  
- Case: pf_lsc_igbuio_msix
  Description: check when pf bound to igb_uio with msix mode, link status change interrupt can be normally handled
  Command / instruction:
    Insmod igb_uio driver with msix interrupt mode
      insmod ./x86_64-native-linuxapp-gcc/kmod/igb_uio.ko intr_mode=msix
    Verify link status can be normally handled like previous case pf_lsc_igbuio_legacy.

- Case: pf_lsc_vfio_legacy
  Description: check when pf bound to vfio with legacy mode, link status change interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Change port config to lsc enable and rxq disable in l3fwd-power/main.c
    Start l3fwd-power with vfio legacy mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=legacy -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Check link status change interrupt can be normally handled like previous case.
	
    Change port config to lsc enable and rxq enable in l3fwd-power/main.c
    Start l3fwd-power with vfio legacy mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=legacy -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify lsc disabled for can't enable lsc and rxq in the same time with legacy mode.
	
- Case: pf_lsc_vfio_msi
  Description: check when pf bound to vfio with msi mode, link status change interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Change port config to lsc enable and rxq disable in l3fwd-power/main.c
    Start l3fwd-power with vfio msi mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=msi -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Check link status change interrupt can be normally handled like previous case.
	
    Change port config to lsc enable and rxq enable in l3fwd-power/main.c
    Start l3fwd-power with vfio msi mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=msi -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify lsc disabled for can't enable lsc and rxq in the same time with legacy mode.

- Case: pf_lsc_vfio_msix
  Description: check when pf bound to vfio with msix mode, link status change interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Change port config to lsc enable and rxq disable in l3fwd-power/main.c
    Start l3fwd-power with vfio msix mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=msix -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Check link status change interrupt can be normally handled like previous case.
	
    Change port config to lsc enable and rxq enable in l3fwd-power/main.c
    Start l3fwd-power with vfio msix mode
      l3fwd-power -c 0x6 -n 3 --vfio-intr=msix -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Check link status change interrupt can be normally handled like previous case.
	
- Case: pf_rxq_on_vfio_msix
  Description: check when pf bound to vfio with default msix mode, receive packet interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
          modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Send packet from tester port0 and verify dut core1 wakeup and then sleep.
      lcore 1 is waked up from rx interrupt on port 0 queue 0
      lcore 1 sleeps until interrupt triggers
    Send packet from tester port1 and verify dut core2 wakeup and then sleep.
      lcore 2 is waked up from rx interrupt on port 1 queue 0
      lcore 2 sleeps until interrupt triggers
	  
- Case: pf_rxq_on_vfio_msi
  Description: check when pf bound to vfio with msi mode, receive packet interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
          modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 --vfio-intr=msi -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.
  
- Case: pf_rxq_on_vfio_legacy
  Description: check when pf bound to vfio with legacy mode, receive packet interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 --vfio-intr=legacy -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.

- Case: pf_onecore_on_vfio
  Description: check when all pf devices bound to one core, receive packet interrupt can be normally handled
  Command / instruction:
    Do prerequisites for vfio driver then bind device to vfio-driver
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:00.0 08:00.1
    Start l3fwd-power with 2 ports and 1 cores.  
      l3fwd-power -c 0x2 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,1)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.
	
- Case: pf_multiqueue_on_vfio
  Description: check when pf device has mulit queues, receive packet interrupt can be normally handled
  Command / instruction:
    Start l3fwd-power with 2 ports and 4 cores.
      l3fwd-power -c 0x100000e -n 3 -- -p 0x3 -P --config="(0,0,1),(0,1,2),(1,0,3),(1,1,24)"
    Send enough packets with different destination ip address.
	  sendp([Ether()/IP(dst="127.0.0.X")/UDP()/Raw('0'*18)], iface="p786p1")
    Verify all cores wakeup and then sleep as expected.

- Case: pf_maxqueue_on_vfio
  Description: check when pf device has maximum queues, receive packet interrupt can be normally handled
  Command / instruction:
    Start l3fwd-power with 2 ports and 32 cores [only for niantic], different nic has different maximum rx queues
      l3fwd-power -c 0x3fdfe3fdfe -n 3 -- -p 0x3 -P --config="(0,0,1),(0,1,21),(0,2,2),(0,3,22),\
          (0,4,3),(0,5,23),(0,6,24),(0,7,4),(0,8,25),(0,9,5),(0,10,26),(0,11,6),(0,12,27),(0,13,7),\
          (0,14,8),(0,15,28),(1,0,10),(1,1,30),(1,2,11),(1,3,31),(1,4,32),(1,5,12),(1,6,33),(1,7,13),\
          (1,8,34),(1,9,14),(1,10,35),(1,11,15),(1,12,16),(1,13,36),(1,14,17),(1,15,37),"
    Send enough packets with different destination ip address.
	  sendp([Ether()/IP(dst="127.0.0.X")/UDP()/Raw('0'*18)], iface="p786p1")
    Verify all cores wakeup and then sleep as expected.	
	
- Case: pf_rxq_on_igbuio_legacy
  Description: check when pf bound to igb_uio with legacy mode, receive packet interrupt can be normally handled
  Command / instruction:
    Insmod igb_uio driver with legacy interrupt mode
      insmod ./x86_64-native-linuxapp-gcc/kmod/igb_uio.ko intr_mode=legacy
      ./tools/dpdk_nic_bind.py --bind=igb_uio 08:00.0 08:00.1  
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.
  
- Case: pf_rxq_on_igbuio_msix
  Description: check when pf bound to igb_uio with msix mode, receive packet interrupt can be normally handled
  Command / instruction:
    Insmod igb_uio driver with msix interrupt mode
      insmod ./x86_64-native-linuxapp-gcc/kmod/igb_uio.ko intr_mode=msix
      ./tools/dpdk_nic_bind.py --bind=igb_uio 08:00.0 08:00.1  
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.

- Case: pf_rxq_on_uiopcigeneric
  Description: check when pf bound to uio_pci_generic, receive packet interrupt can be normally handled
  Command / instruction:
    Insmod uio_pci_generic driver and bind pf device on it.
      ./tools/dpdk_nic_bind.py --bind=uio_pci_generic 08:00.0 08:00.1  
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify packet interrupt can be normally handled like previous case pf_rxq_on_vfio_msix.	

- Case: pf_lsc_on_uiopcigeneric
  Description: check when pf bound to uio_pci_generic, link status changed interrupt can be normally handled
  Command / instruction:
    Insmod uio_pci_generic driver and bind pf device on it.
      ./tools/dpdk_nic_bind.py --bind=uio_pci_generic 08:00.0 08:00.1  
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Change tester port0 link down and verify link down detected on dut port0 
      Port 0: link down
    Change tester port0 link up and verify link up detected on dut port0
      Port 0: link up
    Change tester port1 link down and verify link down detected on dut port1 
      Port 1: link down
    Change tester port1 link up and verify link up detected on dut port1
      Port 1: link up
 
    Change port config to lsc enable and rxq enable in l3fwd-power/main.c
    Build l3fwd-power and start l3fwd-power with 2 ports
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Verify lsc disabled for can't enable lsc and rxq in the same time when pf bound to uio_pci_generic
      lsc won't enable because of no intr multiplex	
	
- Case: vf_in_vm_rxq
  Description: check when vf bound to igb_uio in virtual machine, receive packet interrupt can be normally handled
               Only support niantic by now.
  Command / instruction:
    Create vf devices and bound into virtual machine
      echo 1 > /sys/bus/pci/devices/0000\:08\:00.0/sriov_numvfs
      echo 1 > /sys/bus/pci/devices/0000\:08\:00.1/sriov_numvfs
      virsh
      virsh # nodedev-dettach pci_0000_08_10_0
      virsh # nodedev-dettach pci_0000_08_10_1
    Start virtual machine and bind vf devices to driver igb_uio.
      ./tools/dpdk_nic_bind.py --bind=igb_uio eth1 eth2
    Change port config to lsc disable and rxq enable in l3fwd-power/main.c	
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Send packet from tester port0 with promisc mac and verify vm core1 wakeup and then sleep.
      lcore 1 is waked up from rx interrupt on port 0 queue 0
      lcore 1 sleeps until interrupt triggers
    Send packet from tester port1 with promisc mac and verify vm core2 wakeup and then sleep.
      lcore 2 is waked up from rx interrupt on port 1 queue 0
      lcore 2 sleeps until interrupt triggers
	  
- Case: vf_in_host_rxq
  Description: check when vf bound to vfio with msix mode, receive packet interrupt can be normally handled
               Only support niantic by now.
  Command / instruction:
    Create vf devices and bound to vfio
      echo 1 > /sys/bus/pci/devices/0000\:08\:00.0/sriov_numvfs
      echo 1 > /sys/bus/pci/devices/0000\:08\:00.1/sriov_numvfs
      modprobe vfio
      modprobe vfio-pci
      ./tools/dpdk_nic_bind.py --bind=vfio-pci 08:10.0 08:10.1
    Start l3fwd-power with 2 ports and 2 cores.  
      l3fwd-power -c 0x6 -n 3 -- -p 0x3 -P --config="(0,0,1),(1,0,2)"
    Send packet from tester port0 with promisc mac and verify dut core1 wakeup and then sleep.
      lcore 1 is waked up from rx interrupt on port 0 queue 0
      lcore 1 sleeps until interrupt triggers
    Send packet from tester port1 with promisc mac and verify dut core2 wakeup and then sleep.
      lcore 2 is waked up from rx interrupt on port 1 queue 0
      lcore 2 sleeps until interrupt triggers

> -----Original Message-----
> From: Liang, Cunming
> Sent: Friday, May 29, 2015 4:45 PM
> To: dev@dpdk.org
> Cc: shemming@brocade.com; david.marchand@6wind.com;
> thomas.monjalon@6wind.com; Zhou, Danny; Wang, Liang-min; Richardson, Bruce;
> Liu, Yong; nhorman@tuxdriver.com; Liang, Cunming
> Subject: [PATCH v9 01/12] eal/linux: add interrupt vectors support in
> intr_handle
> 
> The patch adds interrupt vectors support in rte_intr_handle.
> 'vec_en' is set when interrupt vectors are detected and associated event
> fds are set.
> Those event fds are stored in efds[].
> 'intr_vec' is reserved for device driver to initialize the vector mapping
> table.
> When the event fds add to a specified epoll instance, 'eptrs' will hold
> the rte_epoll_event object pointer.
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> ---
> v7 changes:
>  - add eptrs[], it's used to store the register rte_epoll_event instances.
>  - add vec_en, to log the vector capability status.
> 
> v6 changes:
>  - add mapping table between irq vector number and queue id.
> 
> v5 changes:
>  - Create this new patch file for changed struct rte_intr_handle that
>    other patches depend on, to avoid breaking git bisect.
> 
>  lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10
> ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index bdeb3fc..9c86a15 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -38,6 +38,8 @@
>  #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
>  #define _RTE_LINUXAPP_INTERRUPTS_H_
> 
> +#define RTE_MAX_RXTX_INTR_VEC_ID     32
> +
>  enum rte_intr_handle_type {
>  	RTE_INTR_HANDLE_UNKNOWN = 0,
>  	RTE_INTR_HANDLE_UIO,          /**< uio device handle */
> @@ -49,6 +51,8 @@ enum rte_intr_handle_type {
>  	RTE_INTR_HANDLE_MAX
>  };
> 
> +struct rte_epoll_event;
> +
>  /** Handle for interrupts. */
>  struct rte_intr_handle {
>  	union {
> @@ -58,6 +62,12 @@ struct rte_intr_handle {
>  	};
>  	int fd;	 /**< interrupt event file descriptor */
>  	enum rte_intr_handle_type type;  /**< handle type */
> +	uint32_t max_intr;               /**< max interrupt requested */
> +	uint32_t nb_efd;                 /**< number of available efds */
> +	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping
> */
> +	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
> +					 /**< intr vector epoll event ptr */
> +	int *intr_vec;                   /**< intr vector number array */
>  };
> 
>  #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
> --
> 1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v10 00/13] Interrupt mode PMD
  2015-05-29  8:45       ` [dpdk-dev] [PATCH v9 00/12] Interrupt mode PMD Cunming Liang
                           ` (11 preceding siblings ...)
  2015-05-29  8:45         ` [dpdk-dev] [PATCH v9 12/12] abi: fix v2.1 abi broken issue Cunming Liang
@ 2015-06-02  6:53         ` Cunming Liang
  2015-06-02  6:53           ` [dpdk-dev] [PATCH v10 01/13] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
                             ` (13 more replies)
  12 siblings, 14 replies; 242+ messages in thread
From: Cunming Liang @ 2015-06-02  6:53 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

v10 changes
 - code rework to return actual error code
 - bug fix for lsc when using uio_pci_generic

v9 changes
 - code rework to fix open comment
 - bug fix for igb lsc when both lsc and rxq are enabled in vfio-msix
 - new patch to turn off the feature by defalut so as to avoid v2.1 abi broken

v8 changes
 - remove condition check for only vfio-msix
 - add multiplex intr support when only one intr vector allowed
 - lsc and rxq interrupt runtime enable decision
 - add safe event delete while the event wakeup execution happens

v7 changes
 - decouple epoll event and intr operation
 - add condition check in the case intr vector is disabled
 - renaming some APIs

v6 changes
 - split rte_intr_wait_rx_pkt into two APIs 'wait' and 'set'.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set.
 - using vector number instead of queue_id as interrupt API params.
 - patch reorder and split.

v5 changes
 - Rebase the patchset onto the HEAD
 - Isolate ethdev from EAL for new-added wait-for-rx interrupt function
 - Export wait-for-rx interrupt function for shared libraries
 - Split-off a new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect
 - Change sample applicaiton to accomodate EAL function spec change
   accordingly

v4 changes
 - Export interrupt enable/disable functions for shared libraries
 - Adjust position of new-added structure fields and functions to
   avoid breaking ABI
 
v3 changes
 - Add return value for interrupt enable/disable functions
 - Move spinlok from PMD to L3fwd-power
 - Remove unnecessary variables in e1000_mac_info
 - Fix miscelleous review comments
 
v2 changes
 - Fix compilation issue in Makefile for missed header file.
 - Consolidate internal and community review comments of v1 patch set.
 
The patch series introduce low-latency one-shot rx interrupt into DPDK with
polling and interrupt mode switch control example.
 
DPDK userspace interrupt notification and handling mechanism is based on UIO
with below limitation:
1) It is designed to handle LSC interrupt only with inefficient suspended
   pthread wakeup procedure (e.g. UIO wakes up LSC interrupt handling thread
   which then wakes up DPDK polling thread). In this way, it introduces
   non-deterministic wakeup latency for DPDK polling thread as well as packet
   latency if it is used to handle Rx interrupt.
2) UIO only supports a single interrupt vector which has to been shared by
   LSC interrupt and interrupts assigned to dedicated rx queues.
 
This patchset includes below features:
1) Enable one-shot rx queue interrupt in ixgbe PMD(PF & VF) and igb PMD(PF only).
2) Build on top of the VFIO mechanism instead of UIO, so it could support
   up to 64 interrupt vectors for rx queue interrupts.
3) Have 1 DPDK polling thread handle per Rx queue interrupt with a dedicated
   VFIO eventfd, which eliminates non-deterministic pthread wakeup latency in
   user space.
4) Demonstrate interrupts control APIs and userspace NAIP-like polling/interrupt
   switch algorithms in L3fwd-power example.

Known limitations:
1) It does not work for UIO due to a single interrupt eventfd shared by LSC
   and rx queue interrupt handlers causes a mess. [FIXED]
2) LSC interrupt is not supported by VF driver, so it is by default disabled
   in L3fwd-power now. Feel free to turn in on if you want to support both LSC
   and rx queue interrupts on a PF.

Cunming Liang (13):
  eal/linux: add interrupt vectors support in intr_handle
  eal/linux: add rte_epoll_wait/ctl support
  eal/linux: add API to set rx interrupt event monitor
  eal/linux: fix comments typo on vfio msi
  eal/linux: add interrupt vectors handling on VFIO
  eal/linux: standalone intr event fd create support
  eal/linux: fix lsc read error in uio_pci_generic
  eal/bsd: dummy for new intr definition
  ethdev: add rx intr enable, disable and ctl functions
  ixgbe: enable rx queue interrupts for both PF and VF
  igb: enable rx queue interrupts for PF
  l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode
    switch
  abi: fix v2.1 abi broken issue

 drivers/net/e1000/igb_ethdev.c                     | 311 ++++++++++--
 drivers/net/ixgbe/ixgbe_ethdev.c                   | 519 ++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_ethdev.h                   |   4 +
 examples/l3fwd-power/main.c                        | 206 ++++++--
 lib/librte_eal/bsdapp/eal/eal_interrupts.c         |  19 +
 .../bsdapp/eal/include/exec-env/rte_interrupts.h   |  81 ++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map      |   5 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 360 ++++++++++++--
 .../linuxapp/eal/include/exec-env/rte_interrupts.h | 219 +++++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   8 +
 lib/librte_ether/rte_ethdev.c                      | 109 +++++
 lib/librte_ether/rte_ethdev.h                      | 132 ++++++
 lib/librte_ether/rte_ether_version.map             |   4 +
 13 files changed, 1852 insertions(+), 125 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v10 01/13] eal/linux: add interrupt vectors support in intr_handle
  2015-06-02  6:53         ` [dpdk-dev] [PATCH v10 00/13] Interrupt mode PMD Cunming Liang
@ 2015-06-02  6:53           ` Cunming Liang
  2015-06-02  6:53           ` [dpdk-dev] [PATCH v10 02/13] eal/linux: add rte_epoll_wait/ctl support Cunming Liang
                             ` (12 subsequent siblings)
  13 siblings, 0 replies; 242+ messages in thread
From: Cunming Liang @ 2015-06-02  6:53 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds interrupt vectors support in rte_intr_handle.
'vec_en' is set when interrupt vectors are detected and associated event fds are set.
Those event fds are stored in efds[].
'intr_vec' is reserved for device driver to initialize the vector mapping table.
When the event fds add to a specified epoll instance, 'eptrs' will hold the rte_epoll_event object pointer.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v7 changes:
 - add eptrs[], it's used to store the register rte_epoll_event instances.
 - add vec_en, to log the vector capability status.

v6 changes:
 - add mapping table between irq vector number and queue id.

v5 changes:
 - Create this new patch file for changed struct rte_intr_handle that
   other patches depend on, to avoid breaking git bisect.

 lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index bdeb3fc..9c86a15 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -38,6 +38,8 @@
 #ifndef _RTE_LINUXAPP_INTERRUPTS_H_
 #define _RTE_LINUXAPP_INTERRUPTS_H_
 
+#define RTE_MAX_RXTX_INTR_VEC_ID     32
+
 enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_UNKNOWN = 0,
 	RTE_INTR_HANDLE_UIO,          /**< uio device handle */
@@ -49,6 +51,8 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
+struct rte_epoll_event;
+
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	union {
@@ -58,6 +62,12 @@ struct rte_intr_handle {
 	};
 	int fd;	 /**< interrupt event file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
+	uint32_t max_intr;               /**< max interrupt requested */
+	uint32_t nb_efd;                 /**< number of available efds */
+	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
+	struct rte_epoll_event *elist[RTE_MAX_RXTX_INTR_VEC_ID];
+					 /**< intr vector epoll event ptr */
+	int *intr_vec;                   /**< intr vector number array */
 };
 
 #endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */
-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 242+ messages in thread

* [dpdk-dev] [PATCH v10 02/13] eal/linux: add rte_epoll_wait/ctl support
  2015-06-02  6:53         ` [dpdk-dev] [PATCH v10 00/13] Interrupt mode PMD Cunming Liang
  2015-06-02  6:53           ` [dpdk-dev] [PATCH v10 01/13] eal/linux: add interrupt vectors support in intr_handle Cunming Liang
@ 2015-06-02  6:53           ` Cunming Liang
  2015-06-02 16:21             ` Stephen Hemminger
  2015-06-02  6:53           ` [dpdk-dev] [PATCH v10 03/13] eal/linux: add API to set rx interrupt event monitor Cunming Liang
                             ` (11 subsequent siblings)
  13 siblings, 1 reply; 242+ messages in thread
From: Cunming Liang @ 2015-06-02  6:53 UTC (permalink / raw)
  To: dev; +Cc: shemming, liang-min.wang

The patch adds 'rte_epoll_wait' and 'rte_epoll_ctl' for async event wakeup.
It defines 'struct rte_epoll_event' as the event param.
The 'op' uses the same enum as epoll_wait/ctl does.
The epoll event support to carry a raw user data and to register a callback which is exectuted during wakeup.

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
---
v9 changes
 - rework on coding style

v8 changes
 - support delete event in safety during the wakeup execution
 - add EINTR process during epoll_wait

v7 changes
 - split v6[4/8] into two patches, one for epoll event(this one)
   another for rx intr(next patch)
 - introduce rte_epoll_event definition
 - rte_epoll_wait/ctl for more generic RTE epoll API

v6 changes
 - split rte_intr_wait_rx_pkt into two function, wait and set.
 - rewrite rte_intr_rx_wait/rte_intr_rx_set to remove queue visibility on eal.
 - rte_intr_rx_wait to support multiplexing.
 - allow epfd as input to support flexible event fd combination.

 lib/librte_eal/linuxapp/eal/eal_interrupts.c       | 137 +++++++++++++++++++++
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |  82 +++++++++++-
 lib/librte_eal/linuxapp/eal/rte_eal_version.map    |   3 +
 3 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 3a84b3c..2f56000 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -69,6 +69,8 @@
 
 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
 
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
 /**
  * union for pipe fds.
  */
@@ -895,3 +897,138 @@ rte_eal_intr_init(void)
 	return -ret;
 }
 
+static int
+eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
+			struct rte_epoll_event *events)
+{
+	unsigned int i, count = 0;
+	struct rte_epoll_event *rev;
+
+	for (i = 0; i < n; i++) {
+		rev = evs[i].data.ptr;
+		if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+						 RTE_EPOLL_EXEC))
+			continue;
+
+		events[count].status        = RTE_EPOLL_VALID;
+		events[count].fd            = rev->fd;
+		events[count].epfd          = rev->epfd;
+		events[count].epdata.event  = rev->epdata.event;
+		events[count].epdata.data   = rev->epdata.data;
+		if (rev->epdata.cb_fun)
+			rev->epdata.cb_fun(rev->fd,
+					   rev->epdata.cb_arg);
+
+		rte_compiler_barrier();
+		rev->status = RTE_EPOLL_VALID;
+		count++;
+	}
+	return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+	int pfd = epoll_create(255);
+	if (pfd < 0) {
+		RTE_LOG(ERR, EAL,
+			"Cannot create epoll instance\n");
+		return -1;
+	}
+	return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+	if (RTE_PER_LCORE(_epfd) == -1)
+		RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+	return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout)
+{
+	struct epoll_event evs[maxevents];
+	int rc;
+
+	if (!events) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	while (1) {
+		rc = epoll_wait(epfd, evs, maxevents, timeout);
+		if (likely(rc > 0)) {
+			/* epoll_wait has at least one fd ready to read */
+			rc = eal_epoll_process_event(evs, rc, events);
+			break;
+		} else if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			/* epoll_wait fail */
+			RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+				strerror(errno));
+			rc = -1;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+	while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+				    RTE_EPOLL_INVALID))
+		while (ev->status != RTE_EPOLL_VALID)
+			rte_pause();
+	memset(&ev->epdata, 0, sizeof(ev->epdata));
+	ev->fd = -1;
+	ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event)
+{
+	struct epoll_event ev;
+
+	if (!event) {
+		RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+		return -1;
+	}
+
+	/* using per thread epoll fd */
+	if (epfd == RTE_EPOLL_PER_THREAD)
+		epfd = rte_intr_tls_epfd();
+
+	if (op == EPOLL_CTL_ADD) {
+		event->status = RTE_EPOLL_VALID;
+		event->fd = fd;  /* ignore fd in event */
+		event->epfd = epfd;
+		ev.data.ptr = (void *)event;
+	}
+
+	ev.events = event->epdata.event;
+	if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+		RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+			op, fd, strerror(errno));
+		if (op == EPOLL_CTL_ADD)
+			/* rollback status when CTL_ADD fail */
+			event->status = RTE_EPOLL_INVALID;
+		return -1;
+	}
+
+	if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+		eal_epoll_data_safe_free(event);
+
+	return 0;
+}
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 9c86a15..7c21060 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -51,7 +51,31 @@ enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX
 };
 
-struct rte_epoll_event;
+#define RTE_INTR_EVENT_ADD            1UL
+#define	RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+struct rte_epoll_data {
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
+
+enum {
+	RTE_EPOLL_INVALID = 0,
+	RTE_EPOLL_VALID,
+	RTE_EPOLL_EXEC,
+};
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	volatile uint32_t stat