DPDK patches and discussions
 help / color / mirror / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download: 
* [dpdk-dev] [PATCH v5 04/11] ixgbe: replace rte_panic instances in ixgbe driver
    2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
  2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-23 21:28  3% ` Arnon Warshavsky
  2 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-23 21:28 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and return value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/ixgbe/ixgbe_ethdev.c |  6 ++++--
 drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
 drivers/net/ixgbe/ixgbe_pf.c     | 15 ++++++++++-----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a5e2fc0..fb95cc7 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1061,7 +1061,7 @@ struct rte_ixgbe_xstats_name_off {
 		IXGBE_DEV_PRIVATE_TO_BW_CONF(eth_dev->data->dev_private);
 	uint32_t ctrl_ext;
 	uint16_t csum;
-	int diag, i;
+	int diag, i, error;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -1224,7 +1224,9 @@ struct rte_ixgbe_xstats_name_off {
 	memset(hwstrip, 0, sizeof(*hwstrip));
 
 	/* initialize PF if max_vfs not zero */
-	ixgbe_pf_host_init(eth_dev);
+	error = ixgbe_pf_host_init(eth_dev);
+	if (error != 0)
+		return error;
 
 	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
 	/* let hardware know driver is loaded */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 6550777..8bb41ec 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -661,7 +661,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
 
 void ixgbe_vlan_hw_strip_config(struct rte_eth_dev *dev);
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
index 4e61310..1e58750 100644
--- a/drivers/net/ixgbe/ixgbe_pf.c
+++ b/drivers/net/ixgbe/ixgbe_pf.c
@@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct ixgbe_vf_info **vfinfo =
 		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
+				__func__);
+		return -ENOMEM;
+	}
 
 	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
 	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
@@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	/* set mb interrupt mask */
 	ixgbe_mb_intr_setup(eth_dev);
+
+	return 0;
 }
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
@@ -203,7 +208,7 @@ int ixgbe_pf_host_configure(struct rte_eth_dev *eth_dev)
 
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return -1;
+		return -ENOMEM;
 
 	/* enable VMDq and set the default pool for PF */
 	vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL);
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 03/11] e1000: replace rte_panic instances in e1000 driver
    2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-23 21:28  3% ` Arnon Warshavsky
  2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
  2 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-23 21:28 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and return value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/e1000/e1000_ethdev.h |  2 +-
 drivers/net/e1000/igb_ethdev.c   |  4 +++-
 drivers/net/e1000/igb_pf.c       | 15 +++++++++------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b89..2e527de 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -411,7 +411,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
 /*
  * misc function prototypes
  */
-void igb_pf_host_init(struct rte_eth_dev *eth_dev);
+int igb_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 9b808a9..67a32a2 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -833,7 +833,9 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
 	}
 
 	/* initialize PF if max_vfs not zero */
-	igb_pf_host_init(eth_dev);
+	error = igb_pf_host_init(eth_dev);
+	if (error != 0)
+		goto err_late;
 
 	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
 	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
index b9f2e53..1a53531 100644
--- a/drivers/net/e1000/igb_pf.c
+++ b/drivers/net/e1000/igb_pf.c
@@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void igb_pf_host_init(struct rte_eth_dev *eth_dev)
+int igb_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct e1000_vf_info **vfinfo =
 		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	if (0 == (vf_num = dev_num_vf(eth_dev)))
-		return;
+		return 0;
 
 	if (hw->mac.type == e1000_i350)
 		nb_queue = 1;
@@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 		/* per datasheet, it should be 2, but 1 seems correct */
 		nb_queue = 1;
 	else
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private VF data\n",
+			__func__);
+		return -ENOMEM;
+	}
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
 	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
@@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 	/* set mb interrupt mask */
 	igb_mb_intr_setup(eth_dev);
 
-	return;
+	return 0;
 }
 
 void igb_pf_host_uninit(struct rte_eth_dev *dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 02/11] bond: replace rte_panic instances in bonding driver
  @ 2018-04-23 21:28  3% ` Arnon Warshavsky
  2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
  2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
  2 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-23 21:28 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and return value.
Local functions to this file,
changing from void to int are non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c         | 29 ++++++++++++++---------
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
 drivers/net/bonding/rte_eth_bond_api.c            | 22 ++++++++++++-----
 drivers/net/bonding/rte_eth_bond_pmd.c            |  9 ++++---
 drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
 5 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index c452318..2fb6cad 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -893,7 +893,7 @@
 			bond_mode_8023ad_periodic_cb, arg);
 }
 
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
 				uint16_t slave_id)
 {
@@ -939,7 +939,7 @@
 	timer_cancel(&port->warning_timer);
 
 	if (port->mbuf_pool != NULL)
-		return;
+		return 0;
 
 	RTE_ASSERT(port->rx_ring == NULL);
 	RTE_ASSERT(port->tx_ring == NULL);
@@ -968,8 +968,9 @@
 	/* Any memory allocation failure in initialization is critical because
 	 * resources can't be free, so reinitialization is impossible. */
 	if (port->mbuf_pool == NULL) {
-		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-			slave_id, mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
@@ -977,8 +978,9 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
 
 	if (port->rx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	/* TX ring is at least one pkt longer to make room for marker packet. */
@@ -987,9 +989,12 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
 
 	if (port->tx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
+
+	return 0;
 }
 
 int
@@ -1143,9 +1148,11 @@
 	struct bond_dev_private *internals = bond_dev->data->dev_private;
 	uint8_t i;
 
-	for (i = 0; i < internals->active_slave_count; i++)
-		bond_mode_8023ad_activate_slave(bond_dev,
-				internals->active_slaves[i]);
+	for (i = 0; i < internals->active_slave_count; i++) {
+		if (!bond_mode_8023ad_activate_slave(bond_dev,
+						internals->active_slaves[i]))
+			return -1;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
index 0f490a5..96a42f2 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
@@ -263,7 +263,7 @@ struct mode8023ad_private {
  * @return
  *  0 on success, negative value otherwise.
  */
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
 
 /**
diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index aa89425..657fd74 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -69,14 +69,15 @@
 	return 0;
 }
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
 {
 	struct bond_dev_private *internals = eth_dev->data->dev_private;
 	uint8_t active_count = internals->active_slave_count;
 
 	if (internals->mode == BONDING_MODE_8023AD)
-		bond_mode_8023ad_activate_slave(eth_dev, port_id);
+		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
+			return -1;
 
 	if (internals->mode == BONDING_MODE_TLB
 			|| internals->mode == BONDING_MODE_ALB) {
@@ -94,6 +95,8 @@
 		bond_tlb_activate_slave(internals);
 	if (internals->mode == BONDING_MODE_ALB)
 		bond_mode_alb_client_list_upd(eth_dev);
+
+	return 0;
 }
 
 void
@@ -357,10 +360,17 @@
 				bond_ethdev_primary_set(internals,
 							slave_port_id);
 
-			if (find_slave_by_id(internals->active_slaves,
-					     internals->active_slave_count,
-					     slave_port_id) == internals->active_slave_count)
-				activate_slave(bonded_eth_dev, slave_port_id);
+			int rc =
+				find_slave_by_id(internals->active_slaves,
+					internals->active_slave_count,
+					slave_port_id);
+
+			if (rc == internals->active_slave_count) {
+				int rc = activate_slave(bonded_eth_dev,
+							slave_port_id);
+				if (rc != 0)
+					return -1;
+			}
 		}
 	}
 
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index 2805c71..2d9052d 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -1741,8 +1741,10 @@ struct bwg_slave {
 		/* Any memory allocation failure in initialization is critical because
 		 * resources can't be free, so reinitialization is impossible. */
 		if (port->slow_pool == NULL) {
-			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-				slave_id, mem_name, rte_strerror(rte_errno));
+			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
+				__func__, slave_id,
+				mem_name, rte_strerror(rte_errno));
+			return -1;
 		}
 	}
 
@@ -2673,7 +2675,8 @@ struct bwg_slave {
 			mac_address_slaves_update(bonded_eth_dev);
 		}
 
-		activate_slave(bonded_eth_dev, port_id);
+		if (activate_slave(bonded_eth_dev, port_id) != 0)
+			return -1;
 
 		/* If user has defined the primary port then default to using it */
 		if (internals->user_defined_primary_port &&
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 94eca88..d99d42c 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -187,7 +187,7 @@ struct bond_dev_private {
 void
 deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
 void
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/2] mbuf: support attaching external buffer to mbuf
  @ 2018-04-23 16:18  4%   ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2018-04-23 16:18 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: wenzhuo.lu, jingjing.wu, dev, konstantin.ananyev,
	adrien.mazarguil, nelio.laranjeiro

Hi Yongseok,

Please see some comments below.

On Wed, Apr 18, 2018 at 06:11:04PM -0700, Yongseok Koh wrote:
> This patch introduces a new way of attaching an external buffer to a mbuf.
> 
> Attaching an external buffer is quite similar to mbuf indirection in
> replacing buffer addresses and length of a mbuf, but a few differences:
>   - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
>     mbuf must be read-only. But external buffer has its own refcnt and it
>     starts from 1. Unless multiple mbufs are attached to a mbuf having an
>     external buffer, the external buffer is writable.

I'm wondering if "As refcnt of a direct mbuf is at least 2" should be
clarified. I guess we are talking about a direct mbuf that has another one
attached too.

I'm also not sure if I understand properly: to me, it is possible to have
an indirect mbuf that references a direct mbuf with a refcount of 1:
  m = rte_pktmbuf_alloc()
  mi = rte_pktmbuf_alloc()
  rte_pktmbuf_attach(mi, m)
  rte_pktmbuf_free(m)

>   - There's no need to allocate buffer from a mempool. Any buffer can be
>     attached with appropriate free callback.
>   - Smaller metadata is required to maintain shared data such as refcnt.
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>

[...]

> +/**
> + * Function typedef of callback to free externally attached buffer.
> + */
> +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
> +
> +/**
> + * Shared data at the end of an external buffer.
> + */
> +struct rte_mbuf_ext_shared_info {
> +	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> +	void *fcb_opaque;                        /**< Free callback argument */
> +	RTE_STD_C11
> +	union {
> +		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
> +		uint16_t refcnt;          /**< Non-atomically accessed refcnt */

It looks that only refcnt_atomic is used.
I don't know if we really need the non-atomic one yet.


> +	};
> +};
> +
>  /**< Maximum number of nb_segs allowed. */
>  #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
>  
> @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
>  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
>  
>  /**
> + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> + */
> +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> +
> +/**
>   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
>   */
> -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))

I'm a bit reticent to have RTE_MBUF_DIRECT(m) different of
!RTE_MBUF_INDIRECT(m), I feel it's not very natural.

What about:
- direct = embeds its own data
- clone (or another name) = data is another mbuf
- extbuf = data is in an external buffer


>  /**
>   * Private data in case of pktmbuf pool.
> @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
>  
>  #endif /* RTE_MBUF_REFCNT_ATOMIC */
>  
> +/**
> + * Reads the refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @return
> + *   Reference count number.
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)

What do you think about rte_mbuf_ext_refcnt_read() to keep name consistency?
(same for other functions below)

[...]

> @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>  }
>  
>  /**
> + * Return shared data of external buffer of a mbuf.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @return
> + *   The address of the shared data.
> + */
> +static inline struct rte_mbuf_ext_shared_info *
> +rte_mbuf_ext_shinfo(struct rte_mbuf *m)
> +{
> +	return (struct rte_mbuf_ext_shared_info *)
> +		RTE_PTR_ADD(m->buf_addr, m->buf_len);
> +}

This forces to have the shared data at the end of the buffer. Is it
always possible? I think there are use-cases where the user may want to
specify another location for it.

For instance, an application mmaps a big file (locked in memory), and
wants to send mbufs pointing to this data without doing any copy.

Maybe adding a m->shinfo field would be a better choice, what do you
think?

This would certainly break the ABI, but I wonder if that patch does
not already break it. I mean, how would react an application compiled
for 18.02 if an EXTBUF is passed to it, knowing that many functions
are inline?


> +/**
> + * Attach an external buffer to a mbuf.
> + *
> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> + * it, corresponding free callback function and its argument should be
> + * provided. This callback function will be called once all the mbufs are
> + * detached from the buffer.
> + *
> + * More mbufs can be attached to the same external buffer by
> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> + * this API.
> + *
> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> + * ``rte_pktmbuf_detach()``.
> + *
> + * A few bytes in the trailer of the provided buffer will be dedicated for
> + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
> + * callback function and so on. The shared data can be referenced by
> + * ``rte_mbuf_ext_shinfo()``
> + *
> + * Attaching an external buffer is quite similar to mbuf indirection in
> + * replacing buffer addresses and length of a mbuf, but a few differences:
> + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> + *   mbuf must be read-only. But external buffer has its own refcnt and it
> + *   starts from 1. Unless multiple mbufs are attached to a mbuf having an
> + *   external buffer, the external buffer is writable.
> + * - There's no need to allocate buffer from a mempool. Any buffer can be
> + *   attached with appropriate free callback.
> + * - Smaller metadata is required to maintain shared data such as refcnt.
> + *
> + * @warning
> + * @b EXPERIMENTAL: This API may change without prior notice.
> + * Once external buffer is enabled by allowing experimental API,
> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> + * exclusive. A mbuf can be consiered direct if it is neither indirect nor

small typo:
consiered -> considered

> + * having external buffer.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @param buf_addr
> + *   The pointer to the external buffer we're attaching to.
> + * @param buf_len
> + *   The size of the external buffer we're attaching to. This must be larger
> + *   than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
> + *   alignment. If not enough, this function will return NULL.
> + * @param free_cb
> + *   Free callback function to call when the external buffer needs to be freed.
> + * @param fcb_opaque
> + *   Argument for the free callback function.
> + * @return
> + *   A pointer to the new start of the data on success, return NULL otherwise.
> + */
> +static inline char * __rte_experimental
> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> +	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> +	void *fcb_opaque)
> +{
> +	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +
> +	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> +			sizeof(uintptr_t));
> +
> +	if ((void *)shinfo <= buf_addr)
> +		return NULL;
> +
> +	m->buf_addr = buf_addr;
> +	m->buf_iova = rte_mempool_virt2iova(buf_addr);

Agree with Konstantin's comment. I think buf_iova should be an argument
of the function.


> +	m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);

Related to what I said above: I think m->buf_len should be set to
the buf_len argument, so a user can point to existing read-only
data.

[...]

Few more comments:

I think we still need to find a good way to advertise to the users
if a mbuf is writable or readable. Today, the rules are quite implicit.
There are surely some use cases where the mbuf is indirect but with
only one active user, meaning it could be READ-WRITE. We could target
18.08 for this.

One side question about your implementation in mlx. I guess the
hardware will write the mbuf data in a big contiguous buffer like
this:

+-------+--------------+--------+--------------+--------+- - -
|       |mbuf1 data    |        |mbuf2 data    |        |
|       |              |        |              |        |
+-------+--------------+--------+--------------+--------+- - -

Which will be transformed in:

+--+----+--------------+---+----+--------------+---+---+- - -
|  |head|mbuf1 data    |sh |head|mbuf2 data    |sh |   |
|  |room|              |inf|room|              |inf|   |
+--+----+--------------+---+----+--------------+---+---+- - -

So, there is one shinfo (i.e one refcount) for each mbuf.
How do you know when the big buffer is not used anymore?


To summarize, I like the idea of your patchset, this is close to
what I had in mind... which does not necessarly mean it is the good
way to do ;)

I'm a bit afraid about ABI breakage, we need to check that a
18.02-compiled application still works well with this change.

About testing, I don't know if you checked the mbuf autotests,
but it could also help to check that basic stuff still work.


Thanks,
Olivier

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v5 2/4] ethdev: Add group JUMP action
  @ 2018-04-23 15:56  2%     ` Declan Doherty
  0 siblings, 0 replies; 200+ results
From: Declan Doherty @ 2018-04-23 15:56 UTC (permalink / raw)
  To: dev
  Cc: Alex Rosenbaum, Ferruh Yigit, Thomas Monjalon, Shahaf Shuler,
	Qi Zhang, Alejandro Lucero, Andrew Rybchenko,
	Mohammad Abdul Awal, Remy Horton, John McNamara, Rony Efraim,
	Jingjing Wu, Wenzhuo Lu, Vincent Jardin, Yuanhan Liu,
	Bruce Richardson, Konstantin Ananyev, Declan Doherty

Add jump action type which defines an action which allows a matched
flow to be redirect to the specified group. This allows physical and
logical flow table/group hierarchies to be defined through rte_flow.

This breaks ABI compatibility for the following public functions (as it
modifes the ordering of the rte_flow_action_type enumeration):

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Add support for specification of new JUMP action to testpmd's flow
cli, and update the testpmd documentation to describe this new
action.

Signed-off-by: Declan Doherty <declan.doherty@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 +++++++++++
 doc/guides/prog_guide/rte_flow.rst          | 61 ++++++++++++++++++++++++-----
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 ++
 lib/librte_ether/rte_flow.h                 | 41 +++++++++++++++----
 4 files changed, 112 insertions(+), 17 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 32fe6645a..8c570baf4 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -161,6 +161,8 @@ enum index {
 	ACTION_END,
 	ACTION_VOID,
 	ACTION_PASSTHRU,
+	ACTION_JUMP,
+	ACTION_JUMP_GROUP,
 	ACTION_MARK,
 	ACTION_MARK_ID,
 	ACTION_FLAG,
@@ -630,6 +632,7 @@ static const enum index next_action[] = {
 	ACTION_END,
 	ACTION_VOID,
 	ACTION_PASSTHRU,
+	ACTION_JUMP,
 	ACTION_MARK,
 	ACTION_FLAG,
 	ACTION_QUEUE,
@@ -694,6 +697,12 @@ static const enum index action_meter[] = {
 	ZERO,
 };
 
+static const enum index action_jump[] = {
+	ACTION_JUMP_GROUP,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static int parse_init(struct context *, const struct token *,
 		      const char *, unsigned int,
 		      void *, unsigned int);
@@ -1593,6 +1602,20 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
+	[ACTION_JUMP] = {
+		.name = "jump",
+		.help = "redirect traffic to a given group",
+		.priv = PRIV_ACTION(JUMP, sizeof(struct rte_flow_action_jump)),
+		.next = NEXT(action_jump),
+		.call = parse_vc,
+	},
+	[ACTION_JUMP_GROUP] = {
+		.name = "group",
+		.help = "group to redirect traffic to",
+		.next = NEXT(action_jump, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_jump, group)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_MARK] = {
 		.name = "mark",
 		.help = "attach 32 bit value to packets",
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 20625c43e..837f30667 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -90,8 +90,12 @@ Thus predictable results for a given priority level can only be achieved
 with non-overlapping rules, using perfect matching on all protocol layers.
 
 Flow rules can also be grouped, the flow rule priority is specific to the
-group they belong to. All flow rules in a given group are thus processed
-either before or after another group.
+group they belong to. All flow rules in a given group are thus processed within
+the context of that group. Groups are not linked by default, so the logical
+hierarchy of groups must be explicitly defined by flow rules themselves in each
+group using the JUMP action to define the next group to redirect too. Only flow
+rules defined in the default group 0 are guarantee to be matched against, this
+makes group 0 the origin of any group hierarchy defined by an application. 
 
 Support for multiple actions per rule may be implemented internally on top
 of non-default hardware priorities, as a result both features may not be
@@ -138,29 +142,34 @@ Attributes
 Attribute: Group
 ^^^^^^^^^^^^^^^^
 
-Flow rules can be grouped by assigning them a common group number. Lower
-values have higher priority. Group 0 has the highest priority.
+Flow rules can be grouped by assigning them a common group number. Groups
+allow a logical hierarchy of flow rule groups (tables) to be defined. These
+groups can be supported virtually in the PMD or in the physical device. 
+Group 0 is the default group and this is the only group which flows are
+guarantee to matched against, all subsequent groups can only be reached by
+way of the JUMP action from a matched flow rule.
 
 Although optional, applications are encouraged to group similar rules as
 much as possible to fully take advantage of hardware capabilities
 (e.g. optimized matching) and work around limitations (e.g. a single pattern
-type possibly allowed in a given group).
+type possibly allowed in a given group), while being aware that the groups
+hierarchies must be programmed explicitly.
 
 Note that support for more than a single group is not guaranteed.
 
 Attribute: Priority
 ^^^^^^^^^^^^^^^^^^^
 
-A priority level can be assigned to a flow rule. Like groups, lower values
+A priority level can be assigned to a flow rule, lower values
 denote higher priority, with 0 as the maximum.
 
-A rule with priority 0 in group 8 is always matched after a rule with
-priority 8 in group 0.
-
-Group and priority levels are arbitrary and up to the application, they do
+Priority levels are arbitrary and up to the application, they do
 not need to be contiguous nor start from 0, however the maximum number
 varies between devices and may be affected by existing flow rules.
 
+A flow which matches multiple rules in the same group will always matched by
+the rule with the highest priority in that group.
+
 If a packet is matched by several rules of a given group for a given
 priority level, the outcome is undefined. It can take any path, may be
 duplicated or even cause unrecoverable errors.
@@ -1248,6 +1257,38 @@ flow rules:
    | 2     | END                        |
    +-------+----------------------------+
 
+Action: ``JUMP``
+^^^^^^^^^^^^^^^^
+
+Redirects packets to a group on the current device.
+
+In a hierarchy of groups, which can be used to represent physical or logical
+flow group/tables on the device, this action redirects the matched flow to
+the specified group on that device.
+
+If a matched flow is redirected to a table which doesn't contain a matching
+rule for that flow then the behavior is undefined and the resulting behavior
+is up to the specific device. Best practice when using groups would be define
+a default flow rule for each group which a defines the default actions in that
+group so a consistent behavior is defined.
+
+Defining an action for matched flow in a group to jump to a group which is
+higher in the group hierarchy may not be supported by physical devices, 
+depending on how groups are mapped to the physical devices. In the 
+definitions of jump actions, applications should be aware that it may be
+possible to define flow rules which trigger an undefined behavior causing
+flows to loop between groups.
+
+.. _table_rte_flow_action_jump:
+
+.. table:: JUMP
+
+   +-----------+------------------------------+
+   | Field     | Value                        |
+   +===========+==============================+
+   | ``group`` | Group to redirect packets to |
+   +-----------+------------------------------+
+
 Action: ``MARK``
 ^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index bfb5ad027..eec3cee4e 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3387,6 +3387,10 @@ This section lists supported actions and their attributes, if any.
 
 - ``passthru``: let subsequent rule process matched packets.
 
+- ``jump``: redirect traffic to group on device.
+
+  - ``group {unsigned}``: group to redirect to.
+
 - ``mark``: attach 32 bit value to packets.
 
   - ``id {unsigned}``: 32 bit value to return with packets.
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index a51f76e72..0372bd49f 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -35,18 +35,20 @@ extern "C" {
 /**
  * Flow rule attributes.
  *
- * Priorities are set on two levels: per group and per rule within groups.
+ * Priorities are set on a per rule based within groups.
  *
- * Lower values denote higher priority, the highest priority for both levels
- * is 0, so that a rule with priority 0 in group 8 is always matched after a
- * rule with priority 8 in group 0.
+ * Lower values denote higher priority, the highest priority for a flow rule
+ * is 0, so that a flow that matches for than one rule, the rule with the
+ * lowest priority value will always be matched.
  *
  * Although optional, applications are encouraged to group similar rules as
  * much as possible to fully take advantage of hardware capabilities
  * (e.g. optimized matching) and work around limitations (e.g. a single
- * pattern type possibly allowed in a given group).
+ * pattern type possibly allowed in a given group). Applications should be
+ * aware that groups are not linked by default, and that they must be explictly
+ * linked by the application using the JUMP action.
  *
- * Group and priority levels are arbitrary and up to the application, they
+ * Priority levels are arbitrary and up to the application, they
  * do not need to be contiguous nor start from 0, however the maximum number
  * varies between devices and may be affected by existing flow rules.
  *
@@ -69,7 +71,7 @@ extern "C" {
  */
 struct rte_flow_attr {
 	uint32_t group; /**< Priority group. */
-	uint32_t priority; /**< Priority level within group. */
+	uint32_t priority; /**< Rule priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
 	/**
@@ -957,6 +959,15 @@ enum rte_flow_action_type {
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
+	/**
+	 * RTE_FLOW_ACTION_TYPE_JUMP
+	 *
+	 * Redirects packets to a group on the current device.
+	 *
+	 * See struct rte_flow_action_jump.
+	 */
+	RTE_FLOW_ACTION_TYPE_JUMP,
+
 	/**
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
@@ -1104,6 +1115,22 @@ struct rte_flow_action_mark {
 	uint32_t id; /**< Integer value to return with packets. */
 };
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this structure may change without prior notice
+ *
+ * RTE_FLOW_ACTION_TYPE_JUMP
+ *
+ * Redirects packets to a group on the current device.
+ *
+ * In a hierarchy of groups, which can be used to represent physical or logical
+ * flow tables on the device, this action allows the action to be a redirect to
+ * a group on that device.
+ */
+struct rte_flow_action_jump {
+	uint32_t group;
+};
+
 /**
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
-- 
2.14.3

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-23 15:05  0%           ` Nélio Laranjeiro
  0 siblings, 0 replies; 200+ results
From: Nélio Laranjeiro @ 2018-04-23 15:05 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Xueming Li, Wenzhuo Lu,
	Jingjing Wu, Beilei Xing, Qi Zhang, Konstantin Ananyev,
	Yongseok Koh, Andrew Rybchenko, Pascal Mazon, Radu Nicolau,
	Akhil Goyal

On Thu, Apr 19, 2018 at 12:16:39PM +0200, Adrien Mazarguil wrote:
> Since its inception, the rte_flow RSS action has been relying in part on
> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> This structure lacks parameters such as the hash algorithm to use, and more
> recently, a method to tell which layer RSS should be performed on [1].
> 
> Given struct rte_eth_rss_conf will never be flexible enough to represent a
> complete RSS configuration (e.g. RETA table), this patch supersedes it by
> extending the rte_flow RSS action directly.
> 
> A subsequent patch will add a field to use a non-default RSS hash
> algorithm. To that end, a field named "types" replaces the field formerly
> known as "rss_hf" and standing for "RSS hash functions" as it was
> confusing. Actual RSS hash function types are defined by enum
> rte_eth_hash_function.
> 
> This patch updates all PMDs and example applications accordingly.
> 
> It breaks ABI compatibility for the following public functions:
> 
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
> 
> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>     configuration")
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> Cc: Radu Nicolau <radu.nicolau@intel.com>
> Cc: Akhil Goyal <akhil.goyal@nxp.com>
> 
> ---
> 
> v3 changes:
> 
> Documentation update regarding the meaning of a 0 value for RSS types in
> flow rules.
> 
> It used to implicitly mean "no RSS" but is redefined as requesting a kind
> of "best-effort" mode from PMDs, i.e. anything ranging from empty to
> all-inclusive RSS; what matters is it provides safe defaults that will work
> regardless of PMD capabilities.
> ---
>[...]
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> @@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
>  	enum { FATE = 1, MARK = 2, COUNT = 4, };
>  	uint32_t overlap = 0;
>  	struct priv *priv = dev->data->dev_private;
> -	int ret;
>  
> -	/*
> -	 * Add default RSS configuration necessary for Verbs to create QP even
> -	 * if no RSS is necessary.
> -	 */
> -	ret = mlx5_flow_convert_rss_conf(parser,
> -					 (const struct rte_eth_rss_conf *)
> -					 &priv->rss_conf);

This is still needed for QUEUE action, Verbs refuses to create an hash
Rx queue if no RSS key is provided even if the hash field is 0.

This can be fully moved to mlx5_hrxq_new() who can use the default key
when the rss_key is not provided.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v7 0/5] introduce new tunnel types
  2018-04-20 11:56  3%   ` [dpdk-dev] [PATCH v6 0/5] " Xueming Li
@ 2018-04-23 12:16  3%     ` Xueming Li
  0 siblings, 0 replies; 200+ results
From: Xueming Li @ 2018-04-23 12:16 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v7:
- Fixed display name of MPLS-in-GRE and MPLS-in-UDP
v6:
- Add MPLS-in-GRE and MPLS-in-UDP back
- UPdate comment alignment
v5:
- Fixed VXLAN-GPE comment alignment
- Removed MPLS-in-GRE and MPLS-in-UDP patch
v4:
- Update testpmd doc for flow VXLAN-GPE paramter.
v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine


Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c                 |  24 +++++++
 app/test-pmd/config.c                       |   2 +
 app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
 app/test-pmd/parameters.c                   |  12 +++-
 app/test-pmd/testpmd.h                      |   2 +
 doc/guides/prog_guide/rte_flow.rst          |  12 ++++
 doc/guides/rel_notes/deprecation.rst        |   4 --
 doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
 lib/librte_ether/rte_eth_ctrl.h             |   3 +-
 lib/librte_ether/rte_flow.c                 |   1 +
 lib/librte_ether/rte_flow.h                 |  27 ++++++++
 lib/librte_mbuf/rte_mbuf.c                  |   3 +
 lib/librte_mbuf/rte_mbuf.h                  |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c            |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h            |  47 +++++++++++++
 lib/librte_net/rte_ether.h                  |  25 +++++++
 17 files changed, 261 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-20 14:53  4%       ` Aaron Conole
@ 2018-04-23  8:07  4%         ` Arnon Warshavsky
  0 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-23  8:07 UTC (permalink / raw)
  To: Aaron Conole
  Cc: Thomas Monjalon, Burakov, Anatoly, Lu, Wenzhuo, Doherty, Declan,
	jerin.jacob, Bruce Richardson, Yigit, Ferruh, dev, Kevin Traynor

>
> Looking at the eal_thread_init_master, I think it's probably a
> recoverable condition.  For instance, perhaps the core mask was wrong,
> and could be corrected by re-attempting the initialization.  Just
> suggesting that it's probably okay to allow a re-attempt here.  I would
> suggest:
>
> -       eal_thread_init_master(rte_config.master_lcore);
> +       if (eal_thread_init_master(rte_config.master_lcore) != 0) {
> +               rte_eal_init_alert("Cannot assign master lcore\n");
> +               rte_errno = EINVAL;
> +               return -1;
> +       }
>
> if you agree.
>
> Yes. This is indeed the way to go.

>
> --
>
> In the above I hope it illustrates what you'll need - a way to signal to
> each side that initialization phase is happening, and that
> initialization was successful / failed, and to clean up in the failure
> case.
>
> Just meant for illustration so feel free to ignore / flame, but that's
> how I would go about removing the rte_panic() calls.
>

Thanks.
Indeed I did not consider recovery and clean up from here

>
> >  This seems to only exist as a way of triggering the run_once check in
> >  the eal_init.  It doesn't add anything except one more state variable to
> >  check against.  What is the purpose?
> >
> >
> > Actually this is not a run-once in purpose, rather an attempt to define
> a state for the device
> > and on the way work around breaking abi on the the void function called
> before that.
>
> I think it's a way to try and track state for initialization and to
> prevent future inits.  Which ABI are you worried about?  rte_panic()?
> I'm not sure how this is an ABI work around, but I'm probably not
> thinking about it hard enough.
>

I now see the mess I did and why wasn't it clear.
I initially changed the api of eal_thread_init_master() from void to int.
Having had the ABI check failed, I only fixed the linuxapp back to void and
added this workaround to prevent ABI break on this patch.

I will align both linuxapp and bsd to remain at panic for this function as
well , and address it together with the thread itself




-- 

*Arnon Warshavsky*
*Qwilt | work: +972-72-2221634 | mobile: +972-50-8583058 | arnon@qwilt.com
<arnon@qwilt.com>*

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH] lib/ethdev: moving IPsec event enum to the end
  2018-04-23  7:27  3% [dpdk-dev] [PATCH] lib/ethdev: moving IPsec event enum to the end Anoob Joseph
@ 2018-04-23  8:02  0% ` De Lara Guarch, Pablo
  0 siblings, 0 replies; 200+ results
From: De Lara Guarch, Pablo @ 2018-04-23  8:02 UTC (permalink / raw)
  To: Anoob Joseph, Thomas Monjalon
  Cc: Akhil Goyal, Nicolau, Radu, Stephen Hemminger, Jerin Jacob,
	Narayana Prasad, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Anoob Joseph
> Sent: Monday, April 23, 2018 8:28 AM
> To: Thomas Monjalon <thomas@monjalon.net>
> Cc: Anoob Joseph <anoob.joseph@caviumnetworks.com>; Akhil Goyal
> <akhil.goyal@nxp.com>; Nicolau, Radu <radu.nicolau@intel.com>; Stephen
> Hemminger <stephen@networkplumber.org>; Jerin Jacob
> <jerin.jacob@caviumnetworks.com>; Narayana Prasad
> <narayanaprasad.athreya@caviumnetworks.com>; dev@dpdk.org
> Subject: [dpdk-dev] [PATCH] lib/ethdev: moving IPsec event enum to the end
> 
> Adding new entry in the middle could break ABI compatibility. So moving it to
> the end.
> 
> Fixes: 714e05f33171 ("ethdev: support for inline IPsec events")
> 
> Signed-off-by: Anoob Joseph <anoob.joseph@caviumnetworks.com>

Squashed into relevant commit and applied to dpdk-next-crypto tree.

Thanks,
Pablo

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 04/11] net/mlx5: support Rx tunnel type identification
  2018-04-23  7:40  4%   ` Nélio Laranjeiro
@ 2018-04-23  7:56  0%     ` Xueming(Steven) Li
  0 siblings, 0 replies; 200+ results
From: Xueming(Steven) Li @ 2018-04-23  7:56 UTC (permalink / raw)
  To: Nélio Laranjeiro; +Cc: Iremonger Bernard, Shahaf Shuler, dev



> -----Original Message-----
> From: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>
> Sent: Monday, April 23, 2018 3:41 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>
> Cc: Iremonger Bernard <bernard.iremonger@intel.com>; Shahaf Shuler <shahafs@mellanox.com>;
> dev@dpdk.org
> Subject: Re: [PATCH v5 04/11] net/mlx5: support Rx tunnel type identification
> 
> On Fri, Apr 20, 2018 at 08:23:33PM +0800, Xueming Li wrote:
> > This patch introduced tunnel type identification based on flow rules.
> > If flows of multiple tunnel types built on same queue,
> >RTE_PTYPE_TUNNEL_MASK will be returned, user application could use bits
> >in flow mark as tunnel type identifier.
> >[...]
> 
> There is still the issue about returning this wrong bits in the mbuf.
> 
> Bit in the mbuf ptypes must only reflect what is present in the mbuf, using RTE_PTYPE_TUNNEL_MASK
> means all tunnels are present in the packet which is absolutely wrong.
> 
> This behavior was not announce and breaks API/ABI.  It cannot be accepted yet.
> 
> I'll suggest to add a new RTE_PTYPE_TUNNEL_UNKNOWN which does not break the ABI or don't add such bits
> in the mbuf.
> 

Seems I forgot to update commit message. RTE_PTYPE_TUNNEL_UNKNOWN has been removed according to discussion.


> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 04/11] net/mlx5: support Rx tunnel type identification
  @ 2018-04-23  7:40  4%   ` Nélio Laranjeiro
  2018-04-23  7:56  0%     ` Xueming(Steven) Li
  0 siblings, 1 reply; 200+ results
From: Nélio Laranjeiro @ 2018-04-23  7:40 UTC (permalink / raw)
  To: Xueming Li; +Cc: Iremonger Bernard, Shahaf Shuler, dev

On Fri, Apr 20, 2018 at 08:23:33PM +0800, Xueming Li wrote:
> This patch introduced tunnel type identification based on flow rules.
> If flows of multiple tunnel types built on same queue,
> RTE_PTYPE_TUNNEL_MASK will be returned, user application could use bits
> in flow mark as tunnel type identifier.
>[...]

There is still the issue about returning this wrong bits in the mbuf.

Bit in the mbuf ptypes must only reflect what is present in the mbuf,
using RTE_PTYPE_TUNNEL_MASK means all tunnels are present in the packet
which is absolutely wrong.

This behavior was not announce and breaks API/ABI.  It cannot be
accepted yet.

I'll suggest to add a new RTE_PTYPE_TUNNEL_UNKNOWN which does not break
the ABI or don't add such bits in the mbuf.

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH] lib/ethdev: moving IPsec event enum to the end
@ 2018-04-23  7:27  3% Anoob Joseph
  2018-04-23  8:02  0% ` De Lara Guarch, Pablo
  0 siblings, 1 reply; 200+ results
From: Anoob Joseph @ 2018-04-23  7:27 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Anoob Joseph, Akhil Goyal, Radu Nicolau, Stephen Hemminger,
	Jerin Jacob, Narayana Prasad, dev

Adding new entry in the middle could break ABI compatibility. So moving
it to the end.

Fixes: 714e05f33171 ("ethdev: support for inline IPsec events")

Signed-off-by: Anoob Joseph <anoob.joseph@caviumnetworks.com>
---
 lib/librte_ether/rte_ethdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index a2de201..ccc476e 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -2534,11 +2534,11 @@ enum rte_eth_event_type {
 	RTE_ETH_EVENT_INTR_RESET,
 			/**< reset interrupt event, sent to VF on PF reset */
 	RTE_ETH_EVENT_VF_MBOX,  /**< message from the VF received by PF */
-	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
 	RTE_ETH_EVENT_MACSEC,   /**< MACsec offload related event */
 	RTE_ETH_EVENT_INTR_RMV, /**< device removal event */
 	RTE_ETH_EVENT_NEW,      /**< port is probed */
 	RTE_ETH_EVENT_DESTROY,  /**< port is released */
+	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
 	RTE_ETH_EVENT_MAX       /**< max value of this enum */
 };
 
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 1/5] lib/ethdev: support for inline IPsec events
  @ 2018-04-20 15:14  3%       ` Stephen Hemminger
  0 siblings, 0 replies; 200+ results
From: Stephen Hemminger @ 2018-04-20 15:14 UTC (permalink / raw)
  To: Anoob Joseph
  Cc: Thomas Monjalon, Akhil Goyal, Declan Doherty, Jingjing Wu,
	Radu Nicolau, Wenzhuo Lu, Jerin Jacob, Narayana Prasad,
	Nelio Laranjeiro, dev

On Thu, 19 Apr 2018 14:45:01 +0530
Anoob Joseph <Anoob.Joseph@caviumnetworks.com> wrote:

> > +/**
> >    * The eth device event type for interrupt, and maybe others in the future.
> >    */
> >   enum rte_eth_event_type {
> > @@ -2446,6 +2486,7 @@ enum rte_eth_event_type {
> >   	RTE_ETH_EVENT_INTR_RESET,
> >   			/**< reset interrupt event, sent to VF on PF reset */
> >   	RTE_ETH_EVENT_VF_MBOX,  /**< message from the VF received by PF */
> > +	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
> >   	RTE_ETH_EVENT_MACSEC,   /**< MACsec offload related event */
> >   	RTE_ETH_EVENT_INTR_RMV, /**< device removal event */
> >   	RTE_ETH_EVENT_NEW,      /**< port is probed */  


Putting new value in middle of enum risks breaking ABI compatiablity

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-20 13:55  4%     ` Arnon Warshavsky
@ 2018-04-20 14:53  4%       ` Aaron Conole
  2018-04-23  8:07  4%         ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Aaron Conole @ 2018-04-20 14:53 UTC (permalink / raw)
  To: Arnon Warshavsky
  Cc: Thomas Monjalon, Burakov, Anatoly, Lu, Wenzhuo, Doherty, Declan,
	jerin.jacob, Bruce Richardson, Yigit, Ferruh, dev, Kevin Traynor

Arnon Warshavsky <arnon@qwilt.com> writes:

> Thanks Aaron 
>
> Previously, it wasn't possible for mem_cfg_fd to be reused after a
>
>  failure.  Now it is - please reset it to -1. in these close conditions.
>
> Will do. 
>
>  Again, previously this would have aborted on a failure.  So it needs to
>  be reset to a value that allows retry.
>
> Same here
>
>  >       switch (rte_config.process_type){
>  >       case RTE_PROC_PRIMARY:
>  > -             rte_eal_config_create();
>  > +             if (rte_eal_config_create())
>  > +                     return -1;
>  >               break;
>  >       case RTE_PROC_SECONDARY:
>  > -             rte_eal_config_attach();
>  > +             if (rte_eal_config_attach())
>  > +                     return -1;
>  >               rte_eal_mcfg_wait_complete(rte_config.mem_config);
>  >               break;
>  >       case RTE_PROC_AUTO:
>  >       case RTE_PROC_INVALID:
>
>  Not for this patch, but I just noticed that this should probably use a
>  'default' case.
>
> Will add this while Im here
>  
>  
>  Use rte_eal_init_alert to indicate why you are failing the init.
>
> Will do 
>
>  >       if (rte_mp_channel_init() < 0) {
>  >               rte_eal_init_alert("failed to init mp channel\n");
>  > @@ -652,7 +676,8 @@ static void rte_eal_init_alert(const char *msg)
>  >  
>  >       eal_check_mem_on_local_socket();
>  >  
>  > -     eal_thread_init_master(rte_config.master_lcore);
>  > +     if (eal_thread_init_master(rte_config.master_lcore) != 0)
>  > +             return -1;
>
>  Is it ever possible to recover from this?
>
>  
> Definitely not recoverable, but not different than the other cases where panic propagate all the way
> up rather than aborting

Looking at the eal_thread_init_master, I think it's probably a
recoverable condition.  For instance, perhaps the core mask was wrong,
and could be corrected by re-attempting the initialization.  Just
suggesting that it's probably okay to allow a re-attempt here.  I would
suggest:

-	eal_thread_init_master(rte_config.master_lcore);
+	if (eal_thread_init_master(rte_config.master_lcore) != 0) {
+		rte_eal_init_alert("Cannot assign master lcore\n");
+		rte_errno = EINVAL;
+		return -1;
+	}

if you agree.

>  Still needs
>  rte_eal_init_alert() call.
>
> Will do 
>
>  How are you cleaning up the threads that were spawned?  Lets say this
>  loop will execute 5 times, and on the 3rd entry, these errors happen.
>  You now leave DPDK 'half-initialized' - you've spun up threads and
>  allocated memory.
>
> ... 
>
>  I don't see how any of this is better for the user.  In fact, I think
>  this is worse because it will make portions of the application stop
>  working without any way to move forward.  rte_panic() will at least give
>  the process a chance to recover from a potentially ephemeral condition.
>
> As I wrote in a different reply on this patch
> I was probably too eager to get rid of this panic taking some wrong assumptions on the way the
> library will be called.

Okay - guess emails got crossed in flight :)

> Removing the panic from the thread is obviously more complex and also ABI breaking.
> From my own bw, I will not make it with a proper change to this version, so I will revert back to
> panicing on this patchset 
> and aim for the thread in the next build.

I see.  Most likely you'll need a proper initialization protocol both
ways.  As a brief example, you'll need something to guarantee the thread
state (just a general outline):

--
  global_initial_state = UNINIT
  pthread_init_cond = PTHREAD_COND_INIT;
  global_spawned_ctr = atomic_ctr(0);

rte_eal_init()
  ...
  global_initial_state = INITIALIZING
  thread_ctr = 0;

  ...
  for_each_lcore()
    spawn_thread()
    if (spawn_fails)
       global_initial_state = UNINIT;
       pthread_cond_broadcast()
       return error

    thread_ctr++;

  while (thread_ctr != global_spawned_ctr)
     /* wait?  figure out when to declare extreme failure */

  global_initial_state = THREAD_INITIALIZED
  pthread_cond_broadcast(pthread_init_cond)

  while (thread_ctr)
    wait_some_grace_period()
    for_each_lcore_thread()
      thread_state = lcore_state[thr]; /* probably needs a mem barrier*/
      if (thread_state != THREAD_READY && thread_state != THREAD_STARTING)
         /* failure - message all threads to clean up */
      if (thread_state == THREAD_READY)
         thread_ctr--;


in eal_thread_loop():

  /* before even the set_affinity */
  atomic_inc(global_spawned_ctr);
  lcore_state[thr] = THREAD_STARTING;
  pthread_cond_wait(pthread_init_cond)

  if (global_initial_state != THREAD_INITIALIZED)
    lcore_state[thr] = THREAD_FAILED;
    return...;

  /* do all the normal checks... instead of the panic_state, just set
     lcore_state[thr] to THREAD_FAILED, clean up any additional
     allocated resources, and return... which will exit the thread */

  lcore_state[thr] = THREAD_READY;


--

In the above I hope it illustrates what you'll need - a way to signal to
each side that initialization phase is happening, and that
initialization was successful / failed, and to clean up in the failure
case.

Just meant for illustration so feel free to ignore / flame, but that's
how I would go about removing the rte_panic() calls.

>  This seems to only exist as a way of triggering the run_once check in
>  the eal_init.  It doesn't add anything except one more state variable to
>  check against.  What is the purpose?
>
>  
> Actually this is not a run-once in purpose, rather an attempt to define a state for the device 
> and on the way work around breaking abi on the the void function called before that.

I think it's a way to try and track state for initialization and to
prevent future inits.  Which ABI are you worried about?  rte_panic()?
I'm not sure how this is an ABI work around, but I'm probably not
thinking about it hard enough.

>> +     if (rte_get_panic_state())
>> +             return -1;
>> +
>
>  Please just use run_once.  That's a better way of preventing this.
>
>  
>  As stated above - no a run-once
>
>  All of the comments from the bsd side apply here.

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
  2018-04-20 13:48  3%   ` Ferruh Yigit
  2018-04-20 14:23  0%     ` Ferruh Yigit
@ 2018-04-20 14:23  0%     ` Xueming(Steven) Li
  1 sibling, 0 replies; 200+ results
From: Xueming(Steven) Li @ 2018-04-20 14:23 UTC (permalink / raw)
  To: Ferruh Yigit, Shahaf Shuler, Nelio Laranjeiro, Wenzhuo Lu,
	Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: dev

Hi Ferruh,

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, April 20, 2018 9:48 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>; Shahaf Shuler <shahafs@mellanox.com>; Nelio Laranjeiro
> <notifications@github.com>; Wenzhuo Lu <wenzhuo.lu@intel.com>; Jingjing Wu <jingjing.wu@intel.com>;
> Thomas Monjalon <thomas@monjalon.net>; Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
> 
> On 4/20/2018 2:06 PM, Xueming Li wrote:
> > This patch introduce new TX offload flags for device that supports IP
> > or UDP tunneled packet L3/L4 checksum and TSO offload.
> > It will be used for non-standard tunnels.
> >
> > The support from the device is for inner and outer checksums on
> > IPV4/TCP/UDP and TSO for *any packet with the following format*:
> >
> > <some headers> / [optional IPv4/IPv6] / [optional TCP/UDP] / <some
> > headers> / [optional inner IPv4/IPv6] / [optional TCP/UDP]
> >
> > For example the following packets can use this feature:
> >
> > 1. eth / ipv4 / udp / VXLAN / ip / tcp 2. eth / ipv4 / GRE / MPLS /
> > ipv4 / udp
> >
> > Please note that specific tunnel headers that contain payload length,
> > sequence id or checksum will not be updated.
> >
> > Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> > Acked-by: Thomas Monjalon <thomas@monjalon.net>
> 
> It is getting messier! [1]
> 
> Hi Thomas,
> 
> Any suggestion on how to manage these rte_flow patches, we are late and they aren't settle down yet.
> There are some level of dependency and there are some uncertainty in some of the dependent patches
> because of ABI/API process.
> 
> It would be great to get them incremental or have a plan to how to proceed.
> 
> 
> [1]
> Previous version in this thread is following patches:
> [PATCH v4 1/2] ethdev: add supported hash function check [PATCH v4 2/2] app/testpmd: new parameter for
> port config all rss command
> 
> And this set is:
> [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO [PATCH v5 2/2] app/testpmd:
> testpmd support Tx generic tunnel offloads

My bad, I was using wrong branch to format v5 of this thread, will resend, sorry for confusion.

> 
> But there is already v5 send for this set and in other thread there is v7 of it:
> [PATCH v7 0/2] support Tx generic tunnel checksum and TSO [PATCH v7 1/2] ethdev: introduce generic
> IP/UDP tunnel checksum and TSO [PATCH v7 2/2] app/testpmd: testpmd support Tx generic tunnel offloads
> 
> Most probably you have intended to send another patchset here.

Correct.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
  2018-04-20 13:48  3%   ` Ferruh Yigit
@ 2018-04-20 14:23  0%     ` Ferruh Yigit
  2018-04-20 14:23  0%     ` Xueming(Steven) Li
  1 sibling, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-20 14:23 UTC (permalink / raw)
  To: Xueming Li, Shahaf Shuler, Nelio Laranjeiro, Wenzhuo Lu,
	Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: dev, Qi Zhang, Declan Doherty, Awal, Mohammad Abdul,
	Andrew Rybchenko, Gaetan Rivet

On 4/20/2018 2:48 PM, Ferruh Yigit wrote:
> On 4/20/2018 2:06 PM, Xueming Li wrote:
>> This patch introduce new TX offload flags for device that supports
>> IP or UDP tunneled packet L3/L4 checksum and TSO offload.
>> It will be used for non-standard tunnels.
>>
>> The support from the device is for inner and outer checksums on
>> IPV4/TCP/UDP and TSO for *any packet with the following format*:
>>
>> <some headers> / [optional IPv4/IPv6] / [optional TCP/UDP] / <some
>> headers> / [optional inner IPv4/IPv6] / [optional TCP/UDP]
>>
>> For example the following packets can use this feature:
>>
>> 1. eth / ipv4 / udp / VXLAN / ip / tcp
>> 2. eth / ipv4 / GRE / MPLS / ipv4 / udp
>>
>> Please note that specific tunnel headers that contain payload length,
>> sequence id or checksum will not be updated.
>>
>> Signed-off-by: Xueming Li <xuemingl@mellanox.com>
>> Acked-by: Thomas Monjalon <thomas@monjalon.net>
> 
> It is getting messier! [1]
> 
> Hi Thomas,
> 
> Any suggestion on how to manage these rte_flow patches, we are late and they
> aren't settle down yet. There are some level of dependency and there are some
> uncertainty in some of the dependent patches because of ABI/API process.
> 
> It would be great to get them incremental or have a plan to how to proceed.

Involved parties looks like following:

Xueming: Mellanox offloads for tunnel protocols.
Adrien: rte_flow improvements for rss?
Qi: rte_flow more protocol support?
Declan/Awal: TEP, port representor, using rte_flow?
Andrew: sfc PMD updates on top of rte_flow changes.

Gaetan: devargs, previously devargs dependency mentioned for some of above, not
sure that is still the case.

I am not clear of latest status above patches and their dependency with
eachother, are they in sync or not, it can be nice who know clarifies more.

> 
> 
> [1]
> Previous version in this thread is following patches:
> [PATCH v4 1/2] ethdev: add supported hash function check
> [PATCH v4 2/2] app/testpmd: new parameter for port config all rss command
> 
> And this set is:
> [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
> [PATCH v5 2/2] app/testpmd: testpmd support Tx generic tunnel offloads
> 
> But there is already v5 send for this set and in other thread there is v7 of it:
> [PATCH v7 0/2] support Tx generic tunnel checksum and TSO
> [PATCH v7 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
> [PATCH v7 2/2] app/testpmd: testpmd support Tx generic tunnel offloads
> 
> Most probably you have intended to send another patchset here.
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-19 17:48  0%   ` Aaron Conole
@ 2018-04-20 13:55  4%     ` Arnon Warshavsky
  2018-04-20 14:53  4%       ` Aaron Conole
  0 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-20 13:55 UTC (permalink / raw)
  To: Aaron Conole
  Cc: Thomas Monjalon, Burakov, Anatoly, Lu, Wenzhuo, Doherty, Declan,
	jerin.jacob, Bruce Richardson, Yigit, Ferruh, dev, Kevin Traynor

 Thanks Aaron

Previously, it wasn't possible for mem_cfg_fd to be reused after a

> failure.  Now it is - please reset it to -1. in these close conditions.
>
> Will do.

>
>
> Again, previously this would have aborted on a failure.  So it needs to
> be reset to a value that allows retry.
>

Same here

>
> >       switch (rte_config.process_type){
> >       case RTE_PROC_PRIMARY:
> > -             rte_eal_config_create();
> > +             if (rte_eal_config_create())
> > +                     return -1;
> >               break;
> >       case RTE_PROC_SECONDARY:
> > -             rte_eal_config_attach();
> > +             if (rte_eal_config_attach())
> > +                     return -1;
> >               rte_eal_mcfg_wait_complete(rte_config.mem_config);
> >               break;
> >       case RTE_PROC_AUTO:
> >       case RTE_PROC_INVALID:
>
> Not for this patch, but I just noticed that this should probably use a
> 'default' case.
>
> Will add this while Im here


>
> Use rte_eal_init_alert to indicate why you are failing the init.
>
Will do

>
> >       if (rte_mp_channel_init() < 0) {
> >               rte_eal_init_alert("failed to init mp channel\n");
> > @@ -652,7 +676,8 @@ static void rte_eal_init_alert(const char *msg)
> >
> >       eal_check_mem_on_local_socket();
> >
> > -     eal_thread_init_master(rte_config.master_lcore);
> > +     if (eal_thread_init_master(rte_config.master_lcore) != 0)
> > +             return -1;
>
> Is it ever possible to recover from this?


Definitely not recoverable, but not different than the other cases where
panic propagate all the way up rather than aborting

> Still needs
> rte_eal_init_alert() call.
>
Will do

>
>
> How are you cleaning up the threads that were spawned?  Lets say this
> loop will execute 5 times, and on the 3rd entry, these errors happen.
> You now leave DPDK 'half-initialized' - you've spun up threads and
> allocated memory.
>
...

>
> I don't see how any of this is better for the user.  In fact, I think
> this is worse because it will make portions of the application stop
> working without any way to move forward.  rte_panic() will at least give
> the process a chance to recover from a potentially ephemeral condition.
>
> As I wrote in a different reply on this patch
I was probably too eager to get rid of this panic taking some wrong
assumptions on the way the library will be called.
Removing the panic from the thread is obviously more complex and also ABI
breaking.
>From my own bw, I will not make it with a proper change to this version, so
I will revert back to panicing on this patchset
and aim for the thread in the next build.


>
> This seems to only exist as a way of triggering the run_once check in
> the eal_init.  It doesn't add anything except one more state variable to
> check against.  What is the purpose?
>

Actually this is not a run-once in purpose, rather an attempt to define a
state for the device
and on the way work around breaking abi on the the void function called
before that.

> +     if (rte_get_panic_state())
> +             return -1;
> +

Please just use run_once.  That's a better way of preventing this.
>


> As stated above - no a run-once
>
> All of the comments from the bsd side apply here.
>

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
  @ 2018-04-20 13:48  3%   ` Ferruh Yigit
  2018-04-20 14:23  0%     ` Ferruh Yigit
  2018-04-20 14:23  0%     ` Xueming(Steven) Li
  0 siblings, 2 replies; 200+ results
From: Ferruh Yigit @ 2018-04-20 13:48 UTC (permalink / raw)
  To: Xueming Li, Shahaf Shuler, Nelio Laranjeiro, Wenzhuo Lu,
	Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: dev

On 4/20/2018 2:06 PM, Xueming Li wrote:
> This patch introduce new TX offload flags for device that supports
> IP or UDP tunneled packet L3/L4 checksum and TSO offload.
> It will be used for non-standard tunnels.
> 
> The support from the device is for inner and outer checksums on
> IPV4/TCP/UDP and TSO for *any packet with the following format*:
> 
> <some headers> / [optional IPv4/IPv6] / [optional TCP/UDP] / <some
> headers> / [optional inner IPv4/IPv6] / [optional TCP/UDP]
> 
> For example the following packets can use this feature:
> 
> 1. eth / ipv4 / udp / VXLAN / ip / tcp
> 2. eth / ipv4 / GRE / MPLS / ipv4 / udp
> 
> Please note that specific tunnel headers that contain payload length,
> sequence id or checksum will not be updated.
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> Acked-by: Thomas Monjalon <thomas@monjalon.net>

It is getting messier! [1]

Hi Thomas,

Any suggestion on how to manage these rte_flow patches, we are late and they
aren't settle down yet. There are some level of dependency and there are some
uncertainty in some of the dependent patches because of ABI/API process.

It would be great to get them incremental or have a plan to how to proceed.


[1]
Previous version in this thread is following patches:
[PATCH v4 1/2] ethdev: add supported hash function check
[PATCH v4 2/2] app/testpmd: new parameter for port config all rss command

And this set is:
[PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
[PATCH v5 2/2] app/testpmd: testpmd support Tx generic tunnel offloads

But there is already v5 send for this set and in other thread there is v7 of it:
[PATCH v7 0/2] support Tx generic tunnel checksum and TSO
[PATCH v7 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO
[PATCH v7 2/2] app/testpmd: testpmd support Tx generic tunnel offloads

Most probably you have intended to send another patchset here.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver
  2018-04-19 17:25  0%   ` Kevin Traynor
@ 2018-04-20 13:14  0%     ` Arnon Warshavsky
  0 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-20 13:14 UTC (permalink / raw)
  To: Kevin Traynor
  Cc: Thomas Monjalon, Burakov, Anatoly, Lu, Wenzhuo, Doherty, Declan,
	jerin.jacob, Bruce Richardson, Yigit, Ferruh, dev

Same as in the other patches. Will do. Thanks

On Thu, Apr 19, 2018 at 8:25 PM, Kevin Traynor <ktraynor@redhat.com> wrote:

> On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> > replace panic calls with log and retrun value.
> > Local function to this file,
> > changing from void to int is non-abi-breaking
> > --
> > v4 - keep error message literal string in a singhle line
> >
> > Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> > ---
> >  drivers/net/e1000/e1000_ethdev.h |  2 +-
> >  drivers/net/e1000/igb_ethdev.c   |  3 ++-
> >  drivers/net/e1000/igb_pf.c       | 15 +++++++++------
> >  3 files changed, 12 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_
> ethdev.h
> > index 6354b89..2e527de 100644
> > --- a/drivers/net/e1000/e1000_ethdev.h
> > +++ b/drivers/net/e1000/e1000_ethdev.h
> > @@ -411,7 +411,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev
> *dev,
> >  /*
> >   * misc function prototypes
> >   */
> > -void igb_pf_host_init(struct rte_eth_dev *eth_dev);
> > +int igb_pf_host_init(struct rte_eth_dev *eth_dev);
> >
> >  void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
> >
> > diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_
> ethdev.c
> > index 9b808a9..4479616 100644
> > --- a/drivers/net/e1000/igb_ethdev.c
> > +++ b/drivers/net/e1000/igb_ethdev.c
> > @@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev
> *eth_dev)
> >       }
> >
> >       /* initialize PF if max_vfs not zero */
> > -     igb_pf_host_init(eth_dev);
> > +     if (igb_pf_host_init(eth_dev) != 0)
>
> You don't need "!= 0"
>
> You need to set "error" here, or else return it from igb_pf_host_init().
> We know -ENOMEM is the only error that can be returned from
> igb_pf_host_init() but not sure if should assume that.
>
> > +             goto err_late;
> >
> >       ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
> >       /* Set PF Reset Done bit so PF/VF Mail Ops can work */
> > diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
> > index b9f2e53..ae4b0a4 100644
> > --- a/drivers/net/e1000/igb_pf.c
> > +++ b/drivers/net/e1000/igb_pf.c
> > @@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev,
> uint16_t vf_num)
> >       return 0;
> >  }
> >
> > -void igb_pf_host_init(struct rte_eth_dev *eth_dev)
> > +int igb_pf_host_init(struct rte_eth_dev *eth_dev)
> >  {
> >       struct e1000_vf_info **vfinfo =
> >               E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
> > @@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
> >
> >       RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
> >       if (0 == (vf_num = dev_num_vf(eth_dev)))
> > -             return;
> > +             return 0;
> >
> >       if (hw->mac.type == e1000_i350)
> >               nb_queue = 1;
> > @@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
> >               /* per datasheet, it should be 2, but 1 seems correct */
> >               nb_queue = 1;
> >       else
> > -             return;
> > +             return 0;
> >
> >       *vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) *
> vf_num, 0);
> > -     if (*vfinfo == NULL)
> > -             rte_panic("Cannot allocate memory for private VF data\n");
> > +     if (*vfinfo == NULL) {
> > +             RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for
> private VF data\n",
> > +                     __func__);
> > +             return -1;
> > +     }
> >
> >       RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
> >       RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
> > @@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
> >       /* set mb interrupt mask */
> >       igb_mb_intr_setup(eth_dev);
> >
> > -     return;
> > +     return 0;
> >  }
> >
> >  void igb_pf_host_uninit(struct rte_eth_dev *dev)
> >
>
>


-- 

*Arnon Warshavsky*
*Qwilt | work: +972-72-2221634 | mobile: +972-50-8583058 | arnon@qwilt.com
<arnon@qwilt.com>*

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver
  2018-04-19 17:25  0%   ` Kevin Traynor
@ 2018-04-20 13:13  0%     ` Arnon Warshavsky
  0 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-20 13:13 UTC (permalink / raw)
  To: Kevin Traynor
  Cc: Thomas Monjalon, Burakov, Anatoly, Lu, Wenzhuo, Doherty, Declan,
	jerin.jacob, Bruce Richardson, Yigit, Ferruh, dev

Will do. Thanks

On Thu, Apr 19, 2018 at 8:25 PM, Kevin Traynor <ktraynor@redhat.com> wrote:

> On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> > replace panic calls with log and retrun value.
> > Local functions to this file,
> > changing from void to int are non-abi-breaking
> > --
> > v4 - fix split literal strings in log messages
> >
> > Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> > ---
> >  drivers/net/bonding/rte_eth_bond_8023ad.c         | 28
> +++++++++++++++--------
> >  drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
> >  drivers/net/bonding/rte_eth_bond_api.c            | 20 +++++++++++-----
> >  drivers/net/bonding/rte_eth_bond_pmd.c            |  9 +++++---
> >  drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
> >  5 files changed, 40 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c
> b/drivers/net/bonding/rte_eth_bond_8023ad.c
> > index c452318..7512901 100644
> > --- a/drivers/net/bonding/rte_eth_bond_8023ad.c
> > +++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
> > @@ -893,7 +893,7 @@
> >                       bond_mode_8023ad_periodic_cb, arg);
> >  }
> >
> > -void
> > +int
> >  bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
> >                               uint16_t slave_id)
> >  {
> > @@ -939,7 +939,7 @@
> >       timer_cancel(&port->warning_timer);
> >
> >       if (port->mbuf_pool != NULL)
> > -             return;
> > +             return 0;
> >
> >       RTE_ASSERT(port->rx_ring == NULL);
> >       RTE_ASSERT(port->tx_ring == NULL);
> > @@ -968,8 +968,9 @@
> >       /* Any memory allocation failure in initialization is critical
> because
> >        * resources can't be free, so reinitialization is impossible. */
> >       if (port->mbuf_pool == NULL) {
> > -             rte_panic("Slave %u: Failed to create memory pool '%s':
> %s\n",
> > -                     slave_id, mem_name, rte_strerror(rte_errno));
> > +             RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory
> pool '%s': %s\n",
> > +                     __func__, slave_id, mem_name,
> rte_strerror(rte_errno));
> > +             return -1;
> >       }
> >
> >       snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
> > @@ -977,8 +978,9 @@
> >                       rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS),
> socket_id, 0);
> >
> >       if (port->rx_ring == NULL) {
> > -             rte_panic("Slave %u: Failed to create rx ring '%s': %s\n",
> slave_id,
> > -                     mem_name, rte_strerror(rte_errno));
> > +             RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring
> '%s': %s\n",
> > +                     __func__, slave_id, mem_name,
> rte_strerror(rte_errno));
> > +             return -1;
> >       }
> >
> >       /* TX ring is at least one pkt longer to make room for marker
> packet. */
> > @@ -987,9 +989,12 @@
> >                       rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS +
> 1), socket_id, 0);
> >
> >       if (port->tx_ring == NULL) {
> > -             rte_panic("Slave %u: Failed to create tx ring '%s': %s\n",
> slave_id,
> > -                     mem_name, rte_strerror(rte_errno));
> > +             RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring
> '%s': %s\n",
> > +                     __func__, slave_id, mem_name,
> rte_strerror(rte_errno));
> > +             return -1;
> >       }
> > +
> > +     return 0;
> >  }
> >
> >  int
> > @@ -1143,9 +1148,12 @@
> >       struct bond_dev_private *internals = bond_dev->data->dev_private;
> >       uint8_t i;
> >
> > -     for (i = 0; i < internals->active_slave_count; i++)
> > -             bond_mode_8023ad_activate_slave(bond_dev,
> > +     for (i = 0; i < internals->active_slave_count; i++) {
> > +             int rc = bond_mode_8023ad_activate_slave(bond_dev,
> >                               internals->active_slaves[i]);
> > +             if (rc != 0)
> > +                     return rc;
> > +     }
> >
> >       return 0;
> >  }
> > diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> > index 0f490a5..96a42f2 100644
> > --- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> > +++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> > @@ -263,7 +263,7 @@ struct mode8023ad_private {
> >   * @return
> >   *  0 on success, negative value otherwise.
> >   */
> > -void
> > +int
> >  bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t
> port_id);
> >
> >  /**
> > diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> b/drivers/net/bonding/rte_eth_bond_api.c
> > index aa89425..96aa1ff 100644
> > --- a/drivers/net/bonding/rte_eth_bond_api.c
> > +++ b/drivers/net/bonding/rte_eth_bond_api.c
> > @@ -69,14 +69,15 @@
> >       return 0;
> >  }
> >
> > -void
> > +int
> >  activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
> >  {
> >       struct bond_dev_private *internals = eth_dev->data->dev_private;
> >       uint8_t active_count = internals->active_slave_count;
> >
> >       if (internals->mode == BONDING_MODE_8023AD)
> > -             bond_mode_8023ad_activate_slave(eth_dev, port_id);
> > +             if (bond_mode_8023ad_activate_slave(eth_dev, port_id) !=
> 0)
> > +                     return -1;
> >
> >       if (internals->mode == BONDING_MODE_TLB
> >                       || internals->mode == BONDING_MODE_ALB) {
> > @@ -357,10 +358,17 @@
> >                               bond_ethdev_primary_set(internals,
> >                                                       slave_port_id);
> >
> > -                     if (find_slave_by_id(internals->active_slaves,
> > -                                          internals->active_slave_count,
> > -                                          slave_port_id) ==
> internals->active_slave_count)
> > -                             activate_slave(bonded_eth_dev,
> slave_port_id);
> > +                     int rc =
>
> There's no need for the rc variables, the existing check would suffice here
>
> > +                             find_slave_by_id(internals->active_slaves,
> > +                                     internals->active_slave_count,
> > +                                     slave_port_id);
> > +
> > +                     if (rc == internals->active_slave_count) {
> > +                             int rc = activate_slave(bonded_eth_dev,
> > +                                                     slave_port_id);
> > +                             if (rc != 0)
> > +                                     return -1;
> and this could be
>
> if (activate_slave(bonded_eth_dev, slave_port_id))
>         return -1;
>
> > +                     }
> >               }
> >       }
> >
> > diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> b/drivers/net/bonding/rte_eth_bond_pmd.c
> > index 2805c71..2d9052d 100644
> > --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> > +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> > @@ -1741,8 +1741,10 @@ struct bwg_slave {
> >               /* Any memory allocation failure in initialization is
> critical because
> >                * resources can't be free, so reinitialization is
> impossible. */
> >               if (port->slow_pool == NULL) {
> > -                     rte_panic("Slave %u: Failed to create memory pool
> '%s': %s\n",
> > -                             slave_id, mem_name,
> rte_strerror(rte_errno));
> > +                     RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create
> memory pool '%s': %s\n",
> > +                             __func__, slave_id,
> > +                             mem_name, rte_strerror(rte_errno));
> > +                     return -1;
> >               }
> >       }
> >
> > @@ -2673,7 +2675,8 @@ struct bwg_slave {
> >                       mac_address_slaves_update(bonded_eth_dev);
> >               }
> >
> > -             activate_slave(bonded_eth_dev, port_id);
> > +             if (activate_slave(bonded_eth_dev, port_id) != 0)
> > +                     return -1;
>
> it's more consistent with the rest of the function to do,
>
> if(activate_slave(bonded_eth_dev, port_id))
>         return rc;
>
> There's other places through the patches where "!= 0" is used but not
> really needed
>
> >
> >               /* If user has defined the primary port then default to
> using it */
> >               if (internals->user_defined_primary_port &&
> > diff --git a/drivers/net/bonding/rte_eth_bond_private.h
> b/drivers/net/bonding/rte_eth_bond_private.h
> > index 94eca88..d99d42c 100644
> > --- a/drivers/net/bonding/rte_eth_bond_private.h
> > +++ b/drivers/net/bonding/rte_eth_bond_private.h
> > @@ -187,7 +187,7 @@ struct bond_dev_private {
> >  void
> >  deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
> >
> > -void
> > +int
> >  activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
> >
> >  void
> >
>
>


-- 

*Arnon Warshavsky*
*Qwilt | work: +972-72-2221634 | mobile: +972-50-8583058 | arnon@qwilt.com
<arnon@qwilt.com>*

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 0/5] introduce new tunnel types
  2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
  2018-04-17 15:04  3%   ` [dpdk-dev] [PATCH v5 0/4] " Xueming Li
@ 2018-04-20 11:56  3%   ` Xueming Li
  2018-04-23 12:16  3%     ` [dpdk-dev] [PATCH v7 " Xueming Li
  1 sibling, 1 reply; 200+ results
From: Xueming Li @ 2018-04-20 11:56 UTC (permalink / raw)
  To: Iremonger Bernard, Wenzhuo Lu, Jingjing Wu, Thomas Monjalon,
	Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz


v6:
- Add MPLS-in-GRE and MPLS-in-UDP back
- UPdate comment alignment 
v5:
- Fixed VXLAN-GPE comment alignment
- Removed MPLS-in-GRE and MPLS-in-UDP patch
v4:
- Update testpmd doc for flow VXLAN-GPE paramter.
v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine

Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c                 |  24 +++++++
 app/test-pmd/config.c                       |   2 +
 app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
 app/test-pmd/parameters.c                   |  12 +++-
 app/test-pmd/testpmd.h                      |   2 +
 doc/guides/prog_guide/rte_flow.rst          |  12 ++++
 doc/guides/rel_notes/deprecation.rst        |   4 --
 doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
 lib/librte_ether/rte_eth_ctrl.h             |   3 +-
 lib/librte_ether/rte_flow.c                 |   1 +
 lib/librte_ether/rte_flow.h                 |  27 ++++++++
 lib/librte_mbuf/rte_mbuf.c                  |   3 +
 lib/librte_mbuf/rte_mbuf.h                  |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c            |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h            |  47 +++++++++++++
 lib/librte_net/rte_ether.h                  |  25 +++++++
 17 files changed, 261 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: add packet field set aciton in flow API
  @ 2018-04-20  8:54  3%         ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-20  8:54 UTC (permalink / raw)
  To: Zhang, Qi Z
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce

On Fri, Apr 20, 2018 at 02:24:20AM +0000, Zhang, Qi Z wrote:
> Hi Adrien:
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Thursday, April 19, 2018 10:49 PM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> > Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> > <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> > Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Subject: Re: [PATCH v3 2/4] ethdev: add packet field set aciton in flow API
> > 
> > Typo in commit title: aciton => action
> > 
> > On Mon, Apr 16, 2018 at 02:10:40PM +0800, Qi Zhang wrote:
> > > Add new action RTE_FLOW_ACTION_TYPE_FIELD_SET, it is used to modify
> > > fields of specific protocol layer of the packet, the action only apply
> > > on packets that contain the requireds protocol layer.
> > 
> > requireds => required
> > 
> > >
> > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > 
> > (more below)
> > 
> > > ---
> > >  doc/guides/prog_guide/rte_flow.rst | 30
> > +++++++++++++++++++++++++++
> > >  lib/librte_ether/rte_flow.h        | 42
> > +++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 71 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/doc/guides/prog_guide/rte_flow.rst
> > > b/doc/guides/prog_guide/rte_flow.rst
> > > index 99468bf60..68deb9812 100644
> > > --- a/doc/guides/prog_guide/rte_flow.rst
> > > +++ b/doc/guides/prog_guide/rte_flow.rst
> > > @@ -1574,6 +1574,36 @@ fields in the pattern items.
> > >     | 1     | END      |
> > >     +-------+----------+
> > >
> > > +Action: ``FILED_SET``
> > > +^^^^^^^^^^^^^^^^^^^^^
> > 
> > FILED_SET => FIELD_SET
> > 
> > > +
> > > +Modify the value of fields in a protocol layer, only applies to
> > > +packets that contain respective protocol layer.
> > > +
> > > +.. _table_rte_flow_action_field_set:
> > > +
> > > +.. table:: FIELD_SET
> > > +
> > > +   +---------------+-------------------------------------------------------------------+
> > > +   | Field         | Value
> > |
> > > +
> > +===============+==============================================
> > =====================+
> > > +   | ``type``      | Specify the type of a protocol layer. (see
> > RTE_FLOW_ITEM_TYPE_*)  |
> > > +   +---------------+-------------------------------------------------------------------+
> > > +   | ``dir_level`` | Specify the level of matched protocol layer.
> > |
> > > +   |               | direction (1b)
> > |
> > > +   |               | 0: match start from outermost.
> > |
> > > +   |               | 1: match start from innermost.
> > |
> > 
> > Please remove the direction part. What devices can match is always
> > outermost up to the point where they can't recognize an inner header.
> > "innermost" is almost guaranteed to never have the desired effect.
> > 
> > > +   |               | level: (31b)
> > |
> > > +   |               | 0: outermost or innermost protocol layer that
> > matched @type       |
> > > +   |               | 1: next to outmost or innermost protocol layer
> > that matched @type |
> > > +   |               | 2: and so on ...
> > |
> > 
> > Then you can remove any reference to dir_level from here.
> > 
> > > +   +---------------+-------------------------------------------------------------------+
> > > +   |  ``new_val``  | Pointer to specific data structure according to
> > protocol type,    |
> > > +   |               | the content is the new value to updtae.
> > |
> > 
> > updtae => update
> > 
> > > +   +---------------+-------------------------------------------------------------------+
> > > +   |  ``mask``     | Bit-mask applied to new_val
> > |
> > > +
> > > + +---------------+---------------------------------------------------
> > > + ----------------+
> > > +
> > >  Negative types
> > >  ~~~~~~~~~~~~~~
> > >
> > > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > > index f84bbfda5..2dc95b6b8 100644
> > > --- a/lib/librte_ether/rte_flow.h
> > > +++ b/lib/librte_ether/rte_flow.h
> > > @@ -1245,7 +1245,15 @@ enum rte_flow_action_type {
> > >  	 *
> > >  	 * See struct rte_flow_action_security.
> > >  	 */
> > > -	RTE_FLOW_ACTION_TYPE_SECURITY
> > > +	RTE_FLOW_ACTION_TYPE_SECURITY,
> > > +
> > > +	/**
> > > +	 * Modify the value of fields in a protocol layer, only applies to
> > > +	 * packets that contain respective protocol layer.
> > > +	 *
> > > +	 * See struct rte_flow_action_field_set.
> > > +	 */
> > > +	RTE_FLOW_ACTION_TYPE_FIELD_SET,
> > >  };
> > >
> > >  /**
> > > @@ -1384,6 +1392,38 @@ struct rte_flow_action_security {  };
> > >
> > >  /**
> > > + * RTE_FLOW_ACTION_TYPE_FIELD_SET
> > > + *
> > > + * Modify the value of fields in a protocol layer, only applies to
> > > + * packets that contain respective protocol layer.
> > > + */
> > > +struct rte_flow_action_field_set {
> > > +	/**
> > > +	 * Specify the type of a protocol layer.
> > > +	 */
> > > +	enum rte_flow_item_type type;
> > > +	/**
> > > +	 * Specify the level of matched protocol layer.
> > > +	 *
> > > +	 * direction (1b)
> > > +	 * 0: match start from outermost.
> > > +	 * 1: match start from innermost.
> > > +	 *
> > > +	 * level (31b)
> > > +	 * 0: outermost|innermost protocol layer that matched @type.
> > > +	 * 1: next to outermost|innermost protocol layer that matched @type.
> > > +	 * 2: and so on ...
> > > +	 */
> > > +	uint32_t dir_level;
> > 
> > See above regarding this field.
> > 
> > > +	/**
> > > +	 * Pointer to specific data structure according to protocol type,
> > > +	 * the content is the new value to update.
> > > +	 */
> > > +	const void *new_val;
> > > +	const void *mask; /**< Bit-mask applied to new_val. */ };
> > > +
> > > +/**
> > >   * Definition of a single action.
> > >   *
> > >   * A list of actions is terminated by a END action.
> > > --
> > > 2.13.6
> > >
> > 
> > Testpmd implementation and documentation update are also missing,
> > however
> > I'm still not convinced by the definition of this new action, it seems too
> > generic to be useful (e.g. compare this with a dedicated "update destination
> > IPv4 address" action for instance).
> > 
> > What existing HW capabilities do you intend to expose through this, what
> > kind of fields can be updated at this point?
> 
> For our device, there will be more than 20 actions if we create an action for each field like "RTE_FLOW_ACTION_TYPE_IPV4_ADDR_SET",
> More detail, that will cover fields in IPV4/IPV6/Ether/ICMP/ND/ARP, so I think a generic field set action would be better.

I see. You know, I think adding 20+ focused actions would be fine actually,
easier to document, report as HW capabilities and to use by applications
(e.g. see issue you raise about adding testpmd support below).

In all the protocols you mention, is the device able to really update *all*
fields or only the "usual" set? IPv4 src/dst probably, now what about ToS,
packet ID, fragment offset and so on? Same for other protocols, should we
care about protocol fields that applications rarely set (if at all),
especially if no device can update them?

> For testpmd support, Seems there is no reference to enable an action with void parameters, so it may take me time to figure out a solution,

Dedicated actions on the other hand should be way easier to add :)

> I'm not sure I can capture this on the 18.05, so is that possible we just deferred testpmd support for this action? since I saw action like rte_flow_action_security also don't have
> testpmd support yet.

I know, there are several problematic actions without testpmd
support. However those are often associated with experimental APIs and may
be modified without prior notice.

Keep in mind there is no need to rush. Adding rte_flow actions doesn't hurt
ABI and can be done for any DPDK release, even part of the same series as
the first PMD implementation which must be validated somehow (no one ever
submits code that can't be tested, right?), this is why testpmd support is
mandatory.

> > If it's still unclear, I suggest to remove this patch from the series or at
> > the very least mark it as experimental. You can even provide a forward
> > declaration without the contents of struct rte_flow_action_field_set to
> > prevent applications from using it before it's finalized.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration space messages
  2018-04-19 14:39  0%                 ` Maxime Coquelin
@ 2018-04-20  0:32  0%                   ` Liu, Changpeng
  0 siblings, 0 replies; 200+ results
From: Liu, Changpeng @ 2018-04-20  0:32 UTC (permalink / raw)
  To: Maxime Coquelin, Kulasek, TomaszX, yliu
  Cc: Verkamp, Daniel, Harris, James R, Wodkowski, PawelX, dev, Tan, Jianfeng

Hi Maxime,

We'll submit a new v3 version soon after tested with QEMU 2.12 release.

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Thursday, April 19, 2018 10:40 PM
> To: Liu, Changpeng <changpeng.liu@intel.com>; Kulasek, TomaszX
> <tomaszx.kulasek@intel.com>; yliu@fridaylinux.org
> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
> <james.r.harris@intel.com>; Wodkowski, PawelX <pawelx.wodkowski@intel.com>;
> dev@dpdk.org; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration space messages
> 
> Hi Changpeng, Tomasz,
> 
> Any chance that you resubmit the series now that the Qemu changes adding
> a protocol feature flag has been accepted?
> 
> Cheers,
> Maxime
> On 03/28/2018 12:56 PM, Maxime Coquelin wrote:
> >
> >
> > On 03/28/2018 12:23 PM, Liu, Changpeng wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>> Sent: Wednesday, March 28, 2018 6:11 PM
> >>> To: Liu, Changpeng <changpeng.liu@intel.com>; Kulasek, TomaszX
> >>> <tomaszx.kulasek@intel.com>; yliu@fridaylinux.org
> >>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
> >>> <james.r.harris@intel.com>; Wodkowski, PawelX
> >>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Tan, Jianfeng
> >>> <jianfeng.tan@intel.com>
> >>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration space
> >>> messages
> >>>
> >>>
> >>>
> >>> On 03/28/2018 12:03 PM, Liu, Changpeng wrote:
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>>>> Sent: Wednesday, March 28, 2018 5:58 PM
> >>>>> To: Liu, Changpeng <changpeng.liu@intel.com>; Kulasek, TomaszX
> >>>>> <tomaszx.kulasek@intel.com>; yliu@fridaylinux.org
> >>>>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
> >>>>> <james.r.harris@intel.com>; Wodkowski, PawelX
> >>>>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Tan, Jianfeng
> >>>>> <jianfeng.tan@intel.com>
> >>>>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration
> >>>>> space
> >>>>> messages
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 03/28/2018 11:50 AM, Liu, Changpeng wrote:
> >>>>>>
> >>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>>>>>> Sent: Wednesday, March 28, 2018 5:12 PM
> >>>>>>> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>;
> >>>>>>> yliu@fridaylinux.org
> >>>>>>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
> >>>>>>> <james.r.harris@intel.com>; Wodkowski, PawelX
> >>>>>>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Liu, Changpeng
> >>>>>>> <changpeng.liu@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>
> >>>>>>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio
> >>>>>>> configuration space
> >>>>>>> messages
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 03/27/2018 05:35 PM, Tomasz Kulasek wrote:
> >>>>>>>> This patch adds new vhost user messages GET_CONFIG and SET_CONFIG
> >>>>> used
> >>>>>>>> for get/set virtio device's configuration space.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
> >>>>>>>> Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
> >>>>>>>> ---
> >>>>>>>> Changes in v2:
> >>>>>>>>      - code cleanup
> >>>>>>>>
> >>>>>>>>      lib/librte_vhost/rte_vhost.h  |  4 ++++
> >>>>>>>>      lib/librte_vhost/vhost_user.c | 22 ++++++++++++++++++++++
> >>>>>>>>      lib/librte_vhost/vhost_user.h | 16 ++++++++++++++++
> >>>>>>>>      3 files changed, 42 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/lib/librte_vhost/rte_vhost.h
> >>>>>>>> b/lib/librte_vhost/rte_vhost.h
> >>>>>>>> index d332069..fe30518 100644
> >>>>>>>> --- a/lib/librte_vhost/rte_vhost.h
> >>>>>>>> +++ b/lib/librte_vhost/rte_vhost.h
> >>>>>>>> @@ -84,6 +84,10 @@ struct vhost_device_ops {
> >>>>>>>>          int (*new_connection)(int vid);
> >>>>>>>>          void (*destroy_connection)(int vid);
> >>>>>>>>
> >>>>>>>> +    int (*get_config)(int vid, uint8_t *config, uint32_t
> >>>>>>>> config_len);
> >>>>>>>> +    int (*set_config)(int vid, uint8_t *config, uint32_t offset,
> >>>>>>>> +            uint32_t len, uint32_t flags);
> >>>>>>>> +
> >>>>>>>>          void *reserved[2]; /**< Reserved for future extension */
> >>>>>>>
> >>>>>>> You are breaking the ABI, as you grow the size of the ops struct.
> >>>>>>>
> >>>>>>> Also, I'm wondering if we shouldn't have a different ops for
> >>>>>>> external
> >>>>>>> backends. Here these ops are more intended to the application, we
> >>>>>>> could
> >>>>>>> have a specific ops struct for external backends IMHO.
> >>>>>>>
> >>>>>>>>      };
> >>>>>>>>
> >>>>>>>> diff --git a/lib/librte_vhost/vhost_user.c
> >>>>>>>> b/lib/librte_vhost/vhost_user.c
> >>>>>>>> index 90ed211..0ed6a5a 100644
> >>>>>>>> --- a/lib/librte_vhost/vhost_user.c
> >>>>>>>> +++ b/lib/librte_vhost/vhost_user.c
> >>>>>>>> @@ -50,6 +50,8 @@ static const char
> >>>>> *vhost_message_str[VHOST_USER_MAX]
> >>>>>>> = {
> >>>>>>>>          [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
> >>>>>>>>          [VHOST_USER_SET_SLAVE_REQ_FD]  =
> >>>>>>> "VHOST_USER_SET_SLAVE_REQ_FD",
> >>>>>>>>          [VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
> >>>>>>>> +    [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
> >>>>>>>> +    [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
> >>>>>>>>      };
> >>>>>>>>
> >>>>>>>>      static uint64_t
> >>>>>>>> @@ -1355,6 +1357,7 @@ vhost_user_msg_handler(int vid, int fd)
> >>>>>>>>           * would cause a dead lock.
> >>>>>>>>           */
> >>>>>>>>          switch (msg.request.master) {
> >>>>>>>> +    case VHOST_USER_SET_CONFIG:
> >>>>>>>
> >>>>>>> It seems VHOST_USER_GET_CONFIG is missing here.
> >>>>>>>
> >>>>>>>>          case VHOST_USER_SET_FEATURES:
> >>>>>>>>          case VHOST_USER_SET_PROTOCOL_FEATURES:
> >>>>>>>>          case VHOST_USER_SET_OWNER:
> >>>>>>>> @@ -1380,6 +1383,25 @@ vhost_user_msg_handler(int vid, int fd)
> >>>>>>>>          }
> >>>>>>>>
> >>>>>>>>          switch (msg.request.master) {
> >>>>>>>> +    case VHOST_USER_GET_CONFIG:
> >>>>>>>> +        if (dev->notify_ops->get_config(dev->vid,
> >>>>>>> Please check ->get_config is set before calling it.
> >>>>>>>
> >>>>>>>> +                msg.payload.config.region,
> >>>>>>>> +                msg.payload.config.size) != 0) {
> >>>>>>>> +            msg.size = sizeof(uint64_t);
> >>>>>>>> +        }
> >>>>>>>> +        send_vhost_reply(fd, &msg);
> >>>>>>>> +        break;
> >>>>>>>> +    case VHOST_USER_SET_CONFIG:
> >>>>>>>> +        if ((dev->notify_ops->set_config(dev->vid,
> >>>>>>> Ditto.
> >>>>>>>
> >>>>>>>> +                msg.payload.config.region,
> >>>>>>>> +                msg.payload.config.offset,
> >>>>>>>> +                msg.payload.config.size,
> >>>>>>>> +                msg.payload.config.flags)) != 0) {
> >>>>>>>> +            ret = 1;
> >>>>>>>> +        } else {
> >>>>>>>> +            ret = 0;
> >>>>>>>> +        }
> >>>>>>>
> >>>>>>> ret = dev->notify_ops->set_config instead?
> >>>>>>>> +        break;
> >>>>>>>>          case VHOST_USER_GET_FEATURES:
> >>>>>>>>              msg.payload.u64 = vhost_user_get_features(dev);
> >>>>>>>>              msg.size = sizeof(msg.payload.u64);
> >>>>>>>> diff --git a/lib/librte_vhost/vhost_user.h
> >>>>>>>> b/lib/librte_vhost/vhost_user.h
> >>>>>>>> index d4bd604..25cc026 100644
> >>>>>>>> --- a/lib/librte_vhost/vhost_user.h
> >>>>>>>> +++ b/lib/librte_vhost/vhost_user.h
> >>>>>>>> @@ -14,6 +14,11 @@
> >>>>>>>>
> >>>>>>>>      #define VHOST_MEMORY_MAX_NREGIONS 8
> >>>>>>>>
> >>>>>>>> +/*
> >>>>>>>> + * Maximum size of virtio device config space
> >>>>>>>> + */
> >>>>>>>> +#define VHOST_USER_MAX_CONFIG_SIZE 256
> >>>>>>>> +
> >>>>>>>>      #define VHOST_USER_PROTOCOL_F_MQ    0
> >>>>>>>>      #define VHOST_USER_PROTOCOL_F_LOG_SHMFD    1
> >>>>>>>>      #define VHOST_USER_PROTOCOL_F_RARP    2
> >>>>>>>
> >>>>>>> Shouldn't there be a protocol feature associated to these new
> >>>>>>> messages?
> >>>>>>> Else how QEMU knows the backend supports it or not?
> >>>>>>>
> >>>>>>> I looked at QEMU code and indeed no protocol feature associated,
> >>>>>>> that's
> >>>>>>> strange...
> >>>>>> Nice to have, for now not all the QEMU host driver need to get this
> >>>>> configuration space from slave backend
> >>>>>> when getting start. This message can be used for migration of
> >>>>>> vhost-user
> >>>>> devices.
> >>>>>
> >>>>> So if QEMU sends this message but the DPDK version does not support it
> >>>>> yet, vhost_user_msg_handler() will return an error ("vhost read
> >>>>> incorrect message") and the socket will be closed.
> >>>>>
> >>>>> How do we overcome this? I think we really need a spec update ASAP,
> >>>>> before QEMU v2.12 is out (-rc1 already).
> >>>>>
> >>>>> Do you have time to take care of this?
> >>>> For now there are no other users except us care about this message,
> >>>> :), it's no
> >>> hurry.
> >>>> I can take this after QEMU 2.12 release adding a new protocol
> >>>> feature bit.
> >>>
> >>> Are you sure?
> >>> If I understand the code correctly, as the guest writes in config regs
> >>> of a virtio-blk device, .set_config callback will be called.
> >> Exactly.
> >>>
> >>> If you have a vhost-user backend, it will receive the SET_CONFIG
> >>> request, no?
> >> For now this only enabled for QEMU vhost-user-blk driver, QEMU
> >> virtio-blk driver didn't have such issue.
> >
> > Right.
> > But it will be really painful to manage for example for cross-version
> > live migration. Or when you'll want to use QEMU v2.13+ with a DPDK
> > v18.05 backend, the protocol feature won't be negotiated.
> >
> > Really, this is important to get it right at the beginning.
> >
> > Thanks,
> > Maxime
> >>>
> >>> Cheers,
> >>> Maxime
> >>>
> >>>>>
> >>>>> Thanks,
> >>>>> Maxime

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-19  6:01  2% ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  2018-04-19 14:39  3%   ` Burakov, Anatoly
@ 2018-04-19 17:48  0%   ` Aaron Conole
  2018-04-20 13:55  4%     ` Arnon Warshavsky
  1 sibling, 1 reply; 200+ results
From: Aaron Conole @ 2018-04-19 17:48 UTC (permalink / raw)
  To: Arnon Warshavsky
  Cc: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit, dev, Kevin Traynor

Arnon Warshavsky <arnon@qwilt.com> writes:

> Local functions to this file,
> changing from void to int are non-abi-breaking.
> For handling the single function that cannot
> change from void to int due to abi,
> where this is the only place it is called in,
> I added a state variable that is being checked
> right after the call to this function.
>
> --
>
> v4 - fix split literal strings in log messages
>
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---

Hi Arnon,

Always happy to see panic calls get removed.  I have some comments inline.

>  lib/librte_eal/bsdapp/eal/eal.c           |  86 ++++++++++++++-------
>  lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
>  lib/librte_eal/common/eal_common_launch.c |  21 ++++++
>  lib/librte_eal/common/include/rte_debug.h |  12 +++
>  lib/librte_eal/linuxapp/eal/eal.c         | 120 ++++++++++++++++++++----------
>  lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
>  6 files changed, 270 insertions(+), 99 deletions(-)
>
> diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
> index d996190..9c2f6f1 100644
> --- a/lib/librte_eal/bsdapp/eal/eal.c
> +++ b/lib/librte_eal/bsdapp/eal/eal.c
> @@ -151,7 +151,7 @@ enum rte_iova_mode
>   * We also don't lock the whole file, so that in future we can use read-locks
>   * on other parts, e.g. memzones, to detect if there are running secondary
>   * processes. */
> -static void
> +static int
>  rte_eal_config_create(void)
>  {
>  	void *rte_mem_cfg_addr;
> @@ -160,60 +160,78 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	if (mem_cfg_fd < 0){
>  		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +					__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
>  	if (retval < 0){
>  		close(mem_cfg_fd);
> -		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +		return -1;

Previously, it wasn't possible for mem_cfg_fd to be reused after a
failure.  Now it is - please reset it to -1. in these close conditions.

>  	}
>  
>  	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
>  	if (retval < 0){
>  		close(mem_cfg_fd);
> -		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
> -				"process running?\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'. Is another primary process running?\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  
>  	if (rte_mem_cfg_addr == MAP_FAILED){
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +				__func__);
> +		return -1;
>  	}
>  	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
>  	rte_config.mem_config = rte_mem_cfg_addr;
> +
> +	return 0;
>  }
>  
>  /* attach to an existing shared memory config */
> -static void
> +static int
>  rte_eal_config_attach(void)
>  {
>  	void *rte_mem_cfg_addr;
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	if (mem_cfg_fd < 0){
>  		mem_cfg_fd = open(pathname, O_RDWR);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +					__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  	close(mem_cfg_fd);

Again, previously this would have aborted on a failure.  So it needs to
be reset to a value that allows retry.

> -	if (rte_mem_cfg_addr == MAP_FAILED)
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +	if (rte_mem_cfg_addr == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	rte_config.mem_config = rte_mem_cfg_addr;
> +
> +	return 0;
>  }
>  
>  /* Detect if we are a primary or a secondary process */
> @@ -237,23 +255,28 @@ enum rte_proc_type_t
>  }
>  
>  /* Sets up rte_config structure with the pointer to shared memory config.*/
> -static void
> +static int
>  rte_config_init(void)
>  {
>  	rte_config.process_type = internal_config.process_type;
>  
>  	switch (rte_config.process_type){
>  	case RTE_PROC_PRIMARY:
> -		rte_eal_config_create();
> +		if (rte_eal_config_create())
> +			return -1;
>  		break;
>  	case RTE_PROC_SECONDARY:
> -		rte_eal_config_attach();
> +		if (rte_eal_config_attach())
> +			return -1;
>  		rte_eal_mcfg_wait_complete(rte_config.mem_config);
>  		break;
>  	case RTE_PROC_AUTO:
>  	case RTE_PROC_INVALID:

Not for this patch, but I just noticed that this should probably use a
'default' case.

> -		rte_panic("Invalid process type\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
> +				__func__, rte_config.process_type);
> +		return -1;
>  	}
> +	return 0;
>  }
>  
>  /* display usage */
> @@ -595,7 +618,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	rte_srand(rte_rdtsc());
>  
> -	rte_config_init();
> +	if (rte_config_init() != 0)
> +		return -1;

Use rte_eal_init_alert to indicate why you are failing the init.

>  	if (rte_mp_channel_init() < 0) {
>  		rte_eal_init_alert("failed to init mp channel\n");
> @@ -652,7 +676,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	eal_check_mem_on_local_socket();
>  
> -	eal_thread_init_master(rte_config.master_lcore);
> +	if (eal_thread_init_master(rte_config.master_lcore) != 0)
> +		return -1;

Is it ever possible to recover from this?  Still needs
rte_eal_init_alert() call.

>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -666,18 +691,27 @@ static void rte_eal_init_alert(const char *msg)
>  		 * create communication pipes between master thread
>  		 * and children
>  		 */
> -		if (pipe(lcore_config[i].pipe_master2slave) < 0)
> -			rte_panic("Cannot create pipe\n");
> -		if (pipe(lcore_config[i].pipe_slave2master) < 0)
> -			rte_panic("Cannot create pipe\n");
> +		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
> +		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}

How are you cleaning up the threads that were spawned?  Lets say this
loop will execute 5 times, and on the 3rd entry, these errors happen.
You now leave DPDK 'half-initialized' - you've spun up threads and
allocated memory.

Also, again use rte_eal_init_alert().  It was added for a reason :)

>  
>  		lcore_config[i].state = WAIT;
>  
>  		/* create a thread for each lcore */
>  		ret = pthread_create(&lcore_config[i].thread_id, NULL,
>  				     eal_thread_loop, NULL);
> -		if (ret != 0)
> -			rte_panic("Cannot create thread\n");
> +		if (ret != 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
> +					__func__);
> +			return -1;
> +		}

Same question as before.  If pthread_create is failing, there are worse
problems than aborting.

>  		/* Set thread_name for aid in debugging. */
>  		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
> diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
> index d602daf..5c3947c 100644
> --- a/lib/librte_eal/bsdapp/eal/eal_thread.c
> +++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
> @@ -51,16 +51,22 @@
>  	n = 0;
>  	while (n == 0 || (n < 0 && errno == EINTR))
>  		n = write(m2s, &c, 1);
> -	if (n < 0)
> -		rte_panic("cannot write on configuration pipe\n");
> +	if (n < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	/* wait ack */
>  	do {
>  		n = read(s2m, &c, 1);
>  	} while (n < 0 && errno == EINTR);
>  
> -	if (n <= 0)
> -		rte_panic("cannot read on configuration pipe\n");
> +	if (n <= 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	return 0;
>  }
> @@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		rte_move_to_panic_state();
> +	}
> +}
> +
> +/* move to panic state and do not return */
> +static __attribute__((noreturn)) void
> +defunct_and_remain_in_endless_loop(void)
> +{
> +	rte_move_to_panic_state();
> +	while (1)
> +		sleep(1);
>  }

This is worse than a panic.  Users will blame applications for appearing
to freeze.  Please leave the panics in place rather than do this.

>  /* main loop of threads */
> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  		if (thread_id == lcore_config[lcore_id].thread_id)
>  			break;
>  	}
> -	if (lcore_id == RTE_MAX_LCORE)
> -		rte_panic("cannot retrieve lcore id\n");
> +	if (lcore_id == RTE_MAX_LCORE) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
> +				__func__);
> +		defunct_and_remain_in_endless_loop();
> +	}

I'm not even sure this check has merit, tbh.  Is there ever a chance for
an lcore thread to be spawned like this?  Probably a better patch would
just remove all the code you've inserted, but keep the check you
removed.

>  	m2s = lcore_config[lcore_id].pipe_master2slave[0];
>  	s2m = lcore_config[lcore_id].pipe_slave2master[1];
> @@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		defunct_and_remain_in_endless_loop();

How does this improve the user experience?

> +	}
>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  			n = read(m2s, &c, 1);
>  		} while (n < 0 && errno == EINTR);
>  
> -		if (n <= 0)
> -			rte_panic("cannot read on configuration pipe\n");
> +		if (n <= 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();

Same question.  Actually this could happen on shutdown, I think?  If
there's a race where the pipe is torn down before the thread?  Not sure
if there are any ordering guarantees around that.

> +		}
>  
>  		lcore_config[lcore_id].state = RUNNING;
>  
> @@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
>  		n = 0;
>  		while (n == 0 || (n < 0 && errno == EINTR))
>  			n = write(s2m, &c, 1);
> -		if (n < 0)
> -			rte_panic("cannot write on configuration pipe\n");
> -
> -		if (lcore_config[lcore_id].f == NULL)
> -			rte_panic("NULL function pointer\n");
> +		if (n < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
> +
> +		if (lcore_config[lcore_id].f == NULL) {
> +			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}

I don't see how any of this is better for the user.  In fact, I think
this is worse because it will make portions of the application stop
working without any way to move forward.  rte_panic() will at least give
the process a chance to recover from a potentially ephemeral condition.

>  		/* call the function and store the return value */
>  		fct_arg = lcore_config[lcore_id].arg;
> diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
> index fe0ba3f..6f8bd46 100644
> --- a/lib/librte_eal/common/eal_common_launch.c
> +++ b/lib/librte_eal/common/eal_common_launch.c
> @@ -14,6 +14,7 @@
>  #include <rte_pause.h>
>  #include <rte_per_lcore.h>
>  #include <rte_lcore.h>
> +#include <rte_debug.h>
>  
>  /*
>   * Wait until a lcore finished its job.
> @@ -88,3 +89,23 @@ enum rte_lcore_state_t
>  		rte_eal_wait_lcore(lcore_id);
>  	}
>  }
> +
> +/* panic state */
> +static int _panic_state;
> +
> +/**
> + * Check if the system is in panic state
> + * @return int
> + */
> +int rte_get_panic_state(void)
> +{
> +	return _panic_state;
> +}
> +
> +/**
> + * Move the system to be in panic state
> + */
> +void rte_move_to_panic_state(void)
> +{
> +	_panic_state = 1;
> +}
> diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
> index 272df49..b421d33 100644
> --- a/lib/librte_eal/common/include/rte_debug.h
> +++ b/lib/librte_eal/common/include/rte_debug.h
> @@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
>  }
>  #endif
>  
> +/**
> + * Check if the system is in panic state
> + * @return int
> + */
> +int rte_get_panic_state(void);
> +
> +/**
> + * Move the system to be in panic state
> + */
> +void rte_move_to_panic_state(void);

This seems to only exist as a way of triggering the run_once check in
the eal_init.  It doesn't add anything except one more state variable to
check against.  What is the purpose?

Further, it seems unrelated to removing panics.

> +
>  #endif /* _RTE_DEBUG_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 21afa73..393441a 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -160,7 +160,7 @@ enum rte_iova_mode
>   * We also don't lock the whole file, so that in future we can use read-locks
>   * on other parts, e.g. memzones, to detect if there are running secondary
>   * processes. */
> -static void
> +static int
>  rte_eal_config_create(void)
>  {
>  	void *rte_mem_cfg_addr;
> @@ -169,7 +169,7 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	/* map the config before hugepage address so that we don't waste a page */
>  	if (internal_config.base_virtaddr != 0)
> @@ -179,30 +179,39 @@ enum rte_iova_mode
>  	else
>  		rte_mem_cfg_addr = NULL;
>  
> -	if (mem_cfg_fd < 0){
> +	if (mem_cfg_fd < 0) {
>  		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
> -	if (retval < 0){
> +	if (retval < 0) {
>  		close(mem_cfg_fd);
> -		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
> -	if (retval < 0){
> +	if (retval < 0) {
>  		close(mem_cfg_fd);
> -		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
> -				"process running?\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
> +				" Is another primary process running?\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  
> -	if (rte_mem_cfg_addr == MAP_FAILED){
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +	if (rte_mem_cfg_addr == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +			__func__);
> +		return -1;
>  	}
>  	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
>  	rte_config.mem_config = rte_mem_cfg_addr;
> @@ -211,10 +220,11 @@ enum rte_iova_mode
>  	 * processes could later map the config into this exact location */
>  	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
>  
> +	return 0;
>  }
>  
>  /* attach to an existing shared memory config */
> -static void
> +static int
>  rte_eal_config_attach(void)
>  {
>  	struct rte_mem_config *mem_config;
> @@ -222,33 +232,40 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
> -	if (mem_cfg_fd < 0){
> +	if (mem_cfg_fd < 0) {
>  		mem_cfg_fd = open(pathname, O_RDWR);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +						__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	/* map it as read-only first */
>  	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
>  			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
> -	if (mem_config == MAP_FAILED)
> -		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
> -			  errno, strerror(errno));
> +	if (mem_config == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config! error %i (%s)\n",
> +				__func__, errno, strerror(errno));
> +		return -1;
> +	}
>  
>  	rte_config.mem_config = mem_config;
> +
> +	return 0;
>  }
>  
>  /* reattach the shared config at exact memory location primary process has it */
> -static void
> +static int
>  rte_eal_config_reattach(void)
>  {
>  	struct rte_mem_config *mem_config;
>  	void *rte_mem_cfg_addr;
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	/* save the address primary process has mapped shared config to */
>  	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
> @@ -263,16 +280,21 @@ enum rte_iova_mode
>  	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
>  		if (mem_config != MAP_FAILED)
>  			/* errno is stale, don't use */
> -			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
> -				  " - please use '--base-virtaddr' option\n",
> -				  rte_mem_cfg_addr, mem_config);
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
> +					"rte_config at [%p], got [%p] - please use "
> +					"'--base-virtaddr' option\n",
> +					__func__, rte_mem_cfg_addr, mem_config);
>  		else
> -			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
> -				  errno, strerror(errno));
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
> +					"rte_config! error %i (%s)\n",
> +					__func__, errno, strerror(errno));
> +		return -1;
>  	}
>  	close(mem_cfg_fd);
>  
>  	rte_config.mem_config = mem_config;
> +
> +	return 0;
>  }
>  
>  /* Detect if we are a primary or a secondary process */
> @@ -296,24 +318,31 @@ enum rte_proc_type_t
>  }
>  
>  /* Sets up rte_config structure with the pointer to shared memory config.*/
> -static void
> +static int
>  rte_config_init(void)
>  {
>  	rte_config.process_type = internal_config.process_type;
>  
>  	switch (rte_config.process_type){
>  	case RTE_PROC_PRIMARY:
> -		rte_eal_config_create();
> +		if (rte_eal_config_create() != 0)
> +			return -1;
>  		break;
>  	case RTE_PROC_SECONDARY:
> -		rte_eal_config_attach();
> +		if (rte_eal_config_attach() != 0)
> +			return -1;
>  		rte_eal_mcfg_wait_complete(rte_config.mem_config);
> -		rte_eal_config_reattach();
> +		if (rte_eal_config_reattach() != 0)
> +			return -1;
>  		break;
>  	case RTE_PROC_AUTO:
>  	case RTE_PROC_INVALID:
> -		rte_panic("Invalid process type\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
> +				__func__, rte_config.process_type);
> +		return -1;
>  	}
> +
> +	return 0;
>  }
>  
>  /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
> @@ -820,7 +849,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	rte_srand(rte_rdtsc());
>  
> -	rte_config_init();
> +	if (rte_config_init() != 0)
> +		return -1;
>  
>  	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
>  		rte_eal_init_alert("Cannot init logging.");
> @@ -892,6 +922,9 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	eal_thread_init_master(rte_config.master_lcore);
>  
> +	if (rte_get_panic_state())
> +		return -1;
> +

Please just use run_once.  That's a better way of preventing this.

>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
>  	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
> @@ -909,18 +942,27 @@ static void rte_eal_init_alert(const char *msg)
>  		 * create communication pipes between master thread
>  		 * and children
>  		 */
> -		if (pipe(lcore_config[i].pipe_master2slave) < 0)
> -			rte_panic("Cannot create pipe\n");
> -		if (pipe(lcore_config[i].pipe_slave2master) < 0)
> -			rte_panic("Cannot create pipe\n");
> +		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
> +		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
>  
>  		lcore_config[i].state = WAIT;
>  
>  		/* create a thread for each lcore */
>  		ret = pthread_create(&lcore_config[i].thread_id, NULL,
>  				     eal_thread_loop, NULL);
> -		if (ret != 0)
> -			rte_panic("Cannot create thread\n");
> +		if (ret != 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
> +					__func__);
> +			return -1;
> +		}
>  
>  		/* Set thread_name for aid in debugging. */
>  		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
> diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
> index 08e150b..3afcee5 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_thread.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c

All of the comments from the bsd side apply here.

> @@ -51,16 +51,22 @@
>  	n = 0;
>  	while (n == 0 || (n < 0 && errno == EINTR))
>  		n = write(m2s, &c, 1);
> -	if (n < 0)
> -		rte_panic("cannot write on configuration pipe\n");
> +	if (n < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	/* wait ack */
>  	do {
>  		n = read(s2m, &c, 1);
>  	} while (n < 0 && errno == EINTR);
>  
> -	if (n <= 0)
> -		rte_panic("cannot read on configuration pipe\n");
> +	if (n <= 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	return 0;
>  }
> @@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		rte_move_to_panic_state();
> +	}
> +}
> +
> +/* move to panic state and do not return */
> +static __attribute__((noreturn)) void
> +defunct_and_remain_in_endless_loop(void)
> +{
> +	rte_move_to_panic_state();
> +	while (1)
> +		sleep(1);
>  }
>  
>  /* main loop of threads */
> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  		if (thread_id == lcore_config[lcore_id].thread_id)
>  			break;
>  	}
> -	if (lcore_id == RTE_MAX_LCORE)
> -		rte_panic("cannot retrieve lcore id\n");
> +	if (lcore_id == RTE_MAX_LCORE) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
> +				__func__);
> +		defunct_and_remain_in_endless_loop();
> +	}
>  
>  	m2s = lcore_config[lcore_id].pipe_master2slave[0];
>  	s2m = lcore_config[lcore_id].pipe_slave2master[1];
> @@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		defunct_and_remain_in_endless_loop();
> +	}
>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  			n = read(m2s, &c, 1);
>  		} while (n < 0 && errno == EINTR);
>  
> -		if (n <= 0)
> -			rte_panic("cannot read on configuration pipe\n");
> +		if (n <= 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
>  
>  		lcore_config[lcore_id].state = RUNNING;
>  
> @@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
>  		n = 0;
>  		while (n == 0 || (n < 0 && errno == EINTR))
>  			n = write(s2m, &c, 1);
> -		if (n < 0)
> -			rte_panic("cannot write on configuration pipe\n");
> -
> -		if (lcore_config[lcore_id].f == NULL)
> -			rte_panic("NULL function pointer\n");
> +		if (n < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
> +
> +		if (lcore_config[lcore_id].f == NULL) {
> +			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
>  
>  		/* call the function and store the return value */
>  		fct_arg = lcore_config[lcore_id].arg;

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev Arnon Warshavsky
@ 2018-04-19 17:27  0%   ` Kevin Traynor
  0 siblings, 0 replies; 200+ results
From: Kevin Traynor @ 2018-04-19 17:27 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, anatoly.burakov, wenzhuo.lu,
	declan.doherty, jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> Local function to this file,
> changing from void to int is non-abi-breaking
> 
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---
>  lib/librte_ether/rte_ethdev.c | 36 +++++++++++++++++++++++++-----------
>  1 file changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
> index 7821a88..9c13827 100644
> --- a/lib/librte_ether/rte_ethdev.c
> +++ b/lib/librte_ether/rte_ethdev.c
> @@ -194,7 +194,7 @@ enum {
>  	return port_id;
>  }
>  
> -static void
> +static int
>  rte_eth_dev_shared_data_prepare(void)
>  {
>  	const unsigned flags = 0;
> @@ -210,8 +210,12 @@ enum {
>  					rte_socket_id(), flags);
>  		} else
>  			mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA);
> -		if (mz == NULL)
> -			rte_panic("Cannot allocate ethdev shared data\n");
> +		if (mz == NULL) {
> +			rte_spinlock_unlock(&rte_eth_shared_data_lock);
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot allocate ethdev shared data\n",
> +					__func__);
> +			return -1;
> +		}
>  
>  		rte_eth_dev_shared_data = mz->addr;
>  		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> @@ -224,6 +228,8 @@ enum {
>  	}
>  
>  	rte_spinlock_unlock(&rte_eth_shared_data_lock);
> +
> +	return 0;
>  }
>  
>  struct rte_eth_dev *
> @@ -274,7 +280,8 @@ struct rte_eth_dev *
>  	uint16_t port_id;
>  	struct rte_eth_dev *eth_dev = NULL;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)

Lots of "!= 0"'s - you might gather by now that I don't like them :-)

> +		return NULL;
>  
>  	/* Synchronize port creation between primary and secondary threads. */
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
> @@ -317,7 +324,8 @@ struct rte_eth_dev *
>  	uint16_t i;
>  	struct rte_eth_dev *eth_dev = NULL;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return NULL;
>  
>  	/* Synchronize port attachment to primary port creation and release. */
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
> @@ -345,7 +353,8 @@ struct rte_eth_dev *
>  	if (eth_dev == NULL)
>  		return -EINVAL;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return -1;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> @@ -399,7 +408,8 @@ struct rte_eth_dev *
>  int __rte_experimental
>  rte_eth_dev_owner_new(uint64_t *owner_id)
>  {
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return -1;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> @@ -450,7 +460,8 @@ struct rte_eth_dev *
>  {
>  	int ret;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return -1;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> @@ -467,7 +478,8 @@ struct rte_eth_dev *
>  			{.id = RTE_ETH_DEV_NO_OWNER, .name = ""};
>  	int ret;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return -1;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> @@ -482,7 +494,8 @@ struct rte_eth_dev *
>  {

hmm, I'm wondering should void __rte_experimental
rte_eth_dev_owner_delete change to return an int, now that there is a
fail case and it is still experimental...?

>  	uint16_t port_id;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> @@ -502,7 +515,8 @@ struct rte_eth_dev *
>  {
>  	int ret = 0;
>  
> -	rte_eth_dev_shared_data_prepare();
> +	if (rte_eth_dev_shared_data_prepare() != 0)
> +		return -1;
>  
>  	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
>  
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
@ 2018-04-19 17:26  0%   ` Kevin Traynor
  0 siblings, 0 replies; 200+ results
From: Kevin Traynor @ 2018-04-19 17:26 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, anatoly.burakov, wenzhuo.lu,
	declan.doherty, jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> replace panic calls with log and retrun value.

typo return, seems to be in a few commit msgs

> Local function to this file,
> changing from void to int is non-abi-breaking
> 
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---
>  drivers/net/ixgbe/ixgbe_ethdev.c |  3 ++-
>  drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
>  drivers/net/ixgbe/ixgbe_pf.c     | 13 +++++++++----
>  3 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
> index a5e2fc0..c7797f1 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -1224,7 +1224,8 @@ struct rte_ixgbe_xstats_name_off {
>  	memset(hwstrip, 0, sizeof(*hwstrip));
>  
>  	/* initialize PF if max_vfs not zero */
> -	ixgbe_pf_host_init(eth_dev);
> +	if (ixgbe_pf_host_init(eth_dev) != 0)
> +		return -1;

similar comment as previous patch about using an appropriate return value

>  
>  	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
>  	/* let hardware know driver is loaded */
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
> index 6550777..8bb41ec 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.h
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.h
> @@ -661,7 +661,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
>  
>  void ixgbe_vlan_hw_strip_config(struct rte_eth_dev *dev);
>  
> -void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
> +int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
>  
>  void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
>  
> diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
> index 4e61310..4cd3651 100644
> --- a/drivers/net/ixgbe/ixgbe_pf.c
> +++ b/drivers/net/ixgbe/ixgbe_pf.c
> @@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
>  	return 0;
>  }
>  
> -void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
> +int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
>  {
>  	struct ixgbe_vf_info **vfinfo =
>  		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
> @@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
>  	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
>  	vf_num = dev_num_vf(eth_dev);
>  	if (vf_num == 0)
> -		return;
> +		return 0;
>  
>  	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
> -	if (*vfinfo == NULL)
> -		rte_panic("Cannot allocate memory for private VF data\n");
> +	if (*vfinfo == NULL) {
> +		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
>  	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
> @@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
>  
>  	/* set mb interrupt mask */
>  	ixgbe_mb_intr_setup(eth_dev);
> +
> +	return 0;
>  }
>  
>  void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-19 17:25  0%   ` Kevin Traynor
  2018-04-20 13:14  0%     ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Kevin Traynor @ 2018-04-19 17:25 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, anatoly.burakov, wenzhuo.lu,
	declan.doherty, jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> replace panic calls with log and retrun value.
> Local function to this file,
> changing from void to int is non-abi-breaking
> --
> v4 - keep error message literal string in a singhle line
> 
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---
>  drivers/net/e1000/e1000_ethdev.h |  2 +-
>  drivers/net/e1000/igb_ethdev.c   |  3 ++-
>  drivers/net/e1000/igb_pf.c       | 15 +++++++++------
>  3 files changed, 12 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
> index 6354b89..2e527de 100644
> --- a/drivers/net/e1000/e1000_ethdev.h
> +++ b/drivers/net/e1000/e1000_ethdev.h
> @@ -411,7 +411,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
>  /*
>   * misc function prototypes
>   */
> -void igb_pf_host_init(struct rte_eth_dev *eth_dev);
> +int igb_pf_host_init(struct rte_eth_dev *eth_dev);
>  
>  void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
>  
> diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
> index 9b808a9..4479616 100644
> --- a/drivers/net/e1000/igb_ethdev.c
> +++ b/drivers/net/e1000/igb_ethdev.c
> @@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
>  	}
>  
>  	/* initialize PF if max_vfs not zero */
> -	igb_pf_host_init(eth_dev);
> +	if (igb_pf_host_init(eth_dev) != 0)

You don't need "!= 0"

You need to set "error" here, or else return it from igb_pf_host_init().
We know -ENOMEM is the only error that can be returned from
igb_pf_host_init() but not sure if should assume that.

> +		goto err_late;
>  
>  	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
>  	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
> diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
> index b9f2e53..ae4b0a4 100644
> --- a/drivers/net/e1000/igb_pf.c
> +++ b/drivers/net/e1000/igb_pf.c
> @@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
>  	return 0;
>  }
>  
> -void igb_pf_host_init(struct rte_eth_dev *eth_dev)
> +int igb_pf_host_init(struct rte_eth_dev *eth_dev)
>  {
>  	struct e1000_vf_info **vfinfo =
>  		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
> @@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
>  
>  	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
>  	if (0 == (vf_num = dev_num_vf(eth_dev)))
> -		return;
> +		return 0;
>  
>  	if (hw->mac.type == e1000_i350)
>  		nb_queue = 1;
> @@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
>  		/* per datasheet, it should be 2, but 1 seems correct */
>  		nb_queue = 1;
>  	else
> -		return;
> +		return 0;
>  
>  	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
> -	if (*vfinfo == NULL)
> -		rte_panic("Cannot allocate memory for private VF data\n");
> +	if (*vfinfo == NULL) {
> +		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private VF data\n",
> +			__func__);
> +		return -1;
> +	}
>  
>  	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
>  	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
> @@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
>  	/* set mb interrupt mask */
>  	igb_mb_intr_setup(eth_dev);
>  
> -	return;
> +	return 0;
>  }
>  
>  void igb_pf_host_uninit(struct rte_eth_dev *dev)
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-19 17:25  0%   ` Kevin Traynor
  2018-04-20 13:13  0%     ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Kevin Traynor @ 2018-04-19 17:25 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, anatoly.burakov, wenzhuo.lu,
	declan.doherty, jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04/19/2018 07:01 AM, Arnon Warshavsky wrote:
> replace panic calls with log and retrun value.
> Local functions to this file,
> changing from void to int are non-abi-breaking
> --
> v4 - fix split literal strings in log messages
> 
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---
>  drivers/net/bonding/rte_eth_bond_8023ad.c         | 28 +++++++++++++++--------
>  drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
>  drivers/net/bonding/rte_eth_bond_api.c            | 20 +++++++++++-----
>  drivers/net/bonding/rte_eth_bond_pmd.c            |  9 +++++---
>  drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
>  5 files changed, 40 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
> index c452318..7512901 100644
> --- a/drivers/net/bonding/rte_eth_bond_8023ad.c
> +++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
> @@ -893,7 +893,7 @@
>  			bond_mode_8023ad_periodic_cb, arg);
>  }
>  
> -void
> +int
>  bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
>  				uint16_t slave_id)
>  {
> @@ -939,7 +939,7 @@
>  	timer_cancel(&port->warning_timer);
>  
>  	if (port->mbuf_pool != NULL)
> -		return;
> +		return 0;
>  
>  	RTE_ASSERT(port->rx_ring == NULL);
>  	RTE_ASSERT(port->tx_ring == NULL);
> @@ -968,8 +968,9 @@
>  	/* Any memory allocation failure in initialization is critical because
>  	 * resources can't be free, so reinitialization is impossible. */
>  	if (port->mbuf_pool == NULL) {
> -		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
> -			slave_id, mem_name, rte_strerror(rte_errno));
> +		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
> +			__func__, slave_id, mem_name, rte_strerror(rte_errno));
> +		return -1;
>  	}
>  
>  	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
> @@ -977,8 +978,9 @@
>  			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
>  
>  	if (port->rx_ring == NULL) {
> -		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
> -			mem_name, rte_strerror(rte_errno));
> +		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
> +			__func__, slave_id, mem_name, rte_strerror(rte_errno));
> +		return -1;
>  	}
>  
>  	/* TX ring is at least one pkt longer to make room for marker packet. */
> @@ -987,9 +989,12 @@
>  			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
>  
>  	if (port->tx_ring == NULL) {
> -		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
> -			mem_name, rte_strerror(rte_errno));
> +		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring '%s': %s\n",
> +			__func__, slave_id, mem_name, rte_strerror(rte_errno));
> +		return -1;
>  	}
> +
> +	return 0;
>  }
>  
>  int
> @@ -1143,9 +1148,12 @@
>  	struct bond_dev_private *internals = bond_dev->data->dev_private;
>  	uint8_t i;
>  
> -	for (i = 0; i < internals->active_slave_count; i++)
> -		bond_mode_8023ad_activate_slave(bond_dev,
> +	for (i = 0; i < internals->active_slave_count; i++) {
> +		int rc = bond_mode_8023ad_activate_slave(bond_dev,
>  				internals->active_slaves[i]);
> +		if (rc != 0)
> +			return rc;
> +	}
>  
>  	return 0;
>  }
> diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> index 0f490a5..96a42f2 100644
> --- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> +++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
> @@ -263,7 +263,7 @@ struct mode8023ad_private {
>   * @return
>   *  0 on success, negative value otherwise.
>   */
> -void
> +int
>  bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
>  
>  /**
> diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
> index aa89425..96aa1ff 100644
> --- a/drivers/net/bonding/rte_eth_bond_api.c
> +++ b/drivers/net/bonding/rte_eth_bond_api.c
> @@ -69,14 +69,15 @@
>  	return 0;
>  }
>  
> -void
> +int
>  activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
>  {
>  	struct bond_dev_private *internals = eth_dev->data->dev_private;
>  	uint8_t active_count = internals->active_slave_count;
>  
>  	if (internals->mode == BONDING_MODE_8023AD)
> -		bond_mode_8023ad_activate_slave(eth_dev, port_id);
> +		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
> +			return -1;
>  
>  	if (internals->mode == BONDING_MODE_TLB
>  			|| internals->mode == BONDING_MODE_ALB) {
> @@ -357,10 +358,17 @@
>  				bond_ethdev_primary_set(internals,
>  							slave_port_id);
>  
> -			if (find_slave_by_id(internals->active_slaves,
> -					     internals->active_slave_count,
> -					     slave_port_id) == internals->active_slave_count)
> -				activate_slave(bonded_eth_dev, slave_port_id);
> +			int rc =

There's no need for the rc variables, the existing check would suffice here

> +				find_slave_by_id(internals->active_slaves,
> +					internals->active_slave_count,
> +					slave_port_id);
> +
> +			if (rc == internals->active_slave_count) {
> +				int rc = activate_slave(bonded_eth_dev,
> +							slave_port_id);
> +				if (rc != 0)
> +					return -1;
and this could be

if (activate_slave(bonded_eth_dev, slave_port_id))
	return -1;

> +			}
>  		}
>  	}
>  
> diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
> index 2805c71..2d9052d 100644
> --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> @@ -1741,8 +1741,10 @@ struct bwg_slave {
>  		/* Any memory allocation failure in initialization is critical because
>  		 * resources can't be free, so reinitialization is impossible. */
>  		if (port->slow_pool == NULL) {
> -			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
> -				slave_id, mem_name, rte_strerror(rte_errno));
> +			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
> +				__func__, slave_id,
> +				mem_name, rte_strerror(rte_errno));
> +			return -1;
>  		}
>  	}
>  
> @@ -2673,7 +2675,8 @@ struct bwg_slave {
>  			mac_address_slaves_update(bonded_eth_dev);
>  		}
>  
> -		activate_slave(bonded_eth_dev, port_id);
> +		if (activate_slave(bonded_eth_dev, port_id) != 0)
> +			return -1;

it's more consistent with the rest of the function to do,

if(activate_slave(bonded_eth_dev, port_id))
	return rc;

There's other places through the patches where "!= 0" is used but not
really needed

>  
>  		/* If user has defined the primary port then default to using it */
>  		if (internals->user_defined_primary_port &&
> diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
> index 94eca88..d99d42c 100644
> --- a/drivers/net/bonding/rte_eth_bond_private.h
> +++ b/drivers/net/bonding/rte_eth_bond_private.h
> @@ -187,7 +187,7 @@ struct bond_dev_private {
>  void
>  deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
>  
> -void
> +int
>  activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
>  
>  void
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v1 2/6] mempool: implement abstract mempool info API
  @ 2018-04-19 16:42  5%     ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2018-04-19 16:42 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: dev, Artem V. Andreev

On Mon, Mar 26, 2018 at 05:12:55PM +0100, Andrew Rybchenko wrote:
> From: "Artem V. Andreev" <Artem.Andreev@oktetlabs.ru>
> 
> Primarily, it is intended as a way for the mempool driver to provide
> additional information on how it lays up objects inside the mempool.
> 
> Signed-off-by: Artem V. Andreev <Artem.Andreev@oktetlabs.ru>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>

I think it's a good idea to have a way to query mempool features
or parameters. The approach chosen in this patch looks similar
to what we have with ethdev devinfo, right?

[...]

>  /**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Additional information about the mempool
> + */
> +struct rte_mempool_info;
> +

[...]

> +/* wrapper to get additional mempool info */
> +int
> +rte_mempool_ops_get_info(const struct rte_mempool *mp,
> +			 struct rte_mempool_info *info)
> +{
> +	struct rte_mempool_ops *ops;
> +
> +	ops = rte_mempool_get_ops(mp->ops_index);
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(ops->get_info, -ENOTSUP);
> +	return ops->get_info(mp, info);
> +}

Thinking in terms of ABI compatibility, it looks that each time we will
add or remove a field, it will break the ABI because the info structure
will change.

Well, it's maybe nitpicking, because most of the time adding a field in
info structure goes with adding a field in the mempool struct, which
will anyway break the ABI.

Another approach is to have a function
rte_mempool_info_contig_block_size(mp) whose ABI can be more easily
wrapped with VERSION_SYMBOL().

On my side I'm fine with your current approach, especially given how few
usages of VERSION_SYMBOL() we can find in DPDK. But in case you feel
it's better to have a function...

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH v1 0/6] mempool: add bucket driver
    @ 2018-04-19 16:41  0%   ` Olivier Matz
  1 sibling, 0 replies; 200+ results
From: Olivier Matz @ 2018-04-19 16:41 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: dev

Hi Andrew,

Sorry for the late feedback, few comments below.

On Mon, Mar 26, 2018 at 05:12:53PM +0100, Andrew Rybchenko wrote:
> The initial patch series [1] (RFCv1 is [2]) is split into two to simplify
> processing.  It is the second part which relies on the first one [3].
> 
> It should be applied on top of [4] and [3].
> 
> The patch series adds bucket mempool driver which allows to allocate
> (both physically and virtually) contiguous blocks of objects and adds
> mempool API to do it. It is still capable to provide separate objects,
> but it is definitely more heavy-weight than ring/stack drivers.
> The driver will be used by the future Solarflare driver enhancements
> which allow to utilize physical contiguous blocks in the NIC firmware.
> 
> The target usecase is dequeue in blocks and enqueue separate objects
> back (which are collected in buckets to be dequeued). So, the memory
> pool with bucket driver is created by an application and provided to
> networking PMD receive queue. The choice of bucket driver is done using
> rte_eth_dev_pool_ops_supported(). A PMD that relies upon contiguous
> block allocation should report the bucket driver as the only supported
> and preferred one.
> 
> Introduction of the contiguous block dequeue operation is proven by
> performance measurements using autotest with minor enhancements:
>  - in the original test bulks are powers of two, which is unacceptable
>    for us, so they are changed to multiple of contig_block_size;
>  - the test code is duplicated to support plain dequeue and
>    dequeue_contig_blocks;
>  - all the extra test variations (with/without cache etc) are eliminated;
>  - a fake read from the dequeued buffer is added (in both cases) to
>    simulate mbufs access.
> 
> start performance test for bucket (without cache)
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   111935488
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   115290931
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   353055539
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   353330790
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   224657407
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   230411468
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   706700902
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   703673139
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   425236887
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   437295512
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=  1343409356
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=  1336567397
> start performance test for bucket (without cache + contiguous dequeue)
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   122945536
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   126458265
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   374262988
> mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   377316966
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   244842496
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   251618917
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   751226060
> mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   756233010
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   462068120
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   476997221
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=  1432171313
> mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=  1438829771
> 
> The number of objects in the contiguous block is a function of bucket
> memory size (.config option) and total element size. In the future
> additional API with possibility to pass parameters on mempool allocation
> may be added.
> 
> It breaks ABI since changes rte_mempool_ops. The ABI version is already
> bumped in [4].
> 
> 
> [1] https://dpdk.org/ml/archives/dev/2018-January/088698.html
> [2] https://dpdk.org/ml/archives/dev/2017-November/082335.html
> [3] https://dpdk.org/ml/archives/dev/2018-March/093807.html
> [4] https://dpdk.org/ml/archives/dev/2018-March/093196.html


As discussed privately, at first glance I was a bit reticent to
introduce a new API in mempool that will only be available in one
mempool driver.

There have been the same debate for several features of ethdev: should
we provide a generic API for a feature available in only one driver? Or
should the driver provide its own API?

Given that the mempool driver API is not that big currently, and that it
can bring a performance enhancement (which is the primary goal of DPDK),
I think we can give a chance to this patchset. Drivers that want to use
this new bucket driver can take advantage of the new API, keeping a
fallback mode to still be working with other mempool drivers.

The bet is:

- drivers and aplication try the bucket driver and its new API, and
  they notice a performance increase
- the get_contig_block API is implemented in some other drivers, if
  possible (not easy for default one at least)
- the bucket driver could become the default driver, if the performance
  increase is significant and wide.

By integrating this patchset, I hope we can also have some feedback
about the performance of this driver in other situations. My worries are
about pipeline+multicore use-cases, where it may add some pressure
(races conditions?) on the bucket header.

Finally, I think (as discussed privately) that the tests should be
updated to be able to reproduce the tests in this cover letter. I just
did a quick test by replacing "stack" by "bucket" in autotest (see patch
below) and it fails in populate(). I did not investigate more, maybe the
parameters are not correct for bucket.

 --- a/test/test/test_mempool.c
 +++ b/test/test/test_mempool.c
 @@ -498,7 +498,7 @@ test_mempool(void)
                 printf("cannot allocate mp_stack mempool\n");
                 goto err;
         }
 -       if (rte_mempool_set_ops_byname(mp_stack, "stack", NULL) < 0) {
 +       if (rte_mempool_set_ops_byname(mp_stack, "bucket", NULL) < 0) {
                 printf("cannot set stack handler\n");
                 goto err;
         }



Thank you for this work, hope we'll be on time for rc1.
Olivier

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2] cmdline: rework as a wrapper to libedit
  2018-04-17 15:21  1% ` [dpdk-dev] [PATCH v1] " Adrien Mazarguil
  2018-04-17 15:59  0%   ` Burakov, Anatoly
@ 2018-04-19 15:13  1%   ` Adrien Mazarguil
  1 sibling, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 15:13 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Keith Wiles, Jingjing Wu, Thomas Monjalon, Ferruh Yigit,
	Jim Thompson, Anatoly Burakov

Disclaimer: this patch must not be confused with the CLI library [1]
(work in progress) that will eventually supersede librte_cmdline itself
with a different API.

Rather, it modifies librte_cmdline to delegate all the heavy lifting
(terminal and history handling), strips unused features and re-implements
what remains of its public API as a wrapper to the editline library (also
known as libedit) [2], a well-known, BSD-licensed and widely available
library used by many projects which does everything needed and more [3].

This approach was chosen because converting librte_cmdline as a wrapper to
a more capable library was easier and faster than addressing its
shortcomings and results in much less code to maintain in DPDK.

It also provides a drop-in solution for applications that rely on
librte_cmdline. They benefit from greatly improved command line handling
without a meaningful impact on their code base.

The main motivation behind this patch is testpmd's flow (rte_flow) command,
which requires support for dynamic tokens and very long lines that must be
broken down when displayed. This is not supported by librte_cmdline's
limited terminal handling capabilities, resulting in a rather frustrating
user experience.

It had to be addressed given the importance of testpmd as one of the
primary tool used by PMD developers.

This rework results in the following changes:

- Removed circular buffer management interface for command history
  (cmdline_cirbuf.c), command history being handled by libedit.
- Removed raw command-line interpreter (cmdline_rdline.c).
- Removed raw terminal handler (cmdline_vt100.c).
- Removed all test/example code for the above.
- Re-implemented high level interactive and non-interactive command-line
  handlers (cmdline.c and cmdline_socket.c) on top of libedit using its
  native interface, not its readline compatibility layer.
- Made struct cmdline opaque so that applications relying on librte_cmdline
  do not need to include any libedit headers.
- Applications do not need to include cmdline_rdline.h anymore.
- Terminal resizing is now automatically handled.
- New external dependency for applications relying on librte_cmdline.
- Major version bump due to the ABI impact of these changes.

[1] http://dpdk.org/browse/draft/dpdk-draft-cli/
[2] http://thrysoee.dk/editline/
[3] http://netbsd.gw.com/cgi-bin/man-cgi?editline++NetBSD-current

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Olivier Matz <olivier.matz@6wind.com>
Cc: Keith Wiles <keith.wiles@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Jim Thompson <jim@netgate.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>

--

v2 changes:

- Replaced an instance of snprintf() with rte_strlcpy() [5].
- Rebased patch.

[5] http://dpdk.org/ml/archives/dev/2018-April/097721.html

v1 changes:

No fundamental change since the original RFC [4], except it's been rebased
several times and Meson build support was added in the meantime. Commit log
was also shortened a bit.

I'm re-sending this because I think it's useful, at least to me (duh). As
the maintainer of rte_flow, I spend most of my time typing flow commands in
testpmd and libedit makes that a pleasant experience.

Try it out! And don't hesitate to send your acked-by line to get this in
time for 18.05 :)

[4] http://dpdk.org/ml/archives/dev/2017-November/081605.html
---
 app/test-pmd/cmdline.c                          |    1 -
 examples/bond/main.c                            |    1 -
 examples/cmdline/commands.c                     |    1 -
 examples/cmdline/main.c                         |    1 -
 examples/multi_process/simple_mp/main.c         |    1 -
 examples/multi_process/simple_mp/mp_commands.c  |    1 -
 examples/qos_sched/cmdline.c                    |    1 -
 examples/quota_watermark/qwctl/commands.c       |    1 -
 examples/quota_watermark/qwctl/qwctl.c          |    1 -
 .../guest_cli/vm_power_cli_guest.c              |    1 -
 examples/vm_power_manager/vm_power_cli.c        |    1 -
 lib/librte_cmdline/Makefile                     |   10 +-
 lib/librte_cmdline/cmdline.c                    |  383 ++++--
 lib/librte_cmdline/cmdline.h                    |   22 +-
 lib/librte_cmdline/cmdline_cirbuf.c             |  412 ------
 lib/librte_cmdline/cmdline_cirbuf.h             |  193 ---
 lib/librte_cmdline/cmdline_parse.c              |    7 +-
 lib/librte_cmdline/cmdline_rdline.c             |  644 ---------
 lib/librte_cmdline/cmdline_rdline.h             |  201 ---
 lib/librte_cmdline/cmdline_socket.c             |   36 +-
 lib/librte_cmdline/cmdline_vt100.c              |  132 --
 lib/librte_cmdline/cmdline_vt100.h              |  100 --
 lib/librte_cmdline/meson.build                  |   18 +-
 lib/librte_cmdline/rte_cmdline_version.map      |   41 +-
 mk/rte.app.mk                                   |    2 +
 test/cmdline_test/cmdline_test.c                |    1 -
 test/cmdline_test/commands.c                    |   69 -
 test/test/Makefile                              |    1 -
 test/test/commands.c                            |    1 -
 test/test/meson.build                           |    1 -
 test/test/test.c                                |    1 -
 test/test/test_cmdline.c                        |    9 -
 test/test/test_cmdline.h                        |    6 -
 test/test/test_cmdline_cirbuf.c                 | 1301 ------------------
 test/test/test_cmdline_lib.c                    |  117 +-
 35 files changed, 303 insertions(+), 3416 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 0b442c3a6..6b4d9dbfd 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -48,7 +48,6 @@
 #include <rte_flow.h>
 #include <rte_gro.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/bond/main.c b/examples/bond/main.c
index d8edc642b..cd9da146f 100644
--- a/examples/bond/main.c
+++ b/examples/bond/main.c
@@ -42,7 +42,6 @@
 #include <rte_arp.h>
 #include <rte_spinlock.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/cmdline/commands.c b/examples/cmdline/commands.c
index 06916d783..3e9d84d46 100644
--- a/examples/cmdline/commands.c
+++ b/examples/cmdline/commands.c
@@ -20,7 +20,6 @@
 	#endif
 #endif
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 #include <cmdline_parse_num.h>
diff --git a/examples/cmdline/main.c b/examples/cmdline/main.c
index f2f2e5a2f..9fe0fdef7 100644
--- a/examples/cmdline/main.c
+++ b/examples/cmdline/main.c
@@ -11,7 +11,6 @@
 #include <termios.h>
 #include <sys/queue.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/examples/multi_process/simple_mp/main.c b/examples/multi_process/simple_mp/main.c
index e6c69d6a3..49d6ed169 100644
--- a/examples/multi_process/simple_mp/main.c
+++ b/examples/multi_process/simple_mp/main.c
@@ -35,7 +35,6 @@
 #include <rte_ring.h>
 #include <rte_log.h>
 #include <rte_mempool.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_socket.h>
diff --git a/examples/multi_process/simple_mp/mp_commands.c b/examples/multi_process/simple_mp/mp_commands.c
index e4df6ff01..a9eb8bb44 100644
--- a/examples/multi_process/simple_mp/mp_commands.c
+++ b/examples/multi_process/simple_mp/mp_commands.c
@@ -25,7 +25,6 @@
 #include <rte_mempool.h>
 #include <rte_string_fns.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_socket.h>
diff --git a/examples/qos_sched/cmdline.c b/examples/qos_sched/cmdline.c
index 15f51830c..679819a25 100644
--- a/examples/qos_sched/cmdline.c
+++ b/examples/qos_sched/cmdline.c
@@ -7,7 +7,6 @@
 #include <inttypes.h>
 #include <string.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/quota_watermark/qwctl/commands.c b/examples/quota_watermark/qwctl/commands.c
index a1c646b9f..33fce2063 100644
--- a/examples/quota_watermark/qwctl/commands.c
+++ b/examples/quota_watermark/qwctl/commands.c
@@ -7,7 +7,6 @@
 #include <string.h>
 #include <termios.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/quota_watermark/qwctl/qwctl.c b/examples/quota_watermark/qwctl/qwctl.c
index 2f7914c80..9f41a684a 100644
--- a/examples/quota_watermark/qwctl/qwctl.c
+++ b/examples/quota_watermark/qwctl/qwctl.c
@@ -13,7 +13,6 @@
 #include <rte_log.h>
 #include <rte_memzone.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c b/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
index 43bdeacef..218ed192e 100644
--- a/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
+++ b/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
@@ -8,7 +8,6 @@
 #include <stdio.h>
 #include <termios.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
diff --git a/examples/vm_power_manager/vm_power_cli.c b/examples/vm_power_manager/vm_power_cli.c
index d588d38aa..99757420a 100644
--- a/examples/vm_power_manager/vm_power_cli.c
+++ b/examples/vm_power_manager/vm_power_cli.c
@@ -10,7 +10,6 @@
 #include <termios.h>
 #include <errno.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
diff --git a/lib/librte_cmdline/Makefile b/lib/librte_cmdline/Makefile
index ddae1cfde..feb1f1bca 100644
--- a/lib/librte_cmdline/Makefile
+++ b/lib/librte_cmdline/Makefile
@@ -10,28 +10,24 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
 
 EXPORT_MAP := rte_cmdline_version.map
 
-LIBABIVER := 2
+LIBABIVER := 3
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) := cmdline.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_cirbuf.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_etheraddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_ipaddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_num.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_string.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_rdline.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_vt100.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_portlist.c
 
-CFLAGS += -D_GNU_SOURCE
 LDLIBS += -lrte_eal
 
 # install includes
 INCS := cmdline.h cmdline_parse.h cmdline_parse_num.h cmdline_parse_ipaddr.h
-INCS += cmdline_parse_etheraddr.h cmdline_parse_string.h cmdline_rdline.h
-INCS += cmdline_vt100.h cmdline_socket.h cmdline_cirbuf.h cmdline_parse_portlist.h
+INCS += cmdline_parse_etheraddr.h cmdline_parse_string.h
+INCS += cmdline_socket.h cmdline_parse_portlist.h
 SYMLINK-$(CONFIG_RTE_LIBRTE_CMDLINE)-include := $(INCS)
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_cmdline/cmdline.c b/lib/librte_cmdline/cmdline.c
index 591b78b0f..1c45cd9ff 100644
--- a/lib/librte_cmdline/cmdline.c
+++ b/lib/librte_cmdline/cmdline.c
@@ -4,79 +4,183 @@
  * All rights reserved.
  */
 
+#include <ctype.h>
+#include <histedit.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <inttypes.h>
 #include <fcntl.h>
 #include <poll.h>
 #include <errno.h>
-#include <termios.h>
-#include <netinet/in.h>
 
 #include <rte_string_fns.h>
 
 #include "cmdline_parse.h"
-#include "cmdline_rdline.h"
 #include "cmdline.h"
 
-static void
-cmdline_valid_buffer(struct rdline *rdl, const char *buf,
-		     __attribute__((unused)) unsigned int size)
+struct cmdline {
+	char *line;
+	FILE *f_in;
+	FILE *f_out;
+	cmdline_parse_ctx_t *ctx;
+	EditLine *el;
+	History *hist;
+	HistEvent histev;
+	uint32_t eof:1;
+	uint32_t error:1;
+	char prompt[RDLINE_PROMPT_SIZE];
+};
+
+void
+cmdline_set_prompt(struct cmdline *cl, const char *prompt)
 {
-	struct cmdline *cl = rdl->opaque;
-	int ret;
-	ret = cmdline_parse(cl, buf);
-	if (ret == CMDLINE_PARSE_AMBIGUOUS)
-		cmdline_printf(cl, "Ambiguous command\n");
-	else if (ret == CMDLINE_PARSE_NOMATCH)
-		cmdline_printf(cl, "Command not found\n");
-	else if (ret == CMDLINE_PARSE_BAD_ARGS)
-		cmdline_printf(cl, "Bad arguments\n");
+	if (!cl || !prompt)
+		return;
+	rte_strlcpy(cl->prompt, prompt, sizeof(cl->prompt));
 }
 
-static int
-cmdline_complete_buffer(struct rdline *rdl, const char *buf,
-			char *dstbuf, unsigned int dstsize,
-			int *state)
+void *
+cmdline_ctx_get(struct cmdline *cl)
 {
-	struct cmdline *cl = rdl->opaque;
-	return cmdline_complete(cl, buf, state, dstbuf, dstsize);
+	if (!cl)
+		return NULL;
+	return cl->ctx;
 }
 
-int
-cmdline_write_char(struct rdline *rdl, char c)
+static char *
+cmdline_el_prompt(EditLine *el)
 {
-	int ret = -1;
 	struct cmdline *cl;
 
-	if (!rdl)
-		return -1;
-
-	cl = rdl->opaque;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return NULL;
+	return cl->prompt;
+}
 
-	if (cl->s_out >= 0)
-		ret = write(cl->s_out, &c, 1);
+static unsigned char
+cmdline_el_execute(EditLine *el, int c)
+{
+	const LineInfo *li = el_line(el);
+	size_t len = li->lastchar - li->buffer;
+	char *line;
+	struct cmdline *cl;
+	int ret;
 
-	return ret;
+	(void)c;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	line = realloc(cl->line, len + 2);
+	if (!line) {
+		cl->error = 1;
+		return CC_FATAL;
+	}
+	cl->line = line;
+	memcpy(line, li->buffer, len);
+	line[len] = '\n';
+	line[len + 1] = '\0';
+	fputs("\r\n", cl->f_out);
+	ret = cmdline_parse(cl, line);
+	if (ret == CMDLINE_PARSE_AMBIGUOUS)
+		fprintf(cl->f_out, "Ambiguous command\r\n");
+	else if (ret == CMDLINE_PARSE_NOMATCH)
+		fprintf(cl->f_out, "Command not found\r\n");
+	else if (ret == CMDLINE_PARSE_BAD_ARGS)
+		fprintf(cl->f_out, "Bad arguments\r\n");
+	if (cl->error)
+		return CC_FATAL;
+	if (cl->eof)
+		return CC_EOF;
+	if (len) {
+		line[len] = '\0';
+		history(cl->hist, &cl->histev, H_ENTER, line);
+	}
+	return CC_NEWLINE;
 }
 
+static unsigned char
+cmdline_el_complete(EditLine *el, int c)
+{
+	const LineInfo *li = el_line(el);
+	size_t pos = li->cursor - li->buffer;
+	char *line;
+	struct cmdline *cl;
+	char complete_buf[RDLINE_COMPLETE_SIZE];
+	int complete_state;
+	int ret;
 
-void
-cmdline_set_prompt(struct cmdline *cl, const char *prompt)
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	line = realloc(cl->line, pos + 1);
+	if (!line) {
+		cl->error = 1;
+		return CC_FATAL;
+	}
+	cl->line = line;
+	memcpy(line, li->buffer, pos);
+	line[pos] = '\0';
+	if (c == '\t')
+		complete_state = 0;
+	else
+		complete_state = -1;
+	/* see in parse.h for help on complete() */
+	ret = cmdline_complete(cl, line, &complete_state,
+			       complete_buf, sizeof(complete_buf));
+	/* no completion or error */
+	if (ret <= 0)
+		return CC_ARGHACK;
+	/* string must be NUL-terminated */
+	if (strnlen(complete_buf, sizeof(complete_buf)) == sizeof(complete_buf))
+		return CC_ERROR;
+	/* add chars */
+	if (ret == CMDLINE_PARSE_COMPLETED_BUFFER) {
+		/* if in the middle of a token, remove its suffix first */
+		for (pos = 0; li->cursor + pos != li->lastchar; pos++)
+			if (isblank(li->cursor[pos]))
+				break;
+		el_cursor(el, pos);
+		el_deletestr(el, pos);
+		if (el_insertstr(el, complete_buf))
+			return CC_ERROR;
+		return CC_REFRESH;
+	}
+	/* choice */
+	fputs("\r\n", cl->f_out);
+	while (ret) {
+		fputc(' ', cl->f_out);
+		fputs(complete_buf, cl->f_out);
+		fputs("\r\n", cl->f_out);
+		ret = cmdline_complete(cl, line, &complete_state,
+				       complete_buf, sizeof(complete_buf));
+	}
+	el_set(el, EL_REFRESH);
+	return CC_REDISPLAY;
+}
+
+static unsigned char
+cmdline_el_delete_next_char_or_eof(EditLine *el, int c)
 {
-	if (!cl || !prompt)
-		return;
-	snprintf(cl->prompt, sizeof(cl->prompt), "%s", prompt);
+	const LineInfo *li = el_line(el);
+	struct cmdline *cl;
+
+	(void)c;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	if (li->buffer == li->lastchar) {
+		cl->eof = 1;
+		return CC_EOF;
+	}
+	el_cursor(el, 1);
+	el_deletestr(el, 1);
+	return CC_REFRESH;
 }
 
 struct cmdline *
 cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out)
 {
 	struct cmdline *cl;
-	int ret;
 
 	if (!ctx || !prompt)
 		return NULL;
@@ -85,36 +189,89 @@ cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out)
 	if (cl == NULL)
 		return NULL;
 	memset(cl, 0, sizeof(struct cmdline));
-	cl->s_in = s_in;
-	cl->s_out = s_out;
+	cl->line = NULL;
+	s_in = dup(s_in);
+	s_out = s_out != -1 ? dup(s_out) : open("/dev/null", O_WRONLY);
+	if (s_in == -1 || s_out == -1)
+		goto error;
+	cl->f_in = fdopen(s_in, "rb");
+	cl->f_out = fdopen(s_out, "wb");
+	if (!cl->f_in || !cl->f_out)
+		goto error;
 	cl->ctx = ctx;
-
-	ret = rdline_init(&cl->rdl, cmdline_write_char, cmdline_valid_buffer,
-			cmdline_complete_buffer);
-	if (ret != 0) {
-		free(cl);
-		return NULL;
-	}
-
-	cl->rdl.opaque = cl;
+	cl->el = el_init("dpdk", cl->f_in, cl->f_out, stderr);
+	if (!cl->el)
+		goto error;
+	if (el_set(cl->el, EL_CLIENTDATA, cl))
+		goto error;
 	cmdline_set_prompt(cl, prompt);
-	rdline_newline(&cl->rdl, cl->prompt);
-
+	if (el_set(cl->el, EL_PROMPT, cmdline_el_prompt))
+		goto error;
+	if (el_set(cl->el, EL_EDITOR, "emacs"))
+		goto error;
+	if (el_set(cl->el, EL_SIGNAL, 1))
+		goto error;
+	cl->hist = history_init();
+	if (!cl->hist)
+		goto error;
+	if (history(cl->hist, &cl->histev, H_SETSIZE,
+		    RDLINE_HISTORY_MAX_LINE) < 0)
+		goto error;
+	if (history(cl->hist, &cl->histev, H_SETUNIQUE, 1))
+		goto error;
+	if (el_set(cl->el, EL_HIST, history, cl->hist))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-execute", "Execute command",
+		   cmdline_el_execute))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^J", "ed-execute", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^M", "ed-execute", NULL))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-complete", "Complete argument",
+		   cmdline_el_complete))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^I", "ed-complete", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "?", "ed-complete", NULL))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-delete-next-char-or-eof",
+		   "Delete next character or assume EOF",
+		   cmdline_el_delete_next_char_or_eof))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^D",
+		   "ed-delete-next-char-or-eof", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^W", "ed-delete-prev-word", NULL))
+		goto error;
 	return cl;
+error:
+	if (cl->hist)
+		history_end(cl->hist);
+	if (cl->el)
+		el_end(cl->el);
+	if (cl->f_out)
+		fclose(cl->f_out);
+	else if (s_out != -1)
+		close(s_out);
+	if (cl->f_in)
+		fclose(cl->f_in);
+	else if (s_in != -1)
+		close(s_in);
+	free(cl);
+	return NULL;
 }
 
 void
 cmdline_free(struct cmdline *cl)
 {
-	dprintf("called\n");
-
 	if (!cl)
 		return;
-
-	if (cl->s_in > 2)
-		close(cl->s_in);
-	if (cl->s_out != cl->s_in && cl->s_out > 2)
-		close(cl->s_out);
+	history_end(cl->hist);
+	el_end(cl->el);
+	fclose(cl->f_out);
+	fclose(cl->f_in);
+	free(cl->line);
 	free(cl);
 }
 
@@ -126,70 +283,23 @@ cmdline_printf(const struct cmdline *cl, const char *fmt, ...)
 	if (!cl || !fmt)
 		return;
 
-#ifdef _GNU_SOURCE
-	if (cl->s_out < 0)
-		return;
-	va_start(ap, fmt);
-	vdprintf(cl->s_out, fmt, ap);
-	va_end(ap);
-#else
-	int ret;
-	char *buf;
-
-	if (cl->s_out < 0)
-		return;
-
-	buf = malloc(BUFSIZ);
-	if (buf == NULL)
-		return;
 	va_start(ap, fmt);
-	ret = vsnprintf(buf, BUFSIZ, fmt, ap);
+	vfprintf(cl->f_out, fmt, ap);
 	va_end(ap);
-	if (ret < 0) {
-		free(buf);
-		return;
-	}
-	if (ret >= BUFSIZ)
-		ret = BUFSIZ - 1;
-	ret = write(cl->s_out, buf, ret);
-	(void)ret;
-	free(buf);
-#endif
 }
 
 int
 cmdline_in(struct cmdline *cl, const char *buf, int size)
 {
-	const char *history, *buffer;
-	size_t histlen, buflen;
-	int ret = 0;
-	int i, same;
+	int i;
 
 	if (!cl || !buf)
 		return -1;
 
 	for (i=0; i<size; i++) {
-		ret = rdline_char_in(&cl->rdl, buf[i]);
-
-		if (ret == RDLINE_RES_VALIDATED) {
-			buffer = rdline_get_buffer(&cl->rdl);
-			history = rdline_get_history_item(&cl->rdl, 0);
-			if (history) {
-				histlen = strnlen(history, RDLINE_BUF_SIZE);
-				same = !memcmp(buffer, history, histlen) &&
-					buffer[histlen] == '\n';
-			}
-			else
-				same = 0;
-			buflen = strnlen(buffer, RDLINE_BUF_SIZE);
-			if (buflen > 1 && !same)
-				rdline_add_history(&cl->rdl, buffer);
-			rdline_newline(&cl->rdl, cl->prompt);
-		}
-		else if (ret == RDLINE_RES_EOF)
-			return -1;
-		else if (ret == RDLINE_RES_EXITED)
-			return -1;
+		char tmp[2] = { buf[i], '\0' };
+
+		el_push(cl->el, tmp);
 	}
 	return i;
 }
@@ -199,7 +309,7 @@ cmdline_quit(struct cmdline *cl)
 {
 	if (!cl)
 		return;
-	rdline_quit(&cl->rdl);
+	cl->eof = 1;
 }
 
 int
@@ -207,48 +317,49 @@ cmdline_poll(struct cmdline *cl)
 {
 	struct pollfd pfd;
 	int status;
-	ssize_t read_status;
-	char c;
+	int read_status;
+	int flags;
 
 	if (!cl)
 		return -EINVAL;
-	else if (cl->rdl.status == RDLINE_EXITED)
+	else if (cl->error)
+		return RDLINE_ERROR;
+	else if (cl->eof)
 		return RDLINE_EXITED;
 
-	pfd.fd = cl->s_in;
+	pfd.fd = fileno(cl->f_in);
 	pfd.events = POLLIN;
 	pfd.revents = 0;
 
 	status = poll(&pfd, 1, 0);
 	if (status < 0)
-		return status;
-	else if (status > 0) {
-		c = -1;
-		read_status = read(cl->s_in, &c, 1);
-		if (read_status < 0)
-			return read_status;
-
-		status = cmdline_in(cl, &c, 1);
-		if (status < 0 && cl->rdl.status != RDLINE_EXITED)
-			return status;
-	}
-
-	return cl->rdl.status;
+		return RDLINE_ERROR;
+	if (!status)
+		return RDLINE_RUNNING;
+	flags = fcntl(pfd.fd, F_GETFL);
+	if (!(flags & O_NONBLOCK))
+		fcntl(pfd.fd, F_SETFL, flags | O_NONBLOCK);
+	if (!el_gets(cl->el, &read_status) && read_status == -1)
+		cl->error = 1;
+	if (!(flags & O_NONBLOCK))
+		fcntl(pfd.fd, F_SETFL, flags);
+	return cl->error ? RDLINE_ERROR :
+		cl->eof ? RDLINE_EXITED :
+		RDLINE_RUNNING;
 }
 
 void
 cmdline_interact(struct cmdline *cl)
 {
-	char c;
-
 	if (!cl)
 		return;
 
-	c = -1;
-	while (1) {
-		if (read(cl->s_in, &c, 1) <= 0)
-			break;
-		if (cmdline_in(cl, &c, 1) < 0)
-			break;
+	while (!cl->error && !cl->eof) {
+		int read_status;
+
+		if (el_gets(cl->el, &read_status))
+			continue;
+		if (read_status == -1)
+			cl->error = 1;
 	}
 }
diff --git a/lib/librte_cmdline/cmdline.h b/lib/librte_cmdline/cmdline.h
index 27d2effdf..1f443be60 100644
--- a/lib/librte_cmdline/cmdline.h
+++ b/lib/librte_cmdline/cmdline.h
@@ -7,8 +7,6 @@
 #ifndef _CMDLINE_H_
 #define _CMDLINE_H_
 
-#include <termios.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 
 /**
@@ -21,22 +19,26 @@
 extern "C" {
 #endif
 
-struct cmdline {
-	int s_in;
-	int s_out;
-	cmdline_parse_ctx_t *ctx;
-	struct rdline rdl;
-	char prompt[RDLINE_PROMPT_SIZE];
-	struct termios oldterm;
+#define RDLINE_PROMPT_SIZE 32
+#define RDLINE_HISTORY_MAX_LINE 64
+#define RDLINE_COMPLETE_SIZE 128
+
+enum rdline_status {
+	RDLINE_ERROR = -1,
+	RDLINE_INIT,
+	RDLINE_RUNNING,
+	RDLINE_EXITED,
 };
 
+struct cmdline;
+
+void *cmdline_ctx_get(struct cmdline *cl);
 struct cmdline *cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out);
 void cmdline_set_prompt(struct cmdline *cl, const char *prompt);
 void cmdline_free(struct cmdline *cl);
 void cmdline_printf(const struct cmdline *cl, const char *fmt, ...)
 	__attribute__((format(printf,2,3)));
 int cmdline_in(struct cmdline *cl, const char *buf, int size);
-int cmdline_write_char(struct rdline *rdl, char c);
 
 /**
  * This function is nonblocking equivalent of ``cmdline_interact()``. It polls
diff --git a/lib/librte_cmdline/cmdline_cirbuf.c b/lib/librte_cmdline/cmdline_cirbuf.c
deleted file mode 100644
index 829a8af56..000000000
--- a/lib/librte_cmdline/cmdline_cirbuf.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <string.h>
-#include <errno.h>
-#include <stdio.h>
-
-#include "cmdline_cirbuf.h"
-
-
-int
-cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen)
-{
-	if (!cbuf || !buf)
-		return -EINVAL;
-	cbuf->maxlen = maxlen;
-	cbuf->len = 0;
-	cbuf->start = start;
-	cbuf->end = start;
-	cbuf->buf = buf;
-	return 0;
-}
-
-/* multiple add */
-
-int
-cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n)
-{
-	unsigned int e;
-
-	if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf))
-		return -EINVAL;
-
-	e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0;
-
-	if (n < cbuf->start + e) {
-		dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start - n + e, n);
-		memcpy(cbuf->buf + cbuf->start - n + e, c, n);
-	}
-	else {
-		dprintf("s[%d] -> d[%d] (%d)\n", + n - (cbuf->start + e), 0,
-			cbuf->start + e);
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - n +
-			(cbuf->start + e), 0, n - (cbuf->start + e));
-		memcpy(cbuf->buf, c  + n - (cbuf->start + e) , cbuf->start + e);
-		memcpy(cbuf->buf + cbuf->maxlen - n + (cbuf->start + e), c,
-		       n - (cbuf->start + e));
-	}
-	cbuf->len += n;
-	cbuf->start += (cbuf->maxlen - n + e);
-	cbuf->start %= cbuf->maxlen;
-	return n;
-}
-
-/* multiple add */
-
-int
-cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n)
-{
-	unsigned int e;
-
-	if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf))
-		return -EINVAL;
-
-	e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0;
-
-	if (n < cbuf->maxlen - cbuf->end - 1 + e) {
-		dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end + !e, n);
-		memcpy(cbuf->buf + cbuf->end + !e, c, n);
-	}
-	else {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end + !e, 0,
-			cbuf->maxlen - cbuf->end - 1 + e);
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - cbuf->end - 1 +
-			e, 0, n - cbuf->maxlen + cbuf->end + 1 - e);
-		memcpy(cbuf->buf + cbuf->end + !e, c, cbuf->maxlen -
-		       cbuf->end - 1 + e);
-		memcpy(cbuf->buf, c + cbuf->maxlen - cbuf->end - 1 + e,
-		       n - cbuf->maxlen + cbuf->end + 1 - e);
-	}
-	cbuf->len += n;
-	cbuf->end += n - e;
-	cbuf->end %= cbuf->maxlen;
-	return n;
-}
-
-/* add at head */
-
-static inline void
-__cirbuf_add_head(struct cirbuf * cbuf, char c)
-{
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start += (cbuf->maxlen - 1);
-		cbuf->start %= cbuf->maxlen;
-	}
-	cbuf->buf[cbuf->start] = c;
-	cbuf->len ++;
-}
-
-int
-cirbuf_add_head_safe(struct cirbuf * cbuf, char c)
-{
-	if (cbuf && !CIRBUF_IS_FULL(cbuf)) {
-		__cirbuf_add_head(cbuf, c);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_add_head(struct cirbuf * cbuf, char c)
-{
-	__cirbuf_add_head(cbuf, c);
-}
-
-/* add at tail */
-
-static inline void
-__cirbuf_add_tail(struct cirbuf * cbuf, char c)
-{
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end ++;
-		cbuf->end %= cbuf->maxlen;
-	}
-	cbuf->buf[cbuf->end] = c;
-	cbuf->len ++;
-}
-
-int
-cirbuf_add_tail_safe(struct cirbuf * cbuf, char c)
-{
-	if (cbuf && !CIRBUF_IS_FULL(cbuf)) {
-		__cirbuf_add_tail(cbuf, c);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_add_tail(struct cirbuf * cbuf, char c)
-{
-	__cirbuf_add_tail(cbuf, c);
-}
-
-
-static inline void
-__cirbuf_shift_left(struct cirbuf *cbuf)
-{
-	unsigned int i;
-	char tmp = cbuf->buf[cbuf->start];
-
-	for (i=0 ; i<cbuf->len ; i++) {
-		cbuf->buf[(cbuf->start+i)%cbuf->maxlen] =
-			cbuf->buf[(cbuf->start+i+1)%cbuf->maxlen];
-	}
-	cbuf->buf[(cbuf->start-1+cbuf->maxlen)%cbuf->maxlen] = tmp;
-	cbuf->start += (cbuf->maxlen - 1);
-	cbuf->start %= cbuf->maxlen;
-	cbuf->end += (cbuf->maxlen - 1);
-	cbuf->end %= cbuf->maxlen;
-}
-
-static inline void
-__cirbuf_shift_right(struct cirbuf *cbuf)
-{
-	unsigned int i;
-	char tmp = cbuf->buf[cbuf->end];
-
-	for (i=0 ; i<cbuf->len ; i++) {
-		cbuf->buf[(cbuf->end+cbuf->maxlen-i)%cbuf->maxlen] =
-			cbuf->buf[(cbuf->end+cbuf->maxlen-i-1)%cbuf->maxlen];
-	}
-	cbuf->buf[(cbuf->end+1)%cbuf->maxlen] = tmp;
-	cbuf->start += 1;
-	cbuf->start %= cbuf->maxlen;
-	cbuf->end += 1;
-	cbuf->end %= cbuf->maxlen;
-}
-
-/* XXX we could do a better algorithm here... */
-int
-cirbuf_align_left(struct cirbuf * cbuf)
-{
-	if (!cbuf)
-		return -EINVAL;
-
-	if (cbuf->start < cbuf->maxlen/2) {
-		while (cbuf->start != 0) {
-			__cirbuf_shift_left(cbuf);
-		}
-	}
-	else {
-		while (cbuf->start != 0) {
-			__cirbuf_shift_right(cbuf);
-		}
-	}
-
-	return 0;
-}
-
-/* XXX we could do a better algorithm here... */
-int
-cirbuf_align_right(struct cirbuf * cbuf)
-{
-	if (!cbuf)
-		return -EINVAL;
-
-	if (cbuf->start >= cbuf->maxlen/2) {
-		while (cbuf->end != cbuf->maxlen-1) {
-			__cirbuf_shift_left(cbuf);
-		}
-	}
-	else {
-		while (cbuf->start != cbuf->maxlen-1) {
-			__cirbuf_shift_right(cbuf);
-		}
-	}
-
-	return 0;
-}
-
-/* buffer del */
-
-int
-cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size)
-{
-	if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf))
-		return -EINVAL;
-
-	cbuf->len -= size;
-	if (CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start += size - 1;
-		cbuf->start %= cbuf->maxlen;
-	}
-	else {
-		cbuf->start += size;
-		cbuf->start %= cbuf->maxlen;
-	}
-	return 0;
-}
-
-/* buffer del */
-
-int
-cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size)
-{
-	if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf))
-		return -EINVAL;
-
-	cbuf->len -= size;
-	if (CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end  += (cbuf->maxlen - size + 1);
-		cbuf->end %= cbuf->maxlen;
-	}
-	else {
-		cbuf->end  += (cbuf->maxlen - size);
-		cbuf->end %= cbuf->maxlen;
-	}
-	return 0;
-}
-
-/* del at head */
-
-static inline void
-__cirbuf_del_head(struct cirbuf * cbuf)
-{
-	cbuf->len --;
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start ++;
-		cbuf->start %= cbuf->maxlen;
-	}
-}
-
-int
-cirbuf_del_head_safe(struct cirbuf * cbuf)
-{
-	if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) {
-		__cirbuf_del_head(cbuf);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_del_head(struct cirbuf * cbuf)
-{
-	__cirbuf_del_head(cbuf);
-}
-
-/* del at tail */
-
-static inline void
-__cirbuf_del_tail(struct cirbuf * cbuf)
-{
-	cbuf->len --;
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end  += (cbuf->maxlen - 1);
-		cbuf->end %= cbuf->maxlen;
-	}
-}
-
-int
-cirbuf_del_tail_safe(struct cirbuf * cbuf)
-{
-	if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) {
-		__cirbuf_del_tail(cbuf);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_del_tail(struct cirbuf * cbuf)
-{
-	__cirbuf_del_tail(cbuf);
-}
-
-/* convert to buffer */
-
-int
-cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size)
-{
-	unsigned int n;
-
-	if (!cbuf || !c)
-		return -EINVAL;
-
-	n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf);
-
-	if (!n)
-		return 0;
-
-	if (cbuf->start <= cbuf->end) {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0, n);
-		memcpy(c, cbuf->buf + cbuf->start , n);
-	}
-	else {
-		/* check if we need to go from end to the beginning */
-		if (n <= cbuf->maxlen - cbuf->start) {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start, n);
-			memcpy(c, cbuf->buf + cbuf->start , n);
-		}
-		else {
-			dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0,
-				cbuf->maxlen - cbuf->start);
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->maxlen - cbuf->start,
-				n - cbuf->maxlen + cbuf->start);
-			memcpy(c, cbuf->buf + cbuf->start , cbuf->maxlen - cbuf->start);
-			memcpy(c + cbuf->maxlen - cbuf->start, cbuf->buf,
-				   n - cbuf->maxlen + cbuf->start);
-		}
-	}
-	return n;
-}
-
-/* convert to buffer */
-
-int
-cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size)
-{
-	unsigned int n;
-
-	if (!cbuf || !c)
-		return -EINVAL;
-
-	n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf);
-
-	if (!n)
-		return 0;
-
-	if (cbuf->start <= cbuf->end) {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end - n + 1, 0, n);
-		memcpy(c, cbuf->buf + cbuf->end - n + 1, n);
-	}
-	else {
-		/* check if we need to go from end to the beginning */
-		if (n <= cbuf->end + 1) {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end - n + 1, n);
-			memcpy(c, cbuf->buf + cbuf->end - n + 1, n);
-		}
-		else {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0,
-				cbuf->maxlen - cbuf->start, cbuf->end + 1);
-			dprintf("s[%d] -> d[%d] (%d)\n",
-				cbuf->maxlen - n + cbuf->end + 1, 0, n - cbuf->end - 1);
-			memcpy(c + cbuf->maxlen - cbuf->start,
-					       cbuf->buf, cbuf->end + 1);
-			memcpy(c, cbuf->buf + cbuf->maxlen - n + cbuf->end +1,
-				   n - cbuf->end - 1);
-		}
-	}
-	return n;
-}
-
-/* get head or get tail */
-
-char
-cirbuf_get_head(struct cirbuf * cbuf)
-{
-	return cbuf->buf[cbuf->start];
-}
-
-/* get head or get tail */
-
-char
-cirbuf_get_tail(struct cirbuf * cbuf)
-{
-	return cbuf->buf[cbuf->end];
-}
diff --git a/lib/librte_cmdline/cmdline_cirbuf.h b/lib/librte_cmdline/cmdline_cirbuf.h
deleted file mode 100644
index c23b211ad..000000000
--- a/lib/librte_cmdline/cmdline_cirbuf.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _CIRBUF_H_
-#define _CIRBUF_H_
-
-#include <rte_config.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * This structure is the header of a cirbuf type.
- */
-struct cirbuf {
-	unsigned int maxlen;    /**< total len of the fifo (number of elements) */
-	unsigned int start;     /**< indice of the first elt */
-	unsigned int end;       /**< indice of the last elt */
-	unsigned int len;       /**< current len of fifo */
-	char *buf;
-};
-
-#ifdef RTE_LIBRTE_CMDLINE_DEBUG
-#define dprintf_(fmt, ...) printf("line %3.3d - " fmt "%.0s", __LINE__, __VA_ARGS__)
-#define dprintf(...) dprintf_(__VA_ARGS__, "dummy")
-#else
-#define dprintf(...) (void)0
-#endif
-
-
-/**
- * Init the circular buffer
- */
-int cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen);
-
-
-/**
- * Return 1 if the circular buffer is full
- */
-#define CIRBUF_IS_FULL(cirbuf) ((cirbuf)->maxlen == (cirbuf)->len)
-
-/**
- * Return 1 if the circular buffer is empty
- */
-#define CIRBUF_IS_EMPTY(cirbuf) ((cirbuf)->len == 0)
-
-/**
- * return current size of the circular buffer (number of used elements)
- */
-#define CIRBUF_GET_LEN(cirbuf) ((cirbuf)->len)
-
-/**
- * return size of the circular buffer (used + free elements)
- */
-#define CIRBUF_GET_MAXLEN(cirbuf) ((cirbuf)->maxlen)
-
-/**
- * return the number of free elts
- */
-#define CIRBUF_GET_FREELEN(cirbuf) ((cirbuf)->maxlen - (cirbuf)->len)
-
-/**
- * Iterator for a circular buffer
- *   c: struct cirbuf pointer
- *   i: an integer type internally used in the macro
- *   e: char that takes the value for each iteration
- */
-#define CIRBUF_FOREACH(c, i, e)                                 \
-	for ( i=0, e=(c)->buf[(c)->start] ;                     \
-		i<((c)->len) ;                                  \
-		i ++,  e=(c)->buf[((c)->start+i)%((c)->maxlen)])
-
-
-/**
- * Add a character at head of the circular buffer. Return 0 on success, or
- * a negative value on error.
- */
-int cirbuf_add_head_safe(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at head of the circular buffer. You _must_ check that you
- * have enough free space in the buffer before calling this func.
- */
-void cirbuf_add_head(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at tail of the circular buffer. Return 0 on success, or
- * a negative value on error.
- */
-int cirbuf_add_tail_safe(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at tail of the circular buffer. You _must_ check that you
- * have enough free space in the buffer before calling this func.
- */
-void cirbuf_add_tail(struct cirbuf *cbuf, char c);
-
-/**
- * Remove a char at the head of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_head_safe(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the head of the circular buffer. You _must_ check
- * that buffer is not empty before calling the function.
- */
-void cirbuf_del_head(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the tail of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_tail_safe(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the tail of the circular buffer. You _must_ check
- * that buffer is not empty before calling the function.
- */
-void cirbuf_del_tail(struct cirbuf *cbuf);
-
-/**
- * Return the head of the circular buffer. You _must_ check that
- * buffer is not empty before calling the function.
- */
-char cirbuf_get_head(struct cirbuf *cbuf);
-
-/**
- * Return the tail of the circular buffer. You _must_ check that
- * buffer is not empty before calling the function.
- */
-char cirbuf_get_tail(struct cirbuf *cbuf);
-
-/**
- * Add a buffer at head of the circular buffer. 'c' is a pointer to a
- * buffer, and n is the number of char to add. Return the number of
- * copied bytes on success, or a negative value on error.
- */
-int cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n);
-
-/**
- * Add a buffer at tail of the circular buffer. 'c' is a pointer to a
- * buffer, and n is the number of char to add. Return the number of
- * copied bytes on success, or a negative value on error.
- */
-int cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n);
-
-/**
- * Remove chars at the head of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size);
-
-/**
- * Remove chars at the tail of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size);
-
-/**
- * Copy a maximum of 'size' characters from the head of the circular
- * buffer to a flat one pointed by 'c'. Return the number of copied
- * chars.
- */
-int cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size);
-
-/**
- * Copy a maximum of 'size' characters from the tail of the circular
- * buffer to a flat one pointed by 'c'. Return the number of copied
- * chars.
- */
-int cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size);
-
-
-/**
- * Set the start of the data to the index 0 of the internal buffer.
- */
-int cirbuf_align_left(struct cirbuf *cbuf);
-
-/**
- * Set the end of the data to the last index of the internal buffer.
- */
-int cirbuf_align_right(struct cirbuf *cbuf);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _CIRBUF_H_ */
diff --git a/lib/librte_cmdline/cmdline_parse.c b/lib/librte_cmdline/cmdline_parse.c
index 961f9befd..f75870b5b 100644
--- a/lib/librte_cmdline/cmdline_parse.c
+++ b/lib/librte_cmdline/cmdline_parse.c
@@ -16,7 +16,6 @@
 
 #include <rte_string_fns.h>
 
-#include "cmdline_rdline.h"
 #include "cmdline_parse.h"
 #include "cmdline.h"
 
@@ -216,7 +215,7 @@ cmdline_parse(struct cmdline *cl, const char * buf)
 	if (!cl || !buf)
 		return CMDLINE_PARSE_BAD_ARGS;
 
-	ctx = cl->ctx;
+	ctx = cmdline_ctx_get(cl);
 
 	/*
 	 * - look if the buffer contains at least one line
@@ -334,7 +333,7 @@ cmdline_complete(struct cmdline *cl, const char *buf, int *state,
 	if (!cl || !buf || !state || !dst)
 		return -1;
 
-	ctx = cl->ctx;
+	ctx = cmdline_ctx_get(cl);
 
 	debug_printf("%s called\n", __func__);
 	memset(&token_hdr, 0, sizeof(token_hdr));
@@ -346,7 +345,7 @@ cmdline_complete(struct cmdline *cl, const char *buf, int *state,
 		if (isblank2(buf[i]) && !isblank2(buf[i+1]))
 			partial_tok = buf+i+1;
 	}
-	partial_tok_len = strnlen(partial_tok, RDLINE_BUF_SIZE);
+	partial_tok_len = strlen(partial_tok);
 
 	/* first call -> do a first pass */
 	if (*state <= 0) {
diff --git a/lib/librte_cmdline/cmdline_rdline.c b/lib/librte_cmdline/cmdline_rdline.c
deleted file mode 100644
index 2cb53e38f..000000000
--- a/lib/librte_cmdline/cmdline_rdline.c
+++ /dev/null
@@ -1,644 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <ctype.h>
-
-#include "cmdline_cirbuf.h"
-#include "cmdline_rdline.h"
-
-static void rdline_puts(struct rdline *rdl, const char *buf);
-static void rdline_miniprintf(struct rdline *rdl,
-			      const char *buf, unsigned int val);
-
-static void rdline_remove_old_history_item(struct rdline *rdl);
-static void rdline_remove_first_history_item(struct rdline *rdl);
-static unsigned int rdline_get_history_size(struct rdline *rdl);
-
-
-/* isblank() needs _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE, so use our
- * own. */
-static int
-isblank2(char c)
-{
-	if (c == ' ' ||
-	    c == '\t' )
-		return 1;
-	return 0;
-}
-
-int
-rdline_init(struct rdline *rdl,
-		 rdline_write_char_t *write_char,
-		 rdline_validate_t *validate,
-		 rdline_complete_t *complete)
-{
-	if (!rdl || !write_char || !validate || !complete)
-		return -EINVAL;
-	memset(rdl, 0, sizeof(*rdl));
-	rdl->validate = validate;
-	rdl->complete = complete;
-	rdl->write_char = write_char;
-	rdl->status = RDLINE_INIT;
-	return cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE);
-}
-
-void
-rdline_newline(struct rdline *rdl, const char *prompt)
-{
-	unsigned int i;
-
-	if (!rdl || !prompt)
-		return;
-
-	vt100_init(&rdl->vt100);
-	cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-	cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-
-	rdl->prompt_size = strnlen(prompt, RDLINE_PROMPT_SIZE-1);
-	if (prompt != rdl->prompt)
-		memcpy(rdl->prompt, prompt, rdl->prompt_size);
-	rdl->prompt[RDLINE_PROMPT_SIZE-1] = '\0';
-
-	for (i=0 ; i<rdl->prompt_size ; i++)
-		rdl->write_char(rdl, rdl->prompt[i]);
-	rdl->status = RDLINE_RUNNING;
-
-	rdl->history_cur_line = -1;
-}
-
-void
-rdline_stop(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_INIT;
-}
-
-void
-rdline_quit(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_EXITED;
-}
-
-void
-rdline_restart(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_RUNNING;
-}
-
-void
-rdline_reset(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	vt100_init(&rdl->vt100);
-	cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-	cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-
-	rdl->status = RDLINE_RUNNING;
-
-	rdl->history_cur_line = -1;
-}
-
-const char *
-rdline_get_buffer(struct rdline *rdl)
-{
-	if (!rdl)
-		return NULL;
-	unsigned int len_l, len_r;
-	cirbuf_align_left(&rdl->left);
-	cirbuf_align_left(&rdl->right);
-
-	len_l = CIRBUF_GET_LEN(&rdl->left);
-	len_r = CIRBUF_GET_LEN(&rdl->right);
-	memcpy(rdl->left_buf+len_l, rdl->right_buf, len_r);
-
-	rdl->left_buf[len_l + len_r] = '\n';
-	rdl->left_buf[len_l + len_r + 1] = '\0';
-	return rdl->left_buf;
-}
-
-static void
-display_right_buffer(struct rdline *rdl, int force)
-{
-	unsigned int i;
-	char tmp;
-
-	if (!force && CIRBUF_IS_EMPTY(&rdl->right))
-		return;
-
-	rdline_puts(rdl, vt100_clear_right);
-	CIRBUF_FOREACH(&rdl->right, i, tmp) {
-		rdl->write_char(rdl, tmp);
-	}
-	if (!CIRBUF_IS_EMPTY(&rdl->right))
-		rdline_miniprintf(rdl, vt100_multi_left,
-				  CIRBUF_GET_LEN(&rdl->right));
-}
-
-void
-rdline_redisplay(struct rdline *rdl)
-{
-	unsigned int i;
-	char tmp;
-
-	if (!rdl)
-		return;
-
-	rdline_puts(rdl, vt100_home);
-	for (i=0 ; i<rdl->prompt_size ; i++)
-		rdl->write_char(rdl, rdl->prompt[i]);
-	CIRBUF_FOREACH(&rdl->left, i, tmp) {
-		rdl->write_char(rdl, tmp);
-	}
-	display_right_buffer(rdl, 1);
-}
-
-int
-rdline_char_in(struct rdline *rdl, char c)
-{
-	unsigned int i;
-	int cmd;
-	char tmp;
-	char *buf;
-
-	if (!rdl)
-		return -EINVAL;
-
-	if (rdl->status == RDLINE_EXITED)
-		return RDLINE_RES_EXITED;
-	if (rdl->status != RDLINE_RUNNING)
-		return RDLINE_RES_NOT_RUNNING;
-
-	cmd = vt100_parser(&rdl->vt100, c);
-	if (cmd == -2)
-		return RDLINE_RES_SUCCESS;
-
-	if (cmd >= 0) {
-		switch (cmd) {
-		/* move caret 1 char to the left */
-		case CMDLINE_KEY_CTRL_B:
-		case CMDLINE_KEY_LEFT_ARR:
-			if (CIRBUF_IS_EMPTY(&rdl->left))
-				break;
-			tmp = cirbuf_get_tail(&rdl->left);
-			cirbuf_del_tail(&rdl->left);
-			cirbuf_add_head(&rdl->right, tmp);
-			rdline_puts(rdl, vt100_left_arr);
-			break;
-
-		/* move caret 1 char to the right */
-		case CMDLINE_KEY_CTRL_F:
-		case CMDLINE_KEY_RIGHT_ARR:
-			if (CIRBUF_IS_EMPTY(&rdl->right))
-				break;
-			tmp = cirbuf_get_head(&rdl->right);
-			cirbuf_del_head(&rdl->right);
-			cirbuf_add_tail(&rdl->left, tmp);
-			rdline_puts(rdl, vt100_right_arr);
-			break;
-
-		/* move caret 1 word to the left */
-		/* keyboard equivalent: Alt+B */
-		case CMDLINE_KEY_WLEFT:
-			while (! CIRBUF_IS_EMPTY(&rdl->left) &&
-			       (tmp = cirbuf_get_tail(&rdl->left)) &&
-			       isblank2(tmp)) {
-				rdline_puts(rdl, vt100_left_arr);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->left) &&
-			       (tmp = cirbuf_get_tail(&rdl->left)) &&
-			       !isblank2(tmp)) {
-				rdline_puts(rdl, vt100_left_arr);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			break;
-
-		/* move caret 1 word to the right */
-		/* keyboard equivalent: Alt+F */
-		case CMDLINE_KEY_WRIGHT:
-			while (! CIRBUF_IS_EMPTY(&rdl->right) &&
-			       (tmp = cirbuf_get_head(&rdl->right)) &&
-			       isblank2(tmp)) {
-				rdline_puts(rdl, vt100_right_arr);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->right) &&
-			       (tmp = cirbuf_get_head(&rdl->right)) &&
-			       !isblank2(tmp)) {
-				rdline_puts(rdl, vt100_right_arr);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			break;
-
-		/* move caret to the left */
-		case CMDLINE_KEY_CTRL_A:
-			if (CIRBUF_IS_EMPTY(&rdl->left))
-				break;
-			rdline_miniprintf(rdl, vt100_multi_left,
-						CIRBUF_GET_LEN(&rdl->left));
-			while (! CIRBUF_IS_EMPTY(&rdl->left)) {
-				tmp = cirbuf_get_tail(&rdl->left);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			break;
-
-		/* move caret to the right */
-		case CMDLINE_KEY_CTRL_E:
-			if (CIRBUF_IS_EMPTY(&rdl->right))
-				break;
-			rdline_miniprintf(rdl, vt100_multi_right,
-						CIRBUF_GET_LEN(&rdl->right));
-			while (! CIRBUF_IS_EMPTY(&rdl->right)) {
-				tmp = cirbuf_get_head(&rdl->right);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			break;
-
-		/* delete 1 char from the left */
-		case CMDLINE_KEY_BKSPACE:
-		case CMDLINE_KEY_BKSPACE2:
-			if(!cirbuf_del_tail_safe(&rdl->left)) {
-				rdline_puts(rdl, vt100_bs);
-				display_right_buffer(rdl, 1);
-			}
-			break;
-
-		/* delete 1 char from the right */
-		case CMDLINE_KEY_SUPPR:
-		case CMDLINE_KEY_CTRL_D:
-			if (cmd == CMDLINE_KEY_CTRL_D &&
-			    CIRBUF_IS_EMPTY(&rdl->left) &&
-			    CIRBUF_IS_EMPTY(&rdl->right)) {
-				return RDLINE_RES_EOF;
-			}
-			if (!cirbuf_del_head_safe(&rdl->right)) {
-				display_right_buffer(rdl, 1);
-			}
-			break;
-
-		/* delete 1 word from the left */
-		case CMDLINE_KEY_META_BKSPACE:
-		case CMDLINE_KEY_CTRL_W:
-			while (! CIRBUF_IS_EMPTY(&rdl->left) && isblank2(cirbuf_get_tail(&rdl->left))) {
-				rdline_puts(rdl, vt100_bs);
-				cirbuf_del_tail(&rdl->left);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->left) && !isblank2(cirbuf_get_tail(&rdl->left))) {
-				rdline_puts(rdl, vt100_bs);
-				cirbuf_del_tail(&rdl->left);
-			}
-			display_right_buffer(rdl, 1);
-			break;
-
-		/* delete 1 word from the right */
-		case CMDLINE_KEY_META_D:
-			while (! CIRBUF_IS_EMPTY(&rdl->right) && isblank2(cirbuf_get_head(&rdl->right)))
-				cirbuf_del_head(&rdl->right);
-			while (! CIRBUF_IS_EMPTY(&rdl->right) && !isblank2(cirbuf_get_head(&rdl->right)))
-				cirbuf_del_head(&rdl->right);
-			display_right_buffer(rdl, 1);
-			break;
-
-		/* set kill buffer to contents on the right side of caret */
-		case CMDLINE_KEY_CTRL_K:
-			cirbuf_get_buf_head(&rdl->right, rdl->kill_buf, RDLINE_BUF_SIZE);
-			rdl->kill_size = CIRBUF_GET_LEN(&rdl->right);
-			cirbuf_del_buf_head(&rdl->right, rdl->kill_size);
-			rdline_puts(rdl, vt100_clear_right);
-			break;
-
-		/* paste contents of kill buffer to the left side of caret */
-		case CMDLINE_KEY_CTRL_Y:
-			i=0;
-			while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) <
-			      RDLINE_BUF_SIZE &&
-			      i < rdl->kill_size) {
-				cirbuf_add_tail(&rdl->left, rdl->kill_buf[i]);
-				rdl->write_char(rdl, rdl->kill_buf[i]);
-				i++;
-			}
-			display_right_buffer(rdl, 0);
-			break;
-
-		/* clear and newline */
-		case CMDLINE_KEY_CTRL_C:
-			rdline_puts(rdl, "\r\n");
-			rdline_newline(rdl, rdl->prompt);
-			break;
-
-		/* redisplay (helps when prompt is lost in other output) */
-		case CMDLINE_KEY_CTRL_L:
-			rdline_redisplay(rdl);
-			break;
-
-		/* autocomplete */
-		case CMDLINE_KEY_TAB:
-		case CMDLINE_KEY_HELP:
-			cirbuf_align_left(&rdl->left);
-			rdl->left_buf[CIRBUF_GET_LEN(&rdl->left)] = '\0';
-			if (rdl->complete) {
-				char tmp_buf[BUFSIZ];
-				int complete_state;
-				int ret;
-				unsigned int tmp_size;
-
-				if (cmd == CMDLINE_KEY_TAB)
-					complete_state = 0;
-				else
-					complete_state = -1;
-
-				/* see in parse.h for help on complete() */
-				ret = rdl->complete(rdl, rdl->left_buf,
-						    tmp_buf, sizeof(tmp_buf),
-						    &complete_state);
-				/* no completion or error */
-				if (ret <= 0) {
-					return RDLINE_RES_COMPLETE;
-				}
-
-				tmp_size = strnlen(tmp_buf, sizeof(tmp_buf));
-				/* add chars */
-				if (ret == RDLINE_RES_COMPLETE) {
-					i=0;
-					while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) <
-					      RDLINE_BUF_SIZE &&
-					      i < tmp_size) {
-						cirbuf_add_tail(&rdl->left, tmp_buf[i]);
-						rdl->write_char(rdl, tmp_buf[i]);
-						i++;
-					}
-					display_right_buffer(rdl, 1);
-					return RDLINE_RES_COMPLETE; /* ?? */
-				}
-
-				/* choice */
-				rdline_puts(rdl, "\r\n");
-				while (ret) {
-					rdl->write_char(rdl, ' ');
-					for (i=0 ; tmp_buf[i] ; i++)
-						rdl->write_char(rdl, tmp_buf[i]);
-					rdline_puts(rdl, "\r\n");
-					ret = rdl->complete(rdl, rdl->left_buf,
-							    tmp_buf, sizeof(tmp_buf),
-							    &complete_state);
-				}
-
-				rdline_redisplay(rdl);
-			}
-			return RDLINE_RES_COMPLETE;
-
-		/* complete buffer */
-		case CMDLINE_KEY_RETURN:
-		case CMDLINE_KEY_RETURN2:
-			rdline_get_buffer(rdl);
-			rdl->status = RDLINE_INIT;
-			rdline_puts(rdl, "\r\n");
-			if (rdl->history_cur_line != -1)
-				rdline_remove_first_history_item(rdl);
-
-			if (rdl->validate)
-				rdl->validate(rdl, rdl->left_buf, CIRBUF_GET_LEN(&rdl->left)+2);
-			/* user may have stopped rdline */
-			if (rdl->status == RDLINE_EXITED)
-				return RDLINE_RES_EXITED;
-			return RDLINE_RES_VALIDATED;
-
-		/* previous element in history */
-		case CMDLINE_KEY_UP_ARR:
-		case CMDLINE_KEY_CTRL_P:
-			if (rdl->history_cur_line == 0) {
-				rdline_remove_first_history_item(rdl);
-			}
-			if (rdl->history_cur_line <= 0) {
-				rdline_add_history(rdl, rdline_get_buffer(rdl));
-				rdl->history_cur_line = 0;
-			}
-
-			buf = rdline_get_history_item(rdl, rdl->history_cur_line + 1);
-			if (!buf)
-				break;
-
-			rdl->history_cur_line ++;
-			vt100_init(&rdl->vt100);
-			cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE));
-			rdline_redisplay(rdl);
-			break;
-
-		/* next element in history */
-		case CMDLINE_KEY_DOWN_ARR:
-		case CMDLINE_KEY_CTRL_N:
-			if (rdl->history_cur_line - 1 < 0)
-				break;
-
-			rdl->history_cur_line --;
-			buf = rdline_get_history_item(rdl, rdl->history_cur_line);
-			if (!buf)
-				break;
-			vt100_init(&rdl->vt100);
-			cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE));
-			rdline_redisplay(rdl);
-
-			break;
-
-
-		default:
-			break;
-		}
-
-		return RDLINE_RES_SUCCESS;
-	}
-
-	if (!isprint((int)c))
-		return RDLINE_RES_SUCCESS;
-
-	/* standard chars */
-	if (CIRBUF_GET_LEN(&rdl->left) + CIRBUF_GET_LEN(&rdl->right) >= RDLINE_BUF_SIZE)
-		return RDLINE_RES_SUCCESS;
-
-	if (cirbuf_add_tail_safe(&rdl->left, c))
-		return RDLINE_RES_SUCCESS;
-
-	rdl->write_char(rdl, c);
-	display_right_buffer(rdl, 0);
-
-	return RDLINE_RES_SUCCESS;
-}
-
-
-/* HISTORY */
-
-static void
-rdline_remove_old_history_item(struct rdline * rdl)
-{
-	char tmp;
-
-	while (! CIRBUF_IS_EMPTY(&rdl->history) ) {
-		tmp = cirbuf_get_head(&rdl->history);
-		cirbuf_del_head(&rdl->history);
-		if (!tmp)
-			break;
-	}
-}
-
-static void
-rdline_remove_first_history_item(struct rdline * rdl)
-{
-	char tmp;
-
-	if ( CIRBUF_IS_EMPTY(&rdl->history) ) {
-		return;
-	}
-	else {
-		cirbuf_del_tail(&rdl->history);
-	}
-
-	while (! CIRBUF_IS_EMPTY(&rdl->history) ) {
-		tmp = cirbuf_get_tail(&rdl->history);
-		if (!tmp)
-			break;
-		cirbuf_del_tail(&rdl->history);
-	}
-}
-
-static unsigned int
-rdline_get_history_size(struct rdline * rdl)
-{
-	unsigned int i, tmp, ret=0;
-
-	CIRBUF_FOREACH(&rdl->history, i, tmp) {
-		if (tmp == 0)
-			ret ++;
-	}
-
-	return ret;
-}
-
-char *
-rdline_get_history_item(struct rdline * rdl, unsigned int idx)
-{
-	unsigned int len, i, tmp;
-
-	if (!rdl)
-		return NULL;
-
-	len = rdline_get_history_size(rdl);
-	if ( idx >= len ) {
-		return NULL;
-	}
-
-	cirbuf_align_left(&rdl->history);
-
-	CIRBUF_FOREACH(&rdl->history, i, tmp) {
-		if ( idx == len - 1) {
-			return rdl->history_buf + i;
-		}
-		if (tmp == 0)
-			len --;
-	}
-
-	return NULL;
-}
-
-int
-rdline_add_history(struct rdline * rdl, const char * buf)
-{
-	unsigned int len, i;
-
-	if (!rdl || !buf)
-		return -EINVAL;
-
-	len = strnlen(buf, RDLINE_BUF_SIZE);
-	for (i=0; i<len ; i++) {
-		if (buf[i] == '\n') {
-			len = i;
-			break;
-		}
-	}
-
-	if ( len >= RDLINE_HISTORY_BUF_SIZE )
-		return -1;
-
-	while ( len >= CIRBUF_GET_FREELEN(&rdl->history) ) {
-		rdline_remove_old_history_item(rdl);
-	}
-
-	cirbuf_add_buf_tail(&rdl->history, buf, len);
-	cirbuf_add_tail(&rdl->history, 0);
-
-	return 0;
-}
-
-void
-rdline_clear_history(struct rdline * rdl)
-{
-	if (!rdl)
-		return;
-	cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE);
-}
-
-
-/* STATIC USEFUL FUNCS */
-
-static void
-rdline_puts(struct rdline * rdl, const char * buf)
-{
-	char c;
-	while ( (c = *(buf++)) != '\0' ) {
-		rdl->write_char(rdl, c);
-	}
-}
-
-/* a very very basic printf with one arg and one format 'u' */
-static void
-rdline_miniprintf(struct rdline *rdl, const char * buf, unsigned int val)
-{
-	char c, started=0, div=100;
-
-	while ( (c=*(buf++)) ) {
-		if (c != '%') {
-			rdl->write_char(rdl, c);
-			continue;
-		}
-		c = *(buf++);
-		if (c != 'u') {
-			rdl->write_char(rdl, '%');
-			rdl->write_char(rdl, c);
-			continue;
-		}
-		/* val is never more than 255 */
-		while (div) {
-			c = (char)(val / div);
-			if (c || started) {
-				rdl->write_char(rdl, (char)(c+'0'));
-				started = 1;
-			}
-			val %= div;
-			div /= 10;
-		}
-	}
-}
diff --git a/lib/librte_cmdline/cmdline_rdline.h b/lib/librte_cmdline/cmdline_rdline.h
deleted file mode 100644
index d2170293d..000000000
--- a/lib/librte_cmdline/cmdline_rdline.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _RDLINE_H_
-#define _RDLINE_H_
-
-/**
- * This file is a small equivalent to the GNU readline library, but it
- * was originally designed for small systems, like Atmel AVR
- * microcontrollers (8 bits). Indeed, we don't use any malloc that is
- * sometimes not implemented (or just not recommended) on such
- * systems.
- *
- * Obviously, it does not support as many things as the GNU readline,
- * but at least it supports some interesting features like a kill
- * buffer and a command history.
- *
- * It also have a feature that does not have the GNU readline (as far
- * as I know): we can have several instances of it running at the same
- * time, even on a monothread program, since it works with callbacks.
- *
- * The lib is designed for a client-side or a server-side use:
- * - server-side: the server receives all data from a socket, including
- *   control chars, like arrows, tabulations, ... The client is
- *   very simple, it can be a telnet or a minicom through a serial line.
- * - client-side: the client receives its data through its stdin for
- *   instance.
- */
-
-#include <stdio.h>
-#include <cmdline_cirbuf.h>
-#include <cmdline_vt100.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* configuration */
-#define RDLINE_BUF_SIZE 512
-#define RDLINE_PROMPT_SIZE  32
-#define RDLINE_VT100_BUF_SIZE  8
-#define RDLINE_HISTORY_BUF_SIZE BUFSIZ
-#define RDLINE_HISTORY_MAX_LINE 64
-
-enum rdline_status {
-	RDLINE_INIT,
-	RDLINE_RUNNING,
-	RDLINE_EXITED
-};
-
-struct rdline;
-
-typedef int (rdline_write_char_t)(struct rdline *rdl, char);
-typedef void (rdline_validate_t)(struct rdline *rdl,
-				 const char *buf, unsigned int size);
-typedef int (rdline_complete_t)(struct rdline *rdl, const char *buf,
-				char *dstbuf, unsigned int dstsize,
-				int *state);
-
-struct rdline {
-	enum rdline_status status;
-	/* rdline bufs */
-	struct cirbuf left;
-	struct cirbuf right;
-	char left_buf[RDLINE_BUF_SIZE+2]; /* reserve 2 chars for the \n\0 */
-	char right_buf[RDLINE_BUF_SIZE];
-
-	char prompt[RDLINE_PROMPT_SIZE];
-	unsigned int prompt_size;
-
-	char kill_buf[RDLINE_BUF_SIZE];
-	unsigned int kill_size;
-
-	/* history */
-	struct cirbuf history;
-	char history_buf[RDLINE_HISTORY_BUF_SIZE];
-	int history_cur_line;
-
-	/* callbacks and func pointers */
-	rdline_write_char_t *write_char;
-	rdline_validate_t *validate;
-	rdline_complete_t *complete;
-
-	/* vt100 parser */
-	struct cmdline_vt100 vt100;
-
-	/* opaque pointer */
-	void *opaque;
-};
-
-/**
- * Init fields for a struct rdline. Call this only once at the beginning
- * of your program.
- * \param rdl A pointer to an uninitialized struct rdline
- * \param write_char The function used by the function to write a character
- * \param validate A pointer to the function to execute when the
- *                 user validates the buffer.
- * \param complete A pointer to the function to execute when the
- *                 user completes the buffer.
- */
-int rdline_init(struct rdline *rdl,
-		 rdline_write_char_t *write_char,
-		 rdline_validate_t *validate,
-		 rdline_complete_t *complete);
-
-
-/**
- * Init the current buffer, and display a prompt.
- * \param rdl A pointer to a struct rdline
- * \param prompt A string containing the prompt
- */
-void rdline_newline(struct rdline *rdl, const char *prompt);
-
-/**
- * Call it and all received chars will be ignored.
- * \param rdl A pointer to a struct rdline
- */
-void rdline_stop(struct rdline *rdl);
-
-/**
- * Same than rdline_stop() except that next calls to rdline_char_in()
- * will return RDLINE_RES_EXITED.
- * \param rdl A pointer to a struct rdline
- */
-void rdline_quit(struct rdline *rdl);
-
-/**
- * Restart after a call to rdline_stop() or rdline_quit()
- * \param rdl A pointer to a struct rdline
- */
-void rdline_restart(struct rdline *rdl);
-
-/**
- * Redisplay the current buffer
- * \param rdl A pointer to a struct rdline
- */
-void rdline_redisplay(struct rdline *rdl);
-
-/**
- * Reset the current buffer and setup for a new line.
- *  \param rdl A pointer to a struct rdline
- */
-void rdline_reset(struct rdline *rdl);
-
-
-/* return status for rdline_char_in() */
-#define RDLINE_RES_SUCCESS       0
-#define RDLINE_RES_VALIDATED     1
-#define RDLINE_RES_COMPLETE      2
-#define RDLINE_RES_NOT_RUNNING  -1
-#define RDLINE_RES_EOF          -2
-#define RDLINE_RES_EXITED       -3
-
-/**
- * append a char to the readline buffer.
- * Return RDLINE_RES_VALIDATE when the line has been validated.
- * Return RDLINE_RES_COMPLETE when the user asked to complete the buffer.
- * Return RDLINE_RES_NOT_RUNNING if it is not running.
- * Return RDLINE_RES_EOF if EOF (ctrl-d on an empty line).
- * Else return RDLINE_RES_SUCCESS.
- * XXX error case when the buffer is full ?
- *
- * \param rdl A pointer to a struct rdline
- * \param c The character to append
- */
-int rdline_char_in(struct rdline *rdl, char c);
-
-/**
- * Return the current buffer, terminated by '\0'.
- * \param rdl A pointer to a struct rdline
- */
-const char *rdline_get_buffer(struct rdline *rdl);
-
-
-/**
- * Add the buffer to history.
- * return < 0 on error.
- * \param rdl A pointer to a struct rdline
- * \param buf A buffer that is terminated by '\0'
- */
-int rdline_add_history(struct rdline *rdl, const char *buf);
-
-/**
- * Clear current history
- * \param rdl A pointer to a struct rdline
- */
-void rdline_clear_history(struct rdline *rdl);
-
-/**
- * Get the i-th history item
- */
-char *rdline_get_history_item(struct rdline *rdl, unsigned int i);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _RDLINE_H_ */
diff --git a/lib/librte_cmdline/cmdline_socket.c b/lib/librte_cmdline/cmdline_socket.c
index ecb3d82b6..f639c61cb 100644
--- a/lib/librte_cmdline/cmdline_socket.c
+++ b/lib/librte_cmdline/cmdline_socket.c
@@ -4,23 +4,18 @@
  * All rights reserved.
  */
 
-#include <stdio.h>
-#include <string.h>
+#include <stddef.h>
 #include <unistd.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <inttypes.h>
 #include <fcntl.h>
-#include <termios.h>
 
 #include "cmdline_parse.h"
-#include "cmdline_rdline.h"
 #include "cmdline_socket.h"
 #include "cmdline.h"
 
 struct cmdline *
 cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path)
 {
+	struct cmdline *cl;
 	int fd;
 
 	/* everything else is checked in cmdline_new() */
@@ -29,37 +24,22 @@ cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path)
 
 	fd = open(path, O_RDONLY, 0);
 	if (fd < 0) {
-		dprintf("open() failed\n");
 		return NULL;
 	}
-	return cmdline_new(ctx, prompt, fd, -1);
+	cl = cmdline_new(ctx, prompt, fd, -1);
+	/* cmdline_new() duplicates fd */
+	close(fd);
+	return cl;
 }
 
 struct cmdline *
 cmdline_stdin_new(cmdline_parse_ctx_t *ctx, const char *prompt)
 {
-	struct cmdline *cl;
-	struct termios oldterm, term;
-
-	tcgetattr(0, &oldterm);
-	memcpy(&term, &oldterm, sizeof(term));
-	term.c_lflag &= ~(ICANON | ECHO | ISIG);
-	tcsetattr(0, TCSANOW, &term);
-	setbuf(stdin, NULL);
-
-	cl = cmdline_new(ctx, prompt, 0, 1);
-
-	if (cl)
-		memcpy(&cl->oldterm, &oldterm, sizeof(term));
-
-	return cl;
+	return cmdline_new(ctx, prompt, 0, 1);
 }
 
 void
 cmdline_stdin_exit(struct cmdline *cl)
 {
-	if (!cl)
-		return;
-
-	tcsetattr(fileno(stdin), TCSANOW, &cl->oldterm);
+	cmdline_free(cl);
 }
diff --git a/lib/librte_cmdline/cmdline_vt100.c b/lib/librte_cmdline/cmdline_vt100.c
deleted file mode 100644
index 662fc7345..000000000
--- a/lib/librte_cmdline/cmdline_vt100.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdarg.h>
-#include <ctype.h>
-#include <termios.h>
-
-#include "cmdline_vt100.h"
-
-const char *cmdline_vt100_commands[] = {
-	vt100_up_arr,
-	vt100_down_arr,
-	vt100_right_arr,
-	vt100_left_arr,
-	"\177",
-	"\n",
-	"\001",
-	"\005",
-	"\013",
-	"\031",
-	"\003",
-	"\006",
-	"\002",
-	vt100_suppr,
-	vt100_tab,
-	"\004",
-	"\014",
-	"\r",
-	"\033\177",
-	vt100_word_left,
-	vt100_word_right,
-	"?",
-	"\027",
-	"\020",
-	"\016",
-	"\033\144",
-	vt100_bs,
-};
-
-void
-vt100_init(struct cmdline_vt100 *vt)
-{
-	if (!vt)
-		return;
-	vt->state = CMDLINE_VT100_INIT;
-}
-
-
-static int
-match_command(char *buf, unsigned int size)
-{
-	const char *cmd;
-	size_t cmdlen;
-	unsigned int i = 0;
-
-	for (i=0 ; i<sizeof(cmdline_vt100_commands)/sizeof(const char *) ; i++) {
-		cmd = *(cmdline_vt100_commands + i);
-
-		cmdlen = strnlen(cmd, CMDLINE_VT100_BUF_SIZE);
-		if (size == cmdlen &&
-		    !strncmp(buf, cmd, cmdlen)) {
-			return i;
-		}
-	}
-
-	return -1;
-}
-
-int
-vt100_parser(struct cmdline_vt100 *vt, char ch)
-{
-	unsigned int size;
-	uint8_t c = (uint8_t) ch;
-
-	if (!vt)
-		return -1;
-
-	if (vt->bufpos >= CMDLINE_VT100_BUF_SIZE) {
-		vt->state = CMDLINE_VT100_INIT;
-		vt->bufpos = 0;
-	}
-
-	vt->buf[vt->bufpos++] = c;
-	size = vt->bufpos;
-
-	switch (vt->state) {
-	case CMDLINE_VT100_INIT:
-		if (c == 033) {
-			vt->state = CMDLINE_VT100_ESCAPE;
-		}
-		else {
-			vt->bufpos = 0;
-			goto match_command;
-		}
-		break;
-
-	case CMDLINE_VT100_ESCAPE:
-		if (c == 0133) {
-			vt->state = CMDLINE_VT100_ESCAPE_CSI;
-		}
-		else if (c >= 060 && c <= 0177) { /* XXX 0177 ? */
-			vt->bufpos = 0;
-			vt->state = CMDLINE_VT100_INIT;
-			goto match_command;
-		}
-		break;
-
-	case CMDLINE_VT100_ESCAPE_CSI:
-		if (c >= 0100 && c <= 0176) {
-			vt->bufpos = 0;
-			vt->state = CMDLINE_VT100_INIT;
-			goto match_command;
-		}
-		break;
-
-	default:
-		vt->bufpos = 0;
-		break;
-	}
-
-	return -2;
-
- match_command:
-	return match_command(vt->buf, size);
-}
diff --git a/lib/librte_cmdline/cmdline_vt100.h b/lib/librte_cmdline/cmdline_vt100.h
deleted file mode 100644
index e33e67ed8..000000000
--- a/lib/librte_cmdline/cmdline_vt100.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _CMDLINE_VT100_H_
-#define _CMDLINE_VT100_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define vt100_bell         "\007"
-#define vt100_bs           "\010"
-#define vt100_bs_clear     "\010 \010"
-#define vt100_tab          "\011"
-#define vt100_crnl         "\012\015"
-#define vt100_clear_right  "\033[0K"
-#define vt100_clear_left   "\033[1K"
-#define vt100_clear_down   "\033[0J"
-#define vt100_clear_up     "\033[1J"
-#define vt100_clear_line   "\033[2K"
-#define vt100_clear_screen "\033[2J"
-#define vt100_up_arr       "\033\133\101"
-#define vt100_down_arr     "\033\133\102"
-#define vt100_right_arr    "\033\133\103"
-#define vt100_left_arr     "\033\133\104"
-#define vt100_multi_right  "\033\133%uC"
-#define vt100_multi_left   "\033\133%uD"
-#define vt100_suppr        "\033\133\063\176"
-#define vt100_home         "\033M\033E"
-#define vt100_word_left    "\033\142"
-#define vt100_word_right   "\033\146"
-
-/* Result of parsing : it must be synchronized with
- * cmdline_vt100_commands[] in vt100.c */
-#define CMDLINE_KEY_UP_ARR 0
-#define CMDLINE_KEY_DOWN_ARR 1
-#define CMDLINE_KEY_RIGHT_ARR 2
-#define CMDLINE_KEY_LEFT_ARR 3
-#define CMDLINE_KEY_BKSPACE 4
-#define CMDLINE_KEY_RETURN 5
-#define CMDLINE_KEY_CTRL_A 6
-#define CMDLINE_KEY_CTRL_E 7
-#define CMDLINE_KEY_CTRL_K 8
-#define CMDLINE_KEY_CTRL_Y 9
-#define CMDLINE_KEY_CTRL_C 10
-#define CMDLINE_KEY_CTRL_F 11
-#define CMDLINE_KEY_CTRL_B 12
-#define CMDLINE_KEY_SUPPR 13
-#define CMDLINE_KEY_TAB 14
-#define CMDLINE_KEY_CTRL_D 15
-#define CMDLINE_KEY_CTRL_L 16
-#define CMDLINE_KEY_RETURN2 17
-#define CMDLINE_KEY_META_BKSPACE 18
-#define CMDLINE_KEY_WLEFT 19
-#define CMDLINE_KEY_WRIGHT 20
-#define CMDLINE_KEY_HELP 21
-#define CMDLINE_KEY_CTRL_W 22
-#define CMDLINE_KEY_CTRL_P 23
-#define CMDLINE_KEY_CTRL_N 24
-#define CMDLINE_KEY_META_D 25
-#define CMDLINE_KEY_BKSPACE2 26
-
-extern const char *cmdline_vt100_commands[];
-
-enum cmdline_vt100_parser_state {
-	CMDLINE_VT100_INIT,
-	CMDLINE_VT100_ESCAPE,
-	CMDLINE_VT100_ESCAPE_CSI
-};
-
-#define CMDLINE_VT100_BUF_SIZE 8
-struct cmdline_vt100 {
-	uint8_t bufpos;
-	char buf[CMDLINE_VT100_BUF_SIZE];
-	enum cmdline_vt100_parser_state state;
-};
-
-/**
- * Init
- */
-void vt100_init(struct cmdline_vt100 *vt);
-
-/**
- * Input a new character.
- * Return -1 if the character is not part of a control sequence
- * Return -2 if c is not the last char of a control sequence
- * Else return the index in vt100_commands[]
- */
-int vt100_parser(struct cmdline_vt100 *vt, char c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/lib/librte_cmdline/meson.build b/lib/librte_cmdline/meson.build
index 5741817ac..1735737c7 100644
--- a/lib/librte_cmdline/meson.build
+++ b/lib/librte_cmdline/meson.build
@@ -1,18 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
-version = 2
+version = 3
 sources = files('cmdline.c',
-	'cmdline_cirbuf.c',
 	'cmdline_parse.c',
 	'cmdline_parse_etheraddr.c',
 	'cmdline_parse_ipaddr.c',
 	'cmdline_parse_num.c',
 	'cmdline_parse_portlist.c',
 	'cmdline_parse_string.c',
-	'cmdline_rdline.c',
-	'cmdline_socket.c',
-	'cmdline_vt100.c')
+	'cmdline_socket.c')
 
 headers = files('cmdline.h',
 	'cmdline_parse.h',
@@ -20,8 +17,13 @@ headers = files('cmdline.h',
 	'cmdline_parse_ipaddr.h',
 	'cmdline_parse_etheraddr.h',
 	'cmdline_parse_string.h',
-	'cmdline_rdline.h',
-	'cmdline_vt100.h',
 	'cmdline_socket.h',
-	'cmdline_cirbuf.h',
 	'cmdline_parse_portlist.h')
+
+cmdline_dep = dependency('libedit', required: false)
+if cmdline_dep.found()
+	ext_deps += cmdline_dep
+	dpdk_extra_ldflags += '-ledit'
+else
+	build = false
+endif
diff --git a/lib/librte_cmdline/rte_cmdline_version.map b/lib/librte_cmdline/rte_cmdline_version.map
index 04bcb387f..31331995b 100644
--- a/lib/librte_cmdline/rte_cmdline_version.map
+++ b/lib/librte_cmdline/rte_cmdline_version.map
@@ -1,25 +1,6 @@
 DPDK_2.0 {
 	global:
 
-	cirbuf_add_buf_head;
-	cirbuf_add_buf_tail;
-	cirbuf_add_head;
-	cirbuf_add_head_safe;
-	cirbuf_add_tail;
-	cirbuf_add_tail_safe;
-	cirbuf_align_left;
-	cirbuf_align_right;
-	cirbuf_del_buf_head;
-	cirbuf_del_buf_tail;
-	cirbuf_del_head;
-	cirbuf_del_head_safe;
-	cirbuf_del_tail;
-	cirbuf_del_tail_safe;
-	cirbuf_get_buf_head;
-	cirbuf_get_buf_tail;
-	cirbuf_get_head;
-	cirbuf_get_tail;
-	cirbuf_init;
 	cmdline_complete;
 	cmdline_complete_get_elt_string;
 	cmdline_complete_get_nb_string;
@@ -50,21 +31,6 @@ DPDK_2.0 {
 	cmdline_token_num_ops;
 	cmdline_token_portlist_ops;
 	cmdline_token_string_ops;
-	cmdline_write_char;
-	rdline_add_history;
-	rdline_char_in;
-	rdline_clear_history;
-	rdline_get_buffer;
-	rdline_get_history_item;
-	rdline_init;
-	rdline_newline;
-	rdline_quit;
-	rdline_redisplay;
-	rdline_reset;
-	rdline_restart;
-	rdline_stop;
-	vt100_init;
-	vt100_parser;
 
 	local: *;
 };
@@ -75,3 +41,10 @@ DPDK_2.1 {
 	cmdline_poll;
 
 } DPDK_2.0;
+
+DPDK_18.02 {
+	global:
+
+	cmdline_ctx_get;
+
+} DPDK_2.1;
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 8bab901fc..f66411eba 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -104,6 +104,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_RING)           += -lrte_ring
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PCI)            += -lrte_pci
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrte_eal
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE)        += -lrte_cmdline
+
+_LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE)        += $(shell pkg-config --libs libedit)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_REORDER)        += -lrte_reorder
 _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrte_sched
 
diff --git a/test/cmdline_test/cmdline_test.c b/test/cmdline_test/cmdline_test.c
index 3e406331a..e46b66d0c 100644
--- a/test/cmdline_test/cmdline_test.c
+++ b/test/cmdline_test/cmdline_test.c
@@ -12,7 +12,6 @@
 #include <ctype.h>
 #include <sys/queue.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/test/cmdline_test/commands.c b/test/cmdline_test/commands.c
index d81da9665..2bf0262f6 100644
--- a/test/cmdline_test/commands.c
+++ b/test/cmdline_test/commands.c
@@ -7,7 +7,6 @@
 #include <termios.h>
 #include <inttypes.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
@@ -277,72 +276,6 @@ cmdline_parse_inst_t cmd_ambig_2 = {
 
 
 
-/*** get_history_bufsize ***/
-/* command that displays total space in history buffer
- * this will be useful for testing history (to fill it up just enough to
- * remove the last entry, we need to know how big it is).
- */
-
-struct cmd_get_history_bufsize_result {
-	cmdline_fixed_string_t str;
-};
-
-static void
-cmd_get_history_bufsize_parsed(__attribute__((unused)) void *parsed_result,
-		struct cmdline *cl,
-		__attribute__((unused)) void *data)
-{
-	cmdline_printf(cl, "History buffer size: %zu\n",
-			sizeof(cl->rdl.history_buf));
-}
-
-cmdline_parse_token_string_t cmd_get_history_bufsize_tok =
-	TOKEN_STRING_INITIALIZER(struct cmd_get_history_bufsize_result, str,
-				 "get_history_bufsize");
-
-cmdline_parse_inst_t cmd_get_history_bufsize = {
-	.f = cmd_get_history_bufsize_parsed,  /* function to call */
-	.data = NULL,      /* 2nd arg of func */
-	.help_str = "command that displays total space in history buffer",
-	.tokens = {        /* token list, NULL terminated */
-		(void *)&cmd_get_history_bufsize_tok,
-		NULL,
-	},
-};
-
-
-
-/*** clear_history ***/
-/* clears history buffer */
-
-struct cmd_clear_history_result {
-	cmdline_fixed_string_t str;
-};
-
-static void
-cmd_clear_history_parsed(__attribute__((unused)) void *parsed_result,
-		struct cmdline *cl,
-		__attribute__((unused)) void *data)
-{
-	rdline_clear_history(&cl->rdl);
-}
-
-cmdline_parse_token_string_t cmd_clear_history_tok =
-	TOKEN_STRING_INITIALIZER(struct cmd_clear_history_result, str,
-				 "clear_history");
-
-cmdline_parse_inst_t cmd_clear_history = {
-	.f = cmd_clear_history_parsed,  /* function to call */
-	.data = NULL,      /* 2nd arg of func */
-	.help_str = "clear command history",
-	.tokens = {        /* token list, NULL terminated */
-		(void *)&cmd_clear_history_tok,
-		NULL,
-	},
-};
-
-
-
 /****************/
 
 cmdline_parse_ctx_t main_ctx[] = {
@@ -352,8 +285,6 @@ cmdline_parse_ctx_t main_ctx[] = {
 		(cmdline_parse_inst_t *)&cmd_single,
 		(cmdline_parse_inst_t *)&cmd_single_long,
 		(cmdline_parse_inst_t *)&cmd_num,
-		(cmdline_parse_inst_t *)&cmd_get_history_bufsize,
-		(cmdline_parse_inst_t *)&cmd_clear_history,
 		(cmdline_parse_inst_t *)&cmd_autocomplete_1,
 		(cmdline_parse_inst_t *)&cmd_autocomplete_2,
 	NULL,
diff --git a/test/test/Makefile b/test/test/Makefile
index c9c007c9b..08cc04277 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -139,7 +139,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_num.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_etheraddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_portlist.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_ipaddr.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_cirbuf.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_string.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_lib.c
 
diff --git a/test/test/commands.c b/test/test/commands.c
index 6bfdc0272..7e5357b93 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -38,7 +38,6 @@
 #include <rte_mbuf.h>
 #include <rte_devargs.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 #include <cmdline_parse_num.h>
diff --git a/test/test/meson.build b/test/test/meson.build
index eb3d87a4d..d5bfdccf3 100644
--- a/test/test/meson.build
+++ b/test/test/meson.build
@@ -10,7 +10,6 @@ test_sources = files('commands.c',
 	'test_barrier.c',
 	'test_byteorder.c',
 	'test_cmdline.c',
-	'test_cmdline_cirbuf.c',
 	'test_cmdline_etheraddr.c',
 	'test_cmdline_ipaddr.c',
 	'test_cmdline_lib.c',
diff --git a/test/test/test.c b/test/test/test.c
index 44dfe20ef..802e8079b 100644
--- a/test/test/test.c
+++ b/test/test/test.c
@@ -13,7 +13,6 @@
 #include <sys/queue.h>
 
 #ifdef RTE_LIBRTE_CMDLINE
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/test/test/test_cmdline.c b/test/test/test_cmdline.c
index 115bee966..eef219a93 100644
--- a/test/test/test_cmdline.c
+++ b/test/test/test_cmdline.c
@@ -45,15 +45,6 @@ test_cmdline(void)
 		return -1;
 	if (test_parse_string_invalid_param() < 0)
 		return -1;
-	printf("Testing circular buffer...\n");
-	if (test_cirbuf_char() < 0)
-		return -1;
-	if (test_cirbuf_string() < 0)
-		return -1;
-	if (test_cirbuf_align() < 0)
-		return -1;
-	if (test_cirbuf_invalid_param() < 0)
-		return -1;
 	printf("Testing library functions...\n");
 	if (test_cmdline_lib() < 0)
 		return -1;
diff --git a/test/test/test_cmdline.h b/test/test/test_cmdline.h
index 1854caf8f..2fb45b3d4 100644
--- a/test/test/test_cmdline.h
+++ b/test/test/test_cmdline.h
@@ -32,12 +32,6 @@ int test_parse_string_valid(void);
 int test_parse_string_invalid_data(void);
 int test_parse_string_invalid_param(void);
 
-/* cmdline_cirbuf tests */
-int test_cirbuf_invalid_param(void);
-int test_cirbuf_char(void);
-int test_cirbuf_string(void);
-int test_cirbuf_align(void);
-
 /* test the rest of the library */
 int test_cmdline_lib(void);
 
diff --git a/test/test/test_cmdline_cirbuf.c b/test/test/test_cmdline_cirbuf.c
deleted file mode 100644
index 8ac326cb0..000000000
--- a/test/test/test_cmdline_cirbuf.c
+++ /dev/null
@@ -1,1301 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <rte_string_fns.h>
-
-#include <cmdline_cirbuf.h>
-
-#include "test_cmdline.h"
-
-/* different length strings */
-#define CIRBUF_STR_HEAD " HEAD"
-#define CIRBUF_STR_TAIL "TAIL"
-
-/* miscellaneous tests - they make bullseye happy */
-static int
-test_cirbuf_string_misc(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * add strings to head and tail, but read only tail
-	 * this results in read operation that does not transcend
-	 * from buffer end to buffer beginning (in other words,
-	 * strlen <= cb->maxlen - cb->end)
-	 */
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear buffers */
-	memset(tmp, 0, sizeof(tmp));
-	memset(buf, 0, sizeof(buf));
-
-
-
-	/*
-	 * add a string to buffer when start/end is at end of buffer
-	 */
-
-	/*
-	 * reinitialize circular buffer with start at the end of cirbuf
-	 */
-	if (cirbuf_init(&cb, buf, CMDLINE_TEST_BUFSIZE - 2, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: headstrings do not match!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test adding and deleting strings */
-static int
-test_cirbuf_string_add_del(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: head strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: head strings do not match!\n");
-		return -1;
-	}
-	/* delete string from head*/
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to delete string from head!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* delete string from tail */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to delete string from tail!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test adding from head and deleting from tail, and vice versa */
-static int
-test_cirbuf_string_add_del_reverse(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* delete string from tail */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to delete string from tail!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* delete string from head */
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to delete string from head!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to write more than available */
-static int
-test_cirbuf_string_add_boundaries(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* fill the buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE - sizeof(CIRBUF_STR_TAIL) + 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* try adding a string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try adding a string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* fill the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE - sizeof(CIRBUF_STR_HEAD) + 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* try adding a string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try adding a string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to read/delete more than written */
-static int
-test_cirbuf_string_get_del_boundaries(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-				!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read more than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) + 1)
-			!= sizeof(CIRBUF_STR_HEAD)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* read more than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) + 1)
-			!= sizeof(CIRBUF_STR_HEAD)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* delete more than written (head) */
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_HEAD) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-	/* delete more than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_HEAD) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-				!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* read more than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL) + 1)
-			!= sizeof(CIRBUF_STR_TAIL)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* read more than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_TAIL) + 1)
-			!= sizeof(CIRBUF_STR_TAIL)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* delete more than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-	/* delete more than written (head) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to read/delete less than written */
-static int
-test_cirbuf_string_get_del_partial(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	char tmp2[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-	memset(tmp2, 0, sizeof(tmp));
-
-	strlcpy(tmp2, CIRBUF_STR_HEAD, sizeof(tmp2));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-				!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read less than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, tmp2, sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-	memset(tmp, 0, sizeof(tmp));
-	/* read less than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/*
-	 * verify correct deletion
-	 */
-
-	/* clear buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* delete less than written (head) */
-	if (cirbuf_del_buf_head(&cb, 1) != 0) {
-		printf("Error: delete from head failed!\n");
-		return -1;
-	}
-	/* read from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* since we deleted from head, first char should be deleted */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-	/* clear buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* delete less than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, 1) != 0) {
-		printf("Error: delete from tail failed!\n");
-		return -1;
-	}
-	/* read from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 2)
-			!= sizeof(CIRBUF_STR_HEAD) - 2) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* since we deleted from tail, last char should be deleted */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 2) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test cmdline_cirbuf char add/del functions */
-static int
-test_cirbuf_char_add_del(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* clear buffer */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * try to delete something from cirbuf. since it's empty,
-	 * these should fail.
-	 */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: deleting from empty cirbuf head succeeded!\n");
-		return -1;
-	}
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: deleting from empty cirbuf tail succeeded!\n");
-		return -1;
-	}
-
-	/*
-	 * add, verify and delete. these should pass.
-	 */
-	if (cirbuf_add_head_safe(&cb,'h') < 0) {
-		printf("Error: adding to cirbuf head failed!\n");
-		return -1;
-	}
-	if (cirbuf_get_head(&cb) != 'h') {
-		printf("Error: wrong head content!\n");
-		return -1;
-	}
-	if (cirbuf_del_head_safe(&cb) < 0) {
-		printf("Error: deleting from cirbuf head failed!\n");
-		return -1;
-	}
-	if (cirbuf_add_tail_safe(&cb,'t') < 0) {
-		printf("Error: adding to cirbuf tail failed!\n");
-		return -1;
-	}
-	if (cirbuf_get_tail(&cb) != 't') {
-		printf("Error: wrong tail content!\n");
-		return -1;
-	}
-	if (cirbuf_del_tail_safe(&cb) < 0) {
-		printf("Error: deleting from cirbuf tail failed!\n");
-		return -1;
-	}
-	/* do the same for unsafe versions. those are void. */
-	cirbuf_add_head(&cb,'h');
-	if (cirbuf_get_head(&cb) != 'h') {
-		printf("Error: wrong head content!\n");
-		return -1;
-	}
-	cirbuf_del_head(&cb);
-
-	/* test if char has been deleted. we can't call cirbuf_get_head
-	 * because it's unsafe, but we can call cirbuf_get_buf_head.
-	 */
-	if (cirbuf_get_buf_head(&cb, tmp, 1) > 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	cirbuf_add_tail(&cb,'t');
-	if (cirbuf_get_tail(&cb) != 't') {
-		printf("Error: wrong tail content!\n");
-		return -1;
-	}
-	cirbuf_del_tail(&cb);
-
-	/* test if char has been deleted. we can't call cirbuf_get_tail
-	 * because it's unsafe, but we can call cirbuf_get_buf_tail.
-	 */
-	if (cirbuf_get_buf_tail(&cb, tmp, 1) > 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test filling up buffer with chars */
-static int
-test_cirbuf_char_fill(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/* clear buffer */
-	memset(buf, 0, sizeof(buf));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * fill the buffer from head or tail, verify contents, test boundaries
-	 * and clear the buffer
-	 */
-
-	/* fill the buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf); i++) {
-		if (buf[i] != 't') {
-			printf("Error: wrong content in buffer!\n");
-			return -1;
-		}
-	}
-	/* try to add to a full buffer from tail */
-	if (cirbuf_add_tail_safe(&cb, 't') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try to add to a full buffer from head */
-	if (cirbuf_add_head_safe(&cb, 'h') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* delete buffer from tail */
-	for(i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_del_tail_safe(&cb);
-	/* try to delete from an empty buffer */
-	if (cirbuf_del_tail_safe(&cb) >= 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	/* fill the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf); i++) {
-		if (buf[i] != 'h') {
-			printf("Error: wrong content in buffer!\n");
-			return -1;
-		}
-	}
-	/* try to add to a full buffer from head */
-	if (cirbuf_add_head_safe(&cb,'h') >= 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try to add to a full buffer from tail */
-	if (cirbuf_add_tail_safe(&cb, 't') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* delete buffer from head */
-	for(i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_del_head_safe(&cb);
-	/* try to delete from an empty buffer */
-	if (cirbuf_del_head_safe(&cb) >= 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	/*
-	 * fill the buffer from both head and tail, with alternating characters,
-	 * verify contents and clear the buffer
-	 */
-
-	/* fill half of buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE / 2; i++)
-		cirbuf_add_tail_safe(&cb, (char) (i % 2 ? 't' : 'T'));
-	/* fill other half of the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE / 2; i++)
-		cirbuf_add_head_safe(&cb, (char) (i % 2 ? 'H' : 'h')); /* added in reverse */
-
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf) / 2; i++) {
-		if (buf[i] != (char) (i % 2 ? 't' : 'T')) {
-			printf("Error: wrong content in buffer at %u!\n", i);
-			return -1;
-		}
-	}
-	for (i = sizeof(buf) / 2; i < sizeof(buf); i++) {
-		if (buf[i] != (char) (i % 2 ? 'h' : 'H')) {
-			printf("Error: wrong content in buffer %u!\n", i);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-/* test left alignment */
-static int
-test_cirbuf_align_left(void)
-{
-#define HALF_OFFSET CMDLINE_TEST_BUFSIZE / 2
-#define SMALL_OFFSET HALF_OFFSET / 2
-/* resulting buffer lengths for each of the test cases */
-#define LEN1 HALF_OFFSET - SMALL_OFFSET - 1
-#define LEN2 HALF_OFFSET + SMALL_OFFSET + 2
-#define LEN3 HALF_OFFSET - SMALL_OFFSET
-#define LEN4 HALF_OFFSET + SMALL_OFFSET - 1
-
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/*
-	 * align left when start < end and start in left half
-	 */
-
-	/*
-	 * initialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push end into left half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* push start into left half < end */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_head_safe(&cb);
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN1 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!\n");
-		return -1;
-	}
-
-	/*
-	 * align left when start > end and start in left half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into left half */
-	for (i = 0; i < HALF_OFFSET + 2; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN2 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align left when start < end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_tail_safe(&cb);
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN3 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align left when start > end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half < start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN4 ||
-			cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * Verify that alignment doesn't corrupt data
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail and head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD,
-			sizeof(CIRBUF_STR_HEAD)) < 0 || cirbuf_add_buf_tail(&cb,
-					CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to add strings!\n");
-		return -1;
-	}
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/* reset tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test right alignment */
-static int
-test_cirbuf_align_right(void)
-{
-#define END_OFFSET CMDLINE_TEST_BUFSIZE - 1
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-
-	/*
-	 * align right when start < end and start in left half
-	 */
-
-	/*
-	 * initialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push end into left half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* push start into left half < end */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_head_safe(&cb);
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.start != END_OFFSET || cb.len != LEN1 || cb.end != cb.len - 2) {
-		printf("Error: buffer alignment is wrong!\n");
-		return -1;
-	}
-
-	/*
-	 * align right when start > end and start in left half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into left half */
-	for (i = 0; i < HALF_OFFSET + 2; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.start != END_OFFSET || cb.len != LEN2 || cb.end != cb.len - 2) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align right when start < end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_tail_safe(&cb);
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.end != END_OFFSET || cb.len != LEN3 || cb.start != cb.end - cb.len + 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align right when start > end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half < start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.end != END_OFFSET || cb.len != LEN4 || cb.start != cb.end - cb.len + 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * Verify that alignment doesn't corrupt data
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail and head */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_TAIL)) < 0 || cirbuf_add_buf_head(&cb,
-					CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to add strings!\n");
-		return -1;
-	}
-
-	/* align */
-	if (cirbuf_align_right(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/* reset tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* call functions with invalid parameters */
-int
-test_cirbuf_invalid_param(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-
-	/* null cirbuf */
-	if (cirbuf_init(0, buf, 0, sizeof(buf)) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_init(&cb, 0, 0, sizeof(buf)) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_add_head_safe(0, 'h') == 0)
-		return -1;
-	if (cirbuf_add_tail_safe(0, 't') == 0)
-		return -1;
-	if (cirbuf_del_head_safe(0) == 0)
-		return -1;
-	if (cirbuf_del_tail_safe(0) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_add_buf_head(&cb, 0, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(&cb, 0, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_add_buf_head(0, buf, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(0, buf, 0) == 0)
-		return -1;
-	/* null size */
-	if (cirbuf_add_buf_head(&cb, buf, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(&cb, buf, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_del_buf_head(0, 0) == 0)
-		return -1;
-	if (cirbuf_del_buf_tail(0, 0) == 0)
-		return -1;
-	/* null size */
-	if (cirbuf_del_buf_head(&cb, 0) == 0)
-		return -1;
-	if (cirbuf_del_buf_tail(&cb, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_get_buf_head(0, 0, 0) == 0)
-		return -1;
-	if (cirbuf_get_buf_tail(0, 0, 0) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_get_buf_head(&cb, 0, 0) == 0)
-		return -1;
-	if (cirbuf_get_buf_tail(&cb, 0, 0) == 0)
-		return -1;
-	/* null size, this is valid but should return 0 */
-	if (cirbuf_get_buf_head(&cb, buf, 0) != 0)
-		return -1;
-	if (cirbuf_get_buf_tail(&cb, buf, 0) != 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_align_left(0) == 0)
-		return -1;
-	if (cirbuf_align_right(0) == 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf char functions */
-int
-test_cirbuf_char(void)
-{
-	int ret;
-
-	ret = test_cirbuf_char_add_del();
-	if (ret < 0)
-		return -1;
-
-	ret = test_cirbuf_char_fill();
-	if (ret < 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf string functions */
-int
-test_cirbuf_string(void)
-{
-	if (test_cirbuf_string_add_del() < 0)
-		return -1;
-
-	if (test_cirbuf_string_add_del_reverse() < 0)
-		return -1;
-
-	if (test_cirbuf_string_add_boundaries() < 0)
-		return -1;
-
-	if (test_cirbuf_string_get_del_boundaries() < 0)
-		return -1;
-
-	if (test_cirbuf_string_get_del_partial() < 0)
-		return -1;
-
-	if (test_cirbuf_string_misc() < 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf align functions */
-int
-test_cirbuf_align(void)
-{
-	if (test_cirbuf_align_left() < 0)
-		return -1;
-	if (test_cirbuf_align_right() < 0)
-		return -1;
-	return 0;
-}
diff --git a/test/test/test_cmdline_lib.c b/test/test/test_cmdline_lib.c
index a856a9713..2821d4bbf 100644
--- a/test/test/test_cmdline_lib.c
+++ b/test/test/test_cmdline_lib.c
@@ -12,8 +12,6 @@
 #include <ctype.h>
 #include <sys/queue.h>
 
-#include <cmdline_vt100.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
@@ -21,113 +19,41 @@
 #include "test_cmdline.h"
 
 /****************************************************************/
-/* static functions required for some tests */
-static void
-valid_buffer(__attribute__((unused))struct rdline *rdl,
-			__attribute__((unused))const char *buf,
-			__attribute__((unused)) unsigned int size)
-{
-}
-
-static int
-complete_buffer(__attribute__((unused)) struct rdline *rdl,
-			__attribute__((unused)) const char *buf,
-			__attribute__((unused)) char *dstbuf,
-			__attribute__((unused)) unsigned int dstsize,
-			__attribute__((unused)) int *state)
-{
-	return 0;
-}
-
-/****************************************************************/
 
 static int
 test_cmdline_parse_fns(void)
 {
-	struct cmdline cl;
+	struct cmdline *cl;
 	int i = 0;
 	char dst[CMDLINE_TEST_BUFSIZE];
 
+	cl = cmdline_new(NULL, "prompt", 0, 1);
+	if (!cl)
+		goto error;
 	if (cmdline_parse(NULL, "buffer") >= 0)
 		goto error;
-	if (cmdline_parse(&cl, NULL) >= 0)
+	if (cmdline_parse(cl, NULL) >= 0)
 		goto error;
 
 	if (cmdline_complete(NULL, "buffer", &i, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, NULL, &i, dst, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, NULL, &i, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, "buffer", NULL, dst, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, "buffer", NULL, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, "buffer", &i, NULL, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, "buffer", &i, NULL, sizeof(dst)) >= 0)
 		goto error;
 
 	return 0;
 
 error:
+	if (cl)
+		cmdline_free(cl);
 	printf("Error: function accepted null parameter!\n");
 	return -1;
 }
 
 static int
-test_cmdline_rdline_fns(void)
-{
-	struct rdline rdl;
-	rdline_write_char_t *wc = &cmdline_write_char;
-	rdline_validate_t *v = &valid_buffer;
-	rdline_complete_t *c = &complete_buffer;
-
-	if (rdline_init(NULL, wc, v, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, NULL, v, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, wc, NULL, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, wc, v, NULL) >= 0)
-		goto error;
-	if (rdline_char_in(NULL, 0) >= 0)
-		goto error;
-	if (rdline_get_buffer(NULL) != NULL)
-		goto error;
-	if (rdline_add_history(NULL, "history") >= 0)
-		goto error;
-	if (rdline_add_history(&rdl, NULL) >= 0)
-		goto error;
-	if (rdline_get_history_item(NULL, 0) != NULL)
-		goto error;
-
-	/* void functions */
-	rdline_newline(NULL, "prompt");
-	rdline_newline(&rdl, NULL);
-	rdline_stop(NULL);
-	rdline_quit(NULL);
-	rdline_restart(NULL);
-	rdline_redisplay(NULL);
-	rdline_reset(NULL);
-	rdline_clear_history(NULL);
-
-	return 0;
-
-error:
-	printf("Error: function accepted null parameter!\n");
-	return -1;
-}
-
-static int
-test_cmdline_vt100_fns(void)
-{
-	if (vt100_parser(NULL, 0) >= 0) {
-		printf("Error: function accepted null parameter!\n");
-		return -1;
-	}
-
-	/* void functions */
-	vt100_init(NULL);
-
-	return 0;
-}
-
-static int
 test_cmdline_socket_fns(void)
 {
 	cmdline_parse_ctx_t ctx;
@@ -164,7 +90,7 @@ static int
 test_cmdline_fns(void)
 {
 	cmdline_parse_ctx_t ctx;
-	struct cmdline cl, *tmp;
+	struct cmdline *tmp;
 
 	memset(&ctx, 0, sizeof(ctx));
 	tmp = cmdline_new(&ctx, "test", -1, -1);
@@ -177,10 +103,6 @@ test_cmdline_fns(void)
 		goto error;
 	if (cmdline_in(NULL, "buffer", CMDLINE_TEST_BUFSIZE) >= 0)
 		goto error;
-	if (cmdline_in(&cl, NULL, CMDLINE_TEST_BUFSIZE) >= 0)
-		goto error;
-	if (cmdline_write_char(NULL, 0) >= 0)
-		goto error;
 
 	/* void functions */
 	cmdline_set_prompt(NULL, "prompt");
@@ -191,16 +113,6 @@ test_cmdline_fns(void)
 	cmdline_interact(NULL);
 	cmdline_quit(NULL);
 
-	/* check if void calls change anything when they should fail */
-	cl = *tmp;
-
-	cmdline_printf(&cl, NULL);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-	cmdline_set_prompt(&cl, NULL);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-	cmdline_in(&cl, NULL, CMDLINE_TEST_BUFSIZE);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-
 	cmdline_free(tmp);
 
 	return 0;
@@ -208,9 +120,6 @@ test_cmdline_fns(void)
 error:
 	printf("Error: function accepted null parameter!\n");
 	return -1;
-mismatch:
-	printf("Error: data changed!\n");
-	return -1;
 }
 
 /* test library functions. the point of these tests is not so much to test
@@ -222,10 +131,6 @@ test_cmdline_lib(void)
 {
 	if (test_cmdline_parse_fns() < 0)
 		return -1;
-	if (test_cmdline_rdline_fns() < 0)
-		return -1;
-	if (test_cmdline_vt100_fns() < 0)
-		return -1;
 	if (test_cmdline_socket_fns() < 0)
 		return -1;
 	if (test_cmdline_fns() < 0)
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-19 14:39  3%   ` Burakov, Anatoly
@ 2018-04-19 14:48  0%     ` Arnon Warshavsky
  0 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-19 14:48 UTC (permalink / raw)
  To: Burakov, Anatoly
  Cc: Thomas Monjalon, Lu, Wenzhuo, Doherty, Declan, jerin.jacob,
	Bruce Richardson, Yigit, Ferruh, dev

Copy on the commit message  and volatile.

Regarding the new function defunct_and_remain_in_endless_loop ()
I don't think I can put that in a separate patch without breaking the
current patch independence.


On Thu, Apr 19, 2018 at 5:39 PM, Burakov, Anatoly <anatoly.burakov@intel.com
> wrote:

> On 19-Apr-18 7:01 AM, Arnon Warshavsky wrote:
>
>> Local functions to this file,
>> changing from void to int are non-abi-breaking.
>> For handling the single function that cannot
>> change from void to int due to abi,
>> where this is the only place it is called in,
>> I added a state variable that is being checked
>> right after the call to this function.
>>
>
> A rewrite of commit message is in order, i think :) Something like this:
>
> Change some functions' return type from void to int. This will not break
> ABI because they are internal only.
>
> (see below for comments on lcore changes)
>
>
>> --
>>
>> v4 - fix split literal strings in log messages
>>
>> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
>>
>
> Again, please do not add patch/version notes to the commit message, put
> them after "---". Version history is not for commit messages, it's for
> people reviewing it before merge.
>
> ---
>>   lib/librte_eal/bsdapp/eal/eal.c           |  86 ++++++++++++++-------
>>   lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
>>   lib/librte_eal/common/eal_common_launch.c |  21 ++++++
>>   lib/librte_eal/common/include/rte_debug.h |  12 +++
>>   lib/librte_eal/linuxapp/eal/eal.c         | 120
>> ++++++++++++++++++++----------
>>   lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
>>   6 files changed, 270 insertions(+), 99 deletions(-)
>>
>> diff --git a/lib/librte_eal/bsdapp/eal/eal.c
>> b/lib/librte_eal/bsdapp/eal/eal.c
>> index d996190..9c2f6f1 100644
>> --- a/lib/librte_eal/bsdapp/eal/eal.c
>> +++ b/lib/librte_eal/bsdapp/eal/eal.c
>> @@ -151,7 +151,7 @@ enum rte_iova_mode
>>    * We also don't lock the whole file, so that in future we can use
>> read-locks
>>    * on other parts, e.g. memzones, to detect if there are running
>> secondary
>>    * processes. */
>> -static void
>> +static int
>>
>
> <...>
>
> +
>> +/* move to panic state and do not return */
>> +static __attribute__((noreturn)) void
>> +defunct_and_remain_in_endless_loop(void)
>> +{
>> +       rte_move_to_panic_state();
>> +       while (1)
>> +               sleep(1);
>>   }
>>
>
> It seems like you're mixing two different patchsets here. Maybe it would
> be beneficial to put lcore changes in a separate patch? Technically,
> rte_panic's in lcore are not part of init sequence.
>
> (also, should panic state be volatile?)
>
>
>     /* main loop of threads */
>> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>>                 if (thread_id == lcore_config[lcore_id].thread_id)
>>                         break;
>>         }
>> -       if (lcore_id == RTE_MAX_LCORE)
>> -               rte_panic("cannot retrieve lcore id\n");
>> +       if (lcore_id == RTE_MAX_LCORE) {
>> +               RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
>>
>
>
> --
> Thanks,
> Anatoly
>



-- 

*Arnon Warshavsky*
*Qwilt | work: +972-72-2221634 | mobile: +972-50-8583058 | arnon@qwilt.com
<arnon@qwilt.com>*

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration space messages
  @ 2018-04-19 14:39  0%                 ` Maxime Coquelin
  2018-04-20  0:32  0%                   ` Liu, Changpeng
  0 siblings, 1 reply; 200+ results
From: Maxime Coquelin @ 2018-04-19 14:39 UTC (permalink / raw)
  To: Liu, Changpeng, Kulasek, TomaszX, yliu
  Cc: Verkamp, Daniel, Harris, James R, Wodkowski, PawelX, dev, Tan, Jianfeng

Hi Changpeng, Tomasz,

Any chance that you resubmit the series now that the Qemu changes adding
a protocol feature flag has been accepted?

Cheers,
Maxime
On 03/28/2018 12:56 PM, Maxime Coquelin wrote:
> 
> 
> On 03/28/2018 12:23 PM, Liu, Changpeng wrote:
>>
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Wednesday, March 28, 2018 6:11 PM
>>> To: Liu, Changpeng <changpeng.liu@intel.com>; Kulasek, TomaszX
>>> <tomaszx.kulasek@intel.com>; yliu@fridaylinux.org
>>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
>>> <james.r.harris@intel.com>; Wodkowski, PawelX
>>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Tan, Jianfeng
>>> <jianfeng.tan@intel.com>
>>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration space
>>> messages
>>>
>>>
>>>
>>> On 03/28/2018 12:03 PM, Liu, Changpeng wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>> Sent: Wednesday, March 28, 2018 5:58 PM
>>>>> To: Liu, Changpeng <changpeng.liu@intel.com>; Kulasek, TomaszX
>>>>> <tomaszx.kulasek@intel.com>; yliu@fridaylinux.org
>>>>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
>>>>> <james.r.harris@intel.com>; Wodkowski, PawelX
>>>>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Tan, Jianfeng
>>>>> <jianfeng.tan@intel.com>
>>>>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio configuration 
>>>>> space
>>>>> messages
>>>>>
>>>>>
>>>>>
>>>>> On 03/28/2018 11:50 AM, Liu, Changpeng wrote:
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>>>> Sent: Wednesday, March 28, 2018 5:12 PM
>>>>>>> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; 
>>>>>>> yliu@fridaylinux.org
>>>>>>> Cc: Verkamp, Daniel <daniel.verkamp@intel.com>; Harris, James R
>>>>>>> <james.r.harris@intel.com>; Wodkowski, PawelX
>>>>>>> <pawelx.wodkowski@intel.com>; dev@dpdk.org; Liu, Changpeng
>>>>>>> <changpeng.liu@intel.com>; Tan, Jianfeng <jianfeng.tan@intel.com>
>>>>>>> Subject: Re: [dpdk-dev] [PATCH v2] vhost: add virtio 
>>>>>>> configuration space
>>>>>>> messages
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 03/27/2018 05:35 PM, Tomasz Kulasek wrote:
>>>>>>>> This patch adds new vhost user messages GET_CONFIG and SET_CONFIG
>>>>> used
>>>>>>>> for get/set virtio device's configuration space.
>>>>>>>>
>>>>>>>> Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
>>>>>>>> Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
>>>>>>>> ---
>>>>>>>> Changes in v2:
>>>>>>>>      - code cleanup
>>>>>>>>
>>>>>>>>      lib/librte_vhost/rte_vhost.h  |  4 ++++
>>>>>>>>      lib/librte_vhost/vhost_user.c | 22 ++++++++++++++++++++++
>>>>>>>>      lib/librte_vhost/vhost_user.h | 16 ++++++++++++++++
>>>>>>>>      3 files changed, 42 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/lib/librte_vhost/rte_vhost.h 
>>>>>>>> b/lib/librte_vhost/rte_vhost.h
>>>>>>>> index d332069..fe30518 100644
>>>>>>>> --- a/lib/librte_vhost/rte_vhost.h
>>>>>>>> +++ b/lib/librte_vhost/rte_vhost.h
>>>>>>>> @@ -84,6 +84,10 @@ struct vhost_device_ops {
>>>>>>>>          int (*new_connection)(int vid);
>>>>>>>>          void (*destroy_connection)(int vid);
>>>>>>>>
>>>>>>>> +    int (*get_config)(int vid, uint8_t *config, uint32_t 
>>>>>>>> config_len);
>>>>>>>> +    int (*set_config)(int vid, uint8_t *config, uint32_t offset,
>>>>>>>> +            uint32_t len, uint32_t flags);
>>>>>>>> +
>>>>>>>>          void *reserved[2]; /**< Reserved for future extension */
>>>>>>>
>>>>>>> You are breaking the ABI, as you grow the size of the ops struct.
>>>>>>>
>>>>>>> Also, I'm wondering if we shouldn't have a different ops for 
>>>>>>> external
>>>>>>> backends. Here these ops are more intended to the application, we 
>>>>>>> could
>>>>>>> have a specific ops struct for external backends IMHO.
>>>>>>>
>>>>>>>>      };
>>>>>>>>
>>>>>>>> diff --git a/lib/librte_vhost/vhost_user.c 
>>>>>>>> b/lib/librte_vhost/vhost_user.c
>>>>>>>> index 90ed211..0ed6a5a 100644
>>>>>>>> --- a/lib/librte_vhost/vhost_user.c
>>>>>>>> +++ b/lib/librte_vhost/vhost_user.c
>>>>>>>> @@ -50,6 +50,8 @@ static const char
>>>>> *vhost_message_str[VHOST_USER_MAX]
>>>>>>> = {
>>>>>>>>          [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
>>>>>>>>          [VHOST_USER_SET_SLAVE_REQ_FD]  =
>>>>>>> "VHOST_USER_SET_SLAVE_REQ_FD",
>>>>>>>>          [VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
>>>>>>>> +    [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
>>>>>>>> +    [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
>>>>>>>>      };
>>>>>>>>
>>>>>>>>      static uint64_t
>>>>>>>> @@ -1355,6 +1357,7 @@ vhost_user_msg_handler(int vid, int fd)
>>>>>>>>           * would cause a dead lock.
>>>>>>>>           */
>>>>>>>>          switch (msg.request.master) {
>>>>>>>> +    case VHOST_USER_SET_CONFIG:
>>>>>>>
>>>>>>> It seems VHOST_USER_GET_CONFIG is missing here.
>>>>>>>
>>>>>>>>          case VHOST_USER_SET_FEATURES:
>>>>>>>>          case VHOST_USER_SET_PROTOCOL_FEATURES:
>>>>>>>>          case VHOST_USER_SET_OWNER:
>>>>>>>> @@ -1380,6 +1383,25 @@ vhost_user_msg_handler(int vid, int fd)
>>>>>>>>          }
>>>>>>>>
>>>>>>>>          switch (msg.request.master) {
>>>>>>>> +    case VHOST_USER_GET_CONFIG:
>>>>>>>> +        if (dev->notify_ops->get_config(dev->vid,
>>>>>>> Please check ->get_config is set before calling it.
>>>>>>>
>>>>>>>> +                msg.payload.config.region,
>>>>>>>> +                msg.payload.config.size) != 0) {
>>>>>>>> +            msg.size = sizeof(uint64_t);
>>>>>>>> +        }
>>>>>>>> +        send_vhost_reply(fd, &msg);
>>>>>>>> +        break;
>>>>>>>> +    case VHOST_USER_SET_CONFIG:
>>>>>>>> +        if ((dev->notify_ops->set_config(dev->vid,
>>>>>>> Ditto.
>>>>>>>
>>>>>>>> +                msg.payload.config.region,
>>>>>>>> +                msg.payload.config.offset,
>>>>>>>> +                msg.payload.config.size,
>>>>>>>> +                msg.payload.config.flags)) != 0) {
>>>>>>>> +            ret = 1;
>>>>>>>> +        } else {
>>>>>>>> +            ret = 0;
>>>>>>>> +        }
>>>>>>>
>>>>>>> ret = dev->notify_ops->set_config instead?
>>>>>>>> +        break;
>>>>>>>>          case VHOST_USER_GET_FEATURES:
>>>>>>>>              msg.payload.u64 = vhost_user_get_features(dev);
>>>>>>>>              msg.size = sizeof(msg.payload.u64);
>>>>>>>> diff --git a/lib/librte_vhost/vhost_user.h 
>>>>>>>> b/lib/librte_vhost/vhost_user.h
>>>>>>>> index d4bd604..25cc026 100644
>>>>>>>> --- a/lib/librte_vhost/vhost_user.h
>>>>>>>> +++ b/lib/librte_vhost/vhost_user.h
>>>>>>>> @@ -14,6 +14,11 @@
>>>>>>>>
>>>>>>>>      #define VHOST_MEMORY_MAX_NREGIONS 8
>>>>>>>>
>>>>>>>> +/*
>>>>>>>> + * Maximum size of virtio device config space
>>>>>>>> + */
>>>>>>>> +#define VHOST_USER_MAX_CONFIG_SIZE 256
>>>>>>>> +
>>>>>>>>      #define VHOST_USER_PROTOCOL_F_MQ    0
>>>>>>>>      #define VHOST_USER_PROTOCOL_F_LOG_SHMFD    1
>>>>>>>>      #define VHOST_USER_PROTOCOL_F_RARP    2
>>>>>>>
>>>>>>> Shouldn't there be a protocol feature associated to these new 
>>>>>>> messages?
>>>>>>> Else how QEMU knows the backend supports it or not?
>>>>>>>
>>>>>>> I looked at QEMU code and indeed no protocol feature associated, 
>>>>>>> that's
>>>>>>> strange...
>>>>>> Nice to have, for now not all the QEMU host driver need to get this
>>>>> configuration space from slave backend
>>>>>> when getting start. This message can be used for migration of 
>>>>>> vhost-user
>>>>> devices.
>>>>>
>>>>> So if QEMU sends this message but the DPDK version does not support it
>>>>> yet, vhost_user_msg_handler() will return an error ("vhost read
>>>>> incorrect message") and the socket will be closed.
>>>>>
>>>>> How do we overcome this? I think we really need a spec update ASAP,
>>>>> before QEMU v2.12 is out (-rc1 already).
>>>>>
>>>>> Do you have time to take care of this?
>>>> For now there are no other users except us care about this message, 
>>>> :), it's no
>>> hurry.
>>>> I can take this after QEMU 2.12 release adding a new protocol 
>>>> feature bit.
>>>
>>> Are you sure?
>>> If I understand the code correctly, as the guest writes in config regs
>>> of a virtio-blk device, .set_config callback will be called.
>> Exactly.
>>>
>>> If you have a vhost-user backend, it will receive the SET_CONFIG
>>> request, no?
>> For now this only enabled for QEMU vhost-user-blk driver, QEMU 
>> virtio-blk driver didn't have such issue.
> 
> Right.
> But it will be really painful to manage for example for cross-version
> live migration. Or when you'll want to use QEMU v2.13+ with a DPDK
> v18.05 backend, the protocol feature won't be negotiated.
> 
> Really, this is important to get it right at the beginning.
> 
> Thanks,
> Maxime
>>>
>>> Cheers,
>>> Maxime
>>>
>>>>>
>>>>> Thanks,
>>>>> Maxime

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-19  6:01  2% ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
@ 2018-04-19 14:39  3%   ` Burakov, Anatoly
  2018-04-19 14:48  0%     ` Arnon Warshavsky
  2018-04-19 17:48  0%   ` Aaron Conole
  1 sibling, 1 reply; 200+ results
From: Burakov, Anatoly @ 2018-04-19 14:39 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, wenzhuo.lu, declan.doherty,
	jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 19-Apr-18 7:01 AM, Arnon Warshavsky wrote:
> Local functions to this file,
> changing from void to int are non-abi-breaking.
> For handling the single function that cannot
> change from void to int due to abi,
> where this is the only place it is called in,
> I added a state variable that is being checked
> right after the call to this function.

A rewrite of commit message is in order, i think :) Something like this:

Change some functions' return type from void to int. This will not break 
ABI because they are internal only.

(see below for comments on lcore changes)

> 
> --
> 
> v4 - fix split literal strings in log messages
> 
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>

Again, please do not add patch/version notes to the commit message, put 
them after "---". Version history is not for commit messages, it's for 
people reviewing it before merge.

> ---
>   lib/librte_eal/bsdapp/eal/eal.c           |  86 ++++++++++++++-------
>   lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
>   lib/librte_eal/common/eal_common_launch.c |  21 ++++++
>   lib/librte_eal/common/include/rte_debug.h |  12 +++
>   lib/librte_eal/linuxapp/eal/eal.c         | 120 ++++++++++++++++++++----------
>   lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
>   6 files changed, 270 insertions(+), 99 deletions(-)
> 
> diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
> index d996190..9c2f6f1 100644
> --- a/lib/librte_eal/bsdapp/eal/eal.c
> +++ b/lib/librte_eal/bsdapp/eal/eal.c
> @@ -151,7 +151,7 @@ enum rte_iova_mode
>    * We also don't lock the whole file, so that in future we can use read-locks
>    * on other parts, e.g. memzones, to detect if there are running secondary
>    * processes. */
> -static void
> +static int

<...>

> +
> +/* move to panic state and do not return */
> +static __attribute__((noreturn)) void
> +defunct_and_remain_in_endless_loop(void)
> +{
> +	rte_move_to_panic_state();
> +	while (1)
> +		sleep(1);
>   }

It seems like you're mixing two different patchsets here. Maybe it would 
be beneficial to put lcore changes in a separate patch? Technically, 
rte_panic's in lcore are not part of init sequence.

(also, should panic state be volatile?)

>   
>   /* main loop of threads */
> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>   		if (thread_id == lcore_config[lcore_id].thread_id)
>   			break;
>   	}
> -	if (lcore_id == RTE_MAX_LCORE)
> -		rte_panic("cannot retrieve lcore id\n");
> +	if (lcore_id == RTE_MAX_LCORE) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",


-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
  2018-04-18 15:11  0%         ` Iremonger, Bernard
@ 2018-04-19 14:24  0%           ` Xueming(Steven) Li
  0 siblings, 0 replies; 200+ results
From: Xueming(Steven) Li @ 2018-04-19 14:24 UTC (permalink / raw)
  To: Iremonger, Bernard, Lu, Wenzhuo, Wu, Jingjing, Thomas Monjalon,
	Adrien Mazarguil
  Cc: Nélio Laranjeiro, Shahaf Shuler, dev, Olivier Matz



> -----Original Message-----
> From: Iremonger, Bernard <bernard.iremonger@intel.com>
> Sent: Wednesday, April 18, 2018 11:11 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>
> Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org;
> Olivier Matz <olivier.matz@6wind.com>
> Subject: RE: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> 
> Hi Li
> 
> <snip>
> 
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xueming Li
> > > > Sent: Tuesday, April 17, 2018 4:04 PM
> > > > To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> > > > <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>;
> > > > Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > > Cc: Xueming Li <xuemingl@mellanox.com>; Nelio Laranjeiro
> > > > <nelio.laranjeiro@6wind.com>; Shahaf Shuler
> > > > <shahafs@mellanox.com>; dev@dpdk.org; Olivier Matz
> > > > <olivier.matz@6wind.com>
> > > > Subject: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> > > >
> > > > v5:
> > > > - Fixed VXLAN-GPE comment alignment
> > > > v4:
> > > > - Update testpmd doc for flow VXLAN-GPE paramter.
> > > > v3:
> > > > - Change VXLAN-GPE definition order to avoid ABI compatibility issue.
> > > > v2:
> > > > - Split patch set into public and mlx5 two series, this one is the first.
> > > > v1:
> > > > - Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
> > > > - Remove deprecation notes of rss level
> > > >
> > > > This patchset introduced new tunnel type and related testpmd code:
> > > > - New tunnel type VXLAN-GPE
> > > >
> > > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2
> > > > Fd
> > > > at
> > > > atracker.ietf.org%2Fdoc%2Fdraft-ietf-nvo3-vxlan-gpe%2F&data=02%7C0
> > > > 1%
> > > > 7C
> > > >
> > xuemingl%40mellanox.com%7C2dffef40890b4cf8ff9d08d5a47d0420%7Ca65297
> > 1
> > > > c7
> > > >
> > d2e4d9ba6a4d149256f461b%7C0%7C0%7C636595779231620631&sdata=%2Bv
> > x%2Fg
> > > > VB
> > > > 3e3BHI%2BYxPxOIpqK6CuKvQQ8qej4B1Faxihc%3D&reserved=0
> > > > - New tunnel type MPLS-in-GRE
> > > >
> > > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2
> > > > Ft
> > > > oo
> > > >
> > ls.ietf.org%2Fhtml%2Frfc4023&data=02%7C01%7Cxuemingl%40mellanox.com%
> > > > 7C
> > > >
> > 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b
> > %
> > > > 7C
> > > >
> > 0%7C0%7C636595779231620631&sdata=wk2wvoB9LSbI2LfHZVvWzmtgS0XQbG
> > NMFL4
> > > > G1
> > > > kyr77E%3D&reserved=0
> > > > - New tunnel type MPLS-in-UDP
> > > >
> > > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2
> > > > Ft
> > > > oo
> > > >
> > ls.ietf.org%2Fhtml%2Frfc7510&data=02%7C01%7Cxuemingl%40mellanox.com%
> > > > 7C
> > > >
> > 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b
> > %
> > > > 7C
> > > >
> > 0%7C0%7C636595779231620631&sdata=4RXbPD9tV7ArcnKsK8IJy%2B9XbKlzKc
> > NBS
> > > > v1
> > > > LXVwwuPs%3D&reserved=0
> > > > - Support GRE extension in testpmd csum forwarding engine
> > > >
> > > > Xueming Li (4):
> > > >   doc: remove RSS configuration change announcement
> > > >   ethdev: introduce new tunnel VXLAN-GPE
> > > >   app/testpmd: introduce new tunnel VXLAN-GPE
> > > >   app/testpmd: add more GRE extension support to csum engine
> > > >
> > > >  app/test-pmd/cmdline_flow.c                 |  24 +++++++
> > > >  app/test-pmd/config.c                       |   2 +
> > > >  app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++--
> > -
> > > >  app/test-pmd/parameters.c                   |  12 +++-
> > > >  app/test-pmd/testpmd.h                      |   2 +
> > > >  doc/guides/prog_guide/rte_flow.rst          |  12 ++++
> > > >  doc/guides/rel_notes/deprecation.rst        |   4 --
> > > >  doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
> > > >  doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
> > > >  lib/librte_ether/rte_eth_ctrl.h             |   3 +-
> > > >  lib/librte_ether/rte_flow.c                 |   1 +
> > > >  lib/librte_ether/rte_flow.h                 |  27 ++++++++
> > > >  lib/librte_mbuf/rte_mbuf.c                  |   3 +
> > > >  lib/librte_mbuf/rte_mbuf.h                  |   1 +
> > > >  lib/librte_mbuf/rte_mbuf_ptype.c            |   1 +
> > > >  lib/librte_mbuf/rte_mbuf_ptype.h            |  13 ++++
> > > >  lib/librte_net/rte_ether.h                  |  25 +++++++
> > > >  17 files changed, 225 insertions(+), 17 deletions(-)
> > > >
> > > > --
> > > > 2.13.3
> > >
> > > Patch 3 of this patch set fails to apply to the latest master, the
> > > other three
> > patches apply ok.
> > >
> > > Regards,
> > >
> > > Bernard.
> > >
> > I tried it with orgin/master branch and it worked for me.
> > Could you please share more information?
> >
> > Best Regards,
> > Xueming Li
> 
> I have just cloned the current dpdk master, patch 3 still fails to apply, dpdk-dev-v5-3-4-app-testpmd-
> introduce-new-tunnel-VXLAN-GPE.patch
> 
> git am ./dpdk-dev-v5-3-4-app-testpmd-introduce-new-tunnel-VXLAN-GPE.patch
> Applying: app/testpmd: introduce new tunnel VXLAN-GPE
> error: patch failed: app/test-pmd/config.c:997
> error: app/test-pmd/config.c: patch does not apply Patch failed at 0001 app/testpmd: introduce new
> tunnel VXLAN-GPE The copy of the patch that failed is found in:
>    /root/dpdk_temp/.git/rebase-apply/patch
> When you have resolved this problem, run "git am --continue".
> If you prefer to skip this patch, run "git am --skip" instead.
> To restore the original branch and stop patching, run "git am --abort
> 

You are correct, I can reproduce it now. Will send out a new v6 series soon, thanks.

> Regards,
> 
> Bernard.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v6 00/11] Bunch of flow API-related fixes
  2018-04-19 14:03  0%       ` Ferruh Yigit
@ 2018-04-19 14:07  0%         ` Ferruh Yigit
  0 siblings, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-19 14:07 UTC (permalink / raw)
  To: Adrien Mazarguil, dev

On 4/19/2018 3:03 PM, Ferruh Yigit wrote:
> On 4/19/2018 11:07 AM, Adrien Mazarguil wrote:
>> This series contains several fixes for rte_flow and its implementation in
>> PMDs and testpmd. Upcoming work on the flow API depends on it.
>>
>> v6 changes:
>>
>> - No change, rebased series and updated/fixed commit messages.
>>
>> v5 changes:
>>
>> - No change, rebased series to address conflicts.
>>
>> v4 changes:
>>
>> - Rebased again.
>> - The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
>>   see updated patch for details.
>>
>> v3 changes:
>>
>> - Rebased series.
>> - Dropped unnecessary "net/sfc: fix endian conversions in flow API".
>> - Dropped "ethdev: fix ABI version in meson build", handled by prior commit
>>   d9736a248785 ("ethdev: fix library version in meson build").
>>
>> v2 changes:
>>
>> - mlx5 fix (patch #3).
>> - bnxt fix (patch #4).
>> - sfc fix (patch #6).
>> - Missing include (patch #13).
>>
>> Adrien Mazarguil (11):
>>   net/mlx4: fix RSS resource leak in case of error
>>   net/mlx4: fix ignored RSS hash types
>>   net/mlx5: fix RSS flow action bounds check
>>   net/bnxt: fix matching of flow API item masks
>>   app/testpmd: fix flow completion for RSS queues
>>   app/testpmd: fix lack of flow action configuration
>>   app/testpmd: fix RSS flow action configuration
>>   app/testpmd: fix missing RSS fields in flow action
>>   app/testpmd: fix missing boolean values in flow command
>>   ethdev: fix shallow copy of flow API RSS action
>>   ethdev: fix missing include in flow API
> 
> For series,
> Acked-by: Ferruh Yigit <ferruh.yigit@intel.com>

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v6 00/11] Bunch of flow API-related fixes
  2018-04-19 10:07  3%     ` [dpdk-dev] [PATCH v6 " Adrien Mazarguil
@ 2018-04-19 14:03  0%       ` Ferruh Yigit
  2018-04-19 14:07  0%         ` Ferruh Yigit
  0 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-19 14:03 UTC (permalink / raw)
  To: Adrien Mazarguil, dev

On 4/19/2018 11:07 AM, Adrien Mazarguil wrote:
> This series contains several fixes for rte_flow and its implementation in
> PMDs and testpmd. Upcoming work on the flow API depends on it.
> 
> v6 changes:
> 
> - No change, rebased series and updated/fixed commit messages.
> 
> v5 changes:
> 
> - No change, rebased series to address conflicts.
> 
> v4 changes:
> 
> - Rebased again.
> - The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
>   see updated patch for details.
> 
> v3 changes:
> 
> - Rebased series.
> - Dropped unnecessary "net/sfc: fix endian conversions in flow API".
> - Dropped "ethdev: fix ABI version in meson build", handled by prior commit
>   d9736a248785 ("ethdev: fix library version in meson build").
> 
> v2 changes:
> 
> - mlx5 fix (patch #3).
> - bnxt fix (patch #4).
> - sfc fix (patch #6).
> - Missing include (patch #13).
> 
> Adrien Mazarguil (11):
>   net/mlx4: fix RSS resource leak in case of error
>   net/mlx4: fix ignored RSS hash types
>   net/mlx5: fix RSS flow action bounds check
>   net/bnxt: fix matching of flow API item masks
>   app/testpmd: fix flow completion for RSS queues
>   app/testpmd: fix lack of flow action configuration
>   app/testpmd: fix RSS flow action configuration
>   app/testpmd: fix missing RSS fields in flow action
>   app/testpmd: fix missing boolean values in flow command
>   ethdev: fix shallow copy of flow API RSS action
>   ethdev: fix missing include in flow API

For series,
Acked-by: Ferruh Yigit <ferruh.yigit@intel.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/6] ethdev: Add jump action type to rte_flow
  @ 2018-04-19 13:03  3%     ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 13:03 UTC (permalink / raw)
  To: Declan Doherty
  Cc: dev, Alex Rosenbaum, Ferruh Yigit, Thomas Monjalon,
	Shahaf Shuler, Qi Zhang, Alejandro Lucero, Andrew Rybchenko,
	Mohammad Abdul Awal, Remy Horton, John McNamara, Rony Efraim,
	Jingjing Wu, Wenzhuo Lu, Vincent Jardin, Yuanhan Liu,
	Bruce Richardson, Konstantin Ananyev, Zhihong Wang

On Wed, Apr 18, 2018 at 10:04:19PM +0100, Declan Doherty wrote:
> Add jump action type which defines an action which allows a matched
> flow to be redirect to the specified group. This allows physical and
> logical flow table/group hierarchies to be managed through rte_flow.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

You should have rebased this series including this patch on top of mine
([1] followed by [2]) to avoid some mistakes such as documenting
"terminating actions".

This patch could also update documentation for flow rules [3] groups [4] and
priorities [5] to make clear that groups aren't linked by default, their
order doesn't matter and explicit JUMP actions are needed to reach them
(actually look for every instance of the word "group" in this
documentation).

Also the addition of an action in the middle of the enum has an ABI impact,
please put a reminder in the commit log as in [6].

More below.

[1] "Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/098035.html
[2] "Flow API overhaul for switch offloads"
    http://dpdk.org/ml/archives/dev/2018-April/098047.html
[3] "9.2.1. Description"
     https://dpdk.org/doc/guides/prog_guide/rte_flow.html#description
[3] "9.2.2.1. Attribute: Group"
    https://dpdk.org/doc/guides/prog_guide/rte_flow.html#attribute-group
[5] "9.2.2.2. Attribute: Priority"
    https://dpdk.org/doc/guides/prog_guide/rte_flow.html#attribute-priority
[6] "ethdev: add physical port action to flow API"
    http://dpdk.org/ml/archives/dev/2018-April/098062.html

> ---
>  doc/guides/prog_guide/rte_flow.rst | 26 ++++++++++++++++++++++++--
>  lib/librte_ether/rte_flow.h        | 27 +++++++++++++++++++++++++++
>  2 files changed, 51 insertions(+), 2 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 672473d33..325010544 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1188,6 +1188,28 @@ flow rules:
>     | 2     | END                        |
>     +-------+----------------------------+
>  
> +

Extraneous empty line.

> +Action: ``JUMP``
> +^^^^^^^^^^^^^^^^
> +
> +Redirects packets to a group on the current device.
> +
> +In a hierarchy of groups, which can be used to represent physical or logical
> +flow group/tables on the device, this action allows the terminating action to
> +be a group on that device.
> +
> +- Terminating by default.

Since there are no more "terminating" actions, you should document that it
will cause traffic to be processed by flow rules found in the target group.

Also I think you should put emphasis on undefined behavior:

- Targeting a group that doesn't include any flow rule.
- Triggering loops.

> +
> +.. _table_rte_flow_action_jump:
> +
> +.. table:: JUMP
> +
> +   +-----------+---------------------------------+
> +   | Field     | Value                           |
> +   +===========+=================================+
> +   | ``group`` | Group ID to redirect packets to |
> +   +-----------+---------------------------------+

Nit: "Group ID" => "group ID" to keep the style.

> +
>  Action: ``MARK``
>  ^^^^^^^^^^^^^^^^
>  
> @@ -1512,7 +1534,7 @@ the RTE_FLOW_ITEM_TYPE_END item type.
>     | ``definition`` | Tunnel end-point overlay definition |
>     +----------------+-------------------------------------+
>  
> -.. _table_rte_flow_action_tunnel_encap_example:
> +.. _table_rte_flow_action_tunnel_encap_vxlan_example:
>  
>  .. table:: IPv4 VxLAN flow pattern example.
>  
> @@ -1551,7 +1573,7 @@ NVGRE network overlay which conforms with RFC 7637 (NVGRE: Network Virtualizatio
>  Using Generic Routing Encapsulation). The pattern must be terminated with
>  the RTE_FLOW_ITEM_TYPE_END item type.
>  
> -.. _table_rte_flow_action_tunnel_encap_example:
> +.. _table_rte_flow_action_tunnel_encap_nvgre_example:

The above two hunks do not seem relevant.

>
>  .. table:: IPv4 NVGRE flow pattern example.
>  
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 537e1bfba..91544f62b 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -913,6 +913,20 @@ enum rte_flow_action_type {
>  	 */
>  	RTE_FLOW_ACTION_TYPE_PASSTHRU,
>  
> +	/**
> +	 * RTE_FLOW_ACTION_TYPE_JUMP
> +	 *
> +	 * Redirects packets to a group on the current device.
> +	 *
> +	 * In a hierarchy of groups, which can be used to represent
> +	 * physical or logical flow groups/tables on a device, this
> +	 * action allows the terminating action to be a group on
> +	 * that device.

Since struct rte_flow_action_jump provides complete documentation, no need
to repeat this paragraph here. A single line is enough (see other actions).

> +	 *
> +	 * See struct rte_flow_action_jump

Nit: missing "."

> +	 */
> +	RTE_FLOW_ACTION_TYPE_JUMP,
> +
>  	/**
>  	 * [META]
>  	 *
> @@ -1213,6 +1227,19 @@ struct rte_flow_action_tunnel_encap {
>  	 */
>  };
>  
> +/**
> + * RTE_FLOW_ACTION_TYPE_JUMP
> + *
> + * Redirects packets to a group on the current device.
> + *
> + * In a hierarchy of groups, which can be used to represent physical or logical
> + * flow tables on the device, this action allows the action to be a redirect to
> + * a group on that device.

Remember to synchronize this paragraph after making changes to rte_flow.rst.

Here you can also provide more info regarding undefined behavior (useful in
Doxygen format for application writers).

> + */
> +struct rte_flow_action_jump {
> +	uint32_t group;
> +};
> +

You should move this definition before "struct rte_flow_action_mark" to keep
the same definition order as in the enum.

>  /**
>   * Definition of a single action.
>   *
> -- 
> 2.14.3
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 16/16] ethdev: add port ID item and action to flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (11 preceding siblings ...)
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z, Declan Doherty

RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
into a different device, as identified by its DPDK port ID.

This is normally only supported when the target port ID has some kind of
relationship with the port ID the flow rule is created against, such as
being exposed by a common physical device (e.g. a different port of an
Ethernet switch).

The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
flow rule match traffic whose origin is the specified port ID. Note that
specifying a port ID that differs from the one the flow rule is created
against is normally meaningless (if even accepted), but can make sense if
combined with the transfer attribute.

These must not be confused with their PHY_PORT counterparts, which refer to
physical ports using device-specific indices, but unlike PORT_ID are not
necessarily tied to DPDK port IDs.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
Cc: Declan Doherty <declan.doherty@intel.com>

---

This patch provides the same functionality and supersedes Qi Zhang's
"ether: add flow action to redirect packet to a port" [1].

The main differences are:

- Action is named PORT_ID instead of PORT.
- Addition of a PORT_ID pattern item.
- More extensive documentation.
- Testpmd support.
- rte_flow_copy() support.

[1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
---
 app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  2 +
 doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
 lib/librte_ether/rte_flow.c                 |  2 +
 lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 356714801..32fe6645a 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -89,6 +89,8 @@ enum index {
 	ITEM_VF_ID,
 	ITEM_PHY_PORT,
 	ITEM_PHY_PORT_INDEX,
+	ITEM_PORT_ID,
+	ITEM_PORT_ID_ID,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -185,6 +187,9 @@ enum index {
 	ACTION_PHY_PORT,
 	ACTION_PHY_PORT_ORIGINAL,
 	ACTION_PHY_PORT_INDEX,
+	ACTION_PORT_ID,
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -445,6 +450,7 @@ static const enum index next_item[] = {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_PHY_PORT,
+	ITEM_PORT_ID,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -491,6 +497,12 @@ static const enum index item_phy_port[] = {
 	ZERO,
 };
 
+static const enum index item_port_id[] = {
+	ITEM_PORT_ID_ID,
+	ITEM_NEXT,
+	ZERO,
+};
+
 static const enum index item_raw[] = {
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -627,6 +639,7 @@ static const enum index next_action[] = {
 	ACTION_PF,
 	ACTION_VF,
 	ACTION_PHY_PORT,
+	ACTION_PORT_ID,
 	ACTION_METER,
 	ZERO,
 };
@@ -668,6 +681,13 @@ static const enum index action_phy_port[] = {
 	ZERO,
 };
 
+static const enum index action_port_id[] = {
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1084,6 +1104,20 @@ static const struct token token_list[] = {
 		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
+	[ITEM_PORT_ID] = {
+		.name = "port_id",
+		.help = "match traffic from/to a given DPDK port ID",
+		.priv = PRIV_ITEM(PORT_ID,
+				  sizeof(struct rte_flow_item_port_id)),
+		.next = NEXT(item_port_id),
+		.call = parse_vc,
+	},
+	[ITEM_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(item_port_id, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port_id, id)),
+	},
 	[ITEM_RAW] = {
 		.name = "raw",
 		.help = "match an arbitrary byte string",
@@ -1749,6 +1783,29 @@ static const struct token token_list[] = {
 					index)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PORT_ID] = {
+		.name = "port_id",
+		.help = "direct matching traffic to a given DPDK port ID",
+		.priv = PRIV_ACTION(PORT_ID,
+				    sizeof(struct rte_flow_action_port_id)),
+		.next = NEXT(action_port_id),
+		.call = parse_vc,
+	},
+	[ACTION_PORT_ID_ORIGINAL] = {
+		.name = "original",
+		.help = "use original DPDK port ID if possible",
+		.next = NEXT(action_port_id, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_port_id,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(action_port_id, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_port_id, id)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 2d68f1fb0..e7026011b 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -977,6 +977,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -1075,6 +1076,7 @@ static const struct {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a39c1e1b0..2fb8e9c3f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -617,6 +617,36 @@ associated with a port_id should be retrieved by other means.
    | ``mask`` | ``index`` | zeroed to match any port index |
    +----------+-----------+--------------------------------+
 
+Item: ``PORT_ID``
+^^^^^^^^^^^^^^^^^
+
+Matches traffic originating from (ingress) or going to (egress) a given DPDK
+port ID.
+
+Normally only supported if the port ID in question is known by the
+underlying PMD and related to the device the flow rule is created against.
+
+This must not be confused with `Item: PHY_PORT`_ which refers to the
+physical port of a device, whereas `Item: PORT_ID`_ refers to a ``struct
+rte_eth_dev`` object on the application side (also known as "port
+representor" depending on the kind of underlying device).
+
+- Default ``mask`` matches the specified DPDK port ID.
+
+.. _table_rte_flow_item_port_id:
+
+.. table:: PORT_ID
+
+   +----------+----------+-----------------------------+
+   | Field    | Subfield | Value                       |
+   +==========+==========+=============================+
+   | ``spec`` | ``id``   | DPDK port ID                |
+   +----------+----------+-----------------------------+
+   | ``last`` | ``id``   | upper range value           |
+   +----------+----------+-----------------------------+
+   | ``mask`` | ``id``   | zeroed to match any port ID |
+   +----------+----------+-----------------------------+
+
 Data matching item types
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1453,6 +1483,24 @@ See `Item: PHY_PORT`_.
    | ``index``    | physical port index                 |
    +--------------+-------------------------------------+
 
+Action: ``PORT_ID``
+^^^^^^^^^^^^^^^^^^^
+Directs matching traffic to a given DPDK port ID.
+
+See `Item: PORT_ID`_.
+
+.. _table_rte_flow_action_port_id:
+
+.. table:: PORT_ID
+
+   +--------------+---------------------------------------+
+   | Field        | Value                                 |
+   +==============+=======================================+
+   | ``original`` | use original DPDK port ID if possible |
+   +--------------+---------------------------------------+
+   | ``id``       | DPDK port ID                          |
+   +--------------+---------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 64d8dfddb..bfb5ad027 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3212,6 +3212,10 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: match traffic from/to a given DPDK port ID.
+
+  - ``id {unsigned}``: DPDK port ID.
+
 - ``raw``: match an arbitrary byte string.
 
   - ``relative {boolean}``: look for pattern after the previous item.
@@ -3428,6 +3432,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original port index if possible.
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: direct matching traffic to a given DPDK port ID.
+
+  - ``original {boolean}``: use original DPDK port ID if possible.
+  - ``id {unsigned}``: DPDK port ID.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 00989c73b..cecab59f6 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,6 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -77,6 +78,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 58b75e934..09a21e531 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -180,6 +180,16 @@ enum rte_flow_item_type {
 	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
+	 * [META]
+	 *
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given DPDK port ID.
+	 *
+	 * See struct rte_flow_item_port_id.
+	 */
+	RTE_FLOW_ITEM_TYPE_PORT_ID,
+
+	/**
 	 * Matches a byte string of a given length at a given offset.
 	 *
 	 * See struct rte_flow_item_raw.
@@ -414,6 +424,32 @@ static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 #endif
 
 /**
+ * RTE_FLOW_ITEM_TYPE_PORT_ID
+ *
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * DPDK port ID.
+ *
+ * Normally only supported if the port ID in question is known by the
+ * underlying PMD and related to the device the flow rule is created
+ * against.
+ *
+ * This must not be confused with @p PHY_PORT which refers to the physical
+ * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev
+ * object on the application side (also known as "port representor"
+ * depending on the kind of underlying device).
+ */
+struct rte_flow_item_port_id {
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */
+#ifndef __cplusplus
+static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = {
+	.id = 0xffffffff,
+};
+#endif
+
+/**
  * RTE_FLOW_ITEM_TYPE_RAW
  *
  * Matches a byte string of a given length at a given offset.
@@ -997,6 +1033,13 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_PHY_PORT,
 
 	/**
+	 * Directs matching traffic to a given DPDK port ID.
+	 *
+	 * See struct rte_flow_action_port_id.
+	 */
+	RTE_FLOW_ACTION_TYPE_PORT_ID,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1134,6 +1177,19 @@ struct rte_flow_action_phy_port {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PORT_ID
+ *
+ * Directs matching traffic to a given DPDK port ID.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PORT_ID
+ */
+struct rte_flow_action_port_id {
+	uint32_t original:1; /**< Use original DPDK port ID if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 15/16] ethdev: add physical port action to flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (10 preceding siblings ...)
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-19 10:16  3%         ` Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 16/16] ethdev: add port ID item and " Adrien Mazarguil
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

This patch adds the missing action counterpart to the PHY_PORT pattern
item, that is, the ability to directly inject matching traffic into a
physical port of the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
 6 files changed, 84 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f9f937277..356714801 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -182,6 +182,9 @@ enum index {
 	ACTION_VF,
 	ACTION_VF_ORIGINAL,
 	ACTION_VF_ID,
+	ACTION_PHY_PORT,
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -623,6 +626,7 @@ static const enum index next_action[] = {
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
+	ACTION_PHY_PORT,
 	ACTION_METER,
 	ZERO,
 };
@@ -657,6 +661,13 @@ static const enum index action_vf[] = {
 	ZERO,
 };
 
+static const enum index action_phy_port[] = {
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "direct packets to physical port index",
+		.priv = PRIV_ACTION(PHY_PORT,
+				    sizeof(struct rte_flow_action_phy_port)),
+		.next = NEXT(action_phy_port),
+		.call = parse_vc,
+	},
+	[ACTION_PHY_PORT_ORIGINAL] = {
+		.name = "original",
+		.help = "use original port index if possible",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PHY_PORT_INDEX] = {
+		.name = "index",
+		.help = "physical port index",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
+					index)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 840320108..2d68f1fb0 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1074,6 +1074,7 @@ static const struct {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 4e053c24b..a39c1e1b0 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1433,6 +1433,26 @@ See `Item: VF`_.
    | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
+Action: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^^^
+
+Directs matching traffic to a given physical port index of the underlying
+device.
+
+See `Item: PHY_PORT`_.
+
+.. _table_rte_flow_action_phy_port:
+
+.. table:: PHY_PORT
+
+   +--------------+-------------------------------------+
+   | Field        | Value                               |
+   +==============+=====================================+
+   | ``original`` | use original port index if possible |
+   +--------------+-------------------------------------+
+   | ``index``    | physical port index                 |
+   +--------------+-------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a2bbd1930..64d8dfddb 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3423,6 +3423,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original VF ID if possible.
   - ``id {unsigned}``: VF ID.
 
+- ``phy_port``: direct packets to physical port index.
+
+  - ``original {boolean}``: use original port index if possible.
+  - ``index {unsigned}``: physical port index.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 36e277a4f..00989c73b 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 2c7c4d009..58b75e934 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -989,6 +989,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VF,
 
 	/**
+	 * Directs packets to a given physical port index of the underlying
+	 * device.
+	 *
+	 * See struct rte_flow_action_phy_port.
+	 */
+	RTE_FLOW_ACTION_TYPE_PHY_PORT,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1112,6 +1120,20 @@ struct rte_flow_action_vf {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PHY_PORT
+ *
+ * Directs packets to a given physical port index of the underlying
+ * device.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
+ */
+struct rte_flow_action_phy_port {
+	uint32_t original:1; /**< Use original port index if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t index; /**< Physical port index. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 14/16] ethdev: rename physical port item in flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (9 preceding siblings ...)
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 15/16] ethdev: add physical port action to " Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 16/16] ethdev: add port ID item and " Adrien Mazarguil
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
device using specific identifiers, these are often confused with DPDK port
IDs exposed to applications in the global name space.

Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
for better clarity.

No ABI impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
 app/test-pmd/config.c                       |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
 lib/librte_ether/rte_flow.c                 |  2 +-
 lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 41103de67..f9f937277 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -87,8 +87,8 @@ enum index {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_VF_ID,
-	ITEM_PORT,
-	ITEM_PORT_INDEX,
+	ITEM_PHY_PORT,
+	ITEM_PHY_PORT_INDEX,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -441,7 +441,7 @@ static const enum index next_item[] = {
 	ITEM_ANY,
 	ITEM_PF,
 	ITEM_VF,
-	ITEM_PORT,
+	ITEM_PHY_PORT,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -482,8 +482,8 @@ static const enum index item_vf[] = {
 	ZERO,
 };
 
-static const enum index item_port[] = {
-	ITEM_PORT_INDEX,
+static const enum index item_phy_port[] = {
+	ITEM_PHY_PORT_INDEX,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1059,18 +1059,19 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
-	[ITEM_PORT] = {
-		.name = "port",
-		.help = "device-specific physical port index to use",
-		.priv = PRIV_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-		.next = NEXT(item_port),
+	[ITEM_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "match traffic from/to a specific physical port",
+		.priv = PRIV_ITEM(PHY_PORT,
+				  sizeof(struct rte_flow_item_phy_port)),
+		.next = NEXT(item_phy_port),
 		.call = parse_vc,
 	},
-	[ITEM_PORT_INDEX] = {
+	[ITEM_PHY_PORT_INDEX] = {
 		.name = "index",
 		.help = "physical port index",
-		.next = NEXT(item_port, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port, index)),
+		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
 	[ITEM_RAW] = {
 		.name = "raw",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a50a5c544..840320108 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -976,7 +976,7 @@ static const struct {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a0a124aa2..4e053c24b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -195,8 +195,8 @@ When supported, this effectively enables an application to reroute traffic
 not necessarily intended for it (e.g. coming from or addressed to different
 physical ports, VFs or applications) at the device level.
 
-It complements the behavior of some pattern items such as `Item: PORT`_ and
-is meaningless without them.
+It complements the behavior of some pattern items such as `Item: PHY_PORT`_
+and is meaningless without them.
 
 When transferring flow rules, **ingress** and **egress** attributes
 (`Attribute: Traffic direction`_) keep their original meaning, as if
@@ -583,15 +583,15 @@ separate entities, should be addressed through their own DPDK port IDs.
    | ``mask`` | ``id``   | zeroed to match any VF ID |
    +----------+----------+---------------------------+
 
-Item: ``PORT``
-^^^^^^^^^^^^^^
+Item: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^
 
-Matches packets coming from the specified physical port of the underlying
-device.
+Matches traffic originating from (ingress) or going to (egress) a physical
+port of the underlying device.
 
-The first PORT item overrides the physical port normally associated with the
-specified DPDK input port (port_id). This item can be provided several times
-to match additional physical ports.
+The first PHY_PORT item overrides the physical port normally associated with
+the specified DPDK input port (port_id). This item can be provided several
+times to match additional physical ports.
 
 Note that physical ports are not necessarily tied to DPDK input ports
 (port_id) when those are not under DPDK control. Possible values are
@@ -603,9 +603,9 @@ associated with a port_id should be retrieved by other means.
 
 - Default ``mask`` matches any port index.
 
-.. _table_rte_flow_item_port:
+.. _table_rte_flow_item_phy_port:
 
-.. table:: PORT
+.. table:: PHY_PORT
 
    +----------+-----------+--------------------------------+
    | Field    | Subfield  | Value                          |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index af37c3d82..a2bbd1930 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3208,7 +3208,7 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``id {unsigned}``: VF ID.
 
-- ``port``: device-specific physical port index to use.
+- ``phy_port``: match traffic from/to a specific physical port.
 
   - ``index {unsigned}``: physical port index.
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 83b733ff0..36e277a4f 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -38,7 +38,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index f1c7a664e..2c7c4d009 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -84,7 +84,7 @@ struct rte_flow_attr {
 	 * applications) at the device level.
 	 *
 	 * It complements the behavior of some pattern items such as
-	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
 	 *
 	 * When transferring flow rules, ingress and egress attributes keep
 	 * their original meaning, as if processing traffic emitted or
@@ -172,17 +172,12 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets coming from the specified physical port of the
-	 * underlying device.
-	 *
-	 * The first PORT item overrides the physical port normally
-	 * associated with the specified DPDK input port (port_id). This
-	 * item can be provided several times to match additional physical
-	 * ports.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * physical port of the underlying device.
 	 *
-	 * See struct rte_flow_item_port.
+	 * See struct rte_flow_item_phy_port.
 	 */
-	RTE_FLOW_ITEM_TYPE_PORT,
+	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
 	 * Matches a byte string of a given length at a given offset.
@@ -388,13 +383,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
 #endif
 
 /**
- * RTE_FLOW_ITEM_TYPE_PORT
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT
  *
- * Matches packets coming from the specified physical port of the underlying
- * device.
+ * Matches traffic originating from (ingress) or going to (egress) a
+ * physical port of the underlying device.
  *
- * The first PORT item overrides the physical port normally associated with
- * the specified DPDK input port (port_id). This item can be provided
+ * The first PHY_PORT item overrides the physical port normally associated
+ * with the specified DPDK input port (port_id). This item can be provided
  * several times to match additional physical ports.
  *
  * Note that physical ports are not necessarily tied to DPDK input ports
@@ -407,13 +402,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
  *
  * A zeroed mask can be used to match any port index.
  */
-struct rte_flow_item_port {
+struct rte_flow_item_phy_port {
 	uint32_t index; /**< Physical port index. */
 };
 
-/** Default mask for RTE_FLOW_ITEM_TYPE_PORT. */
+/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */
 #ifndef __cplusplus
-static const struct rte_flow_item_port rte_flow_item_port_mask = {
+static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 	.index = 0x00000000,
 };
 #endif
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 13/16] ethdev: update behavior of VF/PF in flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (8 preceding siblings ...)
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 14/16] ethdev: rename physical port item " Adrien Mazarguil
                           ` (2 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

Contrary to all other pattern items, these are inconsistently documented as
affecting traffic instead of simply matching its origin, without provision
for the latter.

This commit clarifies documentation and updates PMDs since the original
behavior now has to be explicitly requested using the new transfer
attribute.

It breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
supported when a transfer attribute is also present.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 12 +++---
 doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
 drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
 drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
 lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
 6 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 1c6b5a112..41103de67 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -1041,21 +1041,21 @@ static const struct token token_list[] = {
 	},
 	[ITEM_PF] = {
 		.name = "pf",
-		.help = "match packets addressed to the physical function",
+		.help = "match traffic from/to the physical function",
 		.priv = PRIV_ITEM(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ITEM_NEXT)),
 		.call = parse_vc,
 	},
 	[ITEM_VF] = {
 		.name = "vf",
-		.help = "match packets addressed to a virtual function ID",
+		.help = "match traffic from/to a virtual function ID",
 		.priv = PRIV_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 		.next = NEXT(item_vf),
 		.call = parse_vc,
 	},
 	[ITEM_VF_ID] = {
 		.name = "id",
-		.help = "destination VF ID",
+		.help = "VF ID",
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
@@ -1686,14 +1686,14 @@ static const struct token token_list[] = {
 	},
 	[ACTION_PF] = {
 		.name = "pf",
-		.help = "redirect packets to physical device function",
+		.help = "direct traffic to physical function",
 		.priv = PRIV_ACTION(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
 	[ACTION_VF] = {
 		.name = "vf",
-		.help = "redirect packets to virtual device function",
+		.help = "direct traffic to a virtual function ID",
 		.priv = PRIV_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 		.next = NEXT(action_vf),
 		.call = parse_vc,
@@ -1708,7 +1708,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_VF_ID] = {
 		.name = "id",
-		.help = "VF ID to redirect packets to",
+		.help = "VF ID",
 		.next = NEXT(action_vf, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 550a4c95b..a0a124aa2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -528,15 +528,12 @@ Usage example, matching non-TCPv4 packets only:
 Item: ``PF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to the physical function of the device.
+Matches traffic originating from (ingress) or going to (egress) the physical
+function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: PF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the physical function is not managed by
+the application and thus not associated with a DPDK port ID.
 
-- Likely to return an error or never match any traffic if applied to a VF
-  device.
 - Can be combined with any number of `Item: VF`_ to match both PF and VF
   traffic.
 - ``spec``, ``last`` and ``mask`` must not be set.
@@ -558,15 +555,15 @@ duplicated between device instances by default.
 Item: ``VF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to a virtual function ID of the device.
+Matches traffic originating from (ingress) or going to (egress) a given
+virtual function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: VF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the virtual function is not managed by the
+application and thus not associated with a DPDK port ID.
+
+Note this pattern item does not match VF representors traffic which, as
+separate entities, should be addressed through their own DPDK port IDs.
 
-- Likely to return an error or never match any traffic if this causes a VF
-  device to match traffic addressed to a different VF.
 - Can be specified multiple times to match traffic addressed to several VF
   IDs.
 - Can be combined with a PF item to match both PF and VF traffic.
@@ -1395,7 +1392,10 @@ only matching traffic goes through.
 Action: ``PF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to the physical function (PF) of the current device.
+Directs matching traffic to the physical function (PF) of the current
+device.
+
+See `Item: PF`_.
 
 - No configurable properties.
 
@@ -1412,13 +1412,15 @@ Redirects packets to the physical function (PF) of the current device.
 Action: ``VF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to a virtual function (VF) of the current device.
+Directs matching traffic to a given virtual function of the current device.
 
 Packets matched by a VF pattern item can be redirected to their original VF
 ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
+See `Item: VF`_.
+
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1428,7 +1430,7 @@ rule or if packets are not addressed to a VF in the first place.
    +==============+================================+
    | ``original`` | use original VF ID if possible |
    +--------------+--------------------------------+
-   | ``vf``       | VF ID to redirect packets to   |
+   | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
 Action: ``METER``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 0bf6c33c9..af37c3d82 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3202,11 +3202,11 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``num {unsigned}``: number of layers covered.
 
-- ``pf``: match packets addressed to the physical function.
+- ``pf``: match traffic from/to the physical function.
 
-- ``vf``: match packets addressed to a virtual function ID.
+- ``vf``: match traffic from/to a virtual function ID.
 
-  - ``id {unsigned}``: destination VF ID.
+  - ``id {unsigned}``: VF ID.
 
 - ``port``: device-specific physical port index to use.
 
@@ -3416,12 +3416,12 @@ This section lists supported actions and their attributes, if any.
 
   - ``queues [{unsigned} [...]] end``: queue indices to use.
 
-- ``pf``: redirect packets to physical device function.
+- ``pf``: direct traffic to physical function.
 
-- ``vf``: redirect packets to virtual device function.
+- ``vf``: direct traffic to a virtual function ID.
 
   - ``original {boolean}``: use original VF ID if possible.
-  - ``id {unsigned}``: VF ID to redirect packets to.
+  - ``id {unsigned}``: VF ID.
 
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 68deb3445..dadd1e32f 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -283,6 +283,7 @@ bnxt_filter_type_check(const struct rte_flow_item pattern[],
 
 static int
 bnxt_validate_and_parse_flow_type(struct bnxt *bp,
+				  const struct rte_flow_attr *attr,
 				  const struct rte_flow_item pattern[],
 				  struct rte_flow_error *error,
 				  struct bnxt_filter_info *filter)
@@ -707,6 +708,16 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 				return -rte_errno;
 			}
 
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   item,
+					   "Matching VF traffic without"
+					   " affecting it (transfer attribute)"
+					   " is unsupported");
+				return -rte_errno;
+			}
+
 			filter->mirror_vnic_id =
 			dflt_vnic = bnxt_hwrm_func_qcfg_vf_dflt_vnic_id(bp, vf);
 			if (dflt_vnic < 0) {
@@ -754,14 +765,6 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -841,7 +844,8 @@ bnxt_validate_and_parse_flow(struct rte_eth_dev *dev,
 		goto ret;
 	}
 
-	rc = bnxt_validate_and_parse_flow_type(bp, pattern, error, filter);
+	rc = bnxt_validate_and_parse_flow_type(bp, attr, pattern, error,
+					       filter);
 	if (rc != 0)
 		goto ret;
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index f416b6a00..057e4f96d 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -54,6 +54,7 @@ static int i40e_flow_parse_ethertype_action(struct rte_eth_dev *dev,
 				    struct rte_flow_error *error,
 				    struct rte_eth_ethertype_filter *filter);
 static int i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+					const struct rte_flow_attr *attr,
 					const struct rte_flow_item *pattern,
 					struct rte_flow_error *error,
 					struct i40e_fdir_filter_conf *filter);
@@ -1918,14 +1919,6 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -2429,6 +2422,7 @@ i40e_flow_fdir_get_pctype_value(struct i40e_pf *pf,
  */
 static int
 i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
 			     const struct rte_flow_item *pattern,
 			     struct rte_flow_error *error,
 			     struct i40e_fdir_filter_conf *filter)
@@ -2966,6 +2960,16 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ITEM_TYPE_VF:
 			vf_spec = item->spec;
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "Matching VF traffic"
+						   " without affecting it"
+						   " (transfer attribute)"
+						   " is unsupported");
+				return -rte_errno;
+			}
 			filter->input.flow_ext.is_vf = 1;
 			filter->input.flow_ext.dst_id = vf_spec->id;
 			if (filter->input.flow_ext.is_vf &&
@@ -3128,7 +3132,8 @@ i40e_flow_parse_fdir_filter(struct rte_eth_dev *dev,
 		&filter->fdir_filter;
 	int ret;
 
-	ret = i40e_flow_parse_fdir_pattern(dev, pattern, error, fdir_filter);
+	ret = i40e_flow_parse_fdir_pattern(dev, attr, pattern, error,
+					   fdir_filter);
 	if (ret)
 		return ret;
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ab2bf2dce..f1c7a664e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -152,13 +152,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to the physical function of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a PF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress)
+	 * the physical function of the current device.
 	 *
 	 * No associated specification structure.
 	 */
@@ -167,13 +162,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to a virtual function ID of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a VF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given virtual function of the current device.
 	 *
 	 * See struct rte_flow_item_vf.
 	 */
@@ -371,15 +361,15 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
 /**
  * RTE_FLOW_ITEM_TYPE_VF
  *
- * Matches packets addressed to a virtual function ID of the device.
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * virtual function of the current device.
  *
- * If the underlying device function differs from the one that would
- * normally receive the matched traffic, specifying this item prevents it
- * from reaching that device unless the flow rule contains a VF
- * action. Packets are not duplicated between device instances by default.
+ * If supported, should work even if the virtual function is not managed by
+ * the application and thus not associated with a DPDK port ID.
+ *
+ * Note this pattern item does not match VF representors traffic which, as
+ * separate entities, should be addressed through their own DPDK port IDs.
  *
- * - Likely to return an error or never match any traffic if this causes a
- *   VF device to match traffic addressed to a different VF.
  * - Can be specified multiple times to match traffic addressed to several
  *   VF IDs.
  * - Can be combined with a PF item to match both PF and VF traffic.
@@ -387,7 +377,7 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
  * A zeroed mask can be used to match any VF ID.
  */
 struct rte_flow_item_vf {
-	uint32_t id; /**< Destination VF ID. */
+	uint32_t id; /**< VF ID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VF. */
@@ -988,16 +978,16 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_RSS,
 
 	/**
-	 * Redirects packets to the physical function (PF) of the current
-	 * device.
+	 * Directs matching traffic to the physical function (PF) of the
+	 * current device.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PF,
 
 	/**
-	 * Redirects packets to the virtual function (VF) of the current
-	 * device with the specified ID.
+	 * Directs matching traffic to a given virtual function of the
+	 * current device.
 	 *
 	 * See struct rte_flow_action_vf.
 	 */
@@ -1111,7 +1101,8 @@ struct rte_flow_action_rss {
 /**
  * RTE_FLOW_ACTION_TYPE_VF
  *
- * Redirects packets to a virtual function (VF) of the current device.
+ * Directs matching traffic to a given virtual function of the current
+ * device.
  *
  * Packets matched by a VF pattern item can be redirected to their original
  * VF ID instead of the specified one. This parameter may not be available
@@ -1122,7 +1113,7 @@ struct rte_flow_action_rss {
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
 	uint32_t reserved:31; /**< Reserved, must be zero. */
-	uint32_t id; /**< VF ID to redirect packets to. */
+	uint32_t id; /**< VF ID. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 12/16] ethdev: add transfer attribute to flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (7 preceding siblings ...)
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
                           ` (3 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Andrew Rybchenko

This new attribute enables applications to create flow rules that do not
simply match traffic whose origin is specified in the pattern (e.g. some
non-default physical port or VF), but actively affect it by applying the
flow rule at the lowest possible level in the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>

---

v3 changes:

Clarified definition for ingress and egress following Andrew's comment on
subsequent patch.

[1] http://dpdk.org/ml/archives/dev/2018-April/095961.html
---
 app/test-pmd/cmdline_flow.c                 | 11 +++++
 app/test-pmd/config.c                       |  6 ++-
 doc/guides/prog_guide/rte_flow.rst          | 26 +++++++++++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 11 ++---
 drivers/net/bnxt/bnxt_filter.c              |  8 ++++
 drivers/net/e1000/igb_flow.c                | 44 ++++++++++++++++++++
 drivers/net/enic/enic_flow.c                |  6 +++
 drivers/net/i40e/i40e_flow.c                |  8 ++++
 drivers/net/ixgbe/ixgbe_flow.c              | 53 ++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_flow.c                |  4 ++
 drivers/net/mlx5/mlx5_flow.c                |  7 ++++
 drivers/net/mvpp2/mrvl_flow.c               |  6 +++
 drivers/net/sfc/sfc_flow.c                  |  6 +++
 drivers/net/tap/tap_flow.c                  |  6 +++
 lib/librte_ether/rte_flow.h                 | 22 +++++++++-
 15 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f8f2a559e..1c6b5a112 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -69,6 +69,7 @@ enum index {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 
 	/* Validate/create pattern. */
 	PATTERN,
@@ -407,6 +408,7 @@ static const enum index next_vc_attr[] = {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 	PATTERN,
 	ZERO,
 };
@@ -960,6 +962,12 @@ static const struct token token_list[] = {
 		.next = NEXT(next_vc_attr),
 		.call = parse_vc,
 	},
+	[TRANSFER] = {
+		.name = "transfer",
+		.help = "apply rule directly to endpoints found in pattern",
+		.next = NEXT(next_vc_attr),
+		.call = parse_vc,
+	},
 	/* Validate/create pattern. */
 	[PATTERN] = {
 		.name = "pattern",
@@ -1945,6 +1953,9 @@ parse_vc(struct context *ctx, const struct token *token,
 	case EGRESS:
 		out->args.vc.attr.egress = 1;
 		return len;
+	case TRANSFER:
+		out->args.vc.attr.transfer = 1;
+		return len;
 	case PATTERN:
 		out->args.vc.pattern =
 			(void *)RTE_ALIGN_CEIL((uintptr_t)(out + 1),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 562fb2f8d..a50a5c544 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1239,6 +1239,7 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
+		[RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
 		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
@@ -1504,12 +1505,13 @@ port_flow_list(portid_t port_id, uint32_t n, const uint32_t group[n])
 		const struct rte_flow_item *item = pf->pattern;
 		const struct rte_flow_action *action = pf->actions;
 
-		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c\t",
+		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c%c\t",
 		       pf->id,
 		       pf->attr.group,
 		       pf->attr.priority,
 		       pf->attr.ingress ? 'i' : '-',
-		       pf->attr.egress ? 'e' : '-');
+		       pf->attr.egress ? 'e' : '-',
+		       pf->attr.transfer ? 't' : '-');
 		while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 			if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
 				printf("%s ", flow_item[item->type].name);
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index c62a80566..550a4c95b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -170,7 +170,13 @@ Note that support for more than a single priority level is not guaranteed.
 Attribute: Traffic direction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Flow rules can apply to inbound and/or outbound traffic (ingress/egress).
+Flow rule patterns apply to inbound and/or outbound traffic.
+
+In the context of this API, **ingress** and **egress** respectively stand
+for **inbound** and **outbound** based on the standpoint of the application
+creating a flow rule.
+
+There are no exceptions to this definition.
 
 Several pattern items and actions are valid and can be used in both
 directions. At least one direction must be specified.
@@ -178,6 +184,24 @@ directions. At least one direction must be specified.
 Specifying both directions at once for a given rule is not recommended but
 may be valid in a few cases (e.g. shared counters).
 
+Attribute: Transfer
+^^^^^^^^^^^^^^^^^^^
+
+Instead of simply matching the properties of traffic as it would appear on a
+given DPDK port ID, enabling this attribute transfers a flow rule to the
+lowest possible level of any device endpoints found in the pattern.
+
+When supported, this effectively enables an application to reroute traffic
+not necessarily intended for it (e.g. coming from or addressed to different
+physical ports, VFs or applications) at the device level.
+
+It complements the behavior of some pattern items such as `Item: PORT`_ and
+is meaningless without them.
+
+When transferring flow rules, **ingress** and **egress** attributes
+(`Attribute: Traffic direction`_) keep their original meaning, as if
+processing traffic emitted or received by the application.
+
 Pattern item
 ~~~~~~~~~~~~
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 923664f7d..0bf6c33c9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -2970,14 +2970,14 @@ following sections.
 - Check whether a flow rule can be created::
 
    flow validate {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
 - Create a flow rule::
 
    flow create {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
@@ -3010,7 +3010,7 @@ underlying device in its current state but stops short of creating it. It is
 bound to ``rte_flow_validate()``::
 
    flow validate {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3047,7 +3047,7 @@ Creating flow rules
 to ``rte_flow_create()``::
 
    flow create {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3061,7 +3061,7 @@ Otherwise it will show an error message of the form::
 
 Parameters describe in the following order:
 
-- Attributes (*group*, *priority*, *ingress*, *egress* tokens).
+- Attributes (*group*, *priority*, *ingress*, *egress*, *transfer* tokens).
 - A matching pattern, starting with the *pattern* token and terminated by an
   *end* pattern item.
 - Actions, starting with the *actions* token and terminated by an *end*
@@ -3089,6 +3089,7 @@ specified before the ``pattern`` token.
 - ``priority {level}``: priority level within group.
 - ``ingress``: rule applies to ingress traffic.
 - ``egress``: rule applies to egress traffic.
+- ``transfer``: apply rule directly to endpoints found in pattern.
 
 Each instance of an attribute specified several times overrides the previous
 value as shown below (group 4 is used)::
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 25806bdc0..68deb3445 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -754,6 +754,14 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index d1c0b4b8d..073852913 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -379,6 +379,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -624,6 +633,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -923,6 +940,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1211,6 +1237,15 @@ cons_parse_flex_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -1361,6 +1396,15 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index eea14ee73..525f3dd7c 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -1318,6 +1318,12 @@ enic_flow_parse(struct rte_eth_dev *dev,
 					   NULL,
 					   "egress is not supported");
 			return -rte_errno;
+		} else if (attrs->transfer) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					   NULL,
+					   "transfer is not supported");
+			return -rte_errno;
 		} else if (!attrs->ingress) {
 			rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 470ab93d6..f416b6a00 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -1918,6 +1918,14 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 438bfcdfb..eb0644c82 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -557,6 +557,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -787,6 +796,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -1078,6 +1095,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1250,6 +1276,15 @@ cons_parse_l2_tn_filter(struct rte_eth_dev *dev,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
 		rte_flow_error_set(error, EINVAL,
@@ -1354,6 +1389,15 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
 		rte_flow_error_set(error, EINVAL,
@@ -2829,6 +2873,15 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index ce36ac715..e3d7aa8ef 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -652,6 +652,10 @@ mlx4_flow_prepare(struct priv *priv,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
 			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
 	if (!attr->ingress)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bcf764b4d..e6c8b3df8 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -568,6 +568,13 @@ mlx5_flow_convert_attributes(const struct rte_flow_attr *attr,
 				   "egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   NULL,
+				   "transfer is not supported");
+		return -rte_errno;
+	}
 	if (!attr->ingress) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 6478eb2fe..a2e2129cc 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -2187,6 +2187,12 @@ mrvl_flow_parse_attr(struct mrvl_priv *priv __rte_unused,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, NULL,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index cd6a61b39..bcde2c2f7 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1116,6 +1116,12 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer != 0) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, attr,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->ingress == 0) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, attr,
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index dff09313a..ad2ba9f4e 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1039,6 +1039,12 @@ priv_flow_process(struct pmd_internals *pmd,
 	};
 	int action = 0; /* Only one action authorized for now */
 
+	if (attr->transfer) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			NULL, "transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->group > MAX_GROUP) {
 		rte_flow_error_set(
 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 513734dce..ab2bf2dce 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -72,7 +72,26 @@ struct rte_flow_attr {
 	uint32_t priority; /**< Priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
-	uint32_t reserved:30; /**< Reserved, must be zero. */
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 *
+	 * When transferring flow rules, ingress and egress attributes keep
+	 * their original meaning, as if processing traffic emitted or
+	 * received by the application.
+	 */
+	uint32_t transfer:1;
+	uint32_t reserved:29; /**< Reserved, must be zero. */
 };
 
 /**
@@ -1181,6 +1200,7 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
+	RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
 	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 10/16] ethdev: refine TPID handling in flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (6 preceding siblings ...)
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 09/16] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-19 10:16  1%         ` Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
                           ` (4 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Andrew Rybchenko, Pascal Mazon

TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
consistent with the normal stacking order of pattern items, which is
confusing to applications.

Problem is that when followed by one of these layers, the EtherType field
of the preceding layer keeps its "inner" definition, and the "outer" TPID
is provided by the subsequent layer, the reverse of how a packet looks like
on the wire:

 Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
 rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]

Worse, when QinQ is involved, the stacking order of VLAN layers is
unspecified. It is unclear whether it should be reversed (innermost to
outermost) as well given TPID applies to the previous layer:

 Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
 rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
 rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]

While specifying EtherType/TPID is hopefully rarely necessary, the stacking
order in case of QinQ and the lack of documentation remain an issue.

This patch replaces TPID in the VLAN pattern item with an inner
EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
clarifies documentation and updates all relevant code.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
items:

- bnxt: EtherType matching is supported with and without VLAN, but TPID
  matching is not and triggers an error.

- e1000: EtherType matching is only supported with the ETHERTYPE filter,
  which does not support VLAN matching, therefore no impact.

- enic: same as bnxt.

- i40e: same as bnxt with existing FDIR limitations on allowed EtherType
  values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
  EtherType matching.

- ixgbe: same as e1000, with additional minor change to rely on the new
  E-Tag macro definition.

- mlx4: EtherType/TPID matching is not supported, no impact.

- mlx5: same as bnxt.

- mvpp2: same as bnxt.

- sfc: same as bnxt.

- tap: same as bnxt.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: John Daley <johndale@cisco.com>
Cc: Hyong Youb Kim <hyonkim@cisco.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Dmitri Epshtein <dima@marvell.com>
Cc: Natalie Samsonov <nsamsono@marvell.com>
Cc: Jianbo Liu <jianbo.liu@arm.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

Updated mrvl to mvpp2.

Moved unrelated default TCI mask update to separate patch.

Fixed sfc according to Andrew's comments [1], which made so much sense that
I standardized on the same behavior for all other PMDs: matching outer TPID
is never supported when a VLAN pattern item is present.

This is done because many devices accept several TPIDs but do not provide
means to match a given one explicitly, it's all or nothing, and that makes
the resulting flow rule inaccurate.

[1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
---
 app/test-pmd/cmdline_flow.c                 | 17 +++----
 doc/guides/nics/tap.rst                     |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
 drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
 drivers/net/enic/enic_flow.c                | 19 +++++---
 drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
 drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
 drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
 drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
 drivers/net/sfc/sfc_flow.c                  | 18 +++++++
 drivers/net/tap/tap_flow.c                  | 14 ++++--
 lib/librte_ether/rte_flow.h                 | 22 ++++++---
 lib/librte_net/rte_ether.h                  |  1 +
 14 files changed, 198 insertions(+), 55 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 976fde7cd..f8f2a559e 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -99,11 +99,11 @@ enum index {
 	ITEM_ETH_SRC,
 	ITEM_ETH_TYPE,
 	ITEM_VLAN,
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_IPV4,
 	ITEM_IPV4_TOS,
 	ITEM_IPV4_TTL,
@@ -505,11 +505,11 @@ static const enum index item_eth[] = {
 };
 
 static const enum index item_vlan[] = {
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1142,12 +1142,6 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vlan),
 		.call = parse_vc,
 	},
-	[ITEM_VLAN_TPID] = {
-		.name = "tpid",
-		.help = "tag protocol identifier",
-		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan, tpid)),
-	},
 	[ITEM_VLAN_TCI] = {
 		.name = "tci",
 		.help = "tag control information",
@@ -1175,6 +1169,13 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY_MASK_HTON(struct rte_flow_item_vlan,
 						  tci, "\x0f\xff")),
 	},
+	[ITEM_VLAN_INNER_TYPE] = {
+		.name = "inner_type",
+		.help = "inner EtherType",
+		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan,
+					     inner_type)),
+	},
 	[ITEM_IPV4] = {
 		.name = "ipv4",
 		.help = "match IPv4 header",
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c97786aca..3f7a15147 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -108,7 +108,7 @@ The kernel support can be checked with this command::
 Supported items:
 
 - eth: src and dst (with variable masks), and eth_type (0xffff mask).
-- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- vlan: vid, pcp, but not eid. (requires kernel 4.9)
 - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
 - udp/tcp: src and dst port (0xffff) mask.
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 1a09e8a0f..fd317b48c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -784,9 +784,15 @@ Item: ``ETH``
 
 Matches an Ethernet header.
 
+The ``type`` field either stands for "EtherType" or "TPID" when followed by
+so-called layer 2.5 pattern items such as ``RTE_FLOW_ITEM_TYPE_VLAN``. In
+the latter case, ``type`` refers to that of the outer header, with the inner
+EtherType/TPID provided by the subsequent pattern item. This is the same
+order as on the wire.
+
 - ``dst``: destination MAC.
 - ``src``: source MAC.
-- ``type``: EtherType.
+- ``type``: EtherType or TPID.
 - Default ``mask`` matches destination and source addresses only.
 
 Item: ``VLAN``
@@ -794,8 +800,12 @@ Item: ``VLAN``
 
 Matches an 802.1Q/ad VLAN tag.
 
-- ``tpid``: tag protocol identifier.
+The corresponding standard outer EtherType (TPID) values are
+``ETHER_TYPE_VLAN`` or ``ETHER_TYPE_QINQ``. It can be overridden by the
+preceding pattern item.
+
 - ``tci``: tag control information.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` matches TCI only.
 
 Item: ``IPV4``
@@ -866,12 +876,15 @@ Item: ``E_TAG``
 
 Matches an IEEE 802.1BR E-Tag header.
 
-- ``tpid``: tag protocol identifier (0x893F)
+The corresponding standard outer EtherType (TPID) value is
+``ETHER_TYPE_ETAG``. It can be overridden by the preceding pattern item.
+
 - ``epcp_edei_in_ecid_b``: E-Tag control information (E-TCI), E-PCP (3b),
   E-DEI (1b), ingress E-CID base (12b).
 - ``rsvd_grp_ecid_b``: reserved (2b), GRP (2b), E-CID base (12b).
 - ``in_ecid_e``: ingress E-CID ext.
 - ``ecid_e``: E-CID ext.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` simultaneously matches GRP and E-CID base.
 
 Item: ``NVGRE``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 3b1073bfc..923664f7d 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3223,15 +3223,15 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``dst {MAC-48}``: destination MAC.
   - ``src {MAC-48}``: source MAC.
-  - ``type {unsigned}``: EtherType.
+  - ``type {unsigned}``: EtherType or TPID.
 
 - ``vlan``: match 802.1Q/ad VLAN tag.
 
-  - ``tpid {unsigned}``: tag protocol identifier.
   - ``tci {unsigned}``: tag control information.
   - ``pcp {unsigned}``: priority code point.
   - ``dei {unsigned}``: drop eligible indicator.
   - ``vid {unsigned}``: VLAN identifier.
+  - ``inner_type {unsigned}``: inner EtherType or TPID.
 
 - ``ipv4``: match IPv4 header.
 
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index fdd94bf02..25806bdc0 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -307,6 +307,7 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 	uint32_t vf = 0;
 	int use_ntuple;
 	uint32_t en = 0;
+	uint32_t en_ethertype;
 	int dflt_vnic;
 
 	use_ntuple = bnxt_filter_type_check(pattern, error);
@@ -316,6 +317,9 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 
 	filter->filter_type = use_ntuple ?
 		HWRM_CFA_NTUPLE_FILTER : HWRM_CFA_EM_FILTER;
+	en_ethertype = use_ntuple ?
+		NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
+		EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
 
 	while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 		if (item->last) {
@@ -385,30 +389,49 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 			if (eth_mask->type) {
 				filter->ethertype =
 					rte_be_to_cpu_16(eth_spec->type);
-				en |= use_ntuple ?
-					NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
-					EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
+				en |= en_ethertype;
 			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+			if (en & en_ethertype) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "VLAN TPID matching is not"
+						   " supported");
+				return -rte_errno;
+			}
 			if (vlan_mask->tci &&
-			    vlan_mask->tci == RTE_BE16(0x0fff) &&
-			    !vlan_mask->tpid) {
+			    vlan_mask->tci == RTE_BE16(0x0fff)) {
 				/* Only the VLAN ID can be matched. */
 				filter->l2_ovlan =
 					rte_be_to_cpu_16(vlan_spec->tci &
 							 RTE_BE16(0x0fff));
 				en |= EM_FLOW_ALLOC_INPUT_EN_OVLAN_VID;
-			} else if (vlan_mask->tci || vlan_mask->tpid) {
+			} else if (vlan_mask->tci) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
 						   "VLAN mask is invalid");
 				return -rte_errno;
 			}
+			if (vlan_mask->inner_type &&
+			    vlan_mask->inner_type != RTE_BE16(0xffff)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "inner ethertype mask not"
+						   " valid");
+				return -rte_errno;
+			}
+			if (vlan_mask->inner_type) {
+				filter->ethertype =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+				en |= en_ethertype;
+			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index c34ae84d1..eea14ee73 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -557,16 +557,21 @@ enic_copy_item_vlan_v2(const struct rte_flow_item *item,
 	if (!spec)
 		return 0;
 
-	/* Don't support filtering in tpid */
-	if (mask) {
-		if (mask->tpid != 0)
-			return ENOTSUP;
-	} else {
+	if (!mask)
 		mask = &rte_flow_item_vlan_mask;
-		RTE_ASSERT(mask->tpid == 0);
-	}
 
 	if (*inner_ofst == 0) {
+		struct ether_hdr *eth_mask =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].mask;
+		struct ether_hdr *eth_val =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].val;
+
+		/* Outer TPID cannot be matched */
+		if (eth_mask->ether_type)
+			return ENOTSUP;
+		eth_mask->ether_type = mask->inner_type;
+		eth_val->ether_type = spec->inner_type;
+
 		/* Outer header. Use the vlan mask/val fields */
 		gp->mask_vlan = mask->tci;
 		gp->val_vlan = spec->tci;
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index db668835d..470ab93d6 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <stdarg.h>
 
+#include <rte_debug.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_log.h>
@@ -2491,16 +2492,22 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						      "Invalid MAC_addr mask.");
 					return -rte_errno;
 				}
+			}
+			if (eth_spec && eth_mask && eth_mask->type) {
+				enum rte_flow_item_type next = (item + 1)->type;
 
-				if ((eth_mask->type & UINT16_MAX) ==
-				    UINT16_MAX) {
-					input_set |= I40E_INSET_LAST_ETHER_TYPE;
-					filter->input.flow.l2_flow.ether_type =
-						eth_spec->type;
+				if (eth_mask->type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid type mask.");
+					return -rte_errno;
 				}
 
 				ether_type = rte_be_to_cpu_16(eth_spec->type);
-				if (ether_type == ETHER_TYPE_IPv4 ||
+
+				if (next == RTE_FLOW_ITEM_TYPE_VLAN ||
+				    ether_type == ETHER_TYPE_IPv4 ||
 				    ether_type == ETHER_TYPE_IPv6 ||
 				    ether_type == ETHER_TYPE_ARP ||
 				    ether_type == outer_tpid) {
@@ -2510,6 +2517,9 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						     "Unsupported ether_type.");
 					return -rte_errno;
 				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					eth_spec->type;
 			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
@@ -2519,6 +2529,8 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+
+			RTE_ASSERT(!(input_set & I40E_INSET_LAST_ETHER_TYPE));
 			if (vlan_spec && vlan_mask) {
 				if (vlan_mask->tci ==
 				    rte_cpu_to_be_16(I40E_TCI_MASK)) {
@@ -2527,6 +2539,33 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						vlan_spec->tci;
 				}
 			}
+			if (vlan_spec && vlan_mask && vlan_mask->inner_type) {
+				if (vlan_mask->inner_type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid inner_type"
+						      " mask.");
+					return -rte_errno;
+				}
+
+				ether_type =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+
+				if (ether_type == ETHER_TYPE_IPv4 ||
+				    ether_type == ETHER_TYPE_IPv6 ||
+				    ether_type == ETHER_TYPE_ARP ||
+				    ether_type == outer_tpid) {
+					rte_flow_error_set(error, EINVAL,
+						     RTE_FLOW_ERROR_TYPE_ITEM,
+						     item,
+						     "Unsupported inner_type.");
+					return -rte_errno;
+				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					vlan_spec->inner_type;
+			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
 			layer_idx = I40E_FLXPLD_L2_IDX;
@@ -3285,7 +3324,8 @@ i40e_flow_parse_vxlan_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -3515,7 +3555,8 @@ i40e_flow_parse_nvgre_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -4023,7 +4064,8 @@ i40e_flow_parse_qinq_pattern(__rte_unused struct rte_eth_dev *dev,
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
 
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 					   RTE_FLOW_ERROR_TYPE_ITEM,
 					   item,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index eaf1aadef..67fbbdc24 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -115,7 +115,6 @@
 
 #define IXGBE_VT_CTL_POOLING_MODE_MASK         0x00030000
 #define IXGBE_VT_CTL_POOLING_MODE_ETAG         0x00010000
-#define DEFAULT_ETAG_ETYPE                     0x893f
 #define IXGBE_ETAG_ETYPE                       0x00005084
 #define IXGBE_ETAG_ETYPE_MASK                  0x0000ffff
 #define IXGBE_ETAG_ETYPE_VALID                 0x80000000
@@ -1481,7 +1480,7 @@ static int ixgbe_l2_tn_filter_init(struct rte_eth_dev *eth_dev)
 	}
 	l2_tn_info->e_tag_en = FALSE;
 	l2_tn_info->e_tag_fwd_en = FALSE;
-	l2_tn_info->e_tag_ether_type = DEFAULT_ETAG_ETYPE;
+	l2_tn_info->e_tag_ether_type = ETHER_TYPE_ETAG;
 
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 0026e938a..bcf764b4d 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_ether.h>
 #include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
@@ -306,6 +307,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.actions = valid_actions,
 		.mask = &(const struct rte_flow_item_vlan){
 			.tci = -1,
+			.inner_type = -1,
 		},
 		.default_mask = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
@@ -1285,6 +1287,7 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 	struct mlx5_flow_parse *parser = data->parser;
 	struct ibv_flow_spec_eth *eth;
 	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg = "VLAN cannot be empty";
 
 	if (spec) {
 		unsigned int i;
@@ -1306,12 +1309,20 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 			 */
 			if (!eth->mask.vlan_tag)
 				goto error;
+			/* Outer TPID cannot be matched. */
+			if (eth->mask.ether_type) {
+				msg = "VLAN TPID matching is not supported";
+				goto error;
+			}
+			eth->val.ether_type = spec->inner_type;
+			eth->mask.ether_type = mask->inner_type;
+			eth->val.ether_type &= eth->mask.ether_type;
 		}
 		return 0;
 	}
 error:
 	return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				  item, "VLAN cannot be empty");
+				  item, msg);
 }
 
 /**
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 8fd4dbfb1..6478eb2fe 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -1091,12 +1091,6 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 	if (ret)
 		return ret;
 
-	if (mask->tpid) {
-		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				   NULL, "Not supported by classifier\n");
-		return -rte_errno;
-	}
-
 	m = rte_be_to_cpu_16(mask->tci);
 	if (m & MRVL_VLAN_ID_MASK) {
 		RTE_LOG(WARNING, PMD, "vlan id mask is ignored\n");
@@ -1112,6 +1106,26 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 			goto out;
 	}
 
+	if (flow->pattern & F_TYPE) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported\n");
+		return -rte_errno;
+	}
+	if (mask->inner_type) {
+		struct rte_flow_item_eth spec_eth = {
+			.type = spec->inner_type,
+		};
+		struct rte_flow_item_eth mask_eth = {
+			.type = mask->inner_type,
+		};
+
+		RTE_LOG(WARNING, PMD, "inner eth type mask is ignored\n");
+		ret = mrvl_parse_type(spec_eth, mask_eth, flow);
+		if (ret)
+			goto out;
+	}
+
 	return 0;
 out:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 3028efbf9..cd6a61b39 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -7,6 +7,7 @@
  * for Solarflare) and Solarflare Communications, Inc.
  */
 
+#include <rte_byteorder.h>
 #include <rte_tailq.h>
 #include <rte_common.h>
 #include <rte_ethdev_driver.h>
@@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 	const struct rte_flow_item_vlan *mask = NULL;
 	const struct rte_flow_item_vlan supp_mask = {
 		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
+		.inner_type = RTE_BE16(0xffff),
 	};
 
 	rc = sfc_flow_parse_init(item,
@@ -393,6 +395,22 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 		return -rte_errno;
 	}
 
+	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported");
+		return -rte_errno;
+	}
+	if (mask->inner_type == supp_mask.inner_type) {
+		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
+	} else if (mask->inner_type) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Bad mask for VLAN inner_type");
+		return -rte_errno;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 7dfaf9ac5..dff09313a 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -270,13 +270,13 @@ static const struct tap_flow_items tap_flow_items[] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
 			       RTE_FLOW_ITEM_TYPE_IPV6),
 		.mask = &(const struct rte_flow_item_vlan){
-			.tpid = -1,
 			/* DEI matching is not supported */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 			.tci = 0xffef,
 #else
 			.tci = 0xefff,
 #endif
+			.inner_type = -1,
 		},
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
 		.default_mask = &rte_flow_item_vlan_mask,
@@ -578,13 +578,19 @@ tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
 	/* use default mask if none provided */
 	if (!mask)
 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
-	/* TC does not support tpid masking. Only accept if exact match. */
-	if (mask->tpid && mask->tpid != 0xffff)
+	/* Outer TPID cannot be matched. */
+	if (info->eth_type)
 		return -1;
 	/* Double-tagging not supported. */
-	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+	if (info->vlan)
 		return -1;
 	info->vlan = 1;
+	if (mask->inner_type) {
+		/* TC does not support partial eth_type masking */
+		if (mask->inner_type != RTE_BE16(0xffff))
+			return -1;
+		info->eth_type = spec->inner_type;
+	}
 	if (!flow)
 		return 0;
 	msg = &flow->msg;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index d0ff26aa3..8e50384d0 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -454,11 +454,17 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
  * RTE_FLOW_ITEM_TYPE_ETH
  *
  * Matches an Ethernet header.
+ *
+ * The @p type field either stands for "EtherType" or "TPID" when followed
+ * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In
+ * the latter case, @p type refers to that of the outer header, with the
+ * inner EtherType/TPID provided by the subsequent pattern item. This is the
+ * same order as on the wire.
  */
 struct rte_flow_item_eth {
 	struct ether_addr dst; /**< Destination MAC. */
 	struct ether_addr src; /**< Source MAC. */
-	rte_be16_t type; /**< EtherType. */
+	rte_be16_t type; /**< EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */
@@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
  *
  * Matches an 802.1Q/ad VLAN tag.
  *
- * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
- * RTE_FLOW_ITEM_TYPE_VLAN.
+ * The corresponding standard outer EtherType (TPID) values are
+ * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
+ * pattern item.
  */
 struct rte_flow_item_vlan {
-	rte_be16_t tpid; /**< Tag protocol identifier. */
 	rte_be16_t tci; /**< Tag control information. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
 #ifndef __cplusplus
 static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
-	.tpid = RTE_BE16(0x0000),
 	.tci = RTE_BE16(0xffff),
+	.inner_type = RTE_BE16(0x0000),
 };
 #endif
 
@@ -636,9 +643,11 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
  * RTE_FLOW_ITEM_TYPE_E_TAG.
  *
  * Matches a E-tag header.
+ *
+ * The corresponding standard outer EtherType (TPID) value is
+ * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item.
  */
 struct rte_flow_item_e_tag {
-	rte_be16_t tpid; /**< Tag protocol identifier (0x893F). */
 	/**
 	 * E-Tag control information (E-TCI).
 	 * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b).
@@ -648,6 +657,7 @@ struct rte_flow_item_e_tag {
 	rte_be16_t rsvd_grp_ecid_b;
 	uint8_t in_ecid_e; /**< Ingress E-CID ext. */
 	uint8_t ecid_e; /**< E-CID ext. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index 45daa911a..a271d1c86 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -301,6 +301,7 @@ struct vxlan_hdr {
 #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */
 #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */
 #define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
+#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */
 #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
 #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
 #define ETHER_TYPE_TEB  0x6558 /**< Transparent Ethernet Bridging. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v5 09/16] ethdev: add encap level to RSS flow API action
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (5 preceding siblings ...)
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-19 10:16  3%         ` Adrien Mazarguil
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
                           ` (5 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 7436e0356..976fde7cd 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 19e27a6ca..562fb2f8d 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1101,6 +1101,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index e0c68495c..1a09e8a0f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1311,6 +1311,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1320,6 +1342,8 @@ field only, both can be requested simultaneously.
    +===============+=============================================+
    | ``func``      | RSS hash function to apply                  |
    +---------------+---------------------------------------------+
+   | ``level``     | encapsulation level for ``types``           |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 546ef3ab7..3b1073bfc 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 82307ec5d..d1c0b4b8d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index d5c1cd3d3..a3776a0d7 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2906,6 +2906,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2921,6 +2922,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 5cb852f2c..42002422b 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -12040,6 +12040,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -12055,6 +12056,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 897989bbd..db668835d 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4380,6 +4380,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 00d975b93..438bfcdfb 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index e91e7f746..2892436e9 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5684,6 +5684,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5699,6 +5700,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 002003235..ce36ac715 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index f9e6779b4..0026e938a 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 779edad0c..3028efbf9 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1269,6 +1269,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 		return -EINVAL;
 	}
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 845031a31..7dfaf9ac5 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index a2b51f1e0..83b733ff0 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 97d7d3594..d0ff26aa3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1046,6 +1046,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 08/16] ethdev: add hash function to RSS flow API action
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (4 preceding siblings ...)
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 09/16] ethdev: add encap level " Adrien Mazarguil
                           ` (6 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

By definition, RSS involves some kind of hash algorithm, usually Toeplitz.

Until now it could not be modified on a flow rule basis and PMDs had to
always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
behavior when unspecified (0).

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

- Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
  more explicit where doing so would clarify the code.

- Updated sfc to include Toeplitz as the other allowed value.

Both according to Andrew's suggestions [1].

[1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
---
 app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          |  2 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
 drivers/net/e1000/igb_flow.c                |  4 ++
 drivers/net/e1000/igb_rxtx.c                |  4 +-
 drivers/net/i40e/i40e_ethdev.c              |  4 +-
 drivers/net/i40e/i40e_flow.c                |  4 ++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
 drivers/net/mlx4/mlx4_flow.c                |  7 +++
 drivers/net/mlx5/mlx5_flow.c                | 13 +++++
 drivers/net/sfc/sfc_flow.c                  |  8 +++
 drivers/net/tap/tap_flow.c                  |  6 ++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 |  2 +
 16 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index c9c2c3ad9..7436e0356 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -14,6 +14,7 @@
 #include <sys/socket.h>
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev.h>
 #include <rte_byteorder.h>
 #include <cmdline_parse.h>
@@ -165,6 +166,10 @@ enum index {
 	ACTION_DROP,
 	ACTION_COUNT,
 	ACTION_RSS,
+	ACTION_RSS_FUNC,
+	ACTION_RSS_FUNC_DEFAULT,
+	ACTION_RSS_FUNC_TOEPLITZ,
+	ACTION_RSS_FUNC_SIMPLE_XOR,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
 	ACTION_RSS_KEY,
@@ -632,6 +637,7 @@ static const enum index action_queue[] = {
 };
 
 static const enum index action_rss[] = {
+	ACTION_RSS_FUNC,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -666,6 +672,9 @@ static int parse_vc_conf(struct context *, const struct token *,
 static int parse_vc_action_rss(struct context *, const struct token *,
 			       const char *, unsigned int, void *,
 			       unsigned int);
+static int parse_vc_action_rss_func(struct context *, const struct token *,
+				    const char *, unsigned int, void *,
+				    unsigned int);
 static int parse_vc_action_rss_type(struct context *, const struct token *,
 				    const char *, unsigned int, void *,
 				    unsigned int);
@@ -1584,6 +1593,29 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
+	[ACTION_RSS_FUNC] = {
+		.name = "func",
+		.help = "RSS hash function to apply",
+		.next = NEXT(action_rss,
+			     NEXT_ENTRY(ACTION_RSS_FUNC_DEFAULT,
+					ACTION_RSS_FUNC_TOEPLITZ,
+					ACTION_RSS_FUNC_SIMPLE_XOR)),
+	},
+	[ACTION_RSS_FUNC_DEFAULT] = {
+		.name = "default",
+		.help = "default hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_TOEPLITZ] = {
+		.name = "toeplitz",
+		.help = "Toeplitz hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_SIMPLE_XOR] = {
+		.name = "simple_xor",
+		.help = "simple XOR hash function",
+		.call = parse_vc_action_rss_func,
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2074,6 +2106,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
+			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
@@ -2099,6 +2132,45 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 }
 
 /**
+ * Parse func field for RSS action.
+ *
+ * The RTE_ETH_HASH_FUNCTION_* value to assign is derived from the
+ * ACTION_RSS_FUNC_* index that called this function.
+ */
+static int
+parse_vc_action_rss_func(struct context *ctx, const struct token *token,
+			 const char *str, unsigned int len,
+			 void *buf, unsigned int size)
+{
+	struct action_rss_data *action_rss_data;
+	enum rte_eth_hash_function func;
+
+	(void)buf;
+	(void)size;
+	/* Token name must match. */
+	if (parse_default(ctx, token, str, len, NULL, 0) < 0)
+		return -1;
+	switch (ctx->curr) {
+	case ACTION_RSS_FUNC_DEFAULT:
+		func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+		break;
+	case ACTION_RSS_FUNC_TOEPLITZ:
+		func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+		break;
+	case ACTION_RSS_FUNC_SIMPLE_XOR:
+		func = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR;
+		break;
+	default:
+		return -1;
+	}
+	if (!ctx->object)
+		return len;
+	action_rss_data = ctx->object;
+	action_rss_data->conf.func = func;
+	return len;
+}
+
+/**
  * Parse type field for RSS action.
  *
  * Valid tokens are type field names and the "end" token.
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 3da09536a..19e27a6ca 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1100,6 +1100,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index cf252eeba..e0c68495c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1318,6 +1318,8 @@ field only, both can be requested simultaneously.
    +---------------+---------------------------------------------+
    | Field         | Value                                       |
    +===============+=============================================+
+   | ``func``      | RSS hash function to apply                  |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 17336d163..546ef3ab7 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,6 +3398,9 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
+  - ``func {hash function}``: RSS hash function to apply, allowed tokens are
+    the same as `set_hash_global_config`_.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 8dc5f75f2..82307ec5d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1310,6 +1310,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 45bb3455c..d5c1cd3d3 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2905,6 +2905,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2919,7 +2920,8 @@ int
 igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index e65235fc3..5cb852f2c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -12039,6 +12039,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -12053,7 +12054,8 @@ int
 i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index ec6231003..897989bbd 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4376,6 +4376,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	/* Parse RSS related parameters from configuration */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 4e31c7c56..00d975b93 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2779,6 +2779,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9fbd7dbd7..e91e7f746 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5683,6 +5683,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5697,7 +5698,8 @@ int
 ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dd86e4ce7..002003235 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -790,6 +790,12 @@ mlx4_flow_prepare(struct priv *priv,
 					" of the context size";
 				goto exit_action_not_supported;
 			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1283,6 +1289,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a289dff73..f9e6779b4 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
@@ -634,6 +635,15 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "the only supported RSS hash"
+						   " function is Toeplitz");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -683,6 +693,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				}
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
+				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1915,6 +1926,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2429,6 +2441,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 1a2c0299c..779edad0c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1261,6 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
+	switch (rss->func) {
+	case RTE_ETH_HASH_FUNCTION_DEFAULT:
+	case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 67146aaba..845031a31 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,6 +2055,12 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
+	/* Check supported hash functions */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "non-default RSS hash functions are not supported");
+
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
 	if (err < 0) {
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index cc7819b6a..a2b51f1e0 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,6 +330,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index bbc408fa6..97d7d3594 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -19,6 +19,7 @@
 
 #include <rte_arp.h>
 #include <rte_ether.h>
+#include <rte_eth_ctrl.h>
 #include <rte_icmp.h>
 #include <rte_ip.h>
 #include <rte_sctp.h>
@@ -1044,6 +1045,7 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
+	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (3 preceding siblings ...)
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-19 10:16  1%         ` Adrien Mazarguil
  2018-04-23 15:05  0%           ` Nélio Laranjeiro
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
                           ` (7 subsequent siblings)
  12 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon, Radu Nicolau, Akhil Goyal

Since its inception, the rte_flow RSS action has been relying in part on
external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
This structure lacks parameters such as the hash algorithm to use, and more
recently, a method to tell which layer RSS should be performed on [1].

Given struct rte_eth_rss_conf will never be flexible enough to represent a
complete RSS configuration (e.g. RETA table), this patch supersedes it by
extending the rte_flow RSS action directly.

A subsequent patch will add a field to use a non-default RSS hash
algorithm. To that end, a field named "types" replaces the field formerly
known as "rss_hf" and standing for "RSS hash functions" as it was
confusing. Actual RSS hash function types are defined by enum
rte_eth_hash_function.

This patch updates all PMDs and example applications accordingly.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

[1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
Cc: Radu Nicolau <radu.nicolau@intel.com>
Cc: Akhil Goyal <akhil.goyal@nxp.com>

---

v3 changes:

Documentation update regarding the meaning of a 0 value for RSS types in
flow rules.

It used to implicitly mean "no RSS" but is redefined as requesting a kind
of "best-effort" mode from PMDs, i.e. anything ranging from empty to
all-inclusive RSS; what matters is it provides safe defaults that will work
regardless of PMD capabilities.
---
 app/test-pmd/cmdline_flow.c                 |  48 +++---
 app/test-pmd/config.c                       |  39 ++---
 doc/guides/prog_guide/rte_flow.rst          |  28 ++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  31 ++--
 drivers/net/e1000/igb_rxtx.c                |  51 +++++-
 drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                |  47 +++---
 drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
 drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                |  61 +++----
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
 drivers/net/sfc/sfc_flow.c                  |  21 ++-
 drivers/net/tap/tap_flow.c                  |   8 +-
 examples/ipsec-secgw/ipsec.c                |  10 +-
 lib/librte_ether/rte_flow.c                 |  39 ++---
 lib/librte_ether/rte_flow.h                 |  12 +-
 28 files changed, 478 insertions(+), 355 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 798b7948d..c9c2c3ad9 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -192,9 +192,8 @@ enum index {
 /** Storage for struct rte_flow_action_rss including external data. */
 struct action_rss_data {
 	struct rte_flow_action_rss conf;
+	uint8_t key[RSS_HASH_KEY_LENGTH];
 	uint16_t queue[ACTION_RSS_QUEUE_NUM];
-	struct rte_eth_rss_conf rss_conf;
-	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -1587,7 +1586,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
-		.help = "RSS hash types",
+		.help = "specific RSS hash types",
 		.next = NEXT(action_rss, NEXT_ENTRY(ACTION_RSS_TYPE)),
 	},
 	[ACTION_RSS_TYPE] = {
@@ -1602,21 +1601,21 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
 		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
 			     ARGS_ENTRY_ARB
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len)),
-			     ARGS_ENTRY(struct action_rss_data, rss_key)),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len)),
+			     ARGS_ENTRY(struct action_rss_data, key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len),
 			      0,
 			      RSS_HASH_KEY_LENGTH)),
 	},
@@ -2075,27 +2074,24 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->rss_conf,
-			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.types = rss_hf,
+			.key_len = sizeof(action_rss_data->key),
+			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.key = action_rss_data->key,
 			.queue = action_rss_data->queue,
 		},
+		.key = "testpmd's default RSS hash key",
 		.queue = { 0 },
-		.rss_conf = (struct rte_eth_rss_conf){
-			.rss_key = action_rss_data->rss_key,
-			.rss_key_len = sizeof(action_rss_data->rss_key),
-			.rss_hf = rss_hf,
-		},
-		.rss_key = "testpmd's default RSS hash key",
 	};
-	for (i = 0; i < action_rss_data->conf.num; ++i)
+	for (i = 0; i < action_rss_data->conf.queue_num; ++i)
 		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->rss_key),
+		action_rss_data->conf.key_len =
+			RTE_MIN(sizeof(action_rss_data->key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2123,7 +2119,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->rss_conf.rss_hf = 0;
+		action_rss_data->conf.types = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2142,7 +2138,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->conf.types |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2192,7 +2188,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue_num = i;
 	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 95618e4eb..3da09536a 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1100,40 +1100,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index acbeaacbd..cf252eeba 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1301,6 +1301,12 @@ Action: ``RSS``
 Similar to QUEUE, except RSS is additionally performed on packets to spread
 them among several queues according to the provided parameters.
 
+Unlike global RSS settings used by other DPDK APIs, unsetting the ``types``
+field does not disable RSS in a flow rule. Doing so instead requests safe
+unspecified "best-effort" settings from the underlying PMD, which depending
+on the flow rule, may result in anything ranging from empty (single queue)
+to all-inclusive RSS.
+
 Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
@@ -1309,15 +1315,19 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+--------------------------------+
-   | Field        | Value                          |
-   +==============+================================+
-   | ``rss_conf`` | RSS parameters                 |
-   +--------------+--------------------------------+
-   | ``num``      | number of entries in ``queue`` |
-   +--------------+--------------------------------+
-   | ``queue``    | queue indices to use           |
-   +--------------+--------------------------------+
+   +---------------+---------------------------------------------+
+   | Field         | Value                                       |
+   +===============+=============================================+
+   | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
+   +---------------+---------------------------------------------+
+   | ``key_len``   | hash key length in bytes                    |
+   +---------------+---------------------------------------------+
+   | ``queue_num`` | number of entries in ``queue``              |
+   +---------------+---------------------------------------------+
+   | ``key``       | hash key                                    |
+   +---------------+---------------------------------------------+
+   | ``queue``     | queue indices to use                        |
+   +---------------+---------------------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a015d02a4..17336d163 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,8 +3398,10 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
-  - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
-    are the same as `set_hash_input_set`_, an empty list means none (0).
+  - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
+    tokens are the same as `set_hash_input_set`_, except that an empty list
+    does not disable RSS but instead requests unspecified "best-effort"
+    settings.
 
   - ``key {string}``: RSS hash key, overrides ``key_len``.
 
diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b894a..902001f36 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -4,6 +4,10 @@
 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
+
+#include <stdint.h>
+
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_pci.h>
 
@@ -27,6 +31,7 @@
 #define E1000_CTRL_EXT_EXTEND_VLAN  (1<<26)    /* EXTENDED VLAN */
 #define IGB_VFTA_SIZE 128
 
+#define IGB_HKEY_MAX_INDEX             10
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
@@ -229,8 +234,8 @@ struct igb_ethertype_filter {
 };
 
 struct igb_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IGB_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IGB_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -501,6 +506,10 @@ int eth_igb_syn_filter_set(struct rte_eth_dev *dev,
 int eth_igb_add_del_flex_filter(struct rte_eth_dev *dev,
 			struct rte_eth_flex_filter *filter,
 			bool add);
+int igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		      const struct rte_flow_action_rss *in);
+int igb_action_rss_same(const struct rte_flow_action_rss *comp,
+			const struct rte_flow_action_rss *with);
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 9b808a982..7e9935b7e 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -41,8 +41,6 @@
 #define IGB_DEFAULT_TX_HTHRESH      1
 #define IGB_DEFAULT_TX_WTHRESH      ((hw->mac.type == e1000_82576) ? 1 : 16)
 
-#define IGB_HKEY_MAX_INDEX 10
-
 /* Bit shift and mask */
 #define IGB_4_BIT_WIDTH  (CHAR_BIT / 2)
 #define IGB_4_BIT_MASK   RTE_LEN2MASK(IGB_4_BIT_WIDTH, uint8_t)
@@ -5576,7 +5574,7 @@ igb_rss_filter_restore(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter_info->rss_info, TRUE);
 }
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index c0f5b5190..8dc5f75f2 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1292,7 +1292,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -1300,7 +1300,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1310,14 +1310,18 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IGB_RSS_OFFLOAD_ALL;
-
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (igb_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	index++;
@@ -1518,9 +1522,8 @@ igb_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct igb_rte_flow_rss_conf));
+			igb_rss_conf_init(&rss_filter_ptr->filter_info,
+					  &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&igb_filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
@@ -1757,7 +1760,7 @@ igb_clear_rss_filter(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter->rss_info.num)
+	if (filter->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter->rss_info, FALSE);
 }
 
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 323913f0d..45bb3455c 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2898,12 +2898,47 @@ igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 }
 
 int
+igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		  const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+igb_action_rss_same(const struct rte_flow_action_rss *comp,
+		    const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 igb_config_rss_filter(struct rte_eth_dev *dev,
 		struct igb_rte_flow_rss_conf *conf, bool add)
 {
 	uint32_t shift;
 	uint16_t i, j;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
@@ -2911,8 +2946,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct igb_rte_flow_rss_conf)) == 0) {
+		if (igb_action_rss_same(&filter_info->rss_info.conf,
+					&conf->conf)) {
 			igb_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct igb_rte_flow_rss_conf));
@@ -2921,7 +2956,7 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 
 	/* Fill in redirection table. */
@@ -2933,9 +2968,9 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		} reta;
 		uint8_t q_idx;
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		q_idx = conf->queue[j];
+		q_idx = conf->conf.queue[j];
 		reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
 		if ((i & 3) == 3)
 			E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
@@ -2952,8 +2987,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	igb_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct igb_rte_flow_rss_conf));
+	if (igb_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 180ac7449..e65235fc3 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include <assert.h>
 
+#include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_string_fns.h>
 #include <rte_pci.h>
@@ -11499,7 +11500,7 @@ i40e_rss_filter_restore(struct i40e_pf *pf)
 {
 	struct i40e_rte_flow_rss_conf *conf =
 					&pf->rss_info;
-	if (conf->num)
+	if (conf->conf.queue_num)
 		i40e_config_rss_filter(pf, conf, TRUE);
 }
 
@@ -12031,18 +12032,52 @@ i40e_cloud_filter_qinq_create(struct i40e_pf *pf)
 }
 
 int
+i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		   const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+		     const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	uint32_t i, lut = 0;
 	uint16_t j, num;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct i40e_rte_flow_rss_conf *rss_info = &pf->rss_info;
 
 	if (!add) {
-		if (memcmp(conf, rss_info,
-			sizeof(struct i40e_rte_flow_rss_conf)) == 0) {
+		if (i40e_action_rss_same(&rss_info->conf, &conf->conf)) {
 			i40e_pf_disable_rss(pf);
 			memset(rss_info, 0,
 				sizeof(struct i40e_rte_flow_rss_conf));
@@ -12051,7 +12086,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 		return -EINVAL;
 	}
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		return -EINVAL;
 
 	/* If both VMDQ and RSS enabled, not all of PF queues are configured.
@@ -12062,7 +12097,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	else
 		num = pf->dev_data->nb_rx_queues;
 
-	num = RTE_MIN(num, conf->num);
+	num = RTE_MIN(num, conf->conf.queue_num);
 	PMD_DRV_LOG(INFO, "Max of contiguous %u PF queues are configured",
 			num);
 
@@ -12075,7 +12110,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
 			j = 0;
-		lut = (lut << 8) | (conf->queue[j] & ((0x1 <<
+		lut = (lut << 8) | (conf->conf.queue[j] & ((0x1 <<
 			hw->func_caps.rss_table_entry_width) - 1));
 		if ((i & 3) == 3)
 			I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i >> 2), lut);
@@ -12100,8 +12135,8 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 
 	i40e_hw_rss_hash_set(pf, &rss_conf);
 
-	rte_memcpy(rss_info,
-		conf, sizeof(struct i40e_rte_flow_rss_conf));
+	if (i40e_rss_conf_init(rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index d33b255e7..a0569d4ae 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -5,14 +5,19 @@
 #ifndef _I40E_ETHDEV_H_
 #define _I40E_ETHDEV_H_
 
+#include <stdint.h>
+
 #include <rte_eth_ctrl.h>
 #include <rte_time.h>
 #include <rte_kvargs.h>
 #include <rte_hash.h>
+#include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_tm_driver.h>
 #include "rte_pmd_i40e.h"
 
+#include "base/i40e_register.h"
+
 #define I40E_VLAN_TAG_SIZE        4
 
 #define I40E_AQ_LEN               32
@@ -878,9 +883,11 @@ struct i40e_customized_pctype {
 };
 
 struct i40e_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
 	uint16_t queue_region_conf; /**< Queue region config flag */
-	uint16_t num; /**< Number of entries in queue[]. */
+	uint8_t key[(I40E_VFQF_HKEY_MAX_INDEX > I40E_PFQF_HKEY_MAX_INDEX ?
+		     I40E_VFQF_HKEY_MAX_INDEX : I40E_PFQF_HKEY_MAX_INDEX) + 1 *
+		    sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[I40E_MAX_Q_PER_TC]; /**< Queues indices to use. */
 };
 
@@ -1219,6 +1226,10 @@ void i40e_init_queue_region_conf(struct rte_eth_dev *dev);
 void i40e_flex_payload_reg_set_default(struct i40e_hw *hw);
 int i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, uint8_t key_len);
 int i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, uint16_t lut_size);
+int i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		       const struct rte_flow_action_rss *in);
+int i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+			 const struct rte_flow_action_rss *with);
 int i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index d6f5e9923..ec6231003 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4220,7 +4220,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 
 	if (action_flag) {
 		for (n = 0; n < 64; n++) {
-			if (rss->rss_conf->rss_hf & (hf_bit << n)) {
+			if (rss->types & (hf_bit << n)) {
 				conf_info->region[0].hw_flowtype[0] = n;
 				conf_info->region[0].flowtype_num = 1;
 				conf_info->queue_region_number = 1;
@@ -4236,12 +4236,12 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	 * queue index for this port.
 	 */
 	if (conf_info->queue_region_number) {
-		for (i = 0; i < rss->num; i++) {
-			for (j = 0; j < rss_info->num; j++) {
-				if (rss->queue[i] == rss_info->queue[j])
+		for (i = 0; i < rss->queue_num; i++) {
+			for (j = 0; j < rss_info->conf.queue_num; j++) {
+				if (rss->queue[i] == rss_info->conf.queue[j])
 					break;
 			}
-			if (j == rss_info->num) {
+			if (j == rss_info->conf.queue_num) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4250,7 +4250,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 		}
 
-		for (i = 0; i < rss->num - 1; i++) {
+		for (i = 0; i < rss->queue_num - 1; i++) {
 			if (rss->queue[i + 1] != rss->queue[i] + 1) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4265,8 +4265,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	for (n = 0; n < conf_info->queue_region_number; n++) {
 		if (conf_info->region[n].user_priority_num ||
 				conf_info->region[n].flowtype_num) {
-			if (!((rte_is_power_of_2(rss->num)) &&
-					rss->num <= 64)) {
+			if (!((rte_is_power_of_2(rss->queue_num)) &&
+					rss->queue_num <= 64)) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4294,7 +4294,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 
 			for (i = 0; i < info->queue_region_number; i++) {
-				if (info->region[i].queue_num == rss->num &&
+				if (info->region[i].queue_num ==
+				    rss->queue_num &&
 					info->region[i].queue_start_index ==
 						rss->queue[0])
 					break;
@@ -4310,7 +4311,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				}
 
 				info->region[i].queue_num =
-					rss->num;
+					rss->queue_num;
 				info->region[i].queue_start_index =
 					rss->queue[0];
 				info->region[i].region_id =
@@ -4356,7 +4357,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	if (rss_config->queue_region_conf)
 		return 0;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -4364,7 +4365,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4375,15 +4376,19 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	/* Parse RSS related parameters from configuration */
-	if (rss->rss_conf)
-		rss_config->rss_conf = *rss->rss_conf;
-	else
-		rss_config->rss_conf.rss_hf =
-			pf->adapter->flow_types_mask;
+	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key too large");
+	if (rss->queue_num > RTE_DIM(rss_config->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (i40e_rss_conf_init(rss_config, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
-	for (n = 0; n < rss->num; ++n)
-		rss_config->queue[n] = rss->queue[n];
-	rss_config->num = rss->num;
 	index++;
 
 	/* check if the next not void action is END */
@@ -4903,7 +4908,7 @@ i40e_flow_flush_rss_filter(struct rte_eth_dev *dev)
 
 	ret = i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		ret = i40e_config_rss_filter(pf, rss_info, FALSE);
 	return ret;
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 33ee52e45..eaf1aadef 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -100,8 +100,6 @@
 
 #define IXGBE_QUEUE_STAT_COUNTERS (sizeof(hw_stats->qprc) / sizeof(hw_stats->qprc[0]))
 
-#define IXGBE_HKEY_MAX_INDEX 10
-
 /* Additional timesync values. */
 #define NSEC_PER_SEC             1000000000L
 #define IXGBE_INCVAL_10GB        0x66666666
@@ -8292,7 +8290,7 @@ ixgbe_rss_filter_restore(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev,
 			&filter_info->rss_info, TRUE);
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 655077700..9491b03f4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -4,6 +4,9 @@
 
 #ifndef _IXGBE_ETHDEV_H_
 #define _IXGBE_ETHDEV_H_
+
+#include <stdint.h>
+
 #include "base/ixgbe_type.h"
 #include "base/ixgbe_dcb.h"
 #include "base/ixgbe_dcb_82599.h"
@@ -12,6 +15,7 @@
 #ifdef RTE_LIBRTE_SECURITY
 #include "ixgbe_ipsec.h"
 #endif
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_hash.h>
 #include <rte_pci.h>
@@ -39,6 +43,7 @@
 #define IXGBE_EXTENDED_VLAN	  (uint32_t)(1 << 26) /* EXTENDED VLAN ENABLE */
 #define IXGBE_VFTA_SIZE 128
 #define IXGBE_VLAN_TAG_SIZE 4
+#define IXGBE_HKEY_MAX_INDEX 10
 #define IXGBE_MAX_RX_QUEUE_NUM	128
 #define IXGBE_MAX_INTR_QUEUE_NUM	15
 #define IXGBE_VMDQ_DCB_NB_QUEUES     IXGBE_MAX_RX_QUEUE_NUM
@@ -196,8 +201,8 @@ struct ixgbe_hw_fdir_info {
 };
 
 struct ixgbe_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IXGBE_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -696,6 +701,10 @@ void ixgbe_tm_conf_init(struct rte_eth_dev *dev);
 void ixgbe_tm_conf_uninit(struct rte_eth_dev *dev);
 int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev, uint16_t queue_idx,
 			       uint16_t tx_rate);
+int ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+			const struct rte_flow_action_rss *in);
+int ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+			  const struct rte_flow_action_rss *with);
 int ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index abdeac28b..4e31c7c56 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2761,7 +2761,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -2769,7 +2769,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -2778,14 +2778,19 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IXGBE_RSS_OFFLOAD_ALL;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (ixgbe_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	act = next_no_void_action(actions, act);
@@ -2834,7 +2839,7 @@ ixgbe_clear_rss_filter(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev, &filter_info->rss_info, FALSE);
 }
 
@@ -3153,9 +3158,8 @@ ixgbe_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct ixgbe_rte_flow_rss_conf));
+			ixgbe_rss_conf_init(&rss_filter_ptr->filter_info,
+					    &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index aed3f5a9a..9fbd7dbd7 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5676,6 +5676,36 @@ ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
 }
 
 int
+ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+		    const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+		      const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add)
 {
@@ -5685,7 +5715,12 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	uint16_t j;
 	uint16_t sp_reta_size;
 	uint32_t reta_reg;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
@@ -5695,8 +5730,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	sp_reta_size = ixgbe_reta_size_get(hw->mac.type);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct ixgbe_rte_flow_rss_conf)) == 0) {
+		if (ixgbe_action_rss_same(&filter_info->rss_info.conf,
+					  &conf->conf)) {
 			ixgbe_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct ixgbe_rte_flow_rss_conf));
@@ -5705,7 +5740,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 	/* Fill in redirection table
 	 * The byte-swap is needed because NIC registers are in
@@ -5715,9 +5750,9 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	for (i = 0, j = 0; i < sp_reta_size; i++, j++) {
 		reta_reg = ixgbe_reta_reg_get(hw->mac.type, i);
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		reta = (reta << 8) | conf->queue[j];
+		reta = (reta << 8) | conf->conf.queue[j];
 		if ((i & 3) == 3)
 			IXGBE_WRITE_REG(hw, reta_reg,
 					rte_bswap32(reta));
@@ -5734,8 +5769,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	ixgbe_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct ixgbe_rte_flow_rss_conf));
+	if (ixgbe_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 06f17703b..970d20dd1 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -569,7 +569,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			     " for UDP RSS and inner VXLAN RSS");
 			/* Fake support for all possible RSS hash fields. */
 			priv->hw_rss_sup = ~UINT64_C(0);
-			priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
+			priv->hw_rss_sup = mlx4_conv_rss_types(priv, -1);
 			/* Filter out known unsupported fields. */
 			priv->hw_rss_sup &=
 				~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 8feb6ae31..dd86e4ce7 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -76,22 +76,22 @@ struct mlx4_drop {
 };
 
 /**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert DPDK RSS hash types to their Verbs equivalent.
  *
- * This function returns the supported (default) set when @p rss_hf has
+ * This function returns the supported (default) set when @p types has
  * special value (uint64_t)-1.
  *
  * @param priv
  *   Pointer to private structure.
- * @param rss_hf
- *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ *   Hash types in DPDK format (see struct rte_eth_rss_conf).
  *
  * @return
  *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
  *   otherwise and rte_errno is set.
  */
 uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types)
 {
 	enum { IPV4, IPV6, TCP, UDP, };
 	const uint64_t in[] = {
@@ -126,17 +126,17 @@ mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
 	unsigned int i;
 
 	for (i = 0; i != RTE_DIM(in); ++i)
-		if (rss_hf & in[i]) {
-			seen |= rss_hf & in[i];
+		if (types & in[i]) {
+			seen |= types & in[i];
 			conv |= out[i];
 		}
 	if ((conv & priv->hw_rss_sup) == conv) {
-		if (rss_hf == (uint64_t)-1) {
+		if (types == (uint64_t)-1) {
 			/* Include inner RSS by default if supported. */
 			conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
 			return conv;
 		}
-		if (!(rss_hf & ~seen))
+		if (!(types & ~seen))
 			return conv;
 	}
 	rte_errno = ENOTSUP;
@@ -717,7 +717,8 @@ mlx4_flow_prepare(struct priv *priv,
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
-			const struct rte_eth_rss_conf *rss_conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint64_t fields;
 			unsigned int i;
 
@@ -747,58 +748,56 @@ mlx4_flow_prepare(struct priv *priv,
 				break;
 			rss = action->conf;
 			/* Default RSS configuration if none is provided. */
-			rss_conf =
-				rss->rss_conf ?
-				rss->rss_conf :
-				&(struct rte_eth_rss_conf){
-					.rss_key = mlx4_rss_hash_key_default,
-					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
-					.rss_hf = -1,
-				};
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
 			/* Sanity checks. */
-			for (i = 0; i < rss->num; ++i)
+			for (i = 0; i < rss->queue_num; ++i)
 				if (rss->queue[i] >=
 				    priv->dev->data->nb_rx_queues)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "queue index target beyond number of"
 					" configured Rx queues";
 				goto exit_action_not_supported;
 			}
-			if (!rte_is_power_of_2(rss->num)) {
+			if (!rte_is_power_of_2(rss->queue_num)) {
 				msg = "for RSS, mlx4 requires the number of"
 					" queues to be a power of two";
 				goto exit_action_not_supported;
 			}
-			if (rss_conf->rss_key_len !=
-			    sizeof(flow->rss->key)) {
+			if (rss_key_len != sizeof(flow->rss->key)) {
 				msg = "mlx4 supports exactly one RSS hash key"
 					" length: "
 					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
 				goto exit_action_not_supported;
 			}
-			for (i = 1; i < rss->num; ++i)
+			for (i = 1; i < rss->queue_num; ++i)
 				if (rss->queue[i] - rss->queue[i - 1] != 1)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "mlx4 requires RSS contexts to use"
 					" consecutive queue indices only";
 				goto exit_action_not_supported;
 			}
-			if (rss->queue[0] % rss->num) {
+			if (rss->queue[0] % rss->queue_num) {
 				msg = "mlx4 requires the first queue of a RSS"
 					" context to be aligned on a multiple"
 					" of the context size";
 				goto exit_action_not_supported;
 			}
 			rte_errno = 0;
-			fields = mlx4_conv_rss_hf(priv, rss_conf->rss_hf);
+			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
 				msg = "unsupported RSS hash type requested";
 				goto exit_action_not_supported;
 			}
 			flow->rss = mlx4_rss_get
-				(priv, fields, rss_conf->rss_key, rss->num,
+				(priv, fields, rss_key, rss->queue_num,
 				 rss->queue);
 			if (!flow->rss) {
 				msg = "either invalid parameters or not enough"
@@ -1284,8 +1283,10 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
+		.types = -1,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 4e3889e67..7b83d74b0 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -47,7 +47,7 @@ struct rte_flow {
 
 /* mlx4_flow.c */
 
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t rss_hf);
 int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
 void mlx4_flow_clean(struct priv *priv);
 int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 6be6a0b9a..b430678c7 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
  */
 struct mlx4_rss *
 mlx4_rss_get(struct priv *priv, uint64_t fields,
-	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 	     uint16_t queues, const uint16_t queue_id[])
 {
 	struct mlx4_rss *rss;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index b1af86110..2dfee957f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -127,7 +127,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
 int mlx4_rss_init(struct priv *priv);
 void mlx4_rss_deinit(struct priv *priv);
 struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
-			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 			      uint16_t queues, const uint16_t queue_id[]);
 void mlx4_rss_put(struct mlx4_rss *rss);
 int mlx4_rss_attach(struct mlx4_rss *rss);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 679fdf318..a289dff73 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -214,9 +214,8 @@ struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t (*queues)[]; /**< Queues indexes to use. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
 	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
@@ -406,9 +405,8 @@ struct mlx5_flow_parse {
 	uint32_t mark:1; /**< Mark is present in the flow. */
 	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
@@ -532,47 +530,6 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Copy the RSS configuration from the user ones, of the rss_conf is null,
- * uses the driver one.
- *
- * @param parser
- *   Internal parser structure.
- * @param rss_conf
- *   User RSS configuration to save.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_flow_convert_rss_conf(struct mlx5_flow_parse *parser,
-			   const struct rte_eth_rss_conf *rss_conf)
-{
-	/*
-	 * This function is also called at the beginning of
-	 * mlx5_flow_convert_actions() to initialize the parser with the
-	 * device default RSS configuration.
-	 */
-	if (rss_conf) {
-		if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len != 40) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len && rss_conf->rss_key) {
-			parser->rss_conf.rss_key_len = rss_conf->rss_key_len;
-			memcpy(parser->rss_key, rss_conf->rss_key,
-			       rss_conf->rss_key_len);
-			parser->rss_conf.rss_key = parser->rss_key;
-		}
-		parser->rss_conf.rss_hf = rss_conf->rss_hf;
-	}
-	return 0;
-}
-
-/**
  * Extract attribute to the parser.
  *
  * @param[in] attr
@@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	enum { FATE = 1, MARK = 2, COUNT = 4, };
 	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
 
-	/*
-	 * Add default RSS configuration necessary for Verbs to create QP even
-	 * if no RSS is necessary.
-	 */
-	ret = mlx5_flow_convert_rss_conf(parser,
-					 (const struct rte_eth_rss_conf *)
-					 &priv->rss_conf);
-	if (ret)
-		return ret;
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
@@ -671,25 +618,53 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			parser->queues_n = 1;
 			parser->queues[0] = queue->index;
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.queue_num = 1,
+				.queue = parser->queues,
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint16_t n;
 
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
-			if (!rss || !rss->num) {
+			if (rss->types & MLX5_RSS_HF_MASK) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "unsupported RSS type"
+						   " requested");
+				return -rte_errno;
+			}
+			if (rss->key_len) {
+				rss_key_len = rss->key_len;
+				rss_key = rss->key;
+			} else {
+				rss_key_len = rss_hash_default_key_len;
+				rss_key = rss_hash_default_key;
+			}
+			if (rss_key_len != RTE_DIM(parser->rss_key)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS hash key must be"
+						   " exactly 40 bytes long");
+				return -rte_errno;
+			}
+			if (!rss->queue_num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (rss->num > RTE_DIM(parser->queues)) {
+			if (rss->queue_num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -697,7 +672,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " context");
 				return -rte_errno;
 			}
-			for (n = 0; n < rss->num; ++n) {
+			for (n = 0; n < rss->queue_num; ++n) {
 				if (rss->queue[n] >= priv->rxqs_n) {
 					rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -707,16 +682,16 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 					return -rte_errno;
 				}
 			}
-			for (n = 0; n < rss->num; ++n)
-				parser->queues[n] = rss->queue[n];
-			parser->queues_n = rss->num;
-			if (mlx5_flow_convert_rss_conf(parser, rss->rss_conf)) {
-				rte_flow_error_set(error, EINVAL,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "wrong RSS configuration");
-				return -rte_errno;
-			}
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.types = rss->types,
+				.key_len = rss_key_len,
+				.queue_num = rss->queue_num,
+				.key = memcpy(parser->rss_key, rss_key,
+					      sizeof(*rss_key) * rss_key_len),
+				.queue = memcpy(parser->queues, rss->queue,
+						sizeof(*rss->queue) *
+						rss->queue_num),
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -761,7 +736,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
-	if (!parser->queues_n && !parser->drop) {
+	if (!parser->rss_conf.queue_num && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
@@ -941,7 +916,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	unsigned int i;
 
 	/* Remove any other flow not matching the pattern. */
-	if (parser->queues_n == 1 && !parser->rss_conf.rss_hf) {
+	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			if (i == HASH_RXQ_ETH)
 				continue;
@@ -969,7 +944,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	}
 	/* Remove impossible flow according to the RSS configuration. */
 	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.rss_hf) {
+	    parser->rss_conf.types) {
 		/* Remove any other flow. */
 		for (i = hmin; i != (hmax + 1); ++i) {
 			if ((i == parser->layer) ||
@@ -980,7 +955,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		}
 	} else  if (!parser->queue[ip].ibv_attr) {
 		/* no RSS possible with the current configuration. */
-		parser->queues_n = 1;
+		parser->rss_conf.queue_num = 1;
 		return;
 	}
 fill:
@@ -1109,7 +1084,7 @@ mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.rss_hf &
+			if (!(parser->rss_conf.types &
 			      hash_rxq_init[i].dpdk_rss_hf) &&
 			    (i != HASH_RXQ_ETH))
 				continue;
@@ -1777,20 +1752,20 @@ mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -1861,9 +1836,9 @@ mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 				   NULL, "internal error in flow creation");
 		goto error;
 	}
-	for (i = 0; i != parser->queues_n; ++i) {
+	for (i = 0; i != parser->rss_conf.queue_num; ++i) {
 		struct mlx5_rxq_data *q =
-			(*priv->rxqs)[parser->queues[i]];
+			(*priv->rxqs)[parser->rss_conf.queue[i]];
 
 		q->mark |= parser->mark;
 	}
@@ -1927,7 +1902,8 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	if (ret)
 		goto exit;
 	flow = rte_calloc(__func__, 1,
-			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  sizeof(*flow) +
+			  parser.rss_conf.queue_num * sizeof(uint16_t),
 			  0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
@@ -1936,15 +1912,20 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 				   "cannot allocate flow memory");
 		return NULL;
 	}
-	/* Copy queues configuration. */
+	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
-	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
-	flow->queues_n = parser.queues_n;
+	flow->rss_conf = (struct rte_flow_action_rss){
+		.types = parser.rss_conf.types,
+		.key_len = parser.rss_conf.key_len,
+		.queue_num = parser.rss_conf.queue_num,
+		.key = memcpy(flow->rss_key, parser.rss_conf.key,
+			      sizeof(*parser.rss_conf.key) *
+			      parser.rss_conf.key_len),
+		.queue = memcpy(flow->queues, parser.rss_conf.queue,
+				sizeof(*parser.rss_conf.queue) *
+				parser.rss_conf.queue_num),
+	};
 	flow->mark = parser.mark;
-	/* Copy RSS configuration. */
-	flow->rss_conf = parser.rss_conf;
-	flow->rss_conf.rss_key = flow->rss_key;
-	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
 	/* finalise the flow. */
 	if (parser.drop)
 		ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow,
@@ -2024,7 +2005,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 
 	if (flow->drop || !flow->mark)
 		goto free;
-	for (i = 0; i != flow->queues_n; ++i) {
+	for (i = 0; i != flow->rss_conf.queue_num; ++i) {
 		struct rte_flow *tmp;
 		int mark = 0;
 
@@ -2334,19 +2315,19 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			if (!flow->frxq[i].ibv_attr)
 				continue;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_get(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_get(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_new(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_new(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
@@ -2370,8 +2351,8 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 		}
 		if (!flow->mark)
 			continue;
-		for (i = 0; i != flow->queues_n; ++i)
-			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
+		for (i = 0; i != flow->rss_conf.queue_num; ++i)
+			(*priv->rxqs)[flow->rss_conf.queue[i]]->mark = 1;
 	}
 	return 0;
 }
@@ -2448,8 +2429,10 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = &priv->rss_conf,
-		.num = priv->reta_idx_n,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index eda3ba3d5..18ad40813 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1218,8 +1218,8 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1286,8 +1286,8 @@ mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
  *   An indirection table if found.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1391,8 +1391,10 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1419,7 +1421,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = rss_key,
+				.rx_hash_key = (void *)(uintptr_t)rss_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
@@ -1469,8 +1471,10 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
  *   An hash Rx queue on success.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2309aa4f3..ee534c340 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -134,7 +134,7 @@ struct mlx5_ind_table_ibv {
 	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
-	uint16_t queues_n; /**< Number of queues in the list. */
+	uint32_t queues_n; /**< Number of queues in the list. */
 	uint16_t queues[]; /**< Queue list. */
 };
 
@@ -145,7 +145,7 @@ struct mlx5_hrxq {
 	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
-	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
 
@@ -237,20 +237,22 @@ int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_rxq_verify(struct rte_eth_dev *dev);
 int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
 			       struct mlx5_ind_table_ibv *ind_tbl);
 int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev);
-struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
-struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 056405515..1a2c0299c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	struct sfc_rxq *rxq;
 	unsigned int rxq_hw_index_min;
 	unsigned int rxq_hw_index_max;
-	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
-	uint64_t rss_hf;
-	uint8_t *rss_key = NULL;
+	const uint8_t *rss_key;
 	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
 	unsigned int i;
 
-	if (rss->num == 0)
+	if (rss->queue_num == 0)
 		return -EINVAL;
 
 	rxq_sw_index = sa->rxq_count - 1;
@@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	rxq_hw_index_min = rxq->hw_index;
 	rxq_hw_index_max = 0;
 
-	for (i = 0; i < rss->num; ++i) {
+	for (i = 0; i < rss->queue_num; ++i) {
 		rxq_sw_index = rss->queue[i];
 
 		if (rxq_sw_index >= sa->rxq_count)
@@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
-	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
-	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
+	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
-	if (rss_conf != NULL) {
-		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
+	if (rss->key_len) {
+		if (rss->key_len != sizeof(sa->rss_key))
 			return -EINVAL;
 
-		rss_key = rss_conf->rss_key;
+		rss_key = rss->key;
 	} else {
 		rss_key = sa->rss_key;
 	}
@@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 
 	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
 	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
-	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
+	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
 	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
 
 	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
-		unsigned int rxq_sw_index = rss->queue[i % rss->num];
+		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
 		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
 
 		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index fe2f94010..67146aaba 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1215,7 +1215,7 @@ priv_flow_process(struct pmd_internals *pmd,
 				if (err)
 					goto exit_action_not_supported;
 			}
-			if (flow && rss)
+			if (flow)
 				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
@@ -2050,7 +2050,7 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 			   struct rte_flow_error *error)
 {
 	/* 4096 is the maximum number of instructions for a BPF program */
-	int i;
+	unsigned int i;
 	int err;
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
@@ -2066,8 +2066,8 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	}
 
 	/* Update RSS map entry with queues */
-	rss_entry.nb_queues = rss->num;
-	for (i = 0; i < rss->num; i++)
+	rss_entry.nb_queues = rss->queue_num;
+	for (i = 0; i < rss->queue_num; i++)
 		rss_entry.queues[i] = rss->queue[i];
 	rss_entry.hash_fields =
 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 8b2047adb..3ce76c413 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -202,9 +202,13 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
 						queue[j++] = i;
-				action_rss.rss_conf = &rss_conf;
-				action_rss.num = j;
-				action_rss.queue = queue;
+				action_rss = (struct rte_flow_action_rss){
+					.types = rss_conf.rss_hf,
+					.key_len = rss_conf.rss_key_len,
+					.queue_num = j,
+					.key = rss_key,
+					.queue = queue,
+				};
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index bb19e28c6..cc7819b6a 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,40 +330,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ad2e55b8e..bbc408fa6 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1033,13 +1033,21 @@ struct rte_flow_query_count {
  * Similar to QUEUE, except RSS is additionally performed on packets to
  * spread them among several queues according to the provided parameters.
  *
+ * Unlike global RSS settings used by other DPDK APIs, unsetting the
+ * @p types field does not disable RSS in a flow rule. Doing so instead
+ * requests safe unspecified "best-effort" settings from the underlying PMD,
+ * which depending on the flow rule, may result in anything ranging from
+ * empty (single queue) to all-inclusive RSS.
+ *
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
-	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+	uint32_t key_len; /**< Hash key length in bytes. */
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	const uint8_t *key; /**< Hash key. */
 	const uint16_t *queue; /**< Queue indices to use. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v5 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                           ` (2 preceding siblings ...)
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-19 10:16  1%         ` Adrien Mazarguil
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
                           ` (8 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Thomas Monjalon <thomas@monjalon.net>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 135 insertions(+), 117 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 2ddb08feb..798b7948d 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,29 +2073,29 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->s.rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->s.rss_key),
+		action_rss_data->rss_conf.rss_key_len =
+			RTE_MIN(sizeof(action_rss_data->rss_key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2125,7 +2113,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2135,7 +2123,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2154,7 +2142,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2169,7 +2157,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2186,10 +2174,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2206,6 +2193,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2483,8 +2471,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2493,6 +2481,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2503,6 +2492,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2525,8 +2519,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index d0d372797..95618e4eb 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -977,7 +977,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1026,14 +1026,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1065,7 +1071,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1096,11 +1102,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 15cdf07b7..8feb6ae31 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 8b156667c..679fdf318 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 80f9cb6cb..bb19e28c6 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 96184f030..ad2e55b8e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v5 05/16] ethdev: alter behavior of flow API actions
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 01/16] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 04/16] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-19 10:16  1%         ` Adrien Mazarguil
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
                           ` (9 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to do so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

--

v5 changes:

Fixed issues raised by GCC and Clang with overlap checks in both enic and
mlx5, as reported by Andrew [1].

[1] http://dpdk.org/ml/archives/dev/2018-April/097864.html
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index b9f36587c..c34ae84d1 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -964,6 +965,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!(overlap & FATE))
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -1001,6 +1011,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!(overlap & FATE))
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 67fd568bc..15cdf07b7 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 1ca413e32..8b156667c 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!(overlap & FATE))
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3b7a960b0..fe2f94010 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 6ace24ff4..96184f030 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v5 04/16] ethdev: remove DUP action from flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-19 10:16  2%         ` Adrien Mazarguil
  2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
                           ` (10 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f0b4b7bc4..2ddb08feb 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a7645adb8..d0d372797 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1065,7 +1065,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ada280810..80f9cb6cb 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index d28a2a473..6ace24ff4 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v5 01/16] ethdev: add error types to flow API
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-19 10:16  3%         ` Adrien Mazarguil
  2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 04/16] ethdev: remove DUP action from " Adrien Mazarguil
                           ` (11 subsequent siblings)
  12 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 5daa93bb3..a7645adb8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1244,8 +1244,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 44ae19d3b..26b95c772 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (12 preceding siblings ...)
  2018-04-16 16:23  2%       ` [dpdk-dev] [PATCH v4 16/16] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-19 10:16  4%       ` Adrien Mazarguil
  2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 01/16] ethdev: add error types to flow API Adrien Mazarguil
                           ` (12 more replies)
  13 siblings, 13 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:16 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v6 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/098035.html

v5 changes:

- Fixed errors reported by GCC and Clang in patch 05/16 ("ethdev: alter
  behavior of flow API actions").
- Rebased series once again.

v4 changes:

- No change besides new acked-by lines, rebased series to address conflicts.

v3 changes:

- Rebased series, fixed latest conflicts.
- Addressed Andrew's comments, see affected patches for details:
  - Empty RSS types in flow rule means PMD-specific RSS instead of no RSS.
  - RSS hash function now explicitly compared against
    RTE_ETH_HASH_FUNCTION_DEFAULT instead of 0 in all PMDs.
  - sfc PMD updated to also accept Toeplitz.
  - Implicit VLAN TPID matching now removed from all PMDs.
  - Default mask upate for VLAN TCI now split as separate patch #11.
  - Ingress/egress definition clarified in patch #12.

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (16):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: limit default VLAN TCI mask in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 394 +++++++++++----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 618 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  60 ++-
 drivers/net/bnxt/bnxt_filter.c              |  49 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 ++-
 drivers/net/e1000/igb_rxtx.c                |  55 +-
 drivers/net/enic/enic_flow.c                |  50 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 130 +++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 +-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 316 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  32 +-
 drivers/net/sfc/sfc_flow.c                  |  78 ++-
 drivers/net/tap/tap_flow.c                  |  49 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 339 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1750 insertions(+), 1123 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v6 00/11] Bunch of flow API-related fixes
  2018-04-16 16:21  3%   ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
  2018-04-17  9:17  0%     ` Ferruh Yigit
@ 2018-04-19 10:07  3%     ` Adrien Mazarguil
  2018-04-19 14:03  0%       ` Ferruh Yigit
  1 sibling, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-19 10:07 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v6 changes:

- No change, rebased series and updated/fixed commit messages.

v5 changes:

- No change, rebased series to address conflicts.

v4 changes:

- Rebased again.
- The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
  see updated patch for details.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  app/testpmd: fix missing boolean values in flow command
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline.c                      |   2 +
 app/test-pmd/cmdline_flow.c                 | 252 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 10 files changed, 494 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-18  9:24  3%             ` Ferruh Yigit
@ 2018-04-19  9:48  5%               ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-19  9:48 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Thomas Monjalon, dev

On Wed, Apr 18, 2018 at 10:24:10AM +0100, Ferruh Yigit wrote:
> On 4/18/2018 9:41 AM, Adrien Mazarguil wrote:
> > On Tue, Apr 17, 2018 at 08:37:31PM +0100, Ferruh Yigit wrote:
> >> On 4/16/2018 5:22 PM, Adrien Mazarguil wrote:
> >>> These enable more precise reporting of objects responsible for errors.
> >>>
> >>> This breaks ABI compatibility for the following public functions:
> >>>
> >>> - rte_flow_create()
> >>> - rte_flow_destroy()
> >>> - rte_flow_error_set()
> >>> - rte_flow_flush()
> >>> - rte_flow_isolate()
> >>> - rte_flow_query()
> >>> - rte_flow_validate()
> >>
> >> Is there a deprecation notice for this API break?
> > 
> > A notice covering the main changes in this series (most patches have an ABI
> > impact) was sent but not included [1]. This particular patch rides on the
> > announced ABI breakage in order to add a relatively minor feature while
> > there.
> 
> My take from "announced ABI breakage" is the deprecation notice get three acks
> and merged into release, so it seems there is no deprecation notice and
> according process first deprecation notice should go in this release.

True, so just describe how ABI impact is no worse than what was announced
(and included) [3] and most of them are actually opportune changes to
improve consistency and documentation since ABI would be broken for this
release regardless. Regarding individual patches:

- 01/16 "ethdev: add error types to flow API"
  => New error types are added in the middle of an existing enum.

- 02/16 "ethdev: clarify flow API pattern items and actions"
  => No impact.

- 03/16 "doc: remove flow API migration section"
  => No impact.

- 04/16 "ethdev: remove DUP action from flow API"
  => An action that no PMD supports is removed from an existing enum.

- 05/16 "ethdev: alter behavior of flow API actions"
  => A documentation change in how actions are processed logically
     breaks ABI in the case of repeated actions (currently a corner
     case). Most PMDs do not implement the original behavior correctly
     anyway (see commit log).

- 06/16 "ethdev: remove C99 flexible arrays from flow API"
  => ABI impact is primarily triggered by the RSS action change (already
     covered [3]). The RAW pattern item is also modified for consistency.
     On the API side, except when allocating these structures, there is no
     difference in usage (i.e. array[index] => pointer[index]).

- 07/16 "ethdev: flatten RSS configuration in flow API"
  => Already covered [3].

- 08/16 "ethdev: add hash function to RSS flow API action"
  => Already covered [3].

- 09/16 "ethdev: add encap level to RSS flow API action"
  => Already covered [3].

- 10/16 "ethdev: refine TPID handling in flow API"
  => No PMD supports the poorly defined TPID matching, applications couldn't
     possibly rely on it.

- 11/16 "ethdev: limit default VLAN TCI mask in flow API"
  => No ABI breakage, but a different behavior for applications that rely on
     the default mask. It doesn't look like any PMD supports PCP/DEI
     matching so again applications could not rely on it (they still can do
     it by providing a specific mask).

- 12/16 "ethdev: add transfer attribute to flow API"
  => Minor ABI impact (read: logical) due to the addition of a bit in an
     existing bit-field. No practical impact on applications.

- 13/16 "ethdev: update behavior of VF/PF in flow API"
  => Documentation (API) change. The "transfer" bit must now be set in order to
     use these actions with PMDs that support them.

- 14/16 "ethdev: rename physical port item in flow API"
  => API change for a pattern item supported by no PMD.

- 15/16 "ethdev: add physical port action to flow API"
  => New action added in the middle of an existing enum.

- 16/16 "ethdev: add port ID item and action to flow API"
  => New item/action added in the middle of existing enums.

> Hi Thomas,
> 
> Any comment on issue?
> 
> > 
> > This ABI change was implicitly needed by upcoming work for 18.05 (Xueming's
> > RSS stuff [2][3], Declan's TEP [4], the rest is summarized by a RFC [5]) due
> > to the necessary changes of behavior in flow rules.
> > 
> > Note that Xueming's deprecation notice [3] alone would have triggered such
> > an ABI change because struct rte_flow_action_rss wouldn't have been binary 
> > compatible if struct rte_eth_rss_conf was updated. This change would have
> > propagated back to rte_flow functions manipulating them.
> 
> To be honest I lost track of Xueming's patches, because of split/merge of
> patchset, multiple set with multiple versions out.
> 
> Is it possible to document the dependency graph including your set?

I hopefully didn't miss any:

 Bunch of flow API-related fixes (v5) [6]
 |
 `-- Flow API overhaul for switch offloads (v4) [7]
     |
     +-- additions to support tunnel encap/decap (v4) [8]
     |
     +-- introduce new tunnel types (v5) [9]
     |   |
     |   `-- mlx5 Rx tunnel offloading (v4) [10]
     |
     +-- rte_flow extension for vSwitch acceleration (v3.2) [11]
     |
     `-- net/sfc: RSS improvements [12]

[6] http://dpdk.org/ml/archives/dev/2018-April/097411.html
[7] http://dpdk.org/ml/archives/dev/2018-April/097423.html
[8] http://dpdk.org/ml/archives/dev/2018-April/097956.html
[9] http://dpdk.org/ml/archives/dev/2018-April/097669.html
[10] http://dpdk.org/ml/archives/dev/2018-April/097673.html
[11] http://dpdk.org/ml/archives/dev/2018-April/097266.html
[12] http://dpdk.org/ml/archives/dev/2018-April/095872.html

I gave the current version for these series but some of them are still under
review and may change. I plan to send updates for [6] and [7] shortly.

> > [1] "doc: announce API changes for flow rules"
> >      http://dpdk.org/ml/archives/dev/2018-February/090988.html
> > [2] "MLX5 tunnel Rx offloading"
> >     http://dpdk.org/ml/archives/dev/2018-February/091461.html
> > [3] "doc: annouce ABI change for RSS configuraiton structure"
> >     http://dpdk.org/ml/archives/dev/2018-February/090127.html
> > [4] "tunnel endpoint hw acceleration enablement"
> >     http://dpdk.org/ml/archives/dev/2017-December/084676.html
> > [5] "Switch device offload with DPDK"
> >     http://dpdk.org/ml/archives/dev/2018-March/092513.html
> > 
> >>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> >>> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
> >>
> >> <...>
> >>
> > 
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (4 preceding siblings ...)
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev Arnon Warshavsky
@ 2018-04-19  6:01  2% ` Arnon Warshavsky
  2018-04-19 14:39  3%   ` Burakov, Anatoly
  2018-04-19 17:48  0%   ` Aaron Conole
  5 siblings, 2 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local functions to this file,
changing from void to int are non-abi-breaking.
For handling the single function that cannot
change from void to int due to abi,
where this is the only place it is called in,
I added a state variable that is being checked
right after the call to this function.

--

v4 - fix split literal strings in log messages

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_eal/bsdapp/eal/eal.c           |  86 ++++++++++++++-------
 lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
 lib/librte_eal/common/eal_common_launch.c |  21 ++++++
 lib/librte_eal/common/include/rte_debug.h |  12 +++
 lib/librte_eal/linuxapp/eal/eal.c         | 120 ++++++++++++++++++++----------
 lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
 6 files changed, 270 insertions(+), 99 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index d996190..9c2f6f1 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -151,7 +151,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -160,60 +160,78 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'. Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
 	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	void *rte_mem_cfg_addr;
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 	close(mem_cfg_fd);
-	if (rte_mem_cfg_addr == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
+	}
 
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -237,23 +255,28 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create())
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach())
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+	return 0;
 }
 
 /* display usage */
@@ -595,7 +618,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_mp_channel_init() < 0) {
 		rte_eal_init_alert("failed to init mp channel\n");
@@ -652,7 +676,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_check_mem_on_local_socket();
 
-	eal_thread_init_master(rte_config.master_lcore);
+	if (eal_thread_init_master(rte_config.master_lcore) != 0)
+		return -1;
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -666,18 +691,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
index d602daf..5c3947c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_thread.c
+++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
index fe0ba3f..6f8bd46 100644
--- a/lib/librte_eal/common/eal_common_launch.c
+++ b/lib/librte_eal/common/eal_common_launch.c
@@ -14,6 +14,7 @@
 #include <rte_pause.h>
 #include <rte_per_lcore.h>
 #include <rte_lcore.h>
+#include <rte_debug.h>
 
 /*
  * Wait until a lcore finished its job.
@@ -88,3 +89,23 @@ enum rte_lcore_state_t
 		rte_eal_wait_lcore(lcore_id);
 	}
 }
+
+/* panic state */
+static int _panic_state;
+
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void)
+{
+	return _panic_state;
+}
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void)
+{
+	_panic_state = 1;
+}
diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
index 272df49..b421d33 100644
--- a/lib/librte_eal/common/include/rte_debug.h
+++ b/lib/librte_eal/common/include/rte_debug.h
@@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
 }
 #endif
 
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void);
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void);
+
+
 #endif /* _RTE_DEBUG_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 21afa73..393441a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -160,7 +160,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -169,7 +169,7 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* map the config before hugepage address so that we don't waste a page */
 	if (internal_config.base_virtaddr != 0)
@@ -179,30 +179,39 @@ enum rte_iova_mode
 	else
 		rte_mem_cfg_addr = NULL;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+				__func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
-	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+			__func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
@@ -211,10 +220,11 @@ enum rte_iova_mode
 	 * processes could later map the config into this exact location */
 	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
 
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	struct rte_mem_config *mem_config;
@@ -222,33 +232,40 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+						__func__, pathname);
+			return -1;
+		}
 	}
 
 	/* map it as read-only first */
 	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
 			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
-	if (mem_config == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-			  errno, strerror(errno));
+	if (mem_config == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config! error %i (%s)\n",
+				__func__, errno, strerror(errno));
+		return -1;
+	}
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* reattach the shared config at exact memory location primary process has it */
-static void
+static int
 rte_eal_config_reattach(void)
 {
 	struct rte_mem_config *mem_config;
 	void *rte_mem_cfg_addr;
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* save the address primary process has mapped shared config to */
 	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
@@ -263,16 +280,21 @@ enum rte_iova_mode
 	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
 		if (mem_config != MAP_FAILED)
 			/* errno is stale, don't use */
-			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
-				  " - please use '--base-virtaddr' option\n",
-				  rte_mem_cfg_addr, mem_config);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config at [%p], got [%p] - please use "
+					"'--base-virtaddr' option\n",
+					__func__, rte_mem_cfg_addr, mem_config);
 		else
-			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-				  errno, strerror(errno));
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config! error %i (%s)\n",
+					__func__, errno, strerror(errno));
+		return -1;
 	}
 	close(mem_cfg_fd);
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -296,24 +318,31 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create() != 0)
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach() != 0)
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
-		rte_eal_config_reattach();
+		if (rte_eal_config_reattach() != 0)
+			return -1;
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+
+	return 0;
 }
 
 /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
@@ -820,7 +849,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
 		rte_eal_init_alert("Cannot init logging.");
@@ -892,6 +922,9 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_thread_init_master(rte_config.master_lcore);
 
+	if (rte_get_panic_state())
+		return -1;
+
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
@@ -909,18 +942,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b..3afcee5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (3 preceding siblings ...)
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 06/11] kni: replace rte_panic instances in kni Arnon Warshavsky
@ 2018-04-19  6:01  3% ` Arnon Warshavsky
  2018-04-19 17:27  0%   ` Kevin Traynor
  2018-04-19  6:01  2% ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  5 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_ether/rte_ethdev.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 7821a88..9c13827 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -194,7 +194,7 @@ enum {
 	return port_id;
 }
 
-static void
+static int
 rte_eth_dev_shared_data_prepare(void)
 {
 	const unsigned flags = 0;
@@ -210,8 +210,12 @@ enum {
 					rte_socket_id(), flags);
 		} else
 			mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA);
-		if (mz == NULL)
-			rte_panic("Cannot allocate ethdev shared data\n");
+		if (mz == NULL) {
+			rte_spinlock_unlock(&rte_eth_shared_data_lock);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot allocate ethdev shared data\n",
+					__func__);
+			return -1;
+		}
 
 		rte_eth_dev_shared_data = mz->addr;
 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
@@ -224,6 +228,8 @@ enum {
 	}
 
 	rte_spinlock_unlock(&rte_eth_shared_data_lock);
+
+	return 0;
 }
 
 struct rte_eth_dev *
@@ -274,7 +280,8 @@ struct rte_eth_dev *
 	uint16_t port_id;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port creation between primary and secondary threads. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -317,7 +324,8 @@ struct rte_eth_dev *
 	uint16_t i;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port attachment to primary port creation and release. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -345,7 +353,8 @@ struct rte_eth_dev *
 	if (eth_dev == NULL)
 		return -EINVAL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -399,7 +408,8 @@ struct rte_eth_dev *
 int __rte_experimental
 rte_eth_dev_owner_new(uint64_t *owner_id)
 {
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -450,7 +460,8 @@ struct rte_eth_dev *
 {
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -467,7 +478,8 @@ struct rte_eth_dev *
 			{.id = RTE_ETH_DEV_NO_OWNER, .name = ""};
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -482,7 +494,8 @@ struct rte_eth_dev *
 {
 	uint16_t port_id;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -502,7 +515,8 @@ struct rte_eth_dev *
 {
 	int ret = 0;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 06/11] kni: replace rte_panic instances in kni
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (2 preceding siblings ...)
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
@ 2018-04-19  6:01  3% ` Arnon Warshavsky
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev Arnon Warshavsky
  2018-04-19  6:01  2% ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_kni/rte_kni.c      | 18 ++++++++++++------
 lib/librte_kni/rte_kni_fifo.h | 11 ++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 8a8f6c1..4dac407 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -353,37 +353,43 @@ struct rte_kni *
 	/* TX RING */
 	mz = slot->m_tx_q;
 	ctx->tx_q = mz->addr;
-	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.tx_phys = mz->phys_addr;
 
 	/* RX RING */
 	mz = slot->m_rx_q;
 	ctx->rx_q = mz->addr;
-	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.rx_phys = mz->phys_addr;
 
 	/* ALLOC RING */
 	mz = slot->m_alloc_q;
 	ctx->alloc_q = mz->addr;
-	kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.alloc_phys = mz->phys_addr;
 
 	/* FREE RING */
 	mz = slot->m_free_q;
 	ctx->free_q = mz->addr;
-	kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.free_phys = mz->phys_addr;
 
 	/* Request RING */
 	mz = slot->m_req_q;
 	ctx->req_q = mz->addr;
-	kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.req_phys = mz->phys_addr;
 
 	/* Response RING */
 	mz = slot->m_resp_q;
 	ctx->resp_q = mz->addr;
-	kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.resp_phys = mz->phys_addr;
 
 	/* Req/Resp sync mem area */
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c..5052015 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -7,17 +7,22 @@
 /**
  * Initializes the kni fifo structure
  */
-static void
+static int
 kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size)
 {
 	/* Ensure size is power of 2 */
-	if (size & (size - 1))
-		rte_panic("KNI fifo size must be power of 2\n");
+	if (size & (size - 1)) {
+		RTE_LOG(CRIT, EAL, "%s(): KNI fifo size must be power of 2\n",
+				__func__);
+		return -1;
+	}
 
 	fifo->write = 0;
 	fifo->read = 0;
 	fifo->len = size;
 	fifo->elem_size = sizeof(void *);
+
+	return 0;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-19  6:01  3% ` Arnon Warshavsky
  2018-04-19 17:26  0%   ` Kevin Traynor
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 06/11] kni: replace rte_panic instances in kni Arnon Warshavsky
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/ixgbe/ixgbe_ethdev.c |  3 ++-
 drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
 drivers/net/ixgbe/ixgbe_pf.c     | 13 +++++++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a5e2fc0..c7797f1 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1224,7 +1224,8 @@ struct rte_ixgbe_xstats_name_off {
 	memset(hwstrip, 0, sizeof(*hwstrip));
 
 	/* initialize PF if max_vfs not zero */
-	ixgbe_pf_host_init(eth_dev);
+	if (ixgbe_pf_host_init(eth_dev) != 0)
+		return -1;
 
 	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
 	/* let hardware know driver is loaded */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 6550777..8bb41ec 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -661,7 +661,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
 
 void ixgbe_vlan_hw_strip_config(struct rte_eth_dev *dev);
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
index 4e61310..4cd3651 100644
--- a/drivers/net/ixgbe/ixgbe_pf.c
+++ b/drivers/net/ixgbe/ixgbe_pf.c
@@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct ixgbe_vf_info **vfinfo =
 		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
+				__func__);
+		return -1;
+	}
 
 	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
 	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
@@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	/* set mb interrupt mask */
 	ixgbe_mb_intr_setup(eth_dev);
+
+	return 0;
 }
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-19  6:01  3% ` Arnon Warshavsky
  2018-04-19 17:25  0%   ` Kevin Traynor
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking
--
v4 - keep error message literal string in a singhle line

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/e1000/e1000_ethdev.h |  2 +-
 drivers/net/e1000/igb_ethdev.c   |  3 ++-
 drivers/net/e1000/igb_pf.c       | 15 +++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b89..2e527de 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -411,7 +411,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
 /*
  * misc function prototypes
  */
-void igb_pf_host_init(struct rte_eth_dev *eth_dev);
+int igb_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 9b808a9..4479616 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
 	}
 
 	/* initialize PF if max_vfs not zero */
-	igb_pf_host_init(eth_dev);
+	if (igb_pf_host_init(eth_dev) != 0)
+		goto err_late;
 
 	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
 	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
index b9f2e53..ae4b0a4 100644
--- a/drivers/net/e1000/igb_pf.c
+++ b/drivers/net/e1000/igb_pf.c
@@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void igb_pf_host_init(struct rte_eth_dev *eth_dev)
+int igb_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct e1000_vf_info **vfinfo =
 		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	if (0 == (vf_num = dev_num_vf(eth_dev)))
-		return;
+		return 0;
 
 	if (hw->mac.type == e1000_i350)
 		nb_queue = 1;
@@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 		/* per datasheet, it should be 2, but 1 seems correct */
 		nb_queue = 1;
 	else
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private VF data\n",
+			__func__);
+		return -1;
+	}
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
 	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
@@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 	/* set mb interrupt mask */
 	igb_mb_intr_setup(eth_dev);
 
-	return;
+	return 0;
 }
 
 void igb_pf_host_uninit(struct rte_eth_dev *dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver
  2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
@ 2018-04-19  6:01  3% ` Arnon Warshavsky
  2018-04-19 17:25  0%   ` Kevin Traynor
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local functions to this file,
changing from void to int are non-abi-breaking
--
v4 - fix split literal strings in log messages

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c         | 28 +++++++++++++++--------
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
 drivers/net/bonding/rte_eth_bond_api.c            | 20 +++++++++++-----
 drivers/net/bonding/rte_eth_bond_pmd.c            |  9 +++++---
 drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
 5 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index c452318..7512901 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -893,7 +893,7 @@
 			bond_mode_8023ad_periodic_cb, arg);
 }
 
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
 				uint16_t slave_id)
 {
@@ -939,7 +939,7 @@
 	timer_cancel(&port->warning_timer);
 
 	if (port->mbuf_pool != NULL)
-		return;
+		return 0;
 
 	RTE_ASSERT(port->rx_ring == NULL);
 	RTE_ASSERT(port->tx_ring == NULL);
@@ -968,8 +968,9 @@
 	/* Any memory allocation failure in initialization is critical because
 	 * resources can't be free, so reinitialization is impossible. */
 	if (port->mbuf_pool == NULL) {
-		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-			slave_id, mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
@@ -977,8 +978,9 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
 
 	if (port->rx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	/* TX ring is at least one pkt longer to make room for marker packet. */
@@ -987,9 +989,12 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
 
 	if (port->tx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
+
+	return 0;
 }
 
 int
@@ -1143,9 +1148,12 @@
 	struct bond_dev_private *internals = bond_dev->data->dev_private;
 	uint8_t i;
 
-	for (i = 0; i < internals->active_slave_count; i++)
-		bond_mode_8023ad_activate_slave(bond_dev,
+	for (i = 0; i < internals->active_slave_count; i++) {
+		int rc = bond_mode_8023ad_activate_slave(bond_dev,
 				internals->active_slaves[i]);
+		if (rc != 0)
+			return rc;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
index 0f490a5..96a42f2 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
@@ -263,7 +263,7 @@ struct mode8023ad_private {
  * @return
  *  0 on success, negative value otherwise.
  */
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
 
 /**
diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index aa89425..96aa1ff 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -69,14 +69,15 @@
 	return 0;
 }
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
 {
 	struct bond_dev_private *internals = eth_dev->data->dev_private;
 	uint8_t active_count = internals->active_slave_count;
 
 	if (internals->mode == BONDING_MODE_8023AD)
-		bond_mode_8023ad_activate_slave(eth_dev, port_id);
+		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
+			return -1;
 
 	if (internals->mode == BONDING_MODE_TLB
 			|| internals->mode == BONDING_MODE_ALB) {
@@ -357,10 +358,17 @@
 				bond_ethdev_primary_set(internals,
 							slave_port_id);
 
-			if (find_slave_by_id(internals->active_slaves,
-					     internals->active_slave_count,
-					     slave_port_id) == internals->active_slave_count)
-				activate_slave(bonded_eth_dev, slave_port_id);
+			int rc =
+				find_slave_by_id(internals->active_slaves,
+					internals->active_slave_count,
+					slave_port_id);
+
+			if (rc == internals->active_slave_count) {
+				int rc = activate_slave(bonded_eth_dev,
+							slave_port_id);
+				if (rc != 0)
+					return -1;
+			}
 		}
 	}
 
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index 2805c71..2d9052d 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -1741,8 +1741,10 @@ struct bwg_slave {
 		/* Any memory allocation failure in initialization is critical because
 		 * resources can't be free, so reinitialization is impossible. */
 		if (port->slow_pool == NULL) {
-			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-				slave_id, mem_name, rte_strerror(rte_errno));
+			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory pool '%s': %s\n",
+				__func__, slave_id,
+				mem_name, rte_strerror(rte_errno));
+			return -1;
 		}
 	}
 
@@ -2673,7 +2675,8 @@ struct bwg_slave {
 			mac_address_slaves_update(bonded_eth_dev);
 		}
 
-		activate_slave(bonded_eth_dev, port_id);
+		if (activate_slave(bonded_eth_dev, port_id) != 0)
+			return -1;
 
 		/* If user has defined the primary port then default to using it */
 		if (internals->user_defined_primary_port &&
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 94eca88..d99d42c 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -187,7 +187,7 @@ struct bond_dev_private {
 void
 deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
 void
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances
@ 2018-04-19  6:00  3% Arnon Warshavsky
  2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
                   ` (5 more replies)
  0 siblings, 6 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-19  6:00 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

The purpose of this patch series is to cleanup the library code
from paths that end up aborting the process,
and move to checking error values, in order to allow the running process
perform an orderly teardown or other mitigation of the event.

This patch modifies the majority of rte_panic calls
under lib and drivers, and replaces them with a log message
and an error return code according to context,
that can be propagated up the call stack.

- Focus was given to the dpdk initialization path
- Some of the panic calls within drivers were left in place where
  the call is from within an interrupt or calls that are
  on the data path,where there is no simple applicative
  route to propagate the error to temination.
  These should be handled by the driver maintainers.
- In order to avoid breaking ABI where panic was called from public
  void functions, a panic state variable was introduced so that
  it can be queried after calling these void functions.
  This tool place for a single function call.
- local void functions with no api were changed to retrun a value
  where needed
- No change took place in example and test files
- No change took place for debug assertions calling panic
- A new function was added to devtools/checkpatches.sh
  in order to prevent new additions of calls to rte_panic
  under lib and drivers.

Keep calm and don't panic

---

v2:
- reformat error messages so that literal string are in the same line
- fix typo in commit message
- add new return code to doxigen of rte_memzone_free()

v3:
- submit  all 13 patches changed and unchanged in the same patchset

v4:
- remove 2 patches that are no more relevant
- fix split literal string in error message
- change return value -1 to enum
- split value and success code in a static function

Arnon Warshavsky (11):
  crypto: replace rte_panic instances in crypto driver
  bond: replace rte_panic instances in bonding driver
  e1000: replace rte_panic instances in e1000 driver
  ixgbe: replace rte_panic instances in ixgbe driver
  eal: replace rte_panic instances in eventdev
  kni: replace rte_panic instances in kni
  eal: replace rte_panic instances in hugepage_info
  eal: replace rte_panic instances in interrupts thread
  eal: replace rte_panic instances in ethdev
  eal: replace rte_panic instances in init sequence
  devtools: prevent new instances of rte_panic and rte_exit

 devtools/checkpatches.sh                          |  94 ++++++++++++++++-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c       |   8 +-
 drivers/crypto/dpaa_sec/dpaa_sec.c                |   8 +-
 drivers/net/bonding/rte_eth_bond_8023ad.c         |  28 +++--
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |   2 +-
 drivers/net/bonding/rte_eth_bond_api.c            |  20 ++--
 drivers/net/bonding/rte_eth_bond_pmd.c            |   9 +-
 drivers/net/bonding/rte_eth_bond_private.h        |   2 +-
 drivers/net/e1000/e1000_ethdev.h                  |   2 +-
 drivers/net/e1000/igb_ethdev.c                    |   3 +-
 drivers/net/e1000/igb_pf.c                        |  15 +--
 drivers/net/ixgbe/ixgbe_ethdev.c                  |   3 +-
 drivers/net/ixgbe/ixgbe_ethdev.h                  |   2 +-
 drivers/net/ixgbe/ixgbe_pf.c                      |  13 ++-
 lib/librte_eal/bsdapp/eal/eal.c                   |  86 +++++++++++-----
 lib/librte_eal/bsdapp/eal/eal_thread.c            |  65 +++++++++---
 lib/librte_eal/common/eal_common_launch.c         |  21 ++++
 lib/librte_eal/common/include/rte_debug.h         |  12 +++
 lib/librte_eal/linuxapp/eal/eal.c                 | 120 +++++++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  32 ++++--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c      |  27 +++--
 lib/librte_eal/linuxapp/eal/eal_thread.c          |  65 +++++++++---
 lib/librte_ether/rte_ethdev.c                     |  36 +++++--
 lib/librte_eventdev/rte_eventdev_pmd_pci.h        |   8 +-
 lib/librte_eventdev/rte_eventdev_pmd_vdev.h       |   8 +-
 lib/librte_kni/rte_kni.c                          |  18 ++--
 lib/librte_kni/rte_kni_fifo.h                     |  11 +-
 27 files changed, 533 insertions(+), 185 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] Retire x86 32 bit?
  2018-04-17 21:18  3%       ` Roger B Melton
@ 2018-04-18 17:40  0%         ` Jim Murphy
  0 siblings, 0 replies; 200+ results
From: Jim Murphy @ 2018-04-18 17:40 UTC (permalink / raw)
  To: Roger B Melton; +Cc: Stephen Hemminger, David Harton (dharton), dev

Same for our case.

On Tue, Apr 17, 2018 at 2:18 PM, Roger B Melton <rmelton@cisco.com> wrote:

> On 4/17/18 4:46 PM, Stephen Hemminger wrote:
>
>> On Tue, 17 Apr 2018 13:01:14 -0700
>> Jim Murphy <jmurphy@arista.com> wrote:
>>
>> Still used in certain memory constrained environments.
>>>
>>>
>>> On Tue, Apr 17, 2018 at 11:39 AM, David Harton (dharton) <
>>> dharton@cisco.com>
>>> wrote:
>>>
>>> It is used and tested in production and non-production environments.
>>>>
>>>> Regards,
>>>> Dave
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: dev <dev-bounces@dpdk.org> On Behalf Of Stephen Hemminger
>>>>> Sent: Tuesday, April 17, 2018 2:31 PM
>>>>> To: dev@dpdk.org
>>>>> Subject: [dpdk-dev] Retire x86 32 bit?
>>>>>
>>>>> I wonder if x86 32 bit is still useful?
>>>>> Many distributions no longer support it, and not sure if it is tested
>>>>> througly by anyone.
>>>>>
>>>>> Maybe time to deprecate it (gradually)?
>>>>>
>>>>
>>>>
>>> Pure 32 bit, or x86-64 instructions and registers used in 32 bit mode
>> (which can be faster).
>> .
>>
>>
> Pure 32bit in our case.  We do not use x32 ABI.
>
> -Roger
>
>
>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
  2018-04-18 11:55  0%       ` Xueming(Steven) Li
@ 2018-04-18 15:11  0%         ` Iremonger, Bernard
  2018-04-19 14:24  0%           ` Xueming(Steven) Li
  0 siblings, 1 reply; 200+ results
From: Iremonger, Bernard @ 2018-04-18 15:11 UTC (permalink / raw)
  To: Xueming(Steven) Li, Lu, Wenzhuo, Wu, Jingjing, Thomas Monjalon,
	Adrien Mazarguil
  Cc: Nélio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

Hi Li

<snip>

> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xueming Li
> > > Sent: Tuesday, April 17, 2018 4:04 PM
> > > To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> > > <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>;
> > > Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > > Cc: Xueming Li <xuemingl@mellanox.com>; Nelio Laranjeiro
> > > <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>;
> > > dev@dpdk.org; Olivier Matz <olivier.matz@6wind.com>
> > > Subject: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> > >
> > > v5:
> > > - Fixed VXLAN-GPE comment alignment
> > > v4:
> > > - Update testpmd doc for flow VXLAN-GPE paramter.
> > > v3:
> > > - Change VXLAN-GPE definition order to avoid ABI compatibility issue.
> > > v2:
> > > - Split patch set into public and mlx5 two series, this one is the first.
> > > v1:
> > > - Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
> > > - Remove deprecation notes of rss level
> > >
> > > This patchset introduced new tunnel type and related testpmd code:
> > > - New tunnel type VXLAN-GPE
> > >
> > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fd
> > > at
> > > atracker.ietf.org%2Fdoc%2Fdraft-ietf-nvo3-vxlan-gpe%2F&data=02%7C01%
> > > 7C
> > >
> xuemingl%40mellanox.com%7C2dffef40890b4cf8ff9d08d5a47d0420%7Ca65297
> 1
> > > c7
> > >
> d2e4d9ba6a4d149256f461b%7C0%7C0%7C636595779231620631&sdata=%2Bv
> x%2Fg
> > > VB
> > > 3e3BHI%2BYxPxOIpqK6CuKvQQ8qej4B1Faxihc%3D&reserved=0
> > > - New tunnel type MPLS-in-GRE
> > >
> > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Ft
> > > oo
> > >
> ls.ietf.org%2Fhtml%2Frfc4023&data=02%7C01%7Cxuemingl%40mellanox.com%
> > > 7C
> > >
> 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b
> %
> > > 7C
> > >
> 0%7C0%7C636595779231620631&sdata=wk2wvoB9LSbI2LfHZVvWzmtgS0XQbG
> NMFL4
> > > G1
> > > kyr77E%3D&reserved=0
> > > - New tunnel type MPLS-in-UDP
> > >
> > > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Ft
> > > oo
> > >
> ls.ietf.org%2Fhtml%2Frfc7510&data=02%7C01%7Cxuemingl%40mellanox.com%
> > > 7C
> > >
> 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b
> %
> > > 7C
> > >
> 0%7C0%7C636595779231620631&sdata=4RXbPD9tV7ArcnKsK8IJy%2B9XbKlzKc
> NBS
> > > v1
> > > LXVwwuPs%3D&reserved=0
> > > - Support GRE extension in testpmd csum forwarding engine
> > >
> > > Xueming Li (4):
> > >   doc: remove RSS configuration change announcement
> > >   ethdev: introduce new tunnel VXLAN-GPE
> > >   app/testpmd: introduce new tunnel VXLAN-GPE
> > >   app/testpmd: add more GRE extension support to csum engine
> > >
> > >  app/test-pmd/cmdline_flow.c                 |  24 +++++++
> > >  app/test-pmd/config.c                       |   2 +
> > >  app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++--
> -
> > >  app/test-pmd/parameters.c                   |  12 +++-
> > >  app/test-pmd/testpmd.h                      |   2 +
> > >  doc/guides/prog_guide/rte_flow.rst          |  12 ++++
> > >  doc/guides/rel_notes/deprecation.rst        |   4 --
> > >  doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
> > >  doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
> > >  lib/librte_ether/rte_eth_ctrl.h             |   3 +-
> > >  lib/librte_ether/rte_flow.c                 |   1 +
> > >  lib/librte_ether/rte_flow.h                 |  27 ++++++++
> > >  lib/librte_mbuf/rte_mbuf.c                  |   3 +
> > >  lib/librte_mbuf/rte_mbuf.h                  |   1 +
> > >  lib/librte_mbuf/rte_mbuf_ptype.c            |   1 +
> > >  lib/librte_mbuf/rte_mbuf_ptype.h            |  13 ++++
> > >  lib/librte_net/rte_ether.h                  |  25 +++++++
> > >  17 files changed, 225 insertions(+), 17 deletions(-)
> > >
> > > --
> > > 2.13.3
> >
> > Patch 3 of this patch set fails to apply to the latest master, the other three
> patches apply ok.
> >
> > Regards,
> >
> > Bernard.
> >
> I tried it with orgin/master branch and it worked for me.
> Could you please share more information?
> 
> Best Regards,
> Xueming Li

I have just cloned the current dpdk master, patch 3 still fails to apply, dpdk-dev-v5-3-4-app-testpmd-introduce-new-tunnel-VXLAN-GPE.patch

git am ./dpdk-dev-v5-3-4-app-testpmd-introduce-new-tunnel-VXLAN-GPE.patch 
Applying: app/testpmd: introduce new tunnel VXLAN-GPE
error: patch failed: app/test-pmd/config.c:997
error: app/test-pmd/config.c: patch does not apply
Patch failed at 0001 app/testpmd: introduce new tunnel VXLAN-GPE
The copy of the patch that failed is found in:
   /root/dpdk_temp/.git/rebase-apply/patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort

Regards,

Bernard.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions
  2018-04-18 12:26  0%         ` Andrew Rybchenko
@ 2018-04-18 14:58  0%           ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-18 14:58 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Ajit Khaparde, Wenzhuo Lu,
	John Daley, Gaetan Rivet, Beilei Xing, Konstantin Ananyev,
	Nelio Laranjeiro, Pascal Mazon

On Wed, Apr 18, 2018 at 03:26:00PM +0300, Andrew Rybchenko wrote:
> On 04/16/2018 07:22 PM, Adrien Mazarguil wrote:
> > This patch makes the following changes to flow rule actions:
> > 
> > - List order now matters, they are redefined as performed first to last
> >    instead of "all simultaneously".
> > 
> > - Repeated actions are now supported (e.g. specifying QUEUE multiple times
> >    now duplicates traffic among them). Previously only the last action of
> >    any given kind was taken into account.
> > 
> > - No more distinction between terminating/non-terminating/meta actions.
> >    Flow rules themselves are now defined as always terminating unless a
> >    PASSTHRU action is specified.
> > 
> > These changes alter the behavior of flow rules in corner cases in order to
> > prepare the flow API for actions that modify traffic contents or properties
> > (e.g. encapsulation, compression) and for which order matter when combined.
> > 
> > Previously one would have to do so through multiple flow rules by combining
> > PASSTRHU with priority levels, however this proved overly complex to
> > implement at the PMD level, hence this simpler approach.
> > 
> > This breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_create()
> > - rte_flow_validate()
> > 
> > PMDs with rte_flow support are modified accordingly:
> > 
> > - bnxt: no change, implementation already forbids multiple actions and does
> >    not support PASSTHRU.
> > 
> > - e1000: no change, same as bnxt.
> > 
> > - enic: modified to forbid redundant actions, no support for default drop.
> > 
> > - failsafe: no change needed.
> > 
> > - i40e: no change, implementation already forbids multiple actions.
> > 
> > - ixgbe: same as i40e.
> > 
> > - mlx4: modified to forbid multiple fate-deciding actions and drop when
> >    unspecified.
> > 
> > - mlx5: same as mlx4, with other redundant actions also forbidden.
> > 
> > - sfc: same as mlx4.
> > 
> > - tap: implementation already complies with the new behavior except for
> >    the default pass-through modified as a default drop.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
> > Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: John Daley <johndale@cisco.com>
> > Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > ---
> >   doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
> >   drivers/net/enic/enic_flow.c       | 25 ++++++++++++
> >   drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
> >   drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
> >   drivers/net/sfc/sfc_flow.c         | 22 +++++++----
> >   drivers/net/tap/tap_flow.c         | 11 ++++++
> >   lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
> >   7 files changed, 138 insertions(+), 131 deletions(-)
> 
> [...]
> 
> > diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
> > index b9f36587c..a5c6a1670 100644
> > --- a/drivers/net/enic/enic_flow.c
> > +++ b/drivers/net/enic/enic_flow.c
> > @@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
> >   			const struct rte_flow_action_queue *queue =
> >   				(const struct rte_flow_action_queue *)
> >   				actions->conf;
> > +
> > +			if (overlap & FATE)
> > +				return ENOTSUP;
> > +			overlap |= FATE;
> >   			enic_action->rq_idx =
> >   				enic_rte_rq_idx_to_sop_idx(queue->index);
> >   			break;
> > @@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
> >   			break;
> >   		}
> >   	}
> > +	if (!overlap & FATE)
> 
> Build using clang on Ubuntu 17.10 fails:
> 
> /var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: fatal error:
> logical not is only applied to
>       the left hand side of this bitwise operator
> [-Wlogical-not-parentheses]
>         if (!overlap & FATE)
>             ^        ~
> /var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: note: add
> parentheses after the '!' to
>       evaluate the bitwise operator first
>         if (!overlap & FATE)
>             ^
>              (             )
> /var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: note: add
> parentheses around left hand side
>       expression to silence this warning
>         if (!overlap & FATE)
>             ^
>             (       )
> 1 error generated.
> /var/tmp/dpdk-next-net/mk/internal/rte.compile-pre.mk:114: recipe for target
> 'enic_flow.o' failed
> make[4]: *** [enic_flow.o] Error 1
> /var/tmp/dpdk-next-net/mk/rte.subdir.mk:35: recipe for target 'enic' failed
> make[3]: *** [enic] Error 2
>   CC nfp_cpp_pcie_ops.o
> make[3]: *** Waiting for unfinished jobs....
> 
> $ clang --version
> clang version 4.0.1-6 (tags/RELEASE_401/final)
> Target: x86_64-pc-linux-gnu
> Thread model: posix
> InstalledDir: /usr/bin
> 
> 
> > +		return ENOTSUP;
> >   	enic_action->type = FILTER_ACTION_RQ_STEERING;
> >   	return 0;
> >   }
> > @@ -1001,6 +1011,9 @@ static int
> >   enic_copy_action_v2(const struct rte_flow_action actions[],
> >   		    struct filter_action_v2 *enic_action)
> >   {
> > +	enum { FATE = 1, MARK = 2, };
> > +	uint32_t overlap = 0;
> > +
> >   	FLOW_TRACE();
> >   	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
> > @@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
> >   			const struct rte_flow_action_queue *queue =
> >   				(const struct rte_flow_action_queue *)
> >   				actions->conf;
> > +
> > +			if (overlap & FATE)
> > +				return ENOTSUP;
> > +			overlap |= FATE;
> >   			enic_action->rq_idx =
> >   				enic_rte_rq_idx_to_sop_idx(queue->index);
> >   			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
> > @@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
> >   				(const struct rte_flow_action_mark *)
> >   				actions->conf;
> > +			if (overlap & MARK)
> 
> Same
> 
> > +				return ENOTSUP;
> > +			overlap |= MARK;
> >   			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
> >   			 * in the range of allows mark ids.
> >   			 */
> > @@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
> >   			break;
> >   		}
> >   		case RTE_FLOW_ACTION_TYPE_FLAG: {
> > +			if (overlap & MARK)
> > +				return ENOTSUP;
> > +			overlap |= MARK;
> >   			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
> >   			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
> >   			break;
> > @@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
> >   			break;
> >   		}
> >   	}
> > +	if (!overlap & FATE)
> 
> Same
> 
> > +		return ENOTSUP;
> >   	enic_action->type = FILTER_ACTION_V2;
> >   	return 0;
> >   }
> 
> [...]

Thanks for reporting. These "!overlap" checks are indeed messy, oddly my own
compilation tests with GCC and clang did not report them.

I'll submit an updated version.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-18 12:26  0%         ` Andrew Rybchenko
  2018-04-18 14:58  0%           ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-18 12:26 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

On 04/16/2018 07:22 PM, Adrien Mazarguil wrote:
> This patch makes the following changes to flow rule actions:
>
> - List order now matters, they are redefined as performed first to last
>    instead of "all simultaneously".
>
> - Repeated actions are now supported (e.g. specifying QUEUE multiple times
>    now duplicates traffic among them). Previously only the last action of
>    any given kind was taken into account.
>
> - No more distinction between terminating/non-terminating/meta actions.
>    Flow rules themselves are now defined as always terminating unless a
>    PASSTHRU action is specified.
>
> These changes alter the behavior of flow rules in corner cases in order to
> prepare the flow API for actions that modify traffic contents or properties
> (e.g. encapsulation, compression) and for which order matter when combined.
>
> Previously one would have to do so through multiple flow rules by combining
> PASSTRHU with priority levels, however this proved overly complex to
> implement at the PMD level, hence this simpler approach.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_validate()
>
> PMDs with rte_flow support are modified accordingly:
>
> - bnxt: no change, implementation already forbids multiple actions and does
>    not support PASSTHRU.
>
> - e1000: no change, same as bnxt.
>
> - enic: modified to forbid redundant actions, no support for default drop.
>
> - failsafe: no change needed.
>
> - i40e: no change, implementation already forbids multiple actions.
>
> - ixgbe: same as i40e.
>
> - mlx4: modified to forbid multiple fate-deciding actions and drop when
>    unspecified.
>
> - mlx5: same as mlx4, with other redundant actions also forbidden.
>
> - sfc: same as mlx4.
>
> - tap: implementation already complies with the new behavior except for
>    the default pass-through modified as a default drop.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> ---
>   doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
>   drivers/net/enic/enic_flow.c       | 25 ++++++++++++
>   drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
>   drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
>   drivers/net/sfc/sfc_flow.c         | 22 +++++++----
>   drivers/net/tap/tap_flow.c         | 11 ++++++
>   lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
>   7 files changed, 138 insertions(+), 131 deletions(-)

[...]

> diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
> index b9f36587c..a5c6a1670 100644
> --- a/drivers/net/enic/enic_flow.c
> +++ b/drivers/net/enic/enic_flow.c
> @@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
>   			const struct rte_flow_action_queue *queue =
>   				(const struct rte_flow_action_queue *)
>   				actions->conf;
> +
> +			if (overlap & FATE)
> +				return ENOTSUP;
> +			overlap |= FATE;
>   			enic_action->rq_idx =
>   				enic_rte_rq_idx_to_sop_idx(queue->index);
>   			break;
> @@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
>   			break;
>   		}
>   	}
> +	if (!overlap & FATE)

Build using clang on Ubuntu 17.10 fails:

/var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: fatal error: 
logical not is only applied to
       the left hand side of this bitwise operator 
[-Wlogical-not-parentheses]
         if (!overlap & FATE)
             ^        ~
/var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: note: add 
parentheses after the '!' to
       evaluate the bitwise operator first
         if (!overlap & FATE)
             ^
              (             )
/var/tmp/dpdk-next-net/drivers/net/enic/enic_flow.c:1000:6: note: add 
parentheses around left hand side
       expression to silence this warning
         if (!overlap & FATE)
             ^
             (       )
1 error generated.
/var/tmp/dpdk-next-net/mk/internal/rte.compile-pre.mk:114: recipe for 
target 'enic_flow.o' failed
make[4]: *** [enic_flow.o] Error 1
/var/tmp/dpdk-next-net/mk/rte.subdir.mk:35: recipe for target 'enic' failed
make[3]: *** [enic] Error 2
   CC nfp_cpp_pcie_ops.o
make[3]: *** Waiting for unfinished jobs....

$ clang --version
clang version 4.0.1-6 (tags/RELEASE_401/final)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin


> +		return ENOTSUP;
>   	enic_action->type = FILTER_ACTION_RQ_STEERING;
>   	return 0;
>   }
> @@ -1001,6 +1011,9 @@ static int
>   enic_copy_action_v2(const struct rte_flow_action actions[],
>   		    struct filter_action_v2 *enic_action)
>   {
> +	enum { FATE = 1, MARK = 2, };
> +	uint32_t overlap = 0;
> +
>   	FLOW_TRACE();
>   
>   	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
> @@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
>   			const struct rte_flow_action_queue *queue =
>   				(const struct rte_flow_action_queue *)
>   				actions->conf;
> +
> +			if (overlap & FATE)
> +				return ENOTSUP;
> +			overlap |= FATE;
>   			enic_action->rq_idx =
>   				enic_rte_rq_idx_to_sop_idx(queue->index);
>   			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
> @@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
>   				(const struct rte_flow_action_mark *)
>   				actions->conf;
>   
> +			if (overlap & MARK)

Same

> +				return ENOTSUP;
> +			overlap |= MARK;
>   			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
>   			 * in the range of allows mark ids.
>   			 */
> @@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
>   			break;
>   		}
>   		case RTE_FLOW_ACTION_TYPE_FLAG: {
> +			if (overlap & MARK)
> +				return ENOTSUP;
> +			overlap |= MARK;
>   			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
>   			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
>   			break;
> @@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
>   			break;
>   		}
>   	}
> +	if (!overlap & FATE)

Same

> +		return ENOTSUP;
>   	enic_action->type = FILTER_ACTION_V2;
>   	return 0;
>   }

[...]

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
  2018-04-17 16:05  0%     ` Iremonger, Bernard
@ 2018-04-18 11:55  0%       ` Xueming(Steven) Li
  2018-04-18 15:11  0%         ` Iremonger, Bernard
  0 siblings, 1 reply; 200+ results
From: Xueming(Steven) Li @ 2018-04-18 11:55 UTC (permalink / raw)
  To: Iremonger, Bernard, Lu, Wenzhuo, Wu, Jingjing, Thomas Monjalon,
	Adrien Mazarguil
  Cc: Nélio Laranjeiro, Shahaf Shuler, dev, Olivier Matz



> -----Original Message-----
> From: Iremonger, Bernard <bernard.iremonger@intel.com>
> Sent: Wednesday, April 18, 2018 12:05 AM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>
> Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org;
> Olivier Matz <olivier.matz@6wind.com>
> Subject: RE: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> 
> Hi Li,
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xueming Li
> > Sent: Tuesday, April 17, 2018 4:04 PM
> > To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> > <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>; Adrien
> > Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Xueming Li <xuemingl@mellanox.com>; Nelio Laranjeiro
> > <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>;
> > dev@dpdk.org; Olivier Matz <olivier.matz@6wind.com>
> > Subject: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> >
> > v5:
> > - Fixed VXLAN-GPE comment alignment
> > v4:
> > - Update testpmd doc for flow VXLAN-GPE paramter.
> > v3:
> > - Change VXLAN-GPE definition order to avoid ABI compatibility issue.
> > v2:
> > - Split patch set into public and mlx5 two series, this one is the first.
> > v1:
> > - Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
> > - Remove deprecation notes of rss level
> >
> > This patchset introduced new tunnel type and related testpmd code:
> > - New tunnel type VXLAN-GPE
> >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdat
> > atracker.ietf.org%2Fdoc%2Fdraft-ietf-nvo3-vxlan-gpe%2F&data=02%7C01%7C
> > xuemingl%40mellanox.com%7C2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7
> > d2e4d9ba6a4d149256f461b%7C0%7C0%7C636595779231620631&sdata=%2Bvx%2FgVB
> > 3e3BHI%2BYxPxOIpqK6CuKvQQ8qej4B1Faxihc%3D&reserved=0
> > - New tunnel type MPLS-in-GRE
> >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Ftoo
> > ls.ietf.org%2Fhtml%2Frfc4023&data=02%7C01%7Cxuemingl%40mellanox.com%7C
> > 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> > 0%7C0%7C636595779231620631&sdata=wk2wvoB9LSbI2LfHZVvWzmtgS0XQbGNMFL4G1
> > kyr77E%3D&reserved=0
> > - New tunnel type MPLS-in-UDP
> >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Ftoo
> > ls.ietf.org%2Fhtml%2Frfc7510&data=02%7C01%7Cxuemingl%40mellanox.com%7C
> > 2dffef40890b4cf8ff9d08d5a47d0420%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> > 0%7C0%7C636595779231620631&sdata=4RXbPD9tV7ArcnKsK8IJy%2B9XbKlzKcNBSv1
> > LXVwwuPs%3D&reserved=0
> > - Support GRE extension in testpmd csum forwarding engine
> >
> > Xueming Li (4):
> >   doc: remove RSS configuration change announcement
> >   ethdev: introduce new tunnel VXLAN-GPE
> >   app/testpmd: introduce new tunnel VXLAN-GPE
> >   app/testpmd: add more GRE extension support to csum engine
> >
> >  app/test-pmd/cmdline_flow.c                 |  24 +++++++
> >  app/test-pmd/config.c                       |   2 +
> >  app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
> >  app/test-pmd/parameters.c                   |  12 +++-
> >  app/test-pmd/testpmd.h                      |   2 +
> >  doc/guides/prog_guide/rte_flow.rst          |  12 ++++
> >  doc/guides/rel_notes/deprecation.rst        |   4 --
> >  doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
> >  doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
> >  lib/librte_ether/rte_eth_ctrl.h             |   3 +-
> >  lib/librte_ether/rte_flow.c                 |   1 +
> >  lib/librte_ether/rte_flow.h                 |  27 ++++++++
> >  lib/librte_mbuf/rte_mbuf.c                  |   3 +
> >  lib/librte_mbuf/rte_mbuf.h                  |   1 +
> >  lib/librte_mbuf/rte_mbuf_ptype.c            |   1 +
> >  lib/librte_mbuf/rte_mbuf_ptype.h            |  13 ++++
> >  lib/librte_net/rte_ether.h                  |  25 +++++++
> >  17 files changed, 225 insertions(+), 17 deletions(-)
> >
> > --
> > 2.13.3
> 
> Patch 3 of this patch set fails to apply to the latest master, the other three patches apply ok.
> 
> Regards,
> 
> Bernard.
> 
I tried it with orgin/master branch and it worked for me.
Could you please share more information?

Best Regards,
Xueming Li

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-18  8:41  5%           ` Adrien Mazarguil
@ 2018-04-18  9:24  3%             ` Ferruh Yigit
  2018-04-19  9:48  5%               ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-18  9:24 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Thomas Monjalon, dev

On 4/18/2018 9:41 AM, Adrien Mazarguil wrote:
> On Tue, Apr 17, 2018 at 08:37:31PM +0100, Ferruh Yigit wrote:
>> On 4/16/2018 5:22 PM, Adrien Mazarguil wrote:
>>> These enable more precise reporting of objects responsible for errors.
>>>
>>> This breaks ABI compatibility for the following public functions:
>>>
>>> - rte_flow_create()
>>> - rte_flow_destroy()
>>> - rte_flow_error_set()
>>> - rte_flow_flush()
>>> - rte_flow_isolate()
>>> - rte_flow_query()
>>> - rte_flow_validate()
>>
>> Is there a deprecation notice for this API break?
> 
> A notice covering the main changes in this series (most patches have an ABI
> impact) was sent but not included [1]. This particular patch rides on the
> announced ABI breakage in order to add a relatively minor feature while
> there.

My take from "announced ABI breakage" is the deprecation notice get three acks
and merged into release, so it seems there is no deprecation notice and
according process first deprecation notice should go in this release.

Hi Thomas,

Any comment on issue?

> 
> This ABI change was implicitly needed by upcoming work for 18.05 (Xueming's
> RSS stuff [2][3], Declan's TEP [4], the rest is summarized by a RFC [5]) due
> to the necessary changes of behavior in flow rules.
> 
> Note that Xueming's deprecation notice [3] alone would have triggered such
> an ABI change because struct rte_flow_action_rss wouldn't have been binary 
> compatible if struct rte_eth_rss_conf was updated. This change would have
> propagated back to rte_flow functions manipulating them.

To be honest I lost track of Xueming's patches, because of split/merge of
patchset, multiple set with multiple versions out.

Is it possible to document the dependency graph including your set?

> 
> [1] "doc: announce API changes for flow rules"
>      http://dpdk.org/ml/archives/dev/2018-February/090988.html
> [2] "MLX5 tunnel Rx offloading"
>     http://dpdk.org/ml/archives/dev/2018-February/091461.html
> [3] "doc: annouce ABI change for RSS configuraiton structure"
>     http://dpdk.org/ml/archives/dev/2018-February/090127.html
> [4] "tunnel endpoint hw acceleration enablement"
>     http://dpdk.org/ml/archives/dev/2017-December/084676.html
> [5] "Switch device offload with DPDK"
>     http://dpdk.org/ml/archives/dev/2018-March/092513.html
> 
>>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
>>> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
>>
>> <...>
>>
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-17 19:37  0%         ` Ferruh Yigit
@ 2018-04-18  8:41  5%           ` Adrien Mazarguil
  2018-04-18  9:24  3%             ` Ferruh Yigit
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-18  8:41 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Thomas Monjalon, dev

On Tue, Apr 17, 2018 at 08:37:31PM +0100, Ferruh Yigit wrote:
> On 4/16/2018 5:22 PM, Adrien Mazarguil wrote:
> > These enable more precise reporting of objects responsible for errors.
> > 
> > This breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_create()
> > - rte_flow_destroy()
> > - rte_flow_error_set()
> > - rte_flow_flush()
> > - rte_flow_isolate()
> > - rte_flow_query()
> > - rte_flow_validate()
> 
> Is there a deprecation notice for this API break?

A notice covering the main changes in this series (most patches have an ABI
impact) was sent but not included [1]. This particular patch rides on the
announced ABI breakage in order to add a relatively minor feature while
there.

This ABI change was implicitly needed by upcoming work for 18.05 (Xueming's
RSS stuff [2][3], Declan's TEP [4], the rest is summarized by a RFC [5]) due
to the necessary changes of behavior in flow rules.

Note that Xueming's deprecation notice [3] alone would have triggered such
an ABI change because struct rte_flow_action_rss wouldn't have been binary 
compatible if struct rte_eth_rss_conf was updated. This change would have
propagated back to rte_flow functions manipulating them.

[1] "doc: announce API changes for flow rules"
     http://dpdk.org/ml/archives/dev/2018-February/090988.html
[2] "MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-February/091461.html
[3] "doc: annouce ABI change for RSS configuraiton structure"
    http://dpdk.org/ml/archives/dev/2018-February/090127.html
[4] "tunnel endpoint hw acceleration enablement"
    http://dpdk.org/ml/archives/dev/2017-December/084676.html
[5] "Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 
> <...>
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-17 20:18  0%         ` Thomas Monjalon
@ 2018-04-18  6:45  0%           ` Nélio Laranjeiro
  0 siblings, 0 replies; 200+ results
From: Nélio Laranjeiro @ 2018-04-18  6:45 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Adrien Mazarguil, dev, Ferruh Yigit

On Tue, Apr 17, 2018 at 10:18:22PM +0200, Thomas Monjalon wrote:
> 16/04/2018 18:22, Adrien Mazarguil:
> > This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
> > and struct rte_flow_item_raw with standard pointers to the same data.
> > 
> > They proved difficult to use in the field (e.g. no possibility of static
> > initialization) and unsuitable for C++ applications.
> > 
> > Affected PMDs and examples are updated accordingly.
> > 
> > This breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > ---
> >  app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
> >  app/test-pmd/config.c              |  25 ++++---
> >  doc/guides/prog_guide/rte_flow.rst |  18 ++---
> >  drivers/net/mlx4/mlx4_flow.c       |  22 +++---
> >  drivers/net/mlx5/mlx5_flow.c       |  20 +++---
> >  examples/ipsec-secgw/ipsec.c       |  17 ++---
> >  lib/librte_ether/rte_flow.c        |  25 ++++---
> >  lib/librte_ether/rte_flow.h        |   8 ++-
> >  8 files changed, 135 insertions(+), 117 deletions(-)
> 
> There are almost as much insertions as deletions.
> So it's probably not a bad move.
> 
> Acked-by: Thomas Monjalon <thomas@monjalon.net>

For mlx5: Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] Retire x86 32 bit?
  @ 2018-04-17 21:18  3%       ` Roger B Melton
  2018-04-18 17:40  0%         ` Jim Murphy
  0 siblings, 1 reply; 200+ results
From: Roger B Melton @ 2018-04-17 21:18 UTC (permalink / raw)
  To: Stephen Hemminger, Jim Murphy; +Cc: David Harton (dharton), dev

On 4/17/18 4:46 PM, Stephen Hemminger wrote:
> On Tue, 17 Apr 2018 13:01:14 -0700
> Jim Murphy <jmurphy@arista.com> wrote:
>
>> Still used in certain memory constrained environments.
>>
>>
>> On Tue, Apr 17, 2018 at 11:39 AM, David Harton (dharton) <dharton@cisco.com>
>> wrote:
>>
>>> It is used and tested in production and non-production environments.
>>>
>>> Regards,
>>> Dave
>>>   
>>>> -----Original Message-----
>>>> From: dev <dev-bounces@dpdk.org> On Behalf Of Stephen Hemminger
>>>> Sent: Tuesday, April 17, 2018 2:31 PM
>>>> To: dev@dpdk.org
>>>> Subject: [dpdk-dev] Retire x86 32 bit?
>>>>
>>>> I wonder if x86 32 bit is still useful?
>>>> Many distributions no longer support it, and not sure if it is tested
>>>> througly by anyone.
>>>>
>>>> Maybe time to deprecate it (gradually)?
>>>   
> Pure 32 bit, or x86-64 instructions and registers used in 32 bit mode (which can be faster).
> .
>

Pure 32bit in our case.  We do not use x32 ABI.

-Roger

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-17 20:18  0%         ` Thomas Monjalon
  2018-04-18  6:45  0%           ` Nélio Laranjeiro
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2018-04-17 20:18 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ferruh Yigit

16/04/2018 18:22, Adrien Mazarguil:
> This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
> and struct rte_flow_item_raw with standard pointers to the same data.
> 
> They proved difficult to use in the field (e.g. no possibility of static
> initialization) and unsuitable for C++ applications.
> 
> Affected PMDs and examples are updated accordingly.
> 
> This breaks ABI compatibility for the following public functions:
> 
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
>  app/test-pmd/config.c              |  25 ++++---
>  doc/guides/prog_guide/rte_flow.rst |  18 ++---
>  drivers/net/mlx4/mlx4_flow.c       |  22 +++---
>  drivers/net/mlx5/mlx5_flow.c       |  20 +++---
>  examples/ipsec-secgw/ipsec.c       |  17 ++---
>  lib/librte_ether/rte_flow.c        |  25 ++++---
>  lib/librte_ether/rte_flow.h        |   8 ++-
>  8 files changed, 135 insertions(+), 117 deletions(-)

There are almost as much insertions as deletions.
So it's probably not a bad move.

Acked-by: Thomas Monjalon <thomas@monjalon.net>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-17 19:37  0%         ` Ferruh Yigit
  2018-04-18  8:41  5%           ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-17 19:37 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, dev

On 4/16/2018 5:22 PM, Adrien Mazarguil wrote:
> These enable more precise reporting of objects responsible for errors.
> 
> This breaks ABI compatibility for the following public functions:
> 
> - rte_flow_create()
> - rte_flow_destroy()
> - rte_flow_error_set()
> - rte_flow_flush()
> - rte_flow_isolate()
> - rte_flow_query()
> - rte_flow_validate()

Is there a deprecation notice for this API break?

> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
  2018-04-17 15:04  3%   ` [dpdk-dev] [PATCH v5 0/4] " Xueming Li
@ 2018-04-17 16:05  0%     ` Iremonger, Bernard
  2018-04-18 11:55  0%       ` Xueming(Steven) Li
  0 siblings, 1 reply; 200+ results
From: Iremonger, Bernard @ 2018-04-17 16:05 UTC (permalink / raw)
  To: Xueming Li, Lu, Wenzhuo, Wu, Jingjing, Thomas Monjalon, Adrien Mazarguil
  Cc: Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

Hi Li,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Xueming Li
> Sent: Tuesday, April 17, 2018 4:04 PM
> To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Thomas Monjalon <thomas@monjalon.net>; Adrien
> Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>; Nelio Laranjeiro
> <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>;
> dev@dpdk.org; Olivier Matz <olivier.matz@6wind.com>
> Subject: [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
> 
> v5:
> - Fixed VXLAN-GPE comment alignment
> v4:
> - Update testpmd doc for flow VXLAN-GPE paramter.
> v3:
> - Change VXLAN-GPE definition order to avoid ABI compatibility issue.
> v2:
> - Split patch set into public and mlx5 two series, this one is the first.
> v1:
> - Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
> - Remove deprecation notes of rss level
> 
> This patchset introduced new tunnel type and related testpmd code:
> - New tunnel type VXLAN-GPE
>   https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
> - New tunnel type MPLS-in-GRE
>   https://tools.ietf.org/html/rfc4023
> - New tunnel type MPLS-in-UDP
>   https://tools.ietf.org/html/rfc7510
> - Support GRE extension in testpmd csum forwarding engine
> 
> Xueming Li (4):
>   doc: remove RSS configuration change announcement
>   ethdev: introduce new tunnel VXLAN-GPE
>   app/testpmd: introduce new tunnel VXLAN-GPE
>   app/testpmd: add more GRE extension support to csum engine
> 
>  app/test-pmd/cmdline_flow.c                 |  24 +++++++
>  app/test-pmd/config.c                       |   2 +
>  app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
>  app/test-pmd/parameters.c                   |  12 +++-
>  app/test-pmd/testpmd.h                      |   2 +
>  doc/guides/prog_guide/rte_flow.rst          |  12 ++++
>  doc/guides/rel_notes/deprecation.rst        |   4 --
>  doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
>  doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
>  lib/librte_ether/rte_eth_ctrl.h             |   3 +-
>  lib/librte_ether/rte_flow.c                 |   1 +
>  lib/librte_ether/rte_flow.h                 |  27 ++++++++
>  lib/librte_mbuf/rte_mbuf.c                  |   3 +
>  lib/librte_mbuf/rte_mbuf.h                  |   1 +
>  lib/librte_mbuf/rte_mbuf_ptype.c            |   1 +
>  lib/librte_mbuf/rte_mbuf_ptype.h            |  13 ++++
>  lib/librte_net/rte_ether.h                  |  25 +++++++
>  17 files changed, 225 insertions(+), 17 deletions(-)
> 
> --
> 2.13.3

Patch 3 of this patch set fails to apply to the latest master, the other three patches apply ok.

Regards,

Bernard.
  

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v1] cmdline: rework as a wrapper to libedit
  2018-04-17 15:21  1% ` [dpdk-dev] [PATCH v1] " Adrien Mazarguil
@ 2018-04-17 15:59  0%   ` Burakov, Anatoly
  2018-04-19 15:13  1%   ` [dpdk-dev] [PATCH v2] " Adrien Mazarguil
  1 sibling, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-17 15:59 UTC (permalink / raw)
  To: Adrien Mazarguil, Olivier Matz
  Cc: dev, Keith Wiles, Jingjing Wu, Thomas Monjalon, Ferruh Yigit,
	Jim Thompson

On 17-Apr-18 4:21 PM, Adrien Mazarguil wrote:
> Disclaimer: this patch must not be confused with the CLI library [1]
> (work in progress) that will eventually supersede librte_cmdline itself
> with a different API.
> 
> Rather, it modifies librte_cmdline to delegate all the heavy lifting
> (terminal and history handling), strips unused features and re-implements
> what remains of its public API as a wrapper to the editline library (also
> known as libedit) [2], a well-known, BSD-licensed and widely available
> library used by many projects which does everything needed and more [3].
> 
> This approach was chosen because converting librte_cmdline as a wrapper to
> a more capable library was easier and faster than addressing its
> shortcomings and results in much less code to maintain in DPDK.
> 
> It also provides a drop-in solution for applications that rely on
> librte_cmdline. They benefit from greatly improved command line handling
> without a meaningful impact on their code base.
> 
> The main motivation behind this patch is testpmd's flow (rte_flow) command,
> which requires support for dynamic tokens and very long lines that must be
> broken down when displayed. This is not supported by librte_cmdline's
> limited terminal handling capabilities, resulting in a rather frustrating
> user experience.
> 
> It had to be addressed given the importance of testpmd as one of the
> primary tool used by PMD developers.
> 
> This rework results in the following changes:
> 
> - Removed circular buffer management interface for command history
>    (cmdline_cirbuf.c), command history being handled by libedit.
> - Removed raw command-line interpreter (cmdline_rdline.c).
> - Removed raw terminal handler (cmdline_vt100.c).
> - Removed all test/example code for the above.
> - Re-implemented high level interactive and non-interactive command-line
>    handlers (cmdline.c and cmdline_socket.c) on top of libedit using its
>    native interface, not its readline compatibility layer.
> - Made struct cmdline opaque so that applications relying on librte_cmdline
>    do not need to include any libedit headers.
> - Applications do not need to include cmdline_rdline.h anymore.
> - Terminal resizing is now automatically handled.
> - New external dependency for applications relying on librte_cmdline.
> - Major version bump due to the ABI impact of these changes.
> 
> [1] http://dpdk.org/browse/draft/dpdk-draft-cli/
> [2] http://thrysoee.dk/editline/
> [3] http://netbsd.gw.com/cgi-bin/man-cgi?editline++NetBSD-current
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Olivier Matz <olivier.matz@6wind.com>
> Cc: Keith Wiles <keith.wiles@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Jim Thompson <jim@netgate.com>
> 
> --
> 
> No fundamental change since the original RFC [4], except it's been rebased
> several times and Meson build support was added in the meantime. Commit log
> was also shortened a bit.
> 
> I'm re-sending this because I think it's useful, at least to me (duh). As
> the maintainer of rte_flow, I spend most of my time typing flow commands in
> testpmd and libedit makes that a pleasant experience.
> 
> Try it out! And don't hesitate to send your acked-by line to get this in
> time for 18.05 :)
> 
> [4] http://dpdk.org/ml/archives/dev/2017-November/081605.html
> ---

<...>

> +	uint32_t error:1;
> +	char prompt[RDLINE_PROMPT_SIZE];
> +};
> +
> +void
> +cmdline_set_prompt(struct cmdline *cl, const char *prompt)
>   {
> -	struct cmdline *cl = rdl->opaque;
> -	int ret;
> -	ret = cmdline_parse(cl, buf);
> -	if (ret == CMDLINE_PARSE_AMBIGUOUS)
> -		cmdline_printf(cl, "Ambiguous command\n");
> -	else if (ret == CMDLINE_PARSE_NOMATCH)
> -		cmdline_printf(cl, "Command not found\n");
> -	else if (ret == CMDLINE_PARSE_BAD_ARGS)
> -		cmdline_printf(cl, "Bad arguments\n");
> +	if (!cl || !prompt)
> +		return;
> +	snprintf(cl->prompt, sizeof(cl->prompt), "%s", prompt);

Didn't look through the entire patch yet, but this stood out - please 
use strlcpy() :)

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v5 0/4] introduce new tunnel types
  2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
@ 2018-04-17 15:04  3%   ` Xueming Li
  2018-04-17 16:05  0%     ` Iremonger, Bernard
  2018-04-20 11:56  3%   ` [dpdk-dev] [PATCH v6 0/5] " Xueming Li
  1 sibling, 1 reply; 200+ results
From: Xueming Li @ 2018-04-17 15:04 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v5:
- Fixed VXLAN-GPE comment alignment
v4:
- Update testpmd doc for flow VXLAN-GPE paramter.
v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine

Xueming Li (4):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c                 |  24 +++++++
 app/test-pmd/config.c                       |   2 +
 app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
 app/test-pmd/parameters.c                   |  12 +++-
 app/test-pmd/testpmd.h                      |   2 +
 doc/guides/prog_guide/rte_flow.rst          |  12 ++++
 doc/guides/rel_notes/deprecation.rst        |   4 --
 doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
 lib/librte_ether/rte_eth_ctrl.h             |   3 +-
 lib/librte_ether/rte_flow.c                 |   1 +
 lib/librte_ether/rte_flow.h                 |  27 ++++++++
 lib/librte_mbuf/rte_mbuf.c                  |   3 +
 lib/librte_mbuf/rte_mbuf.h                  |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c            |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.h            |  13 ++++
 lib/librte_net/rte_ether.h                  |  25 +++++++
 17 files changed, 225 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v1] cmdline: rework as a wrapper to libedit
  @ 2018-04-17 15:21  1% ` Adrien Mazarguil
  2018-04-17 15:59  0%   ` Burakov, Anatoly
  2018-04-19 15:13  1%   ` [dpdk-dev] [PATCH v2] " Adrien Mazarguil
  0 siblings, 2 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-17 15:21 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Keith Wiles, Jingjing Wu, Thomas Monjalon, Ferruh Yigit,
	Jim Thompson

Disclaimer: this patch must not be confused with the CLI library [1]
(work in progress) that will eventually supersede librte_cmdline itself
with a different API.

Rather, it modifies librte_cmdline to delegate all the heavy lifting
(terminal and history handling), strips unused features and re-implements
what remains of its public API as a wrapper to the editline library (also
known as libedit) [2], a well-known, BSD-licensed and widely available
library used by many projects which does everything needed and more [3].

This approach was chosen because converting librte_cmdline as a wrapper to
a more capable library was easier and faster than addressing its
shortcomings and results in much less code to maintain in DPDK.

It also provides a drop-in solution for applications that rely on
librte_cmdline. They benefit from greatly improved command line handling
without a meaningful impact on their code base.

The main motivation behind this patch is testpmd's flow (rte_flow) command,
which requires support for dynamic tokens and very long lines that must be
broken down when displayed. This is not supported by librte_cmdline's
limited terminal handling capabilities, resulting in a rather frustrating
user experience.

It had to be addressed given the importance of testpmd as one of the
primary tool used by PMD developers.

This rework results in the following changes:

- Removed circular buffer management interface for command history
  (cmdline_cirbuf.c), command history being handled by libedit.
- Removed raw command-line interpreter (cmdline_rdline.c).
- Removed raw terminal handler (cmdline_vt100.c).
- Removed all test/example code for the above.
- Re-implemented high level interactive and non-interactive command-line
  handlers (cmdline.c and cmdline_socket.c) on top of libedit using its
  native interface, not its readline compatibility layer.
- Made struct cmdline opaque so that applications relying on librte_cmdline
  do not need to include any libedit headers.
- Applications do not need to include cmdline_rdline.h anymore.
- Terminal resizing is now automatically handled.
- New external dependency for applications relying on librte_cmdline.
- Major version bump due to the ABI impact of these changes.

[1] http://dpdk.org/browse/draft/dpdk-draft-cli/
[2] http://thrysoee.dk/editline/
[3] http://netbsd.gw.com/cgi-bin/man-cgi?editline++NetBSD-current

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Olivier Matz <olivier.matz@6wind.com>
Cc: Keith Wiles <keith.wiles@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Jim Thompson <jim@netgate.com>

--

No fundamental change since the original RFC [4], except it's been rebased
several times and Meson build support was added in the meantime. Commit log
was also shortened a bit.

I'm re-sending this because I think it's useful, at least to me (duh). As
the maintainer of rte_flow, I spend most of my time typing flow commands in
testpmd and libedit makes that a pleasant experience.

Try it out! And don't hesitate to send your acked-by line to get this in
time for 18.05 :)

[4] http://dpdk.org/ml/archives/dev/2017-November/081605.html
---
 app/test-pmd/cmdline.c                          |    1 -
 examples/bond/main.c                            |    1 -
 examples/cmdline/commands.c                     |    1 -
 examples/cmdline/main.c                         |    1 -
 examples/multi_process/simple_mp/main.c         |    1 -
 examples/multi_process/simple_mp/mp_commands.c  |    1 -
 examples/qos_sched/cmdline.c                    |    1 -
 examples/quota_watermark/qwctl/commands.c       |    1 -
 examples/quota_watermark/qwctl/qwctl.c          |    1 -
 .../guest_cli/vm_power_cli_guest.c              |    1 -
 examples/vm_power_manager/vm_power_cli.c        |    1 -
 lib/librte_cmdline/Makefile                     |   10 +-
 lib/librte_cmdline/cmdline.c                    |  385 ++++--
 lib/librte_cmdline/cmdline.h                    |   22 +-
 lib/librte_cmdline/cmdline_cirbuf.c             |  412 ------
 lib/librte_cmdline/cmdline_cirbuf.h             |  193 ---
 lib/librte_cmdline/cmdline_parse.c              |    7 +-
 lib/librte_cmdline/cmdline_rdline.c             |  644 ---------
 lib/librte_cmdline/cmdline_rdline.h             |  201 ---
 lib/librte_cmdline/cmdline_socket.c             |   36 +-
 lib/librte_cmdline/cmdline_vt100.c              |  132 --
 lib/librte_cmdline/cmdline_vt100.h              |  100 --
 lib/librte_cmdline/meson.build                  |   18 +-
 lib/librte_cmdline/rte_cmdline_version.map      |   41 +-
 mk/rte.app.mk                                   |    2 +
 test/cmdline_test/cmdline_test.c                |    1 -
 test/cmdline_test/commands.c                    |   69 -
 test/test/Makefile                              |    1 -
 test/test/commands.c                            |    1 -
 test/test/meson.build                           |    1 -
 test/test/test.c                                |    1 -
 test/test/test_cmdline.c                        |    9 -
 test/test/test_cmdline.h                        |    6 -
 test/test/test_cmdline_cirbuf.c                 | 1301 ------------------
 test/test/test_cmdline_lib.c                    |  117 +-
 35 files changed, 303 insertions(+), 3418 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 512e3b55e..140e6480c 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -48,7 +48,6 @@
 #include <rte_flow.h>
 #include <rte_gro.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/bond/main.c b/examples/bond/main.c
index 455f108ee..3a832079e 100644
--- a/examples/bond/main.c
+++ b/examples/bond/main.c
@@ -42,7 +42,6 @@
 #include <rte_arp.h>
 #include <rte_spinlock.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/cmdline/commands.c b/examples/cmdline/commands.c
index 06916d783..3e9d84d46 100644
--- a/examples/cmdline/commands.c
+++ b/examples/cmdline/commands.c
@@ -20,7 +20,6 @@
 	#endif
 #endif
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 #include <cmdline_parse_num.h>
diff --git a/examples/cmdline/main.c b/examples/cmdline/main.c
index f2f2e5a2f..9fe0fdef7 100644
--- a/examples/cmdline/main.c
+++ b/examples/cmdline/main.c
@@ -11,7 +11,6 @@
 #include <termios.h>
 #include <sys/queue.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/examples/multi_process/simple_mp/main.c b/examples/multi_process/simple_mp/main.c
index e6c69d6a3..49d6ed169 100644
--- a/examples/multi_process/simple_mp/main.c
+++ b/examples/multi_process/simple_mp/main.c
@@ -35,7 +35,6 @@
 #include <rte_ring.h>
 #include <rte_log.h>
 #include <rte_mempool.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_socket.h>
diff --git a/examples/multi_process/simple_mp/mp_commands.c b/examples/multi_process/simple_mp/mp_commands.c
index e4df6ff01..a9eb8bb44 100644
--- a/examples/multi_process/simple_mp/mp_commands.c
+++ b/examples/multi_process/simple_mp/mp_commands.c
@@ -25,7 +25,6 @@
 #include <rte_mempool.h>
 #include <rte_string_fns.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_socket.h>
diff --git a/examples/qos_sched/cmdline.c b/examples/qos_sched/cmdline.c
index 15f51830c..679819a25 100644
--- a/examples/qos_sched/cmdline.c
+++ b/examples/qos_sched/cmdline.c
@@ -7,7 +7,6 @@
 #include <inttypes.h>
 #include <string.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/quota_watermark/qwctl/commands.c b/examples/quota_watermark/qwctl/commands.c
index a1c646b9f..33fce2063 100644
--- a/examples/quota_watermark/qwctl/commands.c
+++ b/examples/quota_watermark/qwctl/commands.c
@@ -7,7 +7,6 @@
 #include <string.h>
 #include <termios.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
 #include <cmdline_parse_string.h>
diff --git a/examples/quota_watermark/qwctl/qwctl.c b/examples/quota_watermark/qwctl/qwctl.c
index 2f7914c80..9f41a684a 100644
--- a/examples/quota_watermark/qwctl/qwctl.c
+++ b/examples/quota_watermark/qwctl/qwctl.c
@@ -13,7 +13,6 @@
 #include <rte_log.h>
 #include <rte_memzone.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c b/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
index 43bdeacef..218ed192e 100644
--- a/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
+++ b/examples/vm_power_manager/guest_cli/vm_power_cli_guest.c
@@ -8,7 +8,6 @@
 #include <stdio.h>
 #include <termios.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
diff --git a/examples/vm_power_manager/vm_power_cli.c b/examples/vm_power_manager/vm_power_cli.c
index d588d38aa..99757420a 100644
--- a/examples/vm_power_manager/vm_power_cli.c
+++ b/examples/vm_power_manager/vm_power_cli.c
@@ -10,7 +10,6 @@
 #include <termios.h>
 #include <errno.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
diff --git a/lib/librte_cmdline/Makefile b/lib/librte_cmdline/Makefile
index ddae1cfde..feb1f1bca 100644
--- a/lib/librte_cmdline/Makefile
+++ b/lib/librte_cmdline/Makefile
@@ -10,28 +10,24 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
 
 EXPORT_MAP := rte_cmdline_version.map
 
-LIBABIVER := 2
+LIBABIVER := 3
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) := cmdline.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_cirbuf.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_etheraddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_ipaddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_num.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_string.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_rdline.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_vt100.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += cmdline_parse_portlist.c
 
-CFLAGS += -D_GNU_SOURCE
 LDLIBS += -lrte_eal
 
 # install includes
 INCS := cmdline.h cmdline_parse.h cmdline_parse_num.h cmdline_parse_ipaddr.h
-INCS += cmdline_parse_etheraddr.h cmdline_parse_string.h cmdline_rdline.h
-INCS += cmdline_vt100.h cmdline_socket.h cmdline_cirbuf.h cmdline_parse_portlist.h
+INCS += cmdline_parse_etheraddr.h cmdline_parse_string.h
+INCS += cmdline_socket.h cmdline_parse_portlist.h
 SYMLINK-$(CONFIG_RTE_LIBRTE_CMDLINE)-include := $(INCS)
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_cmdline/cmdline.c b/lib/librte_cmdline/cmdline.c
index 591b78b0f..d160854e6 100644
--- a/lib/librte_cmdline/cmdline.c
+++ b/lib/librte_cmdline/cmdline.c
@@ -4,79 +4,181 @@
  * All rights reserved.
  */
 
+#include <ctype.h>
+#include <histedit.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <inttypes.h>
 #include <fcntl.h>
 #include <poll.h>
 #include <errno.h>
-#include <termios.h>
-#include <netinet/in.h>
-
-#include <rte_string_fns.h>
 
 #include "cmdline_parse.h"
-#include "cmdline_rdline.h"
 #include "cmdline.h"
 
-static void
-cmdline_valid_buffer(struct rdline *rdl, const char *buf,
-		     __attribute__((unused)) unsigned int size)
+struct cmdline {
+	char *line;
+	FILE *f_in;
+	FILE *f_out;
+	cmdline_parse_ctx_t *ctx;
+	EditLine *el;
+	History *hist;
+	HistEvent histev;
+	uint32_t eof:1;
+	uint32_t error:1;
+	char prompt[RDLINE_PROMPT_SIZE];
+};
+
+void
+cmdline_set_prompt(struct cmdline *cl, const char *prompt)
 {
-	struct cmdline *cl = rdl->opaque;
-	int ret;
-	ret = cmdline_parse(cl, buf);
-	if (ret == CMDLINE_PARSE_AMBIGUOUS)
-		cmdline_printf(cl, "Ambiguous command\n");
-	else if (ret == CMDLINE_PARSE_NOMATCH)
-		cmdline_printf(cl, "Command not found\n");
-	else if (ret == CMDLINE_PARSE_BAD_ARGS)
-		cmdline_printf(cl, "Bad arguments\n");
+	if (!cl || !prompt)
+		return;
+	snprintf(cl->prompt, sizeof(cl->prompt), "%s", prompt);
 }
 
-static int
-cmdline_complete_buffer(struct rdline *rdl, const char *buf,
-			char *dstbuf, unsigned int dstsize,
-			int *state)
+void *
+cmdline_ctx_get(struct cmdline *cl)
 {
-	struct cmdline *cl = rdl->opaque;
-	return cmdline_complete(cl, buf, state, dstbuf, dstsize);
+	if (!cl)
+		return NULL;
+	return cl->ctx;
 }
 
-int
-cmdline_write_char(struct rdline *rdl, char c)
+static char *
+cmdline_el_prompt(EditLine *el)
 {
-	int ret = -1;
 	struct cmdline *cl;
 
-	if (!rdl)
-		return -1;
-
-	cl = rdl->opaque;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return NULL;
+	return cl->prompt;
+}
 
-	if (cl->s_out >= 0)
-		ret = write(cl->s_out, &c, 1);
+static unsigned char
+cmdline_el_execute(EditLine *el, int c)
+{
+	const LineInfo *li = el_line(el);
+	size_t len = li->lastchar - li->buffer;
+	char *line;
+	struct cmdline *cl;
+	int ret;
 
-	return ret;
+	(void)c;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	line = realloc(cl->line, len + 2);
+	if (!line) {
+		cl->error = 1;
+		return CC_FATAL;
+	}
+	cl->line = line;
+	memcpy(line, li->buffer, len);
+	line[len] = '\n';
+	line[len + 1] = '\0';
+	fputs("\r\n", cl->f_out);
+	ret = cmdline_parse(cl, line);
+	if (ret == CMDLINE_PARSE_AMBIGUOUS)
+		fprintf(cl->f_out, "Ambiguous command\r\n");
+	else if (ret == CMDLINE_PARSE_NOMATCH)
+		fprintf(cl->f_out, "Command not found\r\n");
+	else if (ret == CMDLINE_PARSE_BAD_ARGS)
+		fprintf(cl->f_out, "Bad arguments\r\n");
+	if (cl->error)
+		return CC_FATAL;
+	if (cl->eof)
+		return CC_EOF;
+	if (len) {
+		line[len] = '\0';
+		history(cl->hist, &cl->histev, H_ENTER, line);
+	}
+	return CC_NEWLINE;
 }
 
+static unsigned char
+cmdline_el_complete(EditLine *el, int c)
+{
+	const LineInfo *li = el_line(el);
+	size_t pos = li->cursor - li->buffer;
+	char *line;
+	struct cmdline *cl;
+	char complete_buf[RDLINE_COMPLETE_SIZE];
+	int complete_state;
+	int ret;
 
-void
-cmdline_set_prompt(struct cmdline *cl, const char *prompt)
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	line = realloc(cl->line, pos + 1);
+	if (!line) {
+		cl->error = 1;
+		return CC_FATAL;
+	}
+	cl->line = line;
+	memcpy(line, li->buffer, pos);
+	line[pos] = '\0';
+	if (c == '\t')
+		complete_state = 0;
+	else
+		complete_state = -1;
+	/* see in parse.h for help on complete() */
+	ret = cmdline_complete(cl, line, &complete_state,
+			       complete_buf, sizeof(complete_buf));
+	/* no completion or error */
+	if (ret <= 0)
+		return CC_ARGHACK;
+	/* string must be NUL-terminated */
+	if (strnlen(complete_buf, sizeof(complete_buf)) == sizeof(complete_buf))
+		return CC_ERROR;
+	/* add chars */
+	if (ret == CMDLINE_PARSE_COMPLETED_BUFFER) {
+		/* if in the middle of a token, remove its suffix first */
+		for (pos = 0; li->cursor + pos != li->lastchar; pos++)
+			if (isblank(li->cursor[pos]))
+				break;
+		el_cursor(el, pos);
+		el_deletestr(el, pos);
+		if (el_insertstr(el, complete_buf))
+			return CC_ERROR;
+		return CC_REFRESH;
+	}
+	/* choice */
+	fputs("\r\n", cl->f_out);
+	while (ret) {
+		fputc(' ', cl->f_out);
+		fputs(complete_buf, cl->f_out);
+		fputs("\r\n", cl->f_out);
+		ret = cmdline_complete(cl, line, &complete_state,
+				       complete_buf, sizeof(complete_buf));
+	}
+	el_set(el, EL_REFRESH);
+	return CC_REDISPLAY;
+}
+
+static unsigned char
+cmdline_el_delete_next_char_or_eof(EditLine *el, int c)
 {
-	if (!cl || !prompt)
-		return;
-	snprintf(cl->prompt, sizeof(cl->prompt), "%s", prompt);
+	const LineInfo *li = el_line(el);
+	struct cmdline *cl;
+
+	(void)c;
+	if (el_get(el, EL_CLIENTDATA, &cl))
+		return CC_FATAL;
+	if (li->buffer == li->lastchar) {
+		cl->eof = 1;
+		return CC_EOF;
+	}
+	el_cursor(el, 1);
+	el_deletestr(el, 1);
+	return CC_REFRESH;
 }
 
 struct cmdline *
 cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out)
 {
 	struct cmdline *cl;
-	int ret;
 
 	if (!ctx || !prompt)
 		return NULL;
@@ -85,36 +187,89 @@ cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out)
 	if (cl == NULL)
 		return NULL;
 	memset(cl, 0, sizeof(struct cmdline));
-	cl->s_in = s_in;
-	cl->s_out = s_out;
+	cl->line = NULL;
+	s_in = dup(s_in);
+	s_out = s_out != -1 ? dup(s_out) : open("/dev/null", O_WRONLY);
+	if (s_in == -1 || s_out == -1)
+		goto error;
+	cl->f_in = fdopen(s_in, "rb");
+	cl->f_out = fdopen(s_out, "wb");
+	if (!cl->f_in || !cl->f_out)
+		goto error;
 	cl->ctx = ctx;
-
-	ret = rdline_init(&cl->rdl, cmdline_write_char, cmdline_valid_buffer,
-			cmdline_complete_buffer);
-	if (ret != 0) {
-		free(cl);
-		return NULL;
-	}
-
-	cl->rdl.opaque = cl;
+	cl->el = el_init("dpdk", cl->f_in, cl->f_out, stderr);
+	if (!cl->el)
+		goto error;
+	if (el_set(cl->el, EL_CLIENTDATA, cl))
+		goto error;
 	cmdline_set_prompt(cl, prompt);
-	rdline_newline(&cl->rdl, cl->prompt);
-
+	if (el_set(cl->el, EL_PROMPT, cmdline_el_prompt))
+		goto error;
+	if (el_set(cl->el, EL_EDITOR, "emacs"))
+		goto error;
+	if (el_set(cl->el, EL_SIGNAL, 1))
+		goto error;
+	cl->hist = history_init();
+	if (!cl->hist)
+		goto error;
+	if (history(cl->hist, &cl->histev, H_SETSIZE,
+		    RDLINE_HISTORY_MAX_LINE) < 0)
+		goto error;
+	if (history(cl->hist, &cl->histev, H_SETUNIQUE, 1))
+		goto error;
+	if (el_set(cl->el, EL_HIST, history, cl->hist))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-execute", "Execute command",
+		   cmdline_el_execute))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^J", "ed-execute", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^M", "ed-execute", NULL))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-complete", "Complete argument",
+		   cmdline_el_complete))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^I", "ed-complete", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "?", "ed-complete", NULL))
+		goto error;
+	if (el_set(cl->el, EL_ADDFN, "ed-delete-next-char-or-eof",
+		   "Delete next character or assume EOF",
+		   cmdline_el_delete_next_char_or_eof))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^D",
+		   "ed-delete-next-char-or-eof", NULL))
+		goto error;
+	if (el_set(cl->el, EL_BIND, "^W", "ed-delete-prev-word", NULL))
+		goto error;
 	return cl;
+error:
+	if (cl->hist)
+		history_end(cl->hist);
+	if (cl->el)
+		el_end(cl->el);
+	if (cl->f_out)
+		fclose(cl->f_out);
+	else if (s_out != -1)
+		close(s_out);
+	if (cl->f_in)
+		fclose(cl->f_in);
+	else if (s_in != -1)
+		close(s_in);
+	free(cl);
+	return NULL;
 }
 
 void
 cmdline_free(struct cmdline *cl)
 {
-	dprintf("called\n");
-
 	if (!cl)
 		return;
-
-	if (cl->s_in > 2)
-		close(cl->s_in);
-	if (cl->s_out != cl->s_in && cl->s_out > 2)
-		close(cl->s_out);
+	history_end(cl->hist);
+	el_end(cl->el);
+	fclose(cl->f_out);
+	fclose(cl->f_in);
+	free(cl->line);
 	free(cl);
 }
 
@@ -126,70 +281,23 @@ cmdline_printf(const struct cmdline *cl, const char *fmt, ...)
 	if (!cl || !fmt)
 		return;
 
-#ifdef _GNU_SOURCE
-	if (cl->s_out < 0)
-		return;
-	va_start(ap, fmt);
-	vdprintf(cl->s_out, fmt, ap);
-	va_end(ap);
-#else
-	int ret;
-	char *buf;
-
-	if (cl->s_out < 0)
-		return;
-
-	buf = malloc(BUFSIZ);
-	if (buf == NULL)
-		return;
 	va_start(ap, fmt);
-	ret = vsnprintf(buf, BUFSIZ, fmt, ap);
+	vfprintf(cl->f_out, fmt, ap);
 	va_end(ap);
-	if (ret < 0) {
-		free(buf);
-		return;
-	}
-	if (ret >= BUFSIZ)
-		ret = BUFSIZ - 1;
-	ret = write(cl->s_out, buf, ret);
-	(void)ret;
-	free(buf);
-#endif
 }
 
 int
 cmdline_in(struct cmdline *cl, const char *buf, int size)
 {
-	const char *history, *buffer;
-	size_t histlen, buflen;
-	int ret = 0;
-	int i, same;
+	int i;
 
 	if (!cl || !buf)
 		return -1;
 
 	for (i=0; i<size; i++) {
-		ret = rdline_char_in(&cl->rdl, buf[i]);
-
-		if (ret == RDLINE_RES_VALIDATED) {
-			buffer = rdline_get_buffer(&cl->rdl);
-			history = rdline_get_history_item(&cl->rdl, 0);
-			if (history) {
-				histlen = strnlen(history, RDLINE_BUF_SIZE);
-				same = !memcmp(buffer, history, histlen) &&
-					buffer[histlen] == '\n';
-			}
-			else
-				same = 0;
-			buflen = strnlen(buffer, RDLINE_BUF_SIZE);
-			if (buflen > 1 && !same)
-				rdline_add_history(&cl->rdl, buffer);
-			rdline_newline(&cl->rdl, cl->prompt);
-		}
-		else if (ret == RDLINE_RES_EOF)
-			return -1;
-		else if (ret == RDLINE_RES_EXITED)
-			return -1;
+		char tmp[2] = { buf[i], '\0' };
+
+		el_push(cl->el, tmp);
 	}
 	return i;
 }
@@ -199,7 +307,7 @@ cmdline_quit(struct cmdline *cl)
 {
 	if (!cl)
 		return;
-	rdline_quit(&cl->rdl);
+	cl->eof = 1;
 }
 
 int
@@ -207,48 +315,49 @@ cmdline_poll(struct cmdline *cl)
 {
 	struct pollfd pfd;
 	int status;
-	ssize_t read_status;
-	char c;
+	int read_status;
+	int flags;
 
 	if (!cl)
 		return -EINVAL;
-	else if (cl->rdl.status == RDLINE_EXITED)
+	else if (cl->error)
+		return RDLINE_ERROR;
+	else if (cl->eof)
 		return RDLINE_EXITED;
 
-	pfd.fd = cl->s_in;
+	pfd.fd = fileno(cl->f_in);
 	pfd.events = POLLIN;
 	pfd.revents = 0;
 
 	status = poll(&pfd, 1, 0);
 	if (status < 0)
-		return status;
-	else if (status > 0) {
-		c = -1;
-		read_status = read(cl->s_in, &c, 1);
-		if (read_status < 0)
-			return read_status;
-
-		status = cmdline_in(cl, &c, 1);
-		if (status < 0 && cl->rdl.status != RDLINE_EXITED)
-			return status;
-	}
-
-	return cl->rdl.status;
+		return RDLINE_ERROR;
+	if (!status)
+		return RDLINE_RUNNING;
+	flags = fcntl(pfd.fd, F_GETFL);
+	if (!(flags & O_NONBLOCK))
+		fcntl(pfd.fd, F_SETFL, flags | O_NONBLOCK);
+	if (!el_gets(cl->el, &read_status) && read_status == -1)
+		cl->error = 1;
+	if (!(flags & O_NONBLOCK))
+		fcntl(pfd.fd, F_SETFL, flags);
+	return cl->error ? RDLINE_ERROR :
+		cl->eof ? RDLINE_EXITED :
+		RDLINE_RUNNING;
 }
 
 void
 cmdline_interact(struct cmdline *cl)
 {
-	char c;
-
 	if (!cl)
 		return;
 
-	c = -1;
-	while (1) {
-		if (read(cl->s_in, &c, 1) <= 0)
-			break;
-		if (cmdline_in(cl, &c, 1) < 0)
-			break;
+	while (!cl->error && !cl->eof) {
+		int read_status;
+
+		if (el_gets(cl->el, &read_status))
+			continue;
+		if (read_status == -1)
+			cl->error = 1;
 	}
 }
diff --git a/lib/librte_cmdline/cmdline.h b/lib/librte_cmdline/cmdline.h
index 27d2effdf..1f443be60 100644
--- a/lib/librte_cmdline/cmdline.h
+++ b/lib/librte_cmdline/cmdline.h
@@ -7,8 +7,6 @@
 #ifndef _CMDLINE_H_
 #define _CMDLINE_H_
 
-#include <termios.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 
 /**
@@ -21,22 +19,26 @@
 extern "C" {
 #endif
 
-struct cmdline {
-	int s_in;
-	int s_out;
-	cmdline_parse_ctx_t *ctx;
-	struct rdline rdl;
-	char prompt[RDLINE_PROMPT_SIZE];
-	struct termios oldterm;
+#define RDLINE_PROMPT_SIZE 32
+#define RDLINE_HISTORY_MAX_LINE 64
+#define RDLINE_COMPLETE_SIZE 128
+
+enum rdline_status {
+	RDLINE_ERROR = -1,
+	RDLINE_INIT,
+	RDLINE_RUNNING,
+	RDLINE_EXITED,
 };
 
+struct cmdline;
+
+void *cmdline_ctx_get(struct cmdline *cl);
 struct cmdline *cmdline_new(cmdline_parse_ctx_t *ctx, const char *prompt, int s_in, int s_out);
 void cmdline_set_prompt(struct cmdline *cl, const char *prompt);
 void cmdline_free(struct cmdline *cl);
 void cmdline_printf(const struct cmdline *cl, const char *fmt, ...)
 	__attribute__((format(printf,2,3)));
 int cmdline_in(struct cmdline *cl, const char *buf, int size);
-int cmdline_write_char(struct rdline *rdl, char c);
 
 /**
  * This function is nonblocking equivalent of ``cmdline_interact()``. It polls
diff --git a/lib/librte_cmdline/cmdline_cirbuf.c b/lib/librte_cmdline/cmdline_cirbuf.c
deleted file mode 100644
index 829a8af56..000000000
--- a/lib/librte_cmdline/cmdline_cirbuf.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <string.h>
-#include <errno.h>
-#include <stdio.h>
-
-#include "cmdline_cirbuf.h"
-
-
-int
-cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen)
-{
-	if (!cbuf || !buf)
-		return -EINVAL;
-	cbuf->maxlen = maxlen;
-	cbuf->len = 0;
-	cbuf->start = start;
-	cbuf->end = start;
-	cbuf->buf = buf;
-	return 0;
-}
-
-/* multiple add */
-
-int
-cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n)
-{
-	unsigned int e;
-
-	if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf))
-		return -EINVAL;
-
-	e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0;
-
-	if (n < cbuf->start + e) {
-		dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start - n + e, n);
-		memcpy(cbuf->buf + cbuf->start - n + e, c, n);
-	}
-	else {
-		dprintf("s[%d] -> d[%d] (%d)\n", + n - (cbuf->start + e), 0,
-			cbuf->start + e);
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - n +
-			(cbuf->start + e), 0, n - (cbuf->start + e));
-		memcpy(cbuf->buf, c  + n - (cbuf->start + e) , cbuf->start + e);
-		memcpy(cbuf->buf + cbuf->maxlen - n + (cbuf->start + e), c,
-		       n - (cbuf->start + e));
-	}
-	cbuf->len += n;
-	cbuf->start += (cbuf->maxlen - n + e);
-	cbuf->start %= cbuf->maxlen;
-	return n;
-}
-
-/* multiple add */
-
-int
-cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n)
-{
-	unsigned int e;
-
-	if (!cbuf || !c || !n || n > CIRBUF_GET_FREELEN(cbuf))
-		return -EINVAL;
-
-	e = CIRBUF_IS_EMPTY(cbuf) ? 1 : 0;
-
-	if (n < cbuf->maxlen - cbuf->end - 1 + e) {
-		dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end + !e, n);
-		memcpy(cbuf->buf + cbuf->end + !e, c, n);
-	}
-	else {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end + !e, 0,
-			cbuf->maxlen - cbuf->end - 1 + e);
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->maxlen - cbuf->end - 1 +
-			e, 0, n - cbuf->maxlen + cbuf->end + 1 - e);
-		memcpy(cbuf->buf + cbuf->end + !e, c, cbuf->maxlen -
-		       cbuf->end - 1 + e);
-		memcpy(cbuf->buf, c + cbuf->maxlen - cbuf->end - 1 + e,
-		       n - cbuf->maxlen + cbuf->end + 1 - e);
-	}
-	cbuf->len += n;
-	cbuf->end += n - e;
-	cbuf->end %= cbuf->maxlen;
-	return n;
-}
-
-/* add at head */
-
-static inline void
-__cirbuf_add_head(struct cirbuf * cbuf, char c)
-{
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start += (cbuf->maxlen - 1);
-		cbuf->start %= cbuf->maxlen;
-	}
-	cbuf->buf[cbuf->start] = c;
-	cbuf->len ++;
-}
-
-int
-cirbuf_add_head_safe(struct cirbuf * cbuf, char c)
-{
-	if (cbuf && !CIRBUF_IS_FULL(cbuf)) {
-		__cirbuf_add_head(cbuf, c);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_add_head(struct cirbuf * cbuf, char c)
-{
-	__cirbuf_add_head(cbuf, c);
-}
-
-/* add at tail */
-
-static inline void
-__cirbuf_add_tail(struct cirbuf * cbuf, char c)
-{
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end ++;
-		cbuf->end %= cbuf->maxlen;
-	}
-	cbuf->buf[cbuf->end] = c;
-	cbuf->len ++;
-}
-
-int
-cirbuf_add_tail_safe(struct cirbuf * cbuf, char c)
-{
-	if (cbuf && !CIRBUF_IS_FULL(cbuf)) {
-		__cirbuf_add_tail(cbuf, c);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_add_tail(struct cirbuf * cbuf, char c)
-{
-	__cirbuf_add_tail(cbuf, c);
-}
-
-
-static inline void
-__cirbuf_shift_left(struct cirbuf *cbuf)
-{
-	unsigned int i;
-	char tmp = cbuf->buf[cbuf->start];
-
-	for (i=0 ; i<cbuf->len ; i++) {
-		cbuf->buf[(cbuf->start+i)%cbuf->maxlen] =
-			cbuf->buf[(cbuf->start+i+1)%cbuf->maxlen];
-	}
-	cbuf->buf[(cbuf->start-1+cbuf->maxlen)%cbuf->maxlen] = tmp;
-	cbuf->start += (cbuf->maxlen - 1);
-	cbuf->start %= cbuf->maxlen;
-	cbuf->end += (cbuf->maxlen - 1);
-	cbuf->end %= cbuf->maxlen;
-}
-
-static inline void
-__cirbuf_shift_right(struct cirbuf *cbuf)
-{
-	unsigned int i;
-	char tmp = cbuf->buf[cbuf->end];
-
-	for (i=0 ; i<cbuf->len ; i++) {
-		cbuf->buf[(cbuf->end+cbuf->maxlen-i)%cbuf->maxlen] =
-			cbuf->buf[(cbuf->end+cbuf->maxlen-i-1)%cbuf->maxlen];
-	}
-	cbuf->buf[(cbuf->end+1)%cbuf->maxlen] = tmp;
-	cbuf->start += 1;
-	cbuf->start %= cbuf->maxlen;
-	cbuf->end += 1;
-	cbuf->end %= cbuf->maxlen;
-}
-
-/* XXX we could do a better algorithm here... */
-int
-cirbuf_align_left(struct cirbuf * cbuf)
-{
-	if (!cbuf)
-		return -EINVAL;
-
-	if (cbuf->start < cbuf->maxlen/2) {
-		while (cbuf->start != 0) {
-			__cirbuf_shift_left(cbuf);
-		}
-	}
-	else {
-		while (cbuf->start != 0) {
-			__cirbuf_shift_right(cbuf);
-		}
-	}
-
-	return 0;
-}
-
-/* XXX we could do a better algorithm here... */
-int
-cirbuf_align_right(struct cirbuf * cbuf)
-{
-	if (!cbuf)
-		return -EINVAL;
-
-	if (cbuf->start >= cbuf->maxlen/2) {
-		while (cbuf->end != cbuf->maxlen-1) {
-			__cirbuf_shift_left(cbuf);
-		}
-	}
-	else {
-		while (cbuf->start != cbuf->maxlen-1) {
-			__cirbuf_shift_right(cbuf);
-		}
-	}
-
-	return 0;
-}
-
-/* buffer del */
-
-int
-cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size)
-{
-	if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf))
-		return -EINVAL;
-
-	cbuf->len -= size;
-	if (CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start += size - 1;
-		cbuf->start %= cbuf->maxlen;
-	}
-	else {
-		cbuf->start += size;
-		cbuf->start %= cbuf->maxlen;
-	}
-	return 0;
-}
-
-/* buffer del */
-
-int
-cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size)
-{
-	if (!cbuf || !size || size > CIRBUF_GET_LEN(cbuf))
-		return -EINVAL;
-
-	cbuf->len -= size;
-	if (CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end  += (cbuf->maxlen - size + 1);
-		cbuf->end %= cbuf->maxlen;
-	}
-	else {
-		cbuf->end  += (cbuf->maxlen - size);
-		cbuf->end %= cbuf->maxlen;
-	}
-	return 0;
-}
-
-/* del at head */
-
-static inline void
-__cirbuf_del_head(struct cirbuf * cbuf)
-{
-	cbuf->len --;
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->start ++;
-		cbuf->start %= cbuf->maxlen;
-	}
-}
-
-int
-cirbuf_del_head_safe(struct cirbuf * cbuf)
-{
-	if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) {
-		__cirbuf_del_head(cbuf);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_del_head(struct cirbuf * cbuf)
-{
-	__cirbuf_del_head(cbuf);
-}
-
-/* del at tail */
-
-static inline void
-__cirbuf_del_tail(struct cirbuf * cbuf)
-{
-	cbuf->len --;
-	if (!CIRBUF_IS_EMPTY(cbuf)) {
-		cbuf->end  += (cbuf->maxlen - 1);
-		cbuf->end %= cbuf->maxlen;
-	}
-}
-
-int
-cirbuf_del_tail_safe(struct cirbuf * cbuf)
-{
-	if (cbuf && !CIRBUF_IS_EMPTY(cbuf)) {
-		__cirbuf_del_tail(cbuf);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-void
-cirbuf_del_tail(struct cirbuf * cbuf)
-{
-	__cirbuf_del_tail(cbuf);
-}
-
-/* convert to buffer */
-
-int
-cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size)
-{
-	unsigned int n;
-
-	if (!cbuf || !c)
-		return -EINVAL;
-
-	n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf);
-
-	if (!n)
-		return 0;
-
-	if (cbuf->start <= cbuf->end) {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0, n);
-		memcpy(c, cbuf->buf + cbuf->start , n);
-	}
-	else {
-		/* check if we need to go from end to the beginning */
-		if (n <= cbuf->maxlen - cbuf->start) {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->start, n);
-			memcpy(c, cbuf->buf + cbuf->start , n);
-		}
-		else {
-			dprintf("s[%d] -> d[%d] (%d)\n", cbuf->start, 0,
-				cbuf->maxlen - cbuf->start);
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->maxlen - cbuf->start,
-				n - cbuf->maxlen + cbuf->start);
-			memcpy(c, cbuf->buf + cbuf->start , cbuf->maxlen - cbuf->start);
-			memcpy(c + cbuf->maxlen - cbuf->start, cbuf->buf,
-				   n - cbuf->maxlen + cbuf->start);
-		}
-	}
-	return n;
-}
-
-/* convert to buffer */
-
-int
-cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size)
-{
-	unsigned int n;
-
-	if (!cbuf || !c)
-		return -EINVAL;
-
-	n = (size < CIRBUF_GET_LEN(cbuf)) ? size : CIRBUF_GET_LEN(cbuf);
-
-	if (!n)
-		return 0;
-
-	if (cbuf->start <= cbuf->end) {
-		dprintf("s[%d] -> d[%d] (%d)\n", cbuf->end - n + 1, 0, n);
-		memcpy(c, cbuf->buf + cbuf->end - n + 1, n);
-	}
-	else {
-		/* check if we need to go from end to the beginning */
-		if (n <= cbuf->end + 1) {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0, cbuf->end - n + 1, n);
-			memcpy(c, cbuf->buf + cbuf->end - n + 1, n);
-		}
-		else {
-			dprintf("s[%d] -> d[%d] (%d)\n", 0,
-				cbuf->maxlen - cbuf->start, cbuf->end + 1);
-			dprintf("s[%d] -> d[%d] (%d)\n",
-				cbuf->maxlen - n + cbuf->end + 1, 0, n - cbuf->end - 1);
-			memcpy(c + cbuf->maxlen - cbuf->start,
-					       cbuf->buf, cbuf->end + 1);
-			memcpy(c, cbuf->buf + cbuf->maxlen - n + cbuf->end +1,
-				   n - cbuf->end - 1);
-		}
-	}
-	return n;
-}
-
-/* get head or get tail */
-
-char
-cirbuf_get_head(struct cirbuf * cbuf)
-{
-	return cbuf->buf[cbuf->start];
-}
-
-/* get head or get tail */
-
-char
-cirbuf_get_tail(struct cirbuf * cbuf)
-{
-	return cbuf->buf[cbuf->end];
-}
diff --git a/lib/librte_cmdline/cmdline_cirbuf.h b/lib/librte_cmdline/cmdline_cirbuf.h
deleted file mode 100644
index c23b211ad..000000000
--- a/lib/librte_cmdline/cmdline_cirbuf.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _CIRBUF_H_
-#define _CIRBUF_H_
-
-#include <rte_config.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * This structure is the header of a cirbuf type.
- */
-struct cirbuf {
-	unsigned int maxlen;    /**< total len of the fifo (number of elements) */
-	unsigned int start;     /**< indice of the first elt */
-	unsigned int end;       /**< indice of the last elt */
-	unsigned int len;       /**< current len of fifo */
-	char *buf;
-};
-
-#ifdef RTE_LIBRTE_CMDLINE_DEBUG
-#define dprintf_(fmt, ...) printf("line %3.3d - " fmt "%.0s", __LINE__, __VA_ARGS__)
-#define dprintf(...) dprintf_(__VA_ARGS__, "dummy")
-#else
-#define dprintf(...) (void)0
-#endif
-
-
-/**
- * Init the circular buffer
- */
-int cirbuf_init(struct cirbuf *cbuf, char *buf, unsigned int start, unsigned int maxlen);
-
-
-/**
- * Return 1 if the circular buffer is full
- */
-#define CIRBUF_IS_FULL(cirbuf) ((cirbuf)->maxlen == (cirbuf)->len)
-
-/**
- * Return 1 if the circular buffer is empty
- */
-#define CIRBUF_IS_EMPTY(cirbuf) ((cirbuf)->len == 0)
-
-/**
- * return current size of the circular buffer (number of used elements)
- */
-#define CIRBUF_GET_LEN(cirbuf) ((cirbuf)->len)
-
-/**
- * return size of the circular buffer (used + free elements)
- */
-#define CIRBUF_GET_MAXLEN(cirbuf) ((cirbuf)->maxlen)
-
-/**
- * return the number of free elts
- */
-#define CIRBUF_GET_FREELEN(cirbuf) ((cirbuf)->maxlen - (cirbuf)->len)
-
-/**
- * Iterator for a circular buffer
- *   c: struct cirbuf pointer
- *   i: an integer type internally used in the macro
- *   e: char that takes the value for each iteration
- */
-#define CIRBUF_FOREACH(c, i, e)                                 \
-	for ( i=0, e=(c)->buf[(c)->start] ;                     \
-		i<((c)->len) ;                                  \
-		i ++,  e=(c)->buf[((c)->start+i)%((c)->maxlen)])
-
-
-/**
- * Add a character at head of the circular buffer. Return 0 on success, or
- * a negative value on error.
- */
-int cirbuf_add_head_safe(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at head of the circular buffer. You _must_ check that you
- * have enough free space in the buffer before calling this func.
- */
-void cirbuf_add_head(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at tail of the circular buffer. Return 0 on success, or
- * a negative value on error.
- */
-int cirbuf_add_tail_safe(struct cirbuf *cbuf, char c);
-
-/**
- * Add a character at tail of the circular buffer. You _must_ check that you
- * have enough free space in the buffer before calling this func.
- */
-void cirbuf_add_tail(struct cirbuf *cbuf, char c);
-
-/**
- * Remove a char at the head of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_head_safe(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the head of the circular buffer. You _must_ check
- * that buffer is not empty before calling the function.
- */
-void cirbuf_del_head(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the tail of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_tail_safe(struct cirbuf *cbuf);
-
-/**
- * Remove a char at the tail of the circular buffer. You _must_ check
- * that buffer is not empty before calling the function.
- */
-void cirbuf_del_tail(struct cirbuf *cbuf);
-
-/**
- * Return the head of the circular buffer. You _must_ check that
- * buffer is not empty before calling the function.
- */
-char cirbuf_get_head(struct cirbuf *cbuf);
-
-/**
- * Return the tail of the circular buffer. You _must_ check that
- * buffer is not empty before calling the function.
- */
-char cirbuf_get_tail(struct cirbuf *cbuf);
-
-/**
- * Add a buffer at head of the circular buffer. 'c' is a pointer to a
- * buffer, and n is the number of char to add. Return the number of
- * copied bytes on success, or a negative value on error.
- */
-int cirbuf_add_buf_head(struct cirbuf *cbuf, const char *c, unsigned int n);
-
-/**
- * Add a buffer at tail of the circular buffer. 'c' is a pointer to a
- * buffer, and n is the number of char to add. Return the number of
- * copied bytes on success, or a negative value on error.
- */
-int cirbuf_add_buf_tail(struct cirbuf *cbuf, const char *c, unsigned int n);
-
-/**
- * Remove chars at the head of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_buf_head(struct cirbuf *cbuf, unsigned int size);
-
-/**
- * Remove chars at the tail of the circular buffer. Return 0 on
- * success, or a negative value on error.
- */
-int cirbuf_del_buf_tail(struct cirbuf *cbuf, unsigned int size);
-
-/**
- * Copy a maximum of 'size' characters from the head of the circular
- * buffer to a flat one pointed by 'c'. Return the number of copied
- * chars.
- */
-int cirbuf_get_buf_head(struct cirbuf *cbuf, char *c, unsigned int size);
-
-/**
- * Copy a maximum of 'size' characters from the tail of the circular
- * buffer to a flat one pointed by 'c'. Return the number of copied
- * chars.
- */
-int cirbuf_get_buf_tail(struct cirbuf *cbuf, char *c, unsigned int size);
-
-
-/**
- * Set the start of the data to the index 0 of the internal buffer.
- */
-int cirbuf_align_left(struct cirbuf *cbuf);
-
-/**
- * Set the end of the data to the last index of the internal buffer.
- */
-int cirbuf_align_right(struct cirbuf *cbuf);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _CIRBUF_H_ */
diff --git a/lib/librte_cmdline/cmdline_parse.c b/lib/librte_cmdline/cmdline_parse.c
index 961f9befd..f75870b5b 100644
--- a/lib/librte_cmdline/cmdline_parse.c
+++ b/lib/librte_cmdline/cmdline_parse.c
@@ -16,7 +16,6 @@
 
 #include <rte_string_fns.h>
 
-#include "cmdline_rdline.h"
 #include "cmdline_parse.h"
 #include "cmdline.h"
 
@@ -216,7 +215,7 @@ cmdline_parse(struct cmdline *cl, const char * buf)
 	if (!cl || !buf)
 		return CMDLINE_PARSE_BAD_ARGS;
 
-	ctx = cl->ctx;
+	ctx = cmdline_ctx_get(cl);
 
 	/*
 	 * - look if the buffer contains at least one line
@@ -334,7 +333,7 @@ cmdline_complete(struct cmdline *cl, const char *buf, int *state,
 	if (!cl || !buf || !state || !dst)
 		return -1;
 
-	ctx = cl->ctx;
+	ctx = cmdline_ctx_get(cl);
 
 	debug_printf("%s called\n", __func__);
 	memset(&token_hdr, 0, sizeof(token_hdr));
@@ -346,7 +345,7 @@ cmdline_complete(struct cmdline *cl, const char *buf, int *state,
 		if (isblank2(buf[i]) && !isblank2(buf[i+1]))
 			partial_tok = buf+i+1;
 	}
-	partial_tok_len = strnlen(partial_tok, RDLINE_BUF_SIZE);
+	partial_tok_len = strlen(partial_tok);
 
 	/* first call -> do a first pass */
 	if (*state <= 0) {
diff --git a/lib/librte_cmdline/cmdline_rdline.c b/lib/librte_cmdline/cmdline_rdline.c
deleted file mode 100644
index 2cb53e38f..000000000
--- a/lib/librte_cmdline/cmdline_rdline.c
+++ /dev/null
@@ -1,644 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <ctype.h>
-
-#include "cmdline_cirbuf.h"
-#include "cmdline_rdline.h"
-
-static void rdline_puts(struct rdline *rdl, const char *buf);
-static void rdline_miniprintf(struct rdline *rdl,
-			      const char *buf, unsigned int val);
-
-static void rdline_remove_old_history_item(struct rdline *rdl);
-static void rdline_remove_first_history_item(struct rdline *rdl);
-static unsigned int rdline_get_history_size(struct rdline *rdl);
-
-
-/* isblank() needs _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE, so use our
- * own. */
-static int
-isblank2(char c)
-{
-	if (c == ' ' ||
-	    c == '\t' )
-		return 1;
-	return 0;
-}
-
-int
-rdline_init(struct rdline *rdl,
-		 rdline_write_char_t *write_char,
-		 rdline_validate_t *validate,
-		 rdline_complete_t *complete)
-{
-	if (!rdl || !write_char || !validate || !complete)
-		return -EINVAL;
-	memset(rdl, 0, sizeof(*rdl));
-	rdl->validate = validate;
-	rdl->complete = complete;
-	rdl->write_char = write_char;
-	rdl->status = RDLINE_INIT;
-	return cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE);
-}
-
-void
-rdline_newline(struct rdline *rdl, const char *prompt)
-{
-	unsigned int i;
-
-	if (!rdl || !prompt)
-		return;
-
-	vt100_init(&rdl->vt100);
-	cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-	cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-
-	rdl->prompt_size = strnlen(prompt, RDLINE_PROMPT_SIZE-1);
-	if (prompt != rdl->prompt)
-		memcpy(rdl->prompt, prompt, rdl->prompt_size);
-	rdl->prompt[RDLINE_PROMPT_SIZE-1] = '\0';
-
-	for (i=0 ; i<rdl->prompt_size ; i++)
-		rdl->write_char(rdl, rdl->prompt[i]);
-	rdl->status = RDLINE_RUNNING;
-
-	rdl->history_cur_line = -1;
-}
-
-void
-rdline_stop(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_INIT;
-}
-
-void
-rdline_quit(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_EXITED;
-}
-
-void
-rdline_restart(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	rdl->status = RDLINE_RUNNING;
-}
-
-void
-rdline_reset(struct rdline *rdl)
-{
-	if (!rdl)
-		return;
-	vt100_init(&rdl->vt100);
-	cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-	cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-
-	rdl->status = RDLINE_RUNNING;
-
-	rdl->history_cur_line = -1;
-}
-
-const char *
-rdline_get_buffer(struct rdline *rdl)
-{
-	if (!rdl)
-		return NULL;
-	unsigned int len_l, len_r;
-	cirbuf_align_left(&rdl->left);
-	cirbuf_align_left(&rdl->right);
-
-	len_l = CIRBUF_GET_LEN(&rdl->left);
-	len_r = CIRBUF_GET_LEN(&rdl->right);
-	memcpy(rdl->left_buf+len_l, rdl->right_buf, len_r);
-
-	rdl->left_buf[len_l + len_r] = '\n';
-	rdl->left_buf[len_l + len_r + 1] = '\0';
-	return rdl->left_buf;
-}
-
-static void
-display_right_buffer(struct rdline *rdl, int force)
-{
-	unsigned int i;
-	char tmp;
-
-	if (!force && CIRBUF_IS_EMPTY(&rdl->right))
-		return;
-
-	rdline_puts(rdl, vt100_clear_right);
-	CIRBUF_FOREACH(&rdl->right, i, tmp) {
-		rdl->write_char(rdl, tmp);
-	}
-	if (!CIRBUF_IS_EMPTY(&rdl->right))
-		rdline_miniprintf(rdl, vt100_multi_left,
-				  CIRBUF_GET_LEN(&rdl->right));
-}
-
-void
-rdline_redisplay(struct rdline *rdl)
-{
-	unsigned int i;
-	char tmp;
-
-	if (!rdl)
-		return;
-
-	rdline_puts(rdl, vt100_home);
-	for (i=0 ; i<rdl->prompt_size ; i++)
-		rdl->write_char(rdl, rdl->prompt[i]);
-	CIRBUF_FOREACH(&rdl->left, i, tmp) {
-		rdl->write_char(rdl, tmp);
-	}
-	display_right_buffer(rdl, 1);
-}
-
-int
-rdline_char_in(struct rdline *rdl, char c)
-{
-	unsigned int i;
-	int cmd;
-	char tmp;
-	char *buf;
-
-	if (!rdl)
-		return -EINVAL;
-
-	if (rdl->status == RDLINE_EXITED)
-		return RDLINE_RES_EXITED;
-	if (rdl->status != RDLINE_RUNNING)
-		return RDLINE_RES_NOT_RUNNING;
-
-	cmd = vt100_parser(&rdl->vt100, c);
-	if (cmd == -2)
-		return RDLINE_RES_SUCCESS;
-
-	if (cmd >= 0) {
-		switch (cmd) {
-		/* move caret 1 char to the left */
-		case CMDLINE_KEY_CTRL_B:
-		case CMDLINE_KEY_LEFT_ARR:
-			if (CIRBUF_IS_EMPTY(&rdl->left))
-				break;
-			tmp = cirbuf_get_tail(&rdl->left);
-			cirbuf_del_tail(&rdl->left);
-			cirbuf_add_head(&rdl->right, tmp);
-			rdline_puts(rdl, vt100_left_arr);
-			break;
-
-		/* move caret 1 char to the right */
-		case CMDLINE_KEY_CTRL_F:
-		case CMDLINE_KEY_RIGHT_ARR:
-			if (CIRBUF_IS_EMPTY(&rdl->right))
-				break;
-			tmp = cirbuf_get_head(&rdl->right);
-			cirbuf_del_head(&rdl->right);
-			cirbuf_add_tail(&rdl->left, tmp);
-			rdline_puts(rdl, vt100_right_arr);
-			break;
-
-		/* move caret 1 word to the left */
-		/* keyboard equivalent: Alt+B */
-		case CMDLINE_KEY_WLEFT:
-			while (! CIRBUF_IS_EMPTY(&rdl->left) &&
-			       (tmp = cirbuf_get_tail(&rdl->left)) &&
-			       isblank2(tmp)) {
-				rdline_puts(rdl, vt100_left_arr);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->left) &&
-			       (tmp = cirbuf_get_tail(&rdl->left)) &&
-			       !isblank2(tmp)) {
-				rdline_puts(rdl, vt100_left_arr);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			break;
-
-		/* move caret 1 word to the right */
-		/* keyboard equivalent: Alt+F */
-		case CMDLINE_KEY_WRIGHT:
-			while (! CIRBUF_IS_EMPTY(&rdl->right) &&
-			       (tmp = cirbuf_get_head(&rdl->right)) &&
-			       isblank2(tmp)) {
-				rdline_puts(rdl, vt100_right_arr);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->right) &&
-			       (tmp = cirbuf_get_head(&rdl->right)) &&
-			       !isblank2(tmp)) {
-				rdline_puts(rdl, vt100_right_arr);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			break;
-
-		/* move caret to the left */
-		case CMDLINE_KEY_CTRL_A:
-			if (CIRBUF_IS_EMPTY(&rdl->left))
-				break;
-			rdline_miniprintf(rdl, vt100_multi_left,
-						CIRBUF_GET_LEN(&rdl->left));
-			while (! CIRBUF_IS_EMPTY(&rdl->left)) {
-				tmp = cirbuf_get_tail(&rdl->left);
-				cirbuf_del_tail(&rdl->left);
-				cirbuf_add_head(&rdl->right, tmp);
-			}
-			break;
-
-		/* move caret to the right */
-		case CMDLINE_KEY_CTRL_E:
-			if (CIRBUF_IS_EMPTY(&rdl->right))
-				break;
-			rdline_miniprintf(rdl, vt100_multi_right,
-						CIRBUF_GET_LEN(&rdl->right));
-			while (! CIRBUF_IS_EMPTY(&rdl->right)) {
-				tmp = cirbuf_get_head(&rdl->right);
-				cirbuf_del_head(&rdl->right);
-				cirbuf_add_tail(&rdl->left, tmp);
-			}
-			break;
-
-		/* delete 1 char from the left */
-		case CMDLINE_KEY_BKSPACE:
-		case CMDLINE_KEY_BKSPACE2:
-			if(!cirbuf_del_tail_safe(&rdl->left)) {
-				rdline_puts(rdl, vt100_bs);
-				display_right_buffer(rdl, 1);
-			}
-			break;
-
-		/* delete 1 char from the right */
-		case CMDLINE_KEY_SUPPR:
-		case CMDLINE_KEY_CTRL_D:
-			if (cmd == CMDLINE_KEY_CTRL_D &&
-			    CIRBUF_IS_EMPTY(&rdl->left) &&
-			    CIRBUF_IS_EMPTY(&rdl->right)) {
-				return RDLINE_RES_EOF;
-			}
-			if (!cirbuf_del_head_safe(&rdl->right)) {
-				display_right_buffer(rdl, 1);
-			}
-			break;
-
-		/* delete 1 word from the left */
-		case CMDLINE_KEY_META_BKSPACE:
-		case CMDLINE_KEY_CTRL_W:
-			while (! CIRBUF_IS_EMPTY(&rdl->left) && isblank2(cirbuf_get_tail(&rdl->left))) {
-				rdline_puts(rdl, vt100_bs);
-				cirbuf_del_tail(&rdl->left);
-			}
-			while (! CIRBUF_IS_EMPTY(&rdl->left) && !isblank2(cirbuf_get_tail(&rdl->left))) {
-				rdline_puts(rdl, vt100_bs);
-				cirbuf_del_tail(&rdl->left);
-			}
-			display_right_buffer(rdl, 1);
-			break;
-
-		/* delete 1 word from the right */
-		case CMDLINE_KEY_META_D:
-			while (! CIRBUF_IS_EMPTY(&rdl->right) && isblank2(cirbuf_get_head(&rdl->right)))
-				cirbuf_del_head(&rdl->right);
-			while (! CIRBUF_IS_EMPTY(&rdl->right) && !isblank2(cirbuf_get_head(&rdl->right)))
-				cirbuf_del_head(&rdl->right);
-			display_right_buffer(rdl, 1);
-			break;
-
-		/* set kill buffer to contents on the right side of caret */
-		case CMDLINE_KEY_CTRL_K:
-			cirbuf_get_buf_head(&rdl->right, rdl->kill_buf, RDLINE_BUF_SIZE);
-			rdl->kill_size = CIRBUF_GET_LEN(&rdl->right);
-			cirbuf_del_buf_head(&rdl->right, rdl->kill_size);
-			rdline_puts(rdl, vt100_clear_right);
-			break;
-
-		/* paste contents of kill buffer to the left side of caret */
-		case CMDLINE_KEY_CTRL_Y:
-			i=0;
-			while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) <
-			      RDLINE_BUF_SIZE &&
-			      i < rdl->kill_size) {
-				cirbuf_add_tail(&rdl->left, rdl->kill_buf[i]);
-				rdl->write_char(rdl, rdl->kill_buf[i]);
-				i++;
-			}
-			display_right_buffer(rdl, 0);
-			break;
-
-		/* clear and newline */
-		case CMDLINE_KEY_CTRL_C:
-			rdline_puts(rdl, "\r\n");
-			rdline_newline(rdl, rdl->prompt);
-			break;
-
-		/* redisplay (helps when prompt is lost in other output) */
-		case CMDLINE_KEY_CTRL_L:
-			rdline_redisplay(rdl);
-			break;
-
-		/* autocomplete */
-		case CMDLINE_KEY_TAB:
-		case CMDLINE_KEY_HELP:
-			cirbuf_align_left(&rdl->left);
-			rdl->left_buf[CIRBUF_GET_LEN(&rdl->left)] = '\0';
-			if (rdl->complete) {
-				char tmp_buf[BUFSIZ];
-				int complete_state;
-				int ret;
-				unsigned int tmp_size;
-
-				if (cmd == CMDLINE_KEY_TAB)
-					complete_state = 0;
-				else
-					complete_state = -1;
-
-				/* see in parse.h for help on complete() */
-				ret = rdl->complete(rdl, rdl->left_buf,
-						    tmp_buf, sizeof(tmp_buf),
-						    &complete_state);
-				/* no completion or error */
-				if (ret <= 0) {
-					return RDLINE_RES_COMPLETE;
-				}
-
-				tmp_size = strnlen(tmp_buf, sizeof(tmp_buf));
-				/* add chars */
-				if (ret == RDLINE_RES_COMPLETE) {
-					i=0;
-					while(CIRBUF_GET_LEN(&rdl->right) + CIRBUF_GET_LEN(&rdl->left) <
-					      RDLINE_BUF_SIZE &&
-					      i < tmp_size) {
-						cirbuf_add_tail(&rdl->left, tmp_buf[i]);
-						rdl->write_char(rdl, tmp_buf[i]);
-						i++;
-					}
-					display_right_buffer(rdl, 1);
-					return RDLINE_RES_COMPLETE; /* ?? */
-				}
-
-				/* choice */
-				rdline_puts(rdl, "\r\n");
-				while (ret) {
-					rdl->write_char(rdl, ' ');
-					for (i=0 ; tmp_buf[i] ; i++)
-						rdl->write_char(rdl, tmp_buf[i]);
-					rdline_puts(rdl, "\r\n");
-					ret = rdl->complete(rdl, rdl->left_buf,
-							    tmp_buf, sizeof(tmp_buf),
-							    &complete_state);
-				}
-
-				rdline_redisplay(rdl);
-			}
-			return RDLINE_RES_COMPLETE;
-
-		/* complete buffer */
-		case CMDLINE_KEY_RETURN:
-		case CMDLINE_KEY_RETURN2:
-			rdline_get_buffer(rdl);
-			rdl->status = RDLINE_INIT;
-			rdline_puts(rdl, "\r\n");
-			if (rdl->history_cur_line != -1)
-				rdline_remove_first_history_item(rdl);
-
-			if (rdl->validate)
-				rdl->validate(rdl, rdl->left_buf, CIRBUF_GET_LEN(&rdl->left)+2);
-			/* user may have stopped rdline */
-			if (rdl->status == RDLINE_EXITED)
-				return RDLINE_RES_EXITED;
-			return RDLINE_RES_VALIDATED;
-
-		/* previous element in history */
-		case CMDLINE_KEY_UP_ARR:
-		case CMDLINE_KEY_CTRL_P:
-			if (rdl->history_cur_line == 0) {
-				rdline_remove_first_history_item(rdl);
-			}
-			if (rdl->history_cur_line <= 0) {
-				rdline_add_history(rdl, rdline_get_buffer(rdl));
-				rdl->history_cur_line = 0;
-			}
-
-			buf = rdline_get_history_item(rdl, rdl->history_cur_line + 1);
-			if (!buf)
-				break;
-
-			rdl->history_cur_line ++;
-			vt100_init(&rdl->vt100);
-			cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE));
-			rdline_redisplay(rdl);
-			break;
-
-		/* next element in history */
-		case CMDLINE_KEY_DOWN_ARR:
-		case CMDLINE_KEY_CTRL_N:
-			if (rdl->history_cur_line - 1 < 0)
-				break;
-
-			rdl->history_cur_line --;
-			buf = rdline_get_history_item(rdl, rdl->history_cur_line);
-			if (!buf)
-				break;
-			vt100_init(&rdl->vt100);
-			cirbuf_init(&rdl->left, rdl->left_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_init(&rdl->right, rdl->right_buf, 0, RDLINE_BUF_SIZE);
-			cirbuf_add_buf_tail(&rdl->left, buf, strnlen(buf, RDLINE_BUF_SIZE));
-			rdline_redisplay(rdl);
-
-			break;
-
-
-		default:
-			break;
-		}
-
-		return RDLINE_RES_SUCCESS;
-	}
-
-	if (!isprint((int)c))
-		return RDLINE_RES_SUCCESS;
-
-	/* standard chars */
-	if (CIRBUF_GET_LEN(&rdl->left) + CIRBUF_GET_LEN(&rdl->right) >= RDLINE_BUF_SIZE)
-		return RDLINE_RES_SUCCESS;
-
-	if (cirbuf_add_tail_safe(&rdl->left, c))
-		return RDLINE_RES_SUCCESS;
-
-	rdl->write_char(rdl, c);
-	display_right_buffer(rdl, 0);
-
-	return RDLINE_RES_SUCCESS;
-}
-
-
-/* HISTORY */
-
-static void
-rdline_remove_old_history_item(struct rdline * rdl)
-{
-	char tmp;
-
-	while (! CIRBUF_IS_EMPTY(&rdl->history) ) {
-		tmp = cirbuf_get_head(&rdl->history);
-		cirbuf_del_head(&rdl->history);
-		if (!tmp)
-			break;
-	}
-}
-
-static void
-rdline_remove_first_history_item(struct rdline * rdl)
-{
-	char tmp;
-
-	if ( CIRBUF_IS_EMPTY(&rdl->history) ) {
-		return;
-	}
-	else {
-		cirbuf_del_tail(&rdl->history);
-	}
-
-	while (! CIRBUF_IS_EMPTY(&rdl->history) ) {
-		tmp = cirbuf_get_tail(&rdl->history);
-		if (!tmp)
-			break;
-		cirbuf_del_tail(&rdl->history);
-	}
-}
-
-static unsigned int
-rdline_get_history_size(struct rdline * rdl)
-{
-	unsigned int i, tmp, ret=0;
-
-	CIRBUF_FOREACH(&rdl->history, i, tmp) {
-		if (tmp == 0)
-			ret ++;
-	}
-
-	return ret;
-}
-
-char *
-rdline_get_history_item(struct rdline * rdl, unsigned int idx)
-{
-	unsigned int len, i, tmp;
-
-	if (!rdl)
-		return NULL;
-
-	len = rdline_get_history_size(rdl);
-	if ( idx >= len ) {
-		return NULL;
-	}
-
-	cirbuf_align_left(&rdl->history);
-
-	CIRBUF_FOREACH(&rdl->history, i, tmp) {
-		if ( idx == len - 1) {
-			return rdl->history_buf + i;
-		}
-		if (tmp == 0)
-			len --;
-	}
-
-	return NULL;
-}
-
-int
-rdline_add_history(struct rdline * rdl, const char * buf)
-{
-	unsigned int len, i;
-
-	if (!rdl || !buf)
-		return -EINVAL;
-
-	len = strnlen(buf, RDLINE_BUF_SIZE);
-	for (i=0; i<len ; i++) {
-		if (buf[i] == '\n') {
-			len = i;
-			break;
-		}
-	}
-
-	if ( len >= RDLINE_HISTORY_BUF_SIZE )
-		return -1;
-
-	while ( len >= CIRBUF_GET_FREELEN(&rdl->history) ) {
-		rdline_remove_old_history_item(rdl);
-	}
-
-	cirbuf_add_buf_tail(&rdl->history, buf, len);
-	cirbuf_add_tail(&rdl->history, 0);
-
-	return 0;
-}
-
-void
-rdline_clear_history(struct rdline * rdl)
-{
-	if (!rdl)
-		return;
-	cirbuf_init(&rdl->history, rdl->history_buf, 0, RDLINE_HISTORY_BUF_SIZE);
-}
-
-
-/* STATIC USEFUL FUNCS */
-
-static void
-rdline_puts(struct rdline * rdl, const char * buf)
-{
-	char c;
-	while ( (c = *(buf++)) != '\0' ) {
-		rdl->write_char(rdl, c);
-	}
-}
-
-/* a very very basic printf with one arg and one format 'u' */
-static void
-rdline_miniprintf(struct rdline *rdl, const char * buf, unsigned int val)
-{
-	char c, started=0, div=100;
-
-	while ( (c=*(buf++)) ) {
-		if (c != '%') {
-			rdl->write_char(rdl, c);
-			continue;
-		}
-		c = *(buf++);
-		if (c != 'u') {
-			rdl->write_char(rdl, '%');
-			rdl->write_char(rdl, c);
-			continue;
-		}
-		/* val is never more than 255 */
-		while (div) {
-			c = (char)(val / div);
-			if (c || started) {
-				rdl->write_char(rdl, (char)(c+'0'));
-				started = 1;
-			}
-			val %= div;
-			div /= 10;
-		}
-	}
-}
diff --git a/lib/librte_cmdline/cmdline_rdline.h b/lib/librte_cmdline/cmdline_rdline.h
deleted file mode 100644
index d2170293d..000000000
--- a/lib/librte_cmdline/cmdline_rdline.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _RDLINE_H_
-#define _RDLINE_H_
-
-/**
- * This file is a small equivalent to the GNU readline library, but it
- * was originally designed for small systems, like Atmel AVR
- * microcontrollers (8 bits). Indeed, we don't use any malloc that is
- * sometimes not implemented (or just not recommended) on such
- * systems.
- *
- * Obviously, it does not support as many things as the GNU readline,
- * but at least it supports some interesting features like a kill
- * buffer and a command history.
- *
- * It also have a feature that does not have the GNU readline (as far
- * as I know): we can have several instances of it running at the same
- * time, even on a monothread program, since it works with callbacks.
- *
- * The lib is designed for a client-side or a server-side use:
- * - server-side: the server receives all data from a socket, including
- *   control chars, like arrows, tabulations, ... The client is
- *   very simple, it can be a telnet or a minicom through a serial line.
- * - client-side: the client receives its data through its stdin for
- *   instance.
- */
-
-#include <stdio.h>
-#include <cmdline_cirbuf.h>
-#include <cmdline_vt100.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* configuration */
-#define RDLINE_BUF_SIZE 512
-#define RDLINE_PROMPT_SIZE  32
-#define RDLINE_VT100_BUF_SIZE  8
-#define RDLINE_HISTORY_BUF_SIZE BUFSIZ
-#define RDLINE_HISTORY_MAX_LINE 64
-
-enum rdline_status {
-	RDLINE_INIT,
-	RDLINE_RUNNING,
-	RDLINE_EXITED
-};
-
-struct rdline;
-
-typedef int (rdline_write_char_t)(struct rdline *rdl, char);
-typedef void (rdline_validate_t)(struct rdline *rdl,
-				 const char *buf, unsigned int size);
-typedef int (rdline_complete_t)(struct rdline *rdl, const char *buf,
-				char *dstbuf, unsigned int dstsize,
-				int *state);
-
-struct rdline {
-	enum rdline_status status;
-	/* rdline bufs */
-	struct cirbuf left;
-	struct cirbuf right;
-	char left_buf[RDLINE_BUF_SIZE+2]; /* reserve 2 chars for the \n\0 */
-	char right_buf[RDLINE_BUF_SIZE];
-
-	char prompt[RDLINE_PROMPT_SIZE];
-	unsigned int prompt_size;
-
-	char kill_buf[RDLINE_BUF_SIZE];
-	unsigned int kill_size;
-
-	/* history */
-	struct cirbuf history;
-	char history_buf[RDLINE_HISTORY_BUF_SIZE];
-	int history_cur_line;
-
-	/* callbacks and func pointers */
-	rdline_write_char_t *write_char;
-	rdline_validate_t *validate;
-	rdline_complete_t *complete;
-
-	/* vt100 parser */
-	struct cmdline_vt100 vt100;
-
-	/* opaque pointer */
-	void *opaque;
-};
-
-/**
- * Init fields for a struct rdline. Call this only once at the beginning
- * of your program.
- * \param rdl A pointer to an uninitialized struct rdline
- * \param write_char The function used by the function to write a character
- * \param validate A pointer to the function to execute when the
- *                 user validates the buffer.
- * \param complete A pointer to the function to execute when the
- *                 user completes the buffer.
- */
-int rdline_init(struct rdline *rdl,
-		 rdline_write_char_t *write_char,
-		 rdline_validate_t *validate,
-		 rdline_complete_t *complete);
-
-
-/**
- * Init the current buffer, and display a prompt.
- * \param rdl A pointer to a struct rdline
- * \param prompt A string containing the prompt
- */
-void rdline_newline(struct rdline *rdl, const char *prompt);
-
-/**
- * Call it and all received chars will be ignored.
- * \param rdl A pointer to a struct rdline
- */
-void rdline_stop(struct rdline *rdl);
-
-/**
- * Same than rdline_stop() except that next calls to rdline_char_in()
- * will return RDLINE_RES_EXITED.
- * \param rdl A pointer to a struct rdline
- */
-void rdline_quit(struct rdline *rdl);
-
-/**
- * Restart after a call to rdline_stop() or rdline_quit()
- * \param rdl A pointer to a struct rdline
- */
-void rdline_restart(struct rdline *rdl);
-
-/**
- * Redisplay the current buffer
- * \param rdl A pointer to a struct rdline
- */
-void rdline_redisplay(struct rdline *rdl);
-
-/**
- * Reset the current buffer and setup for a new line.
- *  \param rdl A pointer to a struct rdline
- */
-void rdline_reset(struct rdline *rdl);
-
-
-/* return status for rdline_char_in() */
-#define RDLINE_RES_SUCCESS       0
-#define RDLINE_RES_VALIDATED     1
-#define RDLINE_RES_COMPLETE      2
-#define RDLINE_RES_NOT_RUNNING  -1
-#define RDLINE_RES_EOF          -2
-#define RDLINE_RES_EXITED       -3
-
-/**
- * append a char to the readline buffer.
- * Return RDLINE_RES_VALIDATE when the line has been validated.
- * Return RDLINE_RES_COMPLETE when the user asked to complete the buffer.
- * Return RDLINE_RES_NOT_RUNNING if it is not running.
- * Return RDLINE_RES_EOF if EOF (ctrl-d on an empty line).
- * Else return RDLINE_RES_SUCCESS.
- * XXX error case when the buffer is full ?
- *
- * \param rdl A pointer to a struct rdline
- * \param c The character to append
- */
-int rdline_char_in(struct rdline *rdl, char c);
-
-/**
- * Return the current buffer, terminated by '\0'.
- * \param rdl A pointer to a struct rdline
- */
-const char *rdline_get_buffer(struct rdline *rdl);
-
-
-/**
- * Add the buffer to history.
- * return < 0 on error.
- * \param rdl A pointer to a struct rdline
- * \param buf A buffer that is terminated by '\0'
- */
-int rdline_add_history(struct rdline *rdl, const char *buf);
-
-/**
- * Clear current history
- * \param rdl A pointer to a struct rdline
- */
-void rdline_clear_history(struct rdline *rdl);
-
-/**
- * Get the i-th history item
- */
-char *rdline_get_history_item(struct rdline *rdl, unsigned int i);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _RDLINE_H_ */
diff --git a/lib/librte_cmdline/cmdline_socket.c b/lib/librte_cmdline/cmdline_socket.c
index ecb3d82b6..f639c61cb 100644
--- a/lib/librte_cmdline/cmdline_socket.c
+++ b/lib/librte_cmdline/cmdline_socket.c
@@ -4,23 +4,18 @@
  * All rights reserved.
  */
 
-#include <stdio.h>
-#include <string.h>
+#include <stddef.h>
 #include <unistd.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <inttypes.h>
 #include <fcntl.h>
-#include <termios.h>
 
 #include "cmdline_parse.h"
-#include "cmdline_rdline.h"
 #include "cmdline_socket.h"
 #include "cmdline.h"
 
 struct cmdline *
 cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path)
 {
+	struct cmdline *cl;
 	int fd;
 
 	/* everything else is checked in cmdline_new() */
@@ -29,37 +24,22 @@ cmdline_file_new(cmdline_parse_ctx_t *ctx, const char *prompt, const char *path)
 
 	fd = open(path, O_RDONLY, 0);
 	if (fd < 0) {
-		dprintf("open() failed\n");
 		return NULL;
 	}
-	return cmdline_new(ctx, prompt, fd, -1);
+	cl = cmdline_new(ctx, prompt, fd, -1);
+	/* cmdline_new() duplicates fd */
+	close(fd);
+	return cl;
 }
 
 struct cmdline *
 cmdline_stdin_new(cmdline_parse_ctx_t *ctx, const char *prompt)
 {
-	struct cmdline *cl;
-	struct termios oldterm, term;
-
-	tcgetattr(0, &oldterm);
-	memcpy(&term, &oldterm, sizeof(term));
-	term.c_lflag &= ~(ICANON | ECHO | ISIG);
-	tcsetattr(0, TCSANOW, &term);
-	setbuf(stdin, NULL);
-
-	cl = cmdline_new(ctx, prompt, 0, 1);
-
-	if (cl)
-		memcpy(&cl->oldterm, &oldterm, sizeof(term));
-
-	return cl;
+	return cmdline_new(ctx, prompt, 0, 1);
 }
 
 void
 cmdline_stdin_exit(struct cmdline *cl)
 {
-	if (!cl)
-		return;
-
-	tcsetattr(fileno(stdin), TCSANOW, &cl->oldterm);
+	cmdline_free(cl);
 }
diff --git a/lib/librte_cmdline/cmdline_vt100.c b/lib/librte_cmdline/cmdline_vt100.c
deleted file mode 100644
index 662fc7345..000000000
--- a/lib/librte_cmdline/cmdline_vt100.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdarg.h>
-#include <ctype.h>
-#include <termios.h>
-
-#include "cmdline_vt100.h"
-
-const char *cmdline_vt100_commands[] = {
-	vt100_up_arr,
-	vt100_down_arr,
-	vt100_right_arr,
-	vt100_left_arr,
-	"\177",
-	"\n",
-	"\001",
-	"\005",
-	"\013",
-	"\031",
-	"\003",
-	"\006",
-	"\002",
-	vt100_suppr,
-	vt100_tab,
-	"\004",
-	"\014",
-	"\r",
-	"\033\177",
-	vt100_word_left,
-	vt100_word_right,
-	"?",
-	"\027",
-	"\020",
-	"\016",
-	"\033\144",
-	vt100_bs,
-};
-
-void
-vt100_init(struct cmdline_vt100 *vt)
-{
-	if (!vt)
-		return;
-	vt->state = CMDLINE_VT100_INIT;
-}
-
-
-static int
-match_command(char *buf, unsigned int size)
-{
-	const char *cmd;
-	size_t cmdlen;
-	unsigned int i = 0;
-
-	for (i=0 ; i<sizeof(cmdline_vt100_commands)/sizeof(const char *) ; i++) {
-		cmd = *(cmdline_vt100_commands + i);
-
-		cmdlen = strnlen(cmd, CMDLINE_VT100_BUF_SIZE);
-		if (size == cmdlen &&
-		    !strncmp(buf, cmd, cmdlen)) {
-			return i;
-		}
-	}
-
-	return -1;
-}
-
-int
-vt100_parser(struct cmdline_vt100 *vt, char ch)
-{
-	unsigned int size;
-	uint8_t c = (uint8_t) ch;
-
-	if (!vt)
-		return -1;
-
-	if (vt->bufpos >= CMDLINE_VT100_BUF_SIZE) {
-		vt->state = CMDLINE_VT100_INIT;
-		vt->bufpos = 0;
-	}
-
-	vt->buf[vt->bufpos++] = c;
-	size = vt->bufpos;
-
-	switch (vt->state) {
-	case CMDLINE_VT100_INIT:
-		if (c == 033) {
-			vt->state = CMDLINE_VT100_ESCAPE;
-		}
-		else {
-			vt->bufpos = 0;
-			goto match_command;
-		}
-		break;
-
-	case CMDLINE_VT100_ESCAPE:
-		if (c == 0133) {
-			vt->state = CMDLINE_VT100_ESCAPE_CSI;
-		}
-		else if (c >= 060 && c <= 0177) { /* XXX 0177 ? */
-			vt->bufpos = 0;
-			vt->state = CMDLINE_VT100_INIT;
-			goto match_command;
-		}
-		break;
-
-	case CMDLINE_VT100_ESCAPE_CSI:
-		if (c >= 0100 && c <= 0176) {
-			vt->bufpos = 0;
-			vt->state = CMDLINE_VT100_INIT;
-			goto match_command;
-		}
-		break;
-
-	default:
-		vt->bufpos = 0;
-		break;
-	}
-
-	return -2;
-
- match_command:
-	return match_command(vt->buf, size);
-}
diff --git a/lib/librte_cmdline/cmdline_vt100.h b/lib/librte_cmdline/cmdline_vt100.h
deleted file mode 100644
index e33e67ed8..000000000
--- a/lib/librte_cmdline/cmdline_vt100.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright (c) 2009, Olivier MATZ <zer0@droids-corp.org>
- * All rights reserved.
- */
-
-#ifndef _CMDLINE_VT100_H_
-#define _CMDLINE_VT100_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define vt100_bell         "\007"
-#define vt100_bs           "\010"
-#define vt100_bs_clear     "\010 \010"
-#define vt100_tab          "\011"
-#define vt100_crnl         "\012\015"
-#define vt100_clear_right  "\033[0K"
-#define vt100_clear_left   "\033[1K"
-#define vt100_clear_down   "\033[0J"
-#define vt100_clear_up     "\033[1J"
-#define vt100_clear_line   "\033[2K"
-#define vt100_clear_screen "\033[2J"
-#define vt100_up_arr       "\033\133\101"
-#define vt100_down_arr     "\033\133\102"
-#define vt100_right_arr    "\033\133\103"
-#define vt100_left_arr     "\033\133\104"
-#define vt100_multi_right  "\033\133%uC"
-#define vt100_multi_left   "\033\133%uD"
-#define vt100_suppr        "\033\133\063\176"
-#define vt100_home         "\033M\033E"
-#define vt100_word_left    "\033\142"
-#define vt100_word_right   "\033\146"
-
-/* Result of parsing : it must be synchronized with
- * cmdline_vt100_commands[] in vt100.c */
-#define CMDLINE_KEY_UP_ARR 0
-#define CMDLINE_KEY_DOWN_ARR 1
-#define CMDLINE_KEY_RIGHT_ARR 2
-#define CMDLINE_KEY_LEFT_ARR 3
-#define CMDLINE_KEY_BKSPACE 4
-#define CMDLINE_KEY_RETURN 5
-#define CMDLINE_KEY_CTRL_A 6
-#define CMDLINE_KEY_CTRL_E 7
-#define CMDLINE_KEY_CTRL_K 8
-#define CMDLINE_KEY_CTRL_Y 9
-#define CMDLINE_KEY_CTRL_C 10
-#define CMDLINE_KEY_CTRL_F 11
-#define CMDLINE_KEY_CTRL_B 12
-#define CMDLINE_KEY_SUPPR 13
-#define CMDLINE_KEY_TAB 14
-#define CMDLINE_KEY_CTRL_D 15
-#define CMDLINE_KEY_CTRL_L 16
-#define CMDLINE_KEY_RETURN2 17
-#define CMDLINE_KEY_META_BKSPACE 18
-#define CMDLINE_KEY_WLEFT 19
-#define CMDLINE_KEY_WRIGHT 20
-#define CMDLINE_KEY_HELP 21
-#define CMDLINE_KEY_CTRL_W 22
-#define CMDLINE_KEY_CTRL_P 23
-#define CMDLINE_KEY_CTRL_N 24
-#define CMDLINE_KEY_META_D 25
-#define CMDLINE_KEY_BKSPACE2 26
-
-extern const char *cmdline_vt100_commands[];
-
-enum cmdline_vt100_parser_state {
-	CMDLINE_VT100_INIT,
-	CMDLINE_VT100_ESCAPE,
-	CMDLINE_VT100_ESCAPE_CSI
-};
-
-#define CMDLINE_VT100_BUF_SIZE 8
-struct cmdline_vt100 {
-	uint8_t bufpos;
-	char buf[CMDLINE_VT100_BUF_SIZE];
-	enum cmdline_vt100_parser_state state;
-};
-
-/**
- * Init
- */
-void vt100_init(struct cmdline_vt100 *vt);
-
-/**
- * Input a new character.
- * Return -1 if the character is not part of a control sequence
- * Return -2 if c is not the last char of a control sequence
- * Else return the index in vt100_commands[]
- */
-int vt100_parser(struct cmdline_vt100 *vt, char c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/lib/librte_cmdline/meson.build b/lib/librte_cmdline/meson.build
index 5741817ac..1735737c7 100644
--- a/lib/librte_cmdline/meson.build
+++ b/lib/librte_cmdline/meson.build
@@ -1,18 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
-version = 2
+version = 3
 sources = files('cmdline.c',
-	'cmdline_cirbuf.c',
 	'cmdline_parse.c',
 	'cmdline_parse_etheraddr.c',
 	'cmdline_parse_ipaddr.c',
 	'cmdline_parse_num.c',
 	'cmdline_parse_portlist.c',
 	'cmdline_parse_string.c',
-	'cmdline_rdline.c',
-	'cmdline_socket.c',
-	'cmdline_vt100.c')
+	'cmdline_socket.c')
 
 headers = files('cmdline.h',
 	'cmdline_parse.h',
@@ -20,8 +17,13 @@ headers = files('cmdline.h',
 	'cmdline_parse_ipaddr.h',
 	'cmdline_parse_etheraddr.h',
 	'cmdline_parse_string.h',
-	'cmdline_rdline.h',
-	'cmdline_vt100.h',
 	'cmdline_socket.h',
-	'cmdline_cirbuf.h',
 	'cmdline_parse_portlist.h')
+
+cmdline_dep = dependency('libedit', required: false)
+if cmdline_dep.found()
+	ext_deps += cmdline_dep
+	dpdk_extra_ldflags += '-ledit'
+else
+	build = false
+endif
diff --git a/lib/librte_cmdline/rte_cmdline_version.map b/lib/librte_cmdline/rte_cmdline_version.map
index 04bcb387f..31331995b 100644
--- a/lib/librte_cmdline/rte_cmdline_version.map
+++ b/lib/librte_cmdline/rte_cmdline_version.map
@@ -1,25 +1,6 @@
 DPDK_2.0 {
 	global:
 
-	cirbuf_add_buf_head;
-	cirbuf_add_buf_tail;
-	cirbuf_add_head;
-	cirbuf_add_head_safe;
-	cirbuf_add_tail;
-	cirbuf_add_tail_safe;
-	cirbuf_align_left;
-	cirbuf_align_right;
-	cirbuf_del_buf_head;
-	cirbuf_del_buf_tail;
-	cirbuf_del_head;
-	cirbuf_del_head_safe;
-	cirbuf_del_tail;
-	cirbuf_del_tail_safe;
-	cirbuf_get_buf_head;
-	cirbuf_get_buf_tail;
-	cirbuf_get_head;
-	cirbuf_get_tail;
-	cirbuf_init;
 	cmdline_complete;
 	cmdline_complete_get_elt_string;
 	cmdline_complete_get_nb_string;
@@ -50,21 +31,6 @@ DPDK_2.0 {
 	cmdline_token_num_ops;
 	cmdline_token_portlist_ops;
 	cmdline_token_string_ops;
-	cmdline_write_char;
-	rdline_add_history;
-	rdline_char_in;
-	rdline_clear_history;
-	rdline_get_buffer;
-	rdline_get_history_item;
-	rdline_init;
-	rdline_newline;
-	rdline_quit;
-	rdline_redisplay;
-	rdline_reset;
-	rdline_restart;
-	rdline_stop;
-	vt100_init;
-	vt100_parser;
 
 	local: *;
 };
@@ -75,3 +41,10 @@ DPDK_2.1 {
 	cmdline_poll;
 
 } DPDK_2.0;
+
+DPDK_18.02 {
+	global:
+
+	cmdline_ctx_get;
+
+} DPDK_2.1;
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 8bab901fc..f66411eba 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -104,6 +104,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_RING)           += -lrte_ring
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PCI)            += -lrte_pci
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrte_eal
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE)        += -lrte_cmdline
+
+_LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE)        += $(shell pkg-config --libs libedit)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_REORDER)        += -lrte_reorder
 _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrte_sched
 
diff --git a/test/cmdline_test/cmdline_test.c b/test/cmdline_test/cmdline_test.c
index 3e406331a..e46b66d0c 100644
--- a/test/cmdline_test/cmdline_test.c
+++ b/test/cmdline_test/cmdline_test.c
@@ -12,7 +12,6 @@
 #include <ctype.h>
 #include <sys/queue.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/test/cmdline_test/commands.c b/test/cmdline_test/commands.c
index d81da9665..2bf0262f6 100644
--- a/test/cmdline_test/commands.c
+++ b/test/cmdline_test/commands.c
@@ -7,7 +7,6 @@
 #include <termios.h>
 #include <inttypes.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_string.h>
 #include <cmdline_parse_num.h>
@@ -277,72 +276,6 @@ cmdline_parse_inst_t cmd_ambig_2 = {
 
 
 
-/*** get_history_bufsize ***/
-/* command that displays total space in history buffer
- * this will be useful for testing history (to fill it up just enough to
- * remove the last entry, we need to know how big it is).
- */
-
-struct cmd_get_history_bufsize_result {
-	cmdline_fixed_string_t str;
-};
-
-static void
-cmd_get_history_bufsize_parsed(__attribute__((unused)) void *parsed_result,
-		struct cmdline *cl,
-		__attribute__((unused)) void *data)
-{
-	cmdline_printf(cl, "History buffer size: %zu\n",
-			sizeof(cl->rdl.history_buf));
-}
-
-cmdline_parse_token_string_t cmd_get_history_bufsize_tok =
-	TOKEN_STRING_INITIALIZER(struct cmd_get_history_bufsize_result, str,
-				 "get_history_bufsize");
-
-cmdline_parse_inst_t cmd_get_history_bufsize = {
-	.f = cmd_get_history_bufsize_parsed,  /* function to call */
-	.data = NULL,      /* 2nd arg of func */
-	.help_str = "command that displays total space in history buffer",
-	.tokens = {        /* token list, NULL terminated */
-		(void *)&cmd_get_history_bufsize_tok,
-		NULL,
-	},
-};
-
-
-
-/*** clear_history ***/
-/* clears history buffer */
-
-struct cmd_clear_history_result {
-	cmdline_fixed_string_t str;
-};
-
-static void
-cmd_clear_history_parsed(__attribute__((unused)) void *parsed_result,
-		struct cmdline *cl,
-		__attribute__((unused)) void *data)
-{
-	rdline_clear_history(&cl->rdl);
-}
-
-cmdline_parse_token_string_t cmd_clear_history_tok =
-	TOKEN_STRING_INITIALIZER(struct cmd_clear_history_result, str,
-				 "clear_history");
-
-cmdline_parse_inst_t cmd_clear_history = {
-	.f = cmd_clear_history_parsed,  /* function to call */
-	.data = NULL,      /* 2nd arg of func */
-	.help_str = "clear command history",
-	.tokens = {        /* token list, NULL terminated */
-		(void *)&cmd_clear_history_tok,
-		NULL,
-	},
-};
-
-
-
 /****************/
 
 cmdline_parse_ctx_t main_ctx[] = {
@@ -352,8 +285,6 @@ cmdline_parse_ctx_t main_ctx[] = {
 		(cmdline_parse_inst_t *)&cmd_single,
 		(cmdline_parse_inst_t *)&cmd_single_long,
 		(cmdline_parse_inst_t *)&cmd_num,
-		(cmdline_parse_inst_t *)&cmd_get_history_bufsize,
-		(cmdline_parse_inst_t *)&cmd_clear_history,
 		(cmdline_parse_inst_t *)&cmd_autocomplete_1,
 		(cmdline_parse_inst_t *)&cmd_autocomplete_2,
 	NULL,
diff --git a/test/test/Makefile b/test/test/Makefile
index c9c007c9b..08cc04277 100644
--- a/test/test/Makefile
+++ b/test/test/Makefile
@@ -139,7 +139,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_num.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_etheraddr.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_portlist.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_ipaddr.c
-SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_cirbuf.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_string.c
 SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_lib.c
 
diff --git a/test/test/commands.c b/test/test/commands.c
index 6bfdc0272..7e5357b93 100644
--- a/test/test/commands.c
+++ b/test/test/commands.c
@@ -38,7 +38,6 @@
 #include <rte_mbuf.h>
 #include <rte_devargs.h>
 
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 #include <cmdline_parse_num.h>
diff --git a/test/test/meson.build b/test/test/meson.build
index eb3d87a4d..d5bfdccf3 100644
--- a/test/test/meson.build
+++ b/test/test/meson.build
@@ -10,7 +10,6 @@ test_sources = files('commands.c',
 	'test_barrier.c',
 	'test_byteorder.c',
 	'test_cmdline.c',
-	'test_cmdline_cirbuf.c',
 	'test_cmdline_etheraddr.c',
 	'test_cmdline_ipaddr.c',
 	'test_cmdline_lib.c',
diff --git a/test/test/test.c b/test/test/test.c
index 44dfe20ef..802e8079b 100644
--- a/test/test/test.c
+++ b/test/test/test.c
@@ -13,7 +13,6 @@
 #include <sys/queue.h>
 
 #ifdef RTE_LIBRTE_CMDLINE
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
diff --git a/test/test/test_cmdline.c b/test/test/test_cmdline.c
index 115bee966..eef219a93 100644
--- a/test/test/test_cmdline.c
+++ b/test/test/test_cmdline.c
@@ -45,15 +45,6 @@ test_cmdline(void)
 		return -1;
 	if (test_parse_string_invalid_param() < 0)
 		return -1;
-	printf("Testing circular buffer...\n");
-	if (test_cirbuf_char() < 0)
-		return -1;
-	if (test_cirbuf_string() < 0)
-		return -1;
-	if (test_cirbuf_align() < 0)
-		return -1;
-	if (test_cirbuf_invalid_param() < 0)
-		return -1;
 	printf("Testing library functions...\n");
 	if (test_cmdline_lib() < 0)
 		return -1;
diff --git a/test/test/test_cmdline.h b/test/test/test_cmdline.h
index 1854caf8f..2fb45b3d4 100644
--- a/test/test/test_cmdline.h
+++ b/test/test/test_cmdline.h
@@ -32,12 +32,6 @@ int test_parse_string_valid(void);
 int test_parse_string_invalid_data(void);
 int test_parse_string_invalid_param(void);
 
-/* cmdline_cirbuf tests */
-int test_cirbuf_invalid_param(void);
-int test_cirbuf_char(void);
-int test_cirbuf_string(void);
-int test_cirbuf_align(void);
-
 /* test the rest of the library */
 int test_cmdline_lib(void);
 
diff --git a/test/test/test_cmdline_cirbuf.c b/test/test/test_cmdline_cirbuf.c
deleted file mode 100644
index 8ac326cb0..000000000
--- a/test/test/test_cmdline_cirbuf.c
+++ /dev/null
@@ -1,1301 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <rte_string_fns.h>
-
-#include <cmdline_cirbuf.h>
-
-#include "test_cmdline.h"
-
-/* different length strings */
-#define CIRBUF_STR_HEAD " HEAD"
-#define CIRBUF_STR_TAIL "TAIL"
-
-/* miscellaneous tests - they make bullseye happy */
-static int
-test_cirbuf_string_misc(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * add strings to head and tail, but read only tail
-	 * this results in read operation that does not transcend
-	 * from buffer end to buffer beginning (in other words,
-	 * strlen <= cb->maxlen - cb->end)
-	 */
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear buffers */
-	memset(tmp, 0, sizeof(tmp));
-	memset(buf, 0, sizeof(buf));
-
-
-
-	/*
-	 * add a string to buffer when start/end is at end of buffer
-	 */
-
-	/*
-	 * reinitialize circular buffer with start at the end of cirbuf
-	 */
-	if (cirbuf_init(&cb, buf, CMDLINE_TEST_BUFSIZE - 2, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from tail */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: headstrings do not match!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test adding and deleting strings */
-static int
-test_cirbuf_string_add_del(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read string from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: head strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-	/* read string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to get string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) != 0) {
-		printf("Error: head strings do not match!\n");
-		return -1;
-	}
-	/* delete string from head*/
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to delete string from head!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to get string from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: tail strings do not match!\n");
-		return -1;
-	}
-	/* delete string from tail */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to delete string from tail!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test adding from head and deleting from tail, and vice versa */
-static int
-test_cirbuf_string_add_del_reverse(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* delete string from tail */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to delete string from tail!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-	/* clear tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* delete string from head */
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to delete string from head!\n");
-		return -1;
-	}
-	/* verify string was deleted */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to write more than available */
-static int
-test_cirbuf_string_add_boundaries(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* fill the buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE - sizeof(CIRBUF_STR_TAIL) + 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* try adding a string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try adding a string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* fill the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE - sizeof(CIRBUF_STR_HEAD) + 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* try adding a string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try adding a string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-			> 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to read/delete more than written */
-static int
-test_cirbuf_string_get_del_boundaries(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-				!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read more than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) + 1)
-			!= sizeof(CIRBUF_STR_HEAD)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* read more than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) + 1)
-			!= sizeof(CIRBUF_STR_HEAD)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* delete more than written (head) */
-	if (cirbuf_del_buf_head(&cb, sizeof(CIRBUF_STR_HEAD) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-	/* delete more than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_HEAD) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL))
-				!= (sizeof(CIRBUF_STR_TAIL))) {
-		printf("Error: failed to add string to tail!\n");
-		return -1;
-	}
-	/* read more than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_TAIL) + 1)
-			!= sizeof(CIRBUF_STR_TAIL)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* read more than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_TAIL) + 1)
-			!= sizeof(CIRBUF_STR_TAIL)) {
-		printf("Error: unexpected result when reading too much data!\n");
-		return -1;
-	}
-	/* delete more than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-	/* delete more than written (head) */
-	if (cirbuf_del_buf_tail(&cb, sizeof(CIRBUF_STR_TAIL) + 1) == 0) {
-		printf("Error: unexpected result when deleting too much data!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* try to read/delete less than written */
-static int
-test_cirbuf_string_get_del_partial(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	char tmp2[CMDLINE_TEST_BUFSIZE];
-
-	/* initialize buffers */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-	memset(tmp2, 0, sizeof(tmp));
-
-	strlcpy(tmp2, CIRBUF_STR_HEAD, sizeof(tmp2));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD))
-				!= (sizeof(CIRBUF_STR_HEAD))) {
-		printf("Error: failed to add string to head!\n");
-		return -1;
-	}
-	/* read less than written (head) */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, tmp2, sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-	memset(tmp, 0, sizeof(tmp));
-	/* read less than written (tail) */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from tail!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/*
-	 * verify correct deletion
-	 */
-
-	/* clear buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* delete less than written (head) */
-	if (cirbuf_del_buf_head(&cb, 1) != 0) {
-		printf("Error: delete from head failed!\n");
-		return -1;
-	}
-	/* read from head */
-	if (cirbuf_get_buf_head(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 1)
-			!= sizeof(CIRBUF_STR_HEAD) - 1) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* since we deleted from head, first char should be deleted */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 1) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-	/* clear buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* delete less than written (tail) */
-	if (cirbuf_del_buf_tail(&cb, 1) != 0) {
-		printf("Error: delete from tail failed!\n");
-		return -1;
-	}
-	/* read from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp, sizeof(CIRBUF_STR_HEAD) - 2)
-			!= sizeof(CIRBUF_STR_HEAD) - 2) {
-		printf("Error: unexpected result when reading from head!\n");
-		return -1;
-	}
-	/* since we deleted from tail, last char should be deleted */
-	if (strncmp(tmp, &tmp2[1], sizeof(CIRBUF_STR_HEAD) - 2) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test cmdline_cirbuf char add/del functions */
-static int
-test_cirbuf_char_add_del(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-
-	/* clear buffer */
-	memset(buf, 0, sizeof(buf));
-	memset(tmp, 0, sizeof(tmp));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * try to delete something from cirbuf. since it's empty,
-	 * these should fail.
-	 */
-	if (cirbuf_del_head_safe(&cb) == 0) {
-		printf("Error: deleting from empty cirbuf head succeeded!\n");
-		return -1;
-	}
-	if (cirbuf_del_tail_safe(&cb) == 0) {
-		printf("Error: deleting from empty cirbuf tail succeeded!\n");
-		return -1;
-	}
-
-	/*
-	 * add, verify and delete. these should pass.
-	 */
-	if (cirbuf_add_head_safe(&cb,'h') < 0) {
-		printf("Error: adding to cirbuf head failed!\n");
-		return -1;
-	}
-	if (cirbuf_get_head(&cb) != 'h') {
-		printf("Error: wrong head content!\n");
-		return -1;
-	}
-	if (cirbuf_del_head_safe(&cb) < 0) {
-		printf("Error: deleting from cirbuf head failed!\n");
-		return -1;
-	}
-	if (cirbuf_add_tail_safe(&cb,'t') < 0) {
-		printf("Error: adding to cirbuf tail failed!\n");
-		return -1;
-	}
-	if (cirbuf_get_tail(&cb) != 't') {
-		printf("Error: wrong tail content!\n");
-		return -1;
-	}
-	if (cirbuf_del_tail_safe(&cb) < 0) {
-		printf("Error: deleting from cirbuf tail failed!\n");
-		return -1;
-	}
-	/* do the same for unsafe versions. those are void. */
-	cirbuf_add_head(&cb,'h');
-	if (cirbuf_get_head(&cb) != 'h') {
-		printf("Error: wrong head content!\n");
-		return -1;
-	}
-	cirbuf_del_head(&cb);
-
-	/* test if char has been deleted. we can't call cirbuf_get_head
-	 * because it's unsafe, but we can call cirbuf_get_buf_head.
-	 */
-	if (cirbuf_get_buf_head(&cb, tmp, 1) > 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	cirbuf_add_tail(&cb,'t');
-	if (cirbuf_get_tail(&cb) != 't') {
-		printf("Error: wrong tail content!\n");
-		return -1;
-	}
-	cirbuf_del_tail(&cb);
-
-	/* test if char has been deleted. we can't call cirbuf_get_tail
-	 * because it's unsafe, but we can call cirbuf_get_buf_tail.
-	 */
-	if (cirbuf_get_buf_tail(&cb, tmp, 1) > 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test filling up buffer with chars */
-static int
-test_cirbuf_char_fill(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/* clear buffer */
-	memset(buf, 0, sizeof(buf));
-
-	/*
-	 * initialize circular buffer
-	 */
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/*
-	 * fill the buffer from head or tail, verify contents, test boundaries
-	 * and clear the buffer
-	 */
-
-	/* fill the buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf); i++) {
-		if (buf[i] != 't') {
-			printf("Error: wrong content in buffer!\n");
-			return -1;
-		}
-	}
-	/* try to add to a full buffer from tail */
-	if (cirbuf_add_tail_safe(&cb, 't') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try to add to a full buffer from head */
-	if (cirbuf_add_head_safe(&cb, 'h') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* delete buffer from tail */
-	for(i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_del_tail_safe(&cb);
-	/* try to delete from an empty buffer */
-	if (cirbuf_del_tail_safe(&cb) >= 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	/* fill the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf); i++) {
-		if (buf[i] != 'h') {
-			printf("Error: wrong content in buffer!\n");
-			return -1;
-		}
-	}
-	/* try to add to a full buffer from head */
-	if (cirbuf_add_head_safe(&cb,'h') >= 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* try to add to a full buffer from tail */
-	if (cirbuf_add_tail_safe(&cb, 't') == 0) {
-		printf("Error: buffer should have been full!\n");
-		return -1;
-	}
-	/* delete buffer from head */
-	for(i = 0; i < CMDLINE_TEST_BUFSIZE; i++)
-		cirbuf_del_head_safe(&cb);
-	/* try to delete from an empty buffer */
-	if (cirbuf_del_head_safe(&cb) >= 0) {
-		printf("Error: buffer should have been empty!\n");
-		return -1;
-	}
-
-	/*
-	 * fill the buffer from both head and tail, with alternating characters,
-	 * verify contents and clear the buffer
-	 */
-
-	/* fill half of buffer from tail */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE / 2; i++)
-		cirbuf_add_tail_safe(&cb, (char) (i % 2 ? 't' : 'T'));
-	/* fill other half of the buffer from head */
-	for (i = 0; i < CMDLINE_TEST_BUFSIZE / 2; i++)
-		cirbuf_add_head_safe(&cb, (char) (i % 2 ? 'H' : 'h')); /* added in reverse */
-
-	/* verify that contents of the buffer are what they are supposed to be */
-	for (i = 0; i < sizeof(buf) / 2; i++) {
-		if (buf[i] != (char) (i % 2 ? 't' : 'T')) {
-			printf("Error: wrong content in buffer at %u!\n", i);
-			return -1;
-		}
-	}
-	for (i = sizeof(buf) / 2; i < sizeof(buf); i++) {
-		if (buf[i] != (char) (i % 2 ? 'h' : 'H')) {
-			printf("Error: wrong content in buffer %u!\n", i);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-/* test left alignment */
-static int
-test_cirbuf_align_left(void)
-{
-#define HALF_OFFSET CMDLINE_TEST_BUFSIZE / 2
-#define SMALL_OFFSET HALF_OFFSET / 2
-/* resulting buffer lengths for each of the test cases */
-#define LEN1 HALF_OFFSET - SMALL_OFFSET - 1
-#define LEN2 HALF_OFFSET + SMALL_OFFSET + 2
-#define LEN3 HALF_OFFSET - SMALL_OFFSET
-#define LEN4 HALF_OFFSET + SMALL_OFFSET - 1
-
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-	/*
-	 * align left when start < end and start in left half
-	 */
-
-	/*
-	 * initialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push end into left half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* push start into left half < end */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_head_safe(&cb);
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN1 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!\n");
-		return -1;
-	}
-
-	/*
-	 * align left when start > end and start in left half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into left half */
-	for (i = 0; i < HALF_OFFSET + 2; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN2 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align left when start < end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_tail_safe(&cb);
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN3 || cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align left when start > end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half < start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* verify result */
-	if (cb.start != 0 || cb.len != LEN4 ||
-			cb.end != cb.len - 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * Verify that alignment doesn't corrupt data
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail and head */
-	if (cirbuf_add_buf_head(&cb, CIRBUF_STR_HEAD,
-			sizeof(CIRBUF_STR_HEAD)) < 0 || cirbuf_add_buf_tail(&cb,
-					CIRBUF_STR_TAIL, sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to add strings!\n");
-		return -1;
-	}
-
-	/* align */
-	if (cirbuf_align_left(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/* reset tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* test right alignment */
-static int
-test_cirbuf_align_right(void)
-{
-#define END_OFFSET CMDLINE_TEST_BUFSIZE - 1
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-	char tmp[CMDLINE_TEST_BUFSIZE];
-	unsigned i;
-
-
-	/*
-	 * align right when start < end and start in left half
-	 */
-
-	/*
-	 * initialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to initialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push end into left half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* push start into left half < end */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_head_safe(&cb);
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.start != END_OFFSET || cb.len != LEN1 || cb.end != cb.len - 2) {
-		printf("Error: buffer alignment is wrong!\n");
-		return -1;
-	}
-
-	/*
-	 * align right when start > end and start in left half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into left half */
-	for (i = 0; i < HALF_OFFSET + 2; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.start != END_OFFSET || cb.len != LEN2 || cb.end != cb.len - 2) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align right when start < end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half > start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_del_tail_safe(&cb);
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.end != END_OFFSET || cb.len != LEN3 || cb.start != cb.end - cb.len + 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * align right when start > end and start in right half
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* push start into the right half */
-	for (i = 0; i < HALF_OFFSET - 1; i++)
-		cirbuf_add_head_safe(&cb, 'h');
-
-	/* push end into left half < start */
-	for (i = 0; i < SMALL_OFFSET; i++)
-		cirbuf_add_tail_safe(&cb, 't');
-
-	/* align */
-	cirbuf_align_right(&cb);
-
-	/* verify result */
-	if (cb.end != END_OFFSET || cb.len != LEN4 || cb.start != cb.end - cb.len + 1) {
-		printf("Error: buffer alignment is wrong!");
-		return -1;
-	}
-
-	/*
-	 * Verify that alignment doesn't corrupt data
-	 */
-
-	/*
-	 * reinitialize circular buffer
-	 */
-	memset(buf, 0, sizeof(buf));
-	if (cirbuf_init(&cb, buf, 0, sizeof(buf)) < 0) {
-		printf("Error: failed to reinitialize circular buffer!\n");
-		return -1;
-	}
-
-	/* add string to tail and head */
-	if (cirbuf_add_buf_tail(&cb, CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_TAIL)) < 0 || cirbuf_add_buf_head(&cb,
-					CIRBUF_STR_HEAD, sizeof(CIRBUF_STR_HEAD)) < 0) {
-		printf("Error: failed to add strings!\n");
-		return -1;
-	}
-
-	/* align */
-	if (cirbuf_align_right(&cb) < 0) {
-		printf("Error: alignment failed!\n");
-		return -1;
-	}
-
-	/* get string from head */
-	if (cirbuf_get_buf_head(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	/* reset tmp buffer */
-	memset(tmp, 0, sizeof(tmp));
-
-	/* get string from tail */
-	if (cirbuf_get_buf_tail(&cb, tmp,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) < 0) {
-		printf("Error: failed to read string from head!\n");
-		return -1;
-	}
-	/* verify string */
-	if (strncmp(tmp, CIRBUF_STR_HEAD "\0" CIRBUF_STR_TAIL,
-			sizeof(CIRBUF_STR_HEAD) + sizeof(CIRBUF_STR_TAIL)) != 0) {
-		printf("Error: strings mismatch!\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-/* call functions with invalid parameters */
-int
-test_cirbuf_invalid_param(void)
-{
-	struct cirbuf cb;
-	char buf[CMDLINE_TEST_BUFSIZE];
-
-	/* null cirbuf */
-	if (cirbuf_init(0, buf, 0, sizeof(buf)) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_init(&cb, 0, 0, sizeof(buf)) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_add_head_safe(0, 'h') == 0)
-		return -1;
-	if (cirbuf_add_tail_safe(0, 't') == 0)
-		return -1;
-	if (cirbuf_del_head_safe(0) == 0)
-		return -1;
-	if (cirbuf_del_tail_safe(0) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_add_buf_head(&cb, 0, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(&cb, 0, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_add_buf_head(0, buf, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(0, buf, 0) == 0)
-		return -1;
-	/* null size */
-	if (cirbuf_add_buf_head(&cb, buf, 0) == 0)
-		return -1;
-	if (cirbuf_add_buf_tail(&cb, buf, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_del_buf_head(0, 0) == 0)
-		return -1;
-	if (cirbuf_del_buf_tail(0, 0) == 0)
-		return -1;
-	/* null size */
-	if (cirbuf_del_buf_head(&cb, 0) == 0)
-		return -1;
-	if (cirbuf_del_buf_tail(&cb, 0) == 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_get_buf_head(0, 0, 0) == 0)
-		return -1;
-	if (cirbuf_get_buf_tail(0, 0, 0) == 0)
-		return -1;
-	/* null buffer */
-	if (cirbuf_get_buf_head(&cb, 0, 0) == 0)
-		return -1;
-	if (cirbuf_get_buf_tail(&cb, 0, 0) == 0)
-		return -1;
-	/* null size, this is valid but should return 0 */
-	if (cirbuf_get_buf_head(&cb, buf, 0) != 0)
-		return -1;
-	if (cirbuf_get_buf_tail(&cb, buf, 0) != 0)
-		return -1;
-	/* null cirbuf */
-	if (cirbuf_align_left(0) == 0)
-		return -1;
-	if (cirbuf_align_right(0) == 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf char functions */
-int
-test_cirbuf_char(void)
-{
-	int ret;
-
-	ret = test_cirbuf_char_add_del();
-	if (ret < 0)
-		return -1;
-
-	ret = test_cirbuf_char_fill();
-	if (ret < 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf string functions */
-int
-test_cirbuf_string(void)
-{
-	if (test_cirbuf_string_add_del() < 0)
-		return -1;
-
-	if (test_cirbuf_string_add_del_reverse() < 0)
-		return -1;
-
-	if (test_cirbuf_string_add_boundaries() < 0)
-		return -1;
-
-	if (test_cirbuf_string_get_del_boundaries() < 0)
-		return -1;
-
-	if (test_cirbuf_string_get_del_partial() < 0)
-		return -1;
-
-	if (test_cirbuf_string_misc() < 0)
-		return -1;
-
-	return 0;
-}
-
-/* test cmdline_cirbuf align functions */
-int
-test_cirbuf_align(void)
-{
-	if (test_cirbuf_align_left() < 0)
-		return -1;
-	if (test_cirbuf_align_right() < 0)
-		return -1;
-	return 0;
-}
diff --git a/test/test/test_cmdline_lib.c b/test/test/test_cmdline_lib.c
index a856a9713..2821d4bbf 100644
--- a/test/test/test_cmdline_lib.c
+++ b/test/test/test_cmdline_lib.c
@@ -12,8 +12,6 @@
 #include <ctype.h>
 #include <sys/queue.h>
 
-#include <cmdline_vt100.h>
-#include <cmdline_rdline.h>
 #include <cmdline_parse.h>
 #include <cmdline_socket.h>
 #include <cmdline.h>
@@ -21,113 +19,41 @@
 #include "test_cmdline.h"
 
 /****************************************************************/
-/* static functions required for some tests */
-static void
-valid_buffer(__attribute__((unused))struct rdline *rdl,
-			__attribute__((unused))const char *buf,
-			__attribute__((unused)) unsigned int size)
-{
-}
-
-static int
-complete_buffer(__attribute__((unused)) struct rdline *rdl,
-			__attribute__((unused)) const char *buf,
-			__attribute__((unused)) char *dstbuf,
-			__attribute__((unused)) unsigned int dstsize,
-			__attribute__((unused)) int *state)
-{
-	return 0;
-}
-
-/****************************************************************/
 
 static int
 test_cmdline_parse_fns(void)
 {
-	struct cmdline cl;
+	struct cmdline *cl;
 	int i = 0;
 	char dst[CMDLINE_TEST_BUFSIZE];
 
+	cl = cmdline_new(NULL, "prompt", 0, 1);
+	if (!cl)
+		goto error;
 	if (cmdline_parse(NULL, "buffer") >= 0)
 		goto error;
-	if (cmdline_parse(&cl, NULL) >= 0)
+	if (cmdline_parse(cl, NULL) >= 0)
 		goto error;
 
 	if (cmdline_complete(NULL, "buffer", &i, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, NULL, &i, dst, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, NULL, &i, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, "buffer", NULL, dst, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, "buffer", NULL, dst, sizeof(dst)) >= 0)
 		goto error;
-	if (cmdline_complete(&cl, "buffer", &i, NULL, sizeof(dst)) >= 0)
+	if (cmdline_complete(cl, "buffer", &i, NULL, sizeof(dst)) >= 0)
 		goto error;
 
 	return 0;
 
 error:
+	if (cl)
+		cmdline_free(cl);
 	printf("Error: function accepted null parameter!\n");
 	return -1;
 }
 
 static int
-test_cmdline_rdline_fns(void)
-{
-	struct rdline rdl;
-	rdline_write_char_t *wc = &cmdline_write_char;
-	rdline_validate_t *v = &valid_buffer;
-	rdline_complete_t *c = &complete_buffer;
-
-	if (rdline_init(NULL, wc, v, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, NULL, v, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, wc, NULL, c) >= 0)
-		goto error;
-	if (rdline_init(&rdl, wc, v, NULL) >= 0)
-		goto error;
-	if (rdline_char_in(NULL, 0) >= 0)
-		goto error;
-	if (rdline_get_buffer(NULL) != NULL)
-		goto error;
-	if (rdline_add_history(NULL, "history") >= 0)
-		goto error;
-	if (rdline_add_history(&rdl, NULL) >= 0)
-		goto error;
-	if (rdline_get_history_item(NULL, 0) != NULL)
-		goto error;
-
-	/* void functions */
-	rdline_newline(NULL, "prompt");
-	rdline_newline(&rdl, NULL);
-	rdline_stop(NULL);
-	rdline_quit(NULL);
-	rdline_restart(NULL);
-	rdline_redisplay(NULL);
-	rdline_reset(NULL);
-	rdline_clear_history(NULL);
-
-	return 0;
-
-error:
-	printf("Error: function accepted null parameter!\n");
-	return -1;
-}
-
-static int
-test_cmdline_vt100_fns(void)
-{
-	if (vt100_parser(NULL, 0) >= 0) {
-		printf("Error: function accepted null parameter!\n");
-		return -1;
-	}
-
-	/* void functions */
-	vt100_init(NULL);
-
-	return 0;
-}
-
-static int
 test_cmdline_socket_fns(void)
 {
 	cmdline_parse_ctx_t ctx;
@@ -164,7 +90,7 @@ static int
 test_cmdline_fns(void)
 {
 	cmdline_parse_ctx_t ctx;
-	struct cmdline cl, *tmp;
+	struct cmdline *tmp;
 
 	memset(&ctx, 0, sizeof(ctx));
 	tmp = cmdline_new(&ctx, "test", -1, -1);
@@ -177,10 +103,6 @@ test_cmdline_fns(void)
 		goto error;
 	if (cmdline_in(NULL, "buffer", CMDLINE_TEST_BUFSIZE) >= 0)
 		goto error;
-	if (cmdline_in(&cl, NULL, CMDLINE_TEST_BUFSIZE) >= 0)
-		goto error;
-	if (cmdline_write_char(NULL, 0) >= 0)
-		goto error;
 
 	/* void functions */
 	cmdline_set_prompt(NULL, "prompt");
@@ -191,16 +113,6 @@ test_cmdline_fns(void)
 	cmdline_interact(NULL);
 	cmdline_quit(NULL);
 
-	/* check if void calls change anything when they should fail */
-	cl = *tmp;
-
-	cmdline_printf(&cl, NULL);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-	cmdline_set_prompt(&cl, NULL);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-	cmdline_in(&cl, NULL, CMDLINE_TEST_BUFSIZE);
-	if (memcmp(&cl, tmp, sizeof(cl))) goto mismatch;
-
 	cmdline_free(tmp);
 
 	return 0;
@@ -208,9 +120,6 @@ test_cmdline_fns(void)
 error:
 	printf("Error: function accepted null parameter!\n");
 	return -1;
-mismatch:
-	printf("Error: data changed!\n");
-	return -1;
 }
 
 /* test library functions. the point of these tests is not so much to test
@@ -222,10 +131,6 @@ test_cmdline_lib(void)
 {
 	if (test_cmdline_parse_fns() < 0)
 		return -1;
-	if (test_cmdline_rdline_fns() < 0)
-		return -1;
-	if (test_cmdline_vt100_fns() < 0)
-		return -1;
 	if (test_cmdline_socket_fns() < 0)
 		return -1;
 	if (test_cmdline_fns() < 0)
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-06 20:26  2%   ` Adrien Mazarguil
  2018-04-09 16:10  0%     ` Mohammad Abdul Awal
@ 2018-04-17 14:58  3%     ` Doherty, Declan
  1 sibling, 0 replies; 200+ results
From: Doherty, Declan @ 2018-04-17 14:58 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

On 06/04/2018 9:26 PM, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
>> Add new flow action types and associated action data structures to
>> support the encapsulation and decapsulation of the virtual tunnel
>> endpoints.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
>> flow to be encapsulated in the virtual tunnel endpoint overlay
>> defined in the tunnel_encap action data.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
>> tunnel endpoint overlays up to and including the first instance of
>> the flow item type defined in the tunnel_decap action data for the
>> matching flows.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> 
> This generic approach looks flexible enough to cover the use cases that
> immediately come to mind (VLAN, VXLAN), its design is sound.
> 
> However, while I'm aware it's not a concern at this point, it won't be able
> to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
> which will require additional meta data or some run-time assistance from the
> application.
> 
> Eventually for more complex use cases, dedicated encap/decap actions will
> have to appear, so the issue I wanted to raise before going further is this:
> 
> Going generic inevitably trades some of the usability; flat structures
> dedicated to VXLAN encap/decap with only the needed info to get the job done
> would likely be easier to implement in PMDs and use in applications. Any
> number of such actions can be added to rte_flow without ABI impact.
> 
> If VXLAN is the only use case at this point, my suggestion would be to go
> with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
> L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
> 
> Now we can start with the generic approach, see how it fares and add
> dedicated encap/decap later as needed.
> 
> More comments below.
> 

I understand where your coming from here now, I think the best approach 
to take is as you say to have the explicit 
RTE_FLOW_ACTION_TYPE_(PROTOCOL)_(ENCAP|DECAP) actions and make it 
explicit in the action description that it operates within the 
constraints of the corresponding RFC. I'll cover VXLAN and NVGRE as 
that's what we have been testing with today, as you point it will be 
easy to add new protocol encap/decaps without breaking ABI. Also I think 
having a explicit decap action for each protocol makes the overall usage 
model simpler as it negates the need for action data.

>> ---
>>   doc/guides/prog_guide/rte_flow.rst | 77 ++++++++++++++++++++++++++++++++--
>>   lib/librte_ether/rte_flow.h        | 84 ++++++++++++++++++++++++++++++++++++--
>>   2 files changed, 155 insertions(+), 6 deletions(-)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index fd33d19..106fb93 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -997,9 +997,11 @@ Actions
>>   
>>   Each possible action is represented by a type. Some have associated
>>   configuration structures. Several actions combined in a list can be assigned
>> -to a flow rule. That list is not ordered.
>> +to a flow rule. That list is not ordered, with the exception of  actions which
>> +modify the packet itself, these packet modification actions must be specified
>> +in the explicit order in which they are to be executed.
>>   
>> -They fall in three categories:
>> +They fall in four categories:
>>   
>>   - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>     processing matched packets by subsequent flow rules, unless overridden
>> @@ -1008,8 +1010,11 @@ They fall in three categories:
>>   - Non-terminating actions (PASSTHRU, DUP) that leave matched packets up for
>>     additional processing by subsequent flow rules.
>>   
>> +- Non-terminating meta actions that do not affect the fate of packets but result
>> +  in modification of the packet itself (SECURITY, TUNNEL_ENCAP, TUNNEL_DECAP).
>> +
>>   - Other non-terminating meta actions that do not affect the fate of packets
>> -  (END, VOID, MARK, FLAG, COUNT, SECURITY).
>> +  (END, VOID, MARK, FLAG, COUNT).
> 
> The above changes are not necessary anymore [1][2].
> 
> [1] "ethdev: clarify flow API pattern items and actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095776.html
> [2] "ethdev: alter behavior of flow API actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095779.html
> 

ok, I'll remove theses in the next version

>>   When several actions are combined in a flow rule, they should all have
>>   different types (e.g. dropping a packet twice is not possible).
>> @@ -1486,6 +1491,72 @@ fields in the pattern items.
>>      | 1     | END      |
>>      +-------+----------+
>>   
>> +
> 
> Nit: titles in this file are separated by a single empty line.
> 
>> +Action: ``TUNNEL_ENCAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs an encapsulation action by encapsulating the flows matched by the
>> +pattern items according to the network overlay defined in the
>> +``rte_flow_action_tunnel_encap`` pattern items.
>> +
>> +This action modifies the payload of matched flows. The pattern items specified
>> +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
>> +set of overlay headers, from the Ethernet header up to the overlay header. The
>> +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> 
> Regarding the use of a pattern list, if you consider PMDs are already
> iterating on a list of actions when encountering
> RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.
> 
> How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
> exactly one item instead (in encap, i.e. reverse order)?
> 
> In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
> 

Personally after trying all of these approach I prefer the the protocol 
specific full encap/decap model you suggested above.

>> +
>> +- Non-terminating by default.
> 
> There's no such property anymore [2].
> 
>> +
>> +.. _table_rte_flow_action_tunnel_encap:
>> +
>> +.. table:: TUNNEL_ENCAP
>> +
>> +   +-------------+---------------------------------------------+
>> +   | Field       | Value                                       |
>> +   +=============+=============================================+
>> +   | ``pattern`` | Virtual tunnel end-point pattern definition |
>> +   +-------------+---------------------------------------------+
>> +
>> +
>> +.. _table_rte_flow_action_tunnel_encap_example:
>> +
>> +.. table:: IPv4 VxLAN flow pattern example.
> 
> VxLAN => VXLAN
> 
>> +
>> +   +-------+--------------------------+------------+
>> +   | Index | Flow Item Type           | Flow Item  |
>> +   +=======+==========================+============+
>> +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
>> +   +-------+--------------------------+------------+
>> +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
>> +   +-------+--------------------------+------------+
>> +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
>> +   +-------+--------------------------+------------+
>> +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
>> +   +-------+--------------------------+------------+
>> +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
>> +   +-------+--------------------------+------------+
> 
> One possible issue is that it relies on objects normally found on the
> pattern side of flow rules. Those are supposed to match something, they are
> not intended for packet header generation. While their "spec" and "mask"
> fields might make sense in this context, the "last" field is odd.
> 
> You must define them without leaving anything open for interpretation by
> PMDs and users alike. Defining things as "undefined" is fine as long as it's
> covered.
> 
>> +
>> +
> 
> Nit: only one empty line necessary here.
> 
>> +Action: ``TUNNEL_DECAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs a decapsulation action by stripping all headers of the virtual tunnel
>> +end-point overlay up to the header defined by the flow item type of flows
>> +matched by the pattern items.
> 
> Not necessarily, for instance if one guarantees that flowing traffic only
> consists of decap'able packets. You must avoid mandatory dependencies
> between patterns and actions since they are normally unrelated.
> 
> What you can document on the other hand is that the behavior is undefined
> when processing traffic on which the action can't be applied. This is
> how RSS level is documented [3].
> 
> [3] https://dpdk.org/ml/archives/dev/2018-April/095783.html
> 
>> +
>> +This action modifies the payload of matched flows. The flow item type specified
>> +in the ``rte_flow_action_tunnel_decap`` action structure must defined a valid
>> +set of overlay header type.
>> +
>> +- Non-terminating by default.
> 
> See [2].
> 
>> +
>> +.. _table_rte_flow_action_tunnel_decap:
>> +
>> +   +---------------+----------------------------------------------+
>> +   | Field         | Value                                        |
>> +   +===============+==============================================+
>> +   | ``item type`` | Item type of tunnel end-point to decapsulate |
>> +   +---------------+----------------------------------------------+
> 
> "item type" should be the exact name used in the structure.
> 
>> +
>>   Negative types
>>   ~~~~~~~~~~~~~~
>>   
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 7d1f89d..6d94423 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -854,14 +854,17 @@ struct rte_flow_item {
>>   	const void *mask; /**< Bit-mask applied to spec and last. */
>>   };
>>   
>> +
> 
> Unnecessary empty line.
> 
>>   /**
>>    * Action types.
>>    *
>>    * Each possible action is represented by a type. Some have associated
>>    * configuration structures. Several actions combined in a list can be
>> - * affected to a flow rule. That list is not ordered.
>> + * affected to a flow rule. That list is not ordered, with the exception of
>> + * actions which modify the packet itself, these packet modification actions
>> + * must be specified in the explicit order in which they are to be executed.
>>    *
>> - * They fall in three categories:
>> + * They fall in four categories:
>>    *
>>    * - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>    *   processing matched packets by subsequent flow rules, unless overridden
>> @@ -870,6 +873,10 @@ struct rte_flow_item {
>>    * - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
>>    *   for additional processing by subsequent flow rules.
>>    *
>> + * - Non terminating meta actions that do not affect the fate of
>> + *   packets but result in modification of the packet itself (SECURITY,
>> + *   TUNNEL_ENCAP, TUNNEL_DECAP).
>> + *
> 
> Same comment as above [1][2].
> 
>>    * - Other non terminating meta actions that do not affect the fate of
>>    *   packets (END, VOID, MARK, FLAG, COUNT).
>>    *
>> @@ -1022,7 +1029,42 @@ enum rte_flow_action_type {
>>   	 *
>>   	 * See struct rte_flow_action_group_count.
>>   	 */
>> -	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
>> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT,
> 
> An empty line would have been needed here (if we agree about no more
> GROUP_COUNT.)
> 
>> +	/**
>> +	 * Encapsulate flow with tunnel defined in
>> +	 * rte_flow_action_tunnel_encap structure.
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_encap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP,
>> +
>> +	/**
>> +	 * Decapsulate all the headers of the tunnel
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_decap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP,
>> +
>> +	/**
>> +	 * Redirects packets to the logical group of the current device.
>> +	 *
>> +	 * In a logical hierarchy of groups, which can be used to represent a
>> +	 * physical of logical chaining of flow tables, this action allows the
>> +	 * terminating action to be a logical group of the same device.
>> +	 *
>> +	 * See struct rte_flow_action_group.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_GROUP,
>> +
>> +	/**
>> +	 * [META]
>> +	 *
>> +	 * Set specific metadata field associated with packet which is then
>> +	 * available to further pipeline stages.
>> +	 *
>> +	 * See struct rte_flow_action_metadata.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_METADATA
> 
> These two actions should be part of the next patch, I won't comment them
> here.
> 

sorry that was a rebase issue.

>>   };
>>   
>>   /**
>> @@ -1173,6 +1215,42 @@ struct rte_flow_action_group_count {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
>> + *
>> + * Virtual tunnel end-point encapsulation action data.
>> + *
>> + * Non-terminating action by default.
> 
> See [2].
> 
>> + */
>> +struct rte_flow_action_tunnel_encap {
>> +	struct rte_flow_action_item {
>> +		enum rte_flow_item_type type;
>> +		/**< Flow item type. */
>> +		const void *item;
>> +		/**< Flow item definition which points to the data of
>> +		 * corresponding rte_flow_item_type.
>> +		 */
> 
> I see it's a new action type, albeit a bit confusing (there is no
> RTE_FLOW_ACTION_TYPE_ITEM).
> 
> I suggest the standard pattern item type since you're going with enum
> rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
> what fields are relevant. An application might otherwise want to encap with
> unsupported properties (e.g. specific IPv4 ToS field and whatnot).
> sure, that makes sense.

> How about a single "struct rte_flow_pattern_item item", neither const and
> neither a pointer. It's generic enough, enclosed spec/last/mask pointers
> take care of the specifics. You just need to define what's supposed to
> happen when "last" is set.

I think that a note to make it explicit that the last item is 
unused/ignored in the encap case should be sufficient.

> 
>> +	} *pattern;
>> +	/**<
>> +	 * Tunnel pattern specification (list terminated by the END pattern
>> +	 * item).
>> +	 */
> 
> As previously suggested, how about a single item per encap?
> 

For the encap case I still prefer a single definition as an array of 
flow items. I think it will make it easier to verify a valid pattern has 
all the required elements in terms of the protocol encapsulation being 
specified. I see how it could work the other way too, but the validity 
of the pattern definition comes from the action, not from any specific 
item in the pattern. I think that the error reporting is verbose enough 
to allow PMDs to specify exactly why a pattern is invalid for an action. 
For instance if you missed the UDP item in a VXLAN encap action 
definition, in the single item per encap model you would get the error 
on the IP item, but it may actually be perfectly valid, the error is 
that you missed the UDP item.


>> +};
>> +
>> +/**
>> + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
>> + *
>> + * Virtual tunnel end-point decapsulation action data.
>> + *
>> + * Non-terminating action by default.
>> + */
>> +struct rte_flow_action_tunnel_decap {
>> +	enum rte_flow_item_type type;
>> +	/**<
>> +	 * Flow item type of virtual tunnel end-point to be decapsulated
>> +	 */
>> +};
> 
> Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
> actions to peel each layer off. The current definition is fine.
> 
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> -- 
>> 2.7.4
>>
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 3/4] ethdev: Add group action type to rte_flow
  2018-04-06 20:26  3%   ` Adrien Mazarguil
@ 2018-04-17 14:40  0%     ` Doherty, Declan
  0 siblings, 0 replies; 200+ results
From: Doherty, Declan @ 2018-04-17 14:40 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev

On 06/04/2018 9:26 PM, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:02PM +0100, Declan Doherty wrote:
>> Add group action type which defines a terminating action which
>> allows a matched flow to be redirect to a group. This allows logical
>> flow table hierarchies to be managed through rte_flow.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> 
> OK, I'm wondering if perhaps with the addition of this action, we should
> redefine groups as unlinked by default?

Sure that makes sense to me, as in a device with multiple groups I would 
image that you are more likely to have a tree hierarchy of groups, based 
on something like packet types, rather than a straight priority ordering.
> 
> Currently traffic enters through the flow rule with the lowest priority of
> the group with the lowest ID and iterates through subsequent flow rules and
> groups until matched by a flow rule without PASSTHRU (according to latest
> definition [1]).
> 
> This would make jumps between groups always explicit, not necessarily a bad
> idea given no PMD implements groups as of yet. Thoughts?

I think this is a good idea, as in the case with multiple groups which 
support the same packet time, there may be some flows which could 
potentially hit on many groups, but should be directed to a specific 
group. If group matching was always resolved in a strict priority it 
would result on a hit on an incorrect group. I think making it explicit 
from the start makes sense.

One issue is that groups will need to be populated with a specific rule 
to allow flow to fall through to the next priority group if no match is 
found on the default group.

This just requires changes to the API comments, correct? Do you want to 
capture those in [1] or would you like me to make them with this patch?

> 
> Also as a rather fundamental API addition, I suggest to add it after
> RTE_FLOW_ACTION_TYPE_PASSTHRU. It's OK because ABI is already broken. You
> just need to mention it in the commit log [1].

no problem, will do.

> 
> Another suggestion would be to rename it "JUMP" (reasons below).

I have no issue with changing the action name to JUMP
> 
> [1] "ethdev: alter behavior of flow API actions"
>      http://dpdk.org/ml/archives/dev/2018-April/095779.html
> 
>> ---
>>   doc/guides/prog_guide/rte_flow.rst | 23 +++++++++++++++++++++++
>>   lib/librte_ether/rte_flow.h        | 15 +++++++++++++++
>>   2 files changed, 38 insertions(+)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index 106fb93..2f0a47a 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -1557,6 +1557,29 @@ set of overlay header type.
>>      | ``item type`` | Item type of tunnel end-point to decapsulate |
>>      +---------------+----------------------------------------------+
>>   
>> +
> 
> Unnecessary empty line.
> 
>> +Action: ``GROUP``
>> +^^^^^^^^^^^^^^^^^
>> +
>> +Redirects packets to a group on the current device.
>> +
>> +In a hierarchy of groups, which can be used to represent physical or logical
>> +flow tables on the device, this action allows the terminating action to be a
>> +group on that device.
>> +
>> +- Terminating by default.
> 
> Keep in mind there's no such thing as a terminating action anymore [1].
> 
>> +
>> +.. _table_rte_flow_action_group:
>> +
>> +.. table:: GROUP
>> +
>> +   +--------------+---------------------------------+
>> +   | Field        | Value                           |
>> +   +==============+=================================+
>> +   | ``id``       | Group ID to redirect packets to |
>> +   +--------------+---------------------------------+
> 
> "Field" column can be shrunk somewhat.
> 
>> +
>> +
>>   Negative types
>>   ~~~~~~~~~~~~~~
>>   
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 6d94423..968a23b 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -1251,6 +1251,21 @@ struct rte_flow_action_tunnel_decap {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_GROUP
> 
> Its addition to enum rte_flow_action_type should be part of this commit.
> 
that was a rebasing issue, I'll fix in next revision

>> + *
>> + * Redirects packets to a group on the current device.
>> + *
>> + * In a hierarchy of groups, which can be used to represent physical or logical
>> + * flow tables on the device, this action allows the terminating action to be a
>> + * group on that device.
>> + *
>> + * Terminating by default.
> 
> See [1].
> 
>> + */
>> +struct rte_flow_action_group {
>> +	uint32_t id;
> 
> Assuming this structure is named rte_flow_action_jump, naming this field
> "group" would match the attribute of the same name.
> 
>> +};
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> -- 
>> 2.7.4
>>
> 
> Don't forget testpmd code and documentation update.
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
  2018-04-16 15:33  0%     ` Olivier Matz
@ 2018-04-17 10:23  0%     ` Burakov, Anatoly
  1 sibling, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-17 10:23 UTC (permalink / raw)
  To: Andrew Rybchenko, dev; +Cc: Olivier MATZ

On 16-Apr-18 2:24 PM, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> ---
> v3 -> v4:
>   - rebased on top of memory rework
>   - dropped previous Ack's since rebase is not trivial
>   - check size calculation failure in rte_mempool_populate_anon() and
>     rte_mempool_memchunk_anon_free()
> 
> v2 -> v3:
>   - none
> 
> v1 -> v2:
>   - clarify min_chunk_size meaning
>   - rebase on top of patch series which fixes library version in meson
>     build
> 
> RFCv2 -> v1:
>   - move default calc_mem_size callback to rte_mempool_ops_default.c
>   - add ABI changes to release notes
>   - name default callback consistently: rte_mempool_op_<callback>_default()
>   - bump ABI version since it is the first patch which breaks ABI
>   - describe default callback behaviour in details
>   - avoid introduction of internal function to cope with deprecation
>     (keep it to deprecation patch)
>   - move cache-line or page boundary chunk alignment to default callback
>   - highlight that min_chunk_size and align parameters are output only
> 
>   doc/guides/rel_notes/deprecation.rst         |   3 +-
>   doc/guides/rel_notes/release_18_05.rst       |   8 +-
>   lib/librte_mempool/Makefile                  |   3 +-
>   lib/librte_mempool/meson.build               |   5 +-
>   lib/librte_mempool/rte_mempool.c             | 114 +++++++++++++++------------
>   lib/librte_mempool/rte_mempool.h             |  86 +++++++++++++++++++-
>   lib/librte_mempool/rte_mempool_ops.c         |  18 +++++
>   lib/librte_mempool/rte_mempool_ops_default.c |  38 +++++++++
>   lib/librte_mempool/rte_mempool_version.map   |   7 ++
>   9 files changed, 225 insertions(+), 57 deletions(-)
>   create mode 100644 lib/librte_mempool/rte_mempool_ops_default.c

<...>

> -	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> +		size_t min_chunk_size;
>   		unsigned int flags;
> +
>   		if (try_contig || no_pageshift)
> -			size = rte_mempool_xmem_size(n, total_elt_sz, 0,
> -				mp->flags);
> +			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> +					0, &min_chunk_size, &align);
>   		else
> -			size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
> -				mp->flags);
> +			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> +					pg_shift, &min_chunk_size, &align);
> +
> +		if (mem_size < 0) {
> +			ret = mem_size;
> +			goto fail;
> +		}
>   
>   		ret = snprintf(mz_name, sizeof(mz_name),
>   			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
> @@ -692,27 +678,31 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		if (try_contig)
>   			flags |= RTE_MEMZONE_IOVA_CONTIG;
>   
> -		mz = rte_memzone_reserve_aligned(mz_name, size, mp->socket_id,
> -				flags, align);
> +		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> +				mp->socket_id, flags, align);
>   
> -		/* if we were trying to allocate contiguous memory, adjust
> -		 * memzone size and page size to fit smaller page sizes, and
> -		 * try again.
> +		/* if we were trying to allocate contiguous memory, failed and
> +		 * minimum required contiguous chunk fits minimum page, adjust
> +		 * memzone size to the page size, and try again.
>   		 */
> -		if (mz == NULL && try_contig) {
> +		if (mz == NULL && try_contig && min_chunk_size <= pg_sz) {

This is a bit pessimistic. There may not have been enough 
IOVA-contiguous memory to reserve `mem_size`, but there may be enough 
contiguous memory to try and reserve `min_chunk_size` contiguous memory 
if it's bigger than minimum page size. This is *minimum* page size, so 
there may be bigger pages, and ideally if (min_chunk_size >= pg_sz) && 
(min_chunk_size < mem_size), we might've tried to allocate some 
IOVA-contiguous memory, and succeed.

However, that's not a huge issue, so...

Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

>   			try_contig = false;
>   			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> -			align = pg_sz;
> -			size = rte_mempool_xmem_size(n, total_elt_sz,
> -				pg_shift, mp->flags);
>   
> -			mz = rte_memzone_reserve_aligned(mz_name, size,
> +			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> +					pg_shift, &min_chunk_size, &align);
> +			if (mem_size < 0) {
> +				ret = mem_size;
> +				goto fail;
> +			}
> +
> +			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>   				mp->socket_id, flags, align);
>   		}

<...>
-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v11 02/10] crypto/virtio: support virtio device init
  @ 2018-04-17  9:23  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-17  9:23 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   7 +
 drivers/crypto/virtio/meson.build        |   4 +-
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 462 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 253 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 11 files changed, 1449 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index 58f8cfb..be7b828 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -8,6 +8,10 @@ include $(RTE_SDK)/mk/rte.vars.mk
 #
 LIB = librte_pmd_virtio_crypto.a
 
+#
+# include virtio_crypto.h
+#
+CFLAGS += -I$(RTE_SDK)/lib/librte_vhost
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 
@@ -18,6 +22,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/meson.build b/drivers/crypto/virtio/meson.build
index 51f5b08..b15b3f9 100644
--- a/drivers/crypto/virtio/meson.build
+++ b/drivers/crypto/virtio/meson.build
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
 
+includes += include_directories('../../../lib/librte_vhost')
 deps += 'bus_pci'
 name = 'virtio_crypto'
-sources = files('virtio_cryptodev.c')
+sources = files('virtio_cryptodev.c', 'virtio_pci.c',
+		'virtio_rxtx.c', 'virtqueue.c')
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..832c465
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..604ec36
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+#include "virtio_crypto.h"
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..bf10c65
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+#include "virtio_crypto.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v5 00/11] Bunch of flow API-related fixes
  2018-04-16 16:21  3%   ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
@ 2018-04-17  9:17  0%     ` Ferruh Yigit
  2018-04-19 10:07  3%     ` [dpdk-dev] [PATCH v6 " Adrien Mazarguil
  1 sibling, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-17  9:17 UTC (permalink / raw)
  To: Adrien Mazarguil, dev

On 4/16/2018 5:21 PM, Adrien Mazarguil wrote:
> This series contains several fixes for rte_flow and its implementation in
> PMDs and testpmd. Upcoming work on the flow API depends on it.
> 
> v5 changes:
> 
> - No change, rebased series to address conflicts.
> 
> v4 changes:
> 
> - Rebased again.
> - The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
>   see updated patch for details.
> 
> v3 changes:
> 
> - Rebased series.
> - Dropped unnecessary "net/sfc: fix endian conversions in flow API".
> - Dropped "ethdev: fix ABI version in meson build", handled by prior commit
>   d9736a248785 ("ethdev: fix library version in meson build").
> 
> v2 changes:
> 
> - mlx5 fix (patch #3).
> - bnxt fix (patch #4).
> - sfc fix (patch #6).
> - Missing include (patch #13).
> 
> Adrien Mazarguil (11):
>   net/mlx4: fix RSS resource leak in case of error
>   net/mlx4: fix ignored RSS hash types
>   net/mlx5: fix RSS flow action bounds check
>   net/bnxt: fix matching of flow API item masks
>   app/testpmd: fix flow completion for RSS queues
>   app/testpmd: fix lack of flow action configuration
>   app/testpmd: fix RSS flow action configuration
>   app/testpmd: fix missing RSS fields in flow action
>   ethdev: fix shallow copy of flow API RSS action
>   ethdev: fix missing boolean values in flow command
>   ethdev: fix missing include in flow API

Is there any more comments / objections to this patchset?
If there is no objection it will be merged soon.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to flow API
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-17  9:08  0%         ` Mohammad Abdul Awal
  0 siblings, 0 replies; 200+ results
From: Mohammad Abdul Awal @ 2018-04-17  9:08 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z



On 16/04/2018 17:22, Adrien Mazarguil wrote:
> This patch adds the missing action counterpart to the PHY_PORT pattern
> item, that is, the ability to directly inject matching traffic into a
> physical port of the underlying device.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>

Acked-by: Mohammad Abdul Awal <mohammad.abdul.awal@intel.com>

> ---
>   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
>   6 files changed, 84 insertions(+)
>
> diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
> index f9f937277..356714801 100644
> --- a/app/test-pmd/cmdline_flow.c
> +++ b/app/test-pmd/cmdline_flow.c
> @@ -182,6 +182,9 @@ enum index {
>   	ACTION_VF,
>   	ACTION_VF_ORIGINAL,
>   	ACTION_VF_ID,
> +	ACTION_PHY_PORT,
> +	ACTION_PHY_PORT_ORIGINAL,
> +	ACTION_PHY_PORT_INDEX,
>   	ACTION_METER,
>   	ACTION_METER_ID,
>   };
> @@ -623,6 +626,7 @@ static const enum index next_action[] = {
>   	ACTION_RSS,
>   	ACTION_PF,
>   	ACTION_VF,
> +	ACTION_PHY_PORT,
>   	ACTION_METER,
>   	ZERO,
>   };
> @@ -657,6 +661,13 @@ static const enum index action_vf[] = {
>   	ZERO,
>   };
>   
> +static const enum index action_phy_port[] = {
> +	ACTION_PHY_PORT_ORIGINAL,
> +	ACTION_PHY_PORT_INDEX,
> +	ACTION_NEXT,
> +	ZERO,
> +};
> +
>   static const enum index action_meter[] = {
>   	ACTION_METER_ID,
>   	ACTION_NEXT,
> @@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
>   		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
>   		.call = parse_vc_conf,
>   	},
> +	[ACTION_PHY_PORT] = {
> +		.name = "phy_port",
> +		.help = "direct packets to physical port index",
> +		.priv = PRIV_ACTION(PHY_PORT,
> +				    sizeof(struct rte_flow_action_phy_port)),
> +		.next = NEXT(action_phy_port),
> +		.call = parse_vc,
> +	},
> +	[ACTION_PHY_PORT_ORIGINAL] = {
> +		.name = "original",
> +		.help = "use original port index if possible",
> +		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
> +		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
> +					   original, 1)),
> +		.call = parse_vc_conf,
> +	},
> +	[ACTION_PHY_PORT_INDEX] = {
> +		.name = "index",
> +		.help = "physical port index",
> +		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
> +		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
> +					index)),
> +		.call = parse_vc_conf,
> +	},
>   	[ACTION_METER] = {
>   		.name = "meter",
>   		.help = "meter the directed packets at given id",
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 840320108..2d68f1fb0 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -1074,6 +1074,7 @@ static const struct {
>   	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
>   	MK_FLOW_ACTION(PF, 0),
>   	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
> +	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
>   	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
>   };
>   
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 4e053c24b..a39c1e1b0 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1433,6 +1433,26 @@ See `Item: VF`_.
>      | ``id``       | VF ID                          |
>      +--------------+--------------------------------+
>   
> +Action: ``PHY_PORT``
> +^^^^^^^^^^^^^^^^^^^^
> +
> +Directs matching traffic to a given physical port index of the underlying
> +device.
> +
> +See `Item: PHY_PORT`_.
> +
> +.. _table_rte_flow_action_phy_port:
> +
> +.. table:: PHY_PORT
> +
> +   +--------------+-------------------------------------+
> +   | Field        | Value                               |
> +   +==============+=====================================+
> +   | ``original`` | use original port index if possible |
> +   +--------------+-------------------------------------+
> +   | ``index``    | physical port index                 |
> +   +--------------+-------------------------------------+
> +
>   Action: ``METER``
>   ^^^^^^^^^^^^^^^^^
>   
> diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
> index a2bbd1930..64d8dfddb 100644
> --- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
> +++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
> @@ -3423,6 +3423,11 @@ This section lists supported actions and their attributes, if any.
>     - ``original {boolean}``: use original VF ID if possible.
>     - ``id {unsigned}``: VF ID.
>   
> +- ``phy_port``: direct packets to physical port index.
> +
> +  - ``original {boolean}``: use original port index if possible.
> +  - ``index {unsigned}``: physical port index.
> +
>   Destroying flow rules
>   ~~~~~~~~~~~~~~~~~~~~~
>   
> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> index 36e277a4f..00989c73b 100644
> --- a/lib/librte_ether/rte_flow.c
> +++ b/lib/librte_ether/rte_flow.c
> @@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
>   	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
>   	MK_FLOW_ACTION(PF, 0),
>   	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
> +	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
>   };
>   
>   static int
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 2c7c4d009..58b75e934 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -989,6 +989,14 @@ enum rte_flow_action_type {
>   	RTE_FLOW_ACTION_TYPE_VF,
>   
>   	/**
> +	 * Directs packets to a given physical port index of the underlying
> +	 * device.
> +	 *
> +	 * See struct rte_flow_action_phy_port.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_PHY_PORT,
> +
> +	/**
>   	 * Traffic metering and policing (MTR).
>   	 *
>   	 * See struct rte_flow_action_meter.
> @@ -1112,6 +1120,20 @@ struct rte_flow_action_vf {
>   };
>   
>   /**
> + * RTE_FLOW_ACTION_TYPE_PHY_PORT
> + *
> + * Directs packets to a given physical port index of the underlying
> + * device.
> + *
> + * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
> + */
> +struct rte_flow_action_phy_port {
> +	uint32_t original:1; /**< Use original port index if possible. */
> +	uint32_t reserved:31; /**< Reserved, must be zero. */
> +	uint32_t index; /**< Physical port index. */
> +};
> +
> +/**
>    * RTE_FLOW_ACTION_TYPE_METER
>    *
>    * Traffic metering and policing (MTR).

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v4 16/16] ethdev: add port ID item and action to flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (11 preceding siblings ...)
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-16 16:23  2%       ` Adrien Mazarguil
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:23 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z, Declan Doherty

RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
into a different device, as identified by its DPDK port ID.

This is normally only supported when the target port ID has some kind of
relationship with the port ID the flow rule is created against, such as
being exposed by a common physical device (e.g. a different port of an
Ethernet switch).

The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
flow rule match traffic whose origin is the specified port ID. Note that
specifying a port ID that differs from the one the flow rule is created
against is normally meaningless (if even accepted), but can make sense if
combined with the transfer attribute.

These must not be confused with their PHY_PORT counterparts, which refer to
physical ports using device-specific indices, but unlike PORT_ID are not
necessarily tied to DPDK port IDs.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
Cc: Declan Doherty <declan.doherty@intel.com>

---

This patch provides the same functionality and supersedes Qi Zhang's
"ether: add flow action to redirect packet to a port" [1].

The main differences are:

- Action is named PORT_ID instead of PORT.
- Addition of a PORT_ID pattern item.
- More extensive documentation.
- Testpmd support.
- rte_flow_copy() support.

[1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
---
 app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  2 +
 doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
 lib/librte_ether/rte_flow.c                 |  2 +
 lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 356714801..32fe6645a 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -89,6 +89,8 @@ enum index {
 	ITEM_VF_ID,
 	ITEM_PHY_PORT,
 	ITEM_PHY_PORT_INDEX,
+	ITEM_PORT_ID,
+	ITEM_PORT_ID_ID,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -185,6 +187,9 @@ enum index {
 	ACTION_PHY_PORT,
 	ACTION_PHY_PORT_ORIGINAL,
 	ACTION_PHY_PORT_INDEX,
+	ACTION_PORT_ID,
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -445,6 +450,7 @@ static const enum index next_item[] = {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_PHY_PORT,
+	ITEM_PORT_ID,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -491,6 +497,12 @@ static const enum index item_phy_port[] = {
 	ZERO,
 };
 
+static const enum index item_port_id[] = {
+	ITEM_PORT_ID_ID,
+	ITEM_NEXT,
+	ZERO,
+};
+
 static const enum index item_raw[] = {
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -627,6 +639,7 @@ static const enum index next_action[] = {
 	ACTION_PF,
 	ACTION_VF,
 	ACTION_PHY_PORT,
+	ACTION_PORT_ID,
 	ACTION_METER,
 	ZERO,
 };
@@ -668,6 +681,13 @@ static const enum index action_phy_port[] = {
 	ZERO,
 };
 
+static const enum index action_port_id[] = {
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1084,6 +1104,20 @@ static const struct token token_list[] = {
 		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
+	[ITEM_PORT_ID] = {
+		.name = "port_id",
+		.help = "match traffic from/to a given DPDK port ID",
+		.priv = PRIV_ITEM(PORT_ID,
+				  sizeof(struct rte_flow_item_port_id)),
+		.next = NEXT(item_port_id),
+		.call = parse_vc,
+	},
+	[ITEM_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(item_port_id, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port_id, id)),
+	},
 	[ITEM_RAW] = {
 		.name = "raw",
 		.help = "match an arbitrary byte string",
@@ -1749,6 +1783,29 @@ static const struct token token_list[] = {
 					index)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PORT_ID] = {
+		.name = "port_id",
+		.help = "direct matching traffic to a given DPDK port ID",
+		.priv = PRIV_ACTION(PORT_ID,
+				    sizeof(struct rte_flow_action_port_id)),
+		.next = NEXT(action_port_id),
+		.call = parse_vc,
+	},
+	[ACTION_PORT_ID_ORIGINAL] = {
+		.name = "original",
+		.help = "use original DPDK port ID if possible",
+		.next = NEXT(action_port_id, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_port_id,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(action_port_id, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_port_id, id)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 2d68f1fb0..e7026011b 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -977,6 +977,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -1075,6 +1076,7 @@ static const struct {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a39c1e1b0..2fb8e9c3f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -617,6 +617,36 @@ associated with a port_id should be retrieved by other means.
    | ``mask`` | ``index`` | zeroed to match any port index |
    +----------+-----------+--------------------------------+
 
+Item: ``PORT_ID``
+^^^^^^^^^^^^^^^^^
+
+Matches traffic originating from (ingress) or going to (egress) a given DPDK
+port ID.
+
+Normally only supported if the port ID in question is known by the
+underlying PMD and related to the device the flow rule is created against.
+
+This must not be confused with `Item: PHY_PORT`_ which refers to the
+physical port of a device, whereas `Item: PORT_ID`_ refers to a ``struct
+rte_eth_dev`` object on the application side (also known as "port
+representor" depending on the kind of underlying device).
+
+- Default ``mask`` matches the specified DPDK port ID.
+
+.. _table_rte_flow_item_port_id:
+
+.. table:: PORT_ID
+
+   +----------+----------+-----------------------------+
+   | Field    | Subfield | Value                       |
+   +==========+==========+=============================+
+   | ``spec`` | ``id``   | DPDK port ID                |
+   +----------+----------+-----------------------------+
+   | ``last`` | ``id``   | upper range value           |
+   +----------+----------+-----------------------------+
+   | ``mask`` | ``id``   | zeroed to match any port ID |
+   +----------+----------+-----------------------------+
+
 Data matching item types
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1453,6 +1483,24 @@ See `Item: PHY_PORT`_.
    | ``index``    | physical port index                 |
    +--------------+-------------------------------------+
 
+Action: ``PORT_ID``
+^^^^^^^^^^^^^^^^^^^
+Directs matching traffic to a given DPDK port ID.
+
+See `Item: PORT_ID`_.
+
+.. _table_rte_flow_action_port_id:
+
+.. table:: PORT_ID
+
+   +--------------+---------------------------------------+
+   | Field        | Value                                 |
+   +==============+=======================================+
+   | ``original`` | use original DPDK port ID if possible |
+   +--------------+---------------------------------------+
+   | ``id``       | DPDK port ID                          |
+   +--------------+---------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 64d8dfddb..bfb5ad027 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3212,6 +3212,10 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: match traffic from/to a given DPDK port ID.
+
+  - ``id {unsigned}``: DPDK port ID.
+
 - ``raw``: match an arbitrary byte string.
 
   - ``relative {boolean}``: look for pattern after the previous item.
@@ -3428,6 +3432,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original port index if possible.
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: direct matching traffic to a given DPDK port ID.
+
+  - ``original {boolean}``: use original DPDK port ID if possible.
+  - ``id {unsigned}``: DPDK port ID.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 00989c73b..cecab59f6 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,6 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -77,6 +78,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 58b75e934..09a21e531 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -180,6 +180,16 @@ enum rte_flow_item_type {
 	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
+	 * [META]
+	 *
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given DPDK port ID.
+	 *
+	 * See struct rte_flow_item_port_id.
+	 */
+	RTE_FLOW_ITEM_TYPE_PORT_ID,
+
+	/**
 	 * Matches a byte string of a given length at a given offset.
 	 *
 	 * See struct rte_flow_item_raw.
@@ -414,6 +424,32 @@ static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 #endif
 
 /**
+ * RTE_FLOW_ITEM_TYPE_PORT_ID
+ *
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * DPDK port ID.
+ *
+ * Normally only supported if the port ID in question is known by the
+ * underlying PMD and related to the device the flow rule is created
+ * against.
+ *
+ * This must not be confused with @p PHY_PORT which refers to the physical
+ * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev
+ * object on the application side (also known as "port representor"
+ * depending on the kind of underlying device).
+ */
+struct rte_flow_item_port_id {
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */
+#ifndef __cplusplus
+static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = {
+	.id = 0xffffffff,
+};
+#endif
+
+/**
  * RTE_FLOW_ITEM_TYPE_RAW
  *
  * Matches a byte string of a given length at a given offset.
@@ -997,6 +1033,13 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_PHY_PORT,
 
 	/**
+	 * Directs matching traffic to a given DPDK port ID.
+	 *
+	 * See struct rte_flow_action_port_id.
+	 */
+	RTE_FLOW_ACTION_TYPE_PORT_ID,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1134,6 +1177,19 @@ struct rte_flow_action_phy_port {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PORT_ID
+ *
+ * Directs matching traffic to a given DPDK port ID.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PORT_ID
+ */
+struct rte_flow_action_port_id {
+	uint32_t original:1; /**< Use original DPDK port ID if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (10 preceding siblings ...)
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-16 16:22  3%       ` Adrien Mazarguil
  2018-04-17  9:08  0%         ` Mohammad Abdul Awal
  2018-04-16 16:23  2%       ` [dpdk-dev] [PATCH v4 16/16] ethdev: add port ID item and " Adrien Mazarguil
  2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

This patch adds the missing action counterpart to the PHY_PORT pattern
item, that is, the ability to directly inject matching traffic into a
physical port of the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
 6 files changed, 84 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f9f937277..356714801 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -182,6 +182,9 @@ enum index {
 	ACTION_VF,
 	ACTION_VF_ORIGINAL,
 	ACTION_VF_ID,
+	ACTION_PHY_PORT,
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -623,6 +626,7 @@ static const enum index next_action[] = {
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
+	ACTION_PHY_PORT,
 	ACTION_METER,
 	ZERO,
 };
@@ -657,6 +661,13 @@ static const enum index action_vf[] = {
 	ZERO,
 };
 
+static const enum index action_phy_port[] = {
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "direct packets to physical port index",
+		.priv = PRIV_ACTION(PHY_PORT,
+				    sizeof(struct rte_flow_action_phy_port)),
+		.next = NEXT(action_phy_port),
+		.call = parse_vc,
+	},
+	[ACTION_PHY_PORT_ORIGINAL] = {
+		.name = "original",
+		.help = "use original port index if possible",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PHY_PORT_INDEX] = {
+		.name = "index",
+		.help = "physical port index",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
+					index)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 840320108..2d68f1fb0 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1074,6 +1074,7 @@ static const struct {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 4e053c24b..a39c1e1b0 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1433,6 +1433,26 @@ See `Item: VF`_.
    | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
+Action: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^^^
+
+Directs matching traffic to a given physical port index of the underlying
+device.
+
+See `Item: PHY_PORT`_.
+
+.. _table_rte_flow_action_phy_port:
+
+.. table:: PHY_PORT
+
+   +--------------+-------------------------------------+
+   | Field        | Value                               |
+   +==============+=====================================+
+   | ``original`` | use original port index if possible |
+   +--------------+-------------------------------------+
+   | ``index``    | physical port index                 |
+   +--------------+-------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a2bbd1930..64d8dfddb 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3423,6 +3423,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original VF ID if possible.
   - ``id {unsigned}``: VF ID.
 
+- ``phy_port``: direct packets to physical port index.
+
+  - ``original {boolean}``: use original port index if possible.
+  - ``index {unsigned}``: physical port index.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 36e277a4f..00989c73b 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 2c7c4d009..58b75e934 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -989,6 +989,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VF,
 
 	/**
+	 * Directs packets to a given physical port index of the underlying
+	 * device.
+	 *
+	 * See struct rte_flow_action_phy_port.
+	 */
+	RTE_FLOW_ACTION_TYPE_PHY_PORT,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1112,6 +1120,20 @@ struct rte_flow_action_vf {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PHY_PORT
+ *
+ * Directs packets to a given physical port index of the underlying
+ * device.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
+ */
+struct rte_flow_action_phy_port {
+	uint32_t original:1; /**< Use original port index if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t index; /**< Physical port index. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 14/16] ethdev: rename physical port item in flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (9 preceding siblings ...)
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to " Adrien Mazarguil
                         ` (2 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
device using specific identifiers, these are often confused with DPDK port
IDs exposed to applications in the global name space.

Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
for better clarity.

No ABI impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
 app/test-pmd/config.c                       |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
 lib/librte_ether/rte_flow.c                 |  2 +-
 lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 41103de67..f9f937277 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -87,8 +87,8 @@ enum index {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_VF_ID,
-	ITEM_PORT,
-	ITEM_PORT_INDEX,
+	ITEM_PHY_PORT,
+	ITEM_PHY_PORT_INDEX,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -441,7 +441,7 @@ static const enum index next_item[] = {
 	ITEM_ANY,
 	ITEM_PF,
 	ITEM_VF,
-	ITEM_PORT,
+	ITEM_PHY_PORT,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -482,8 +482,8 @@ static const enum index item_vf[] = {
 	ZERO,
 };
 
-static const enum index item_port[] = {
-	ITEM_PORT_INDEX,
+static const enum index item_phy_port[] = {
+	ITEM_PHY_PORT_INDEX,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1059,18 +1059,19 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
-	[ITEM_PORT] = {
-		.name = "port",
-		.help = "device-specific physical port index to use",
-		.priv = PRIV_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-		.next = NEXT(item_port),
+	[ITEM_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "match traffic from/to a specific physical port",
+		.priv = PRIV_ITEM(PHY_PORT,
+				  sizeof(struct rte_flow_item_phy_port)),
+		.next = NEXT(item_phy_port),
 		.call = parse_vc,
 	},
-	[ITEM_PORT_INDEX] = {
+	[ITEM_PHY_PORT_INDEX] = {
 		.name = "index",
 		.help = "physical port index",
-		.next = NEXT(item_port, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port, index)),
+		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
 	[ITEM_RAW] = {
 		.name = "raw",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a50a5c544..840320108 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -976,7 +976,7 @@ static const struct {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a0a124aa2..4e053c24b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -195,8 +195,8 @@ When supported, this effectively enables an application to reroute traffic
 not necessarily intended for it (e.g. coming from or addressed to different
 physical ports, VFs or applications) at the device level.
 
-It complements the behavior of some pattern items such as `Item: PORT`_ and
-is meaningless without them.
+It complements the behavior of some pattern items such as `Item: PHY_PORT`_
+and is meaningless without them.
 
 When transferring flow rules, **ingress** and **egress** attributes
 (`Attribute: Traffic direction`_) keep their original meaning, as if
@@ -583,15 +583,15 @@ separate entities, should be addressed through their own DPDK port IDs.
    | ``mask`` | ``id``   | zeroed to match any VF ID |
    +----------+----------+---------------------------+
 
-Item: ``PORT``
-^^^^^^^^^^^^^^
+Item: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^
 
-Matches packets coming from the specified physical port of the underlying
-device.
+Matches traffic originating from (ingress) or going to (egress) a physical
+port of the underlying device.
 
-The first PORT item overrides the physical port normally associated with the
-specified DPDK input port (port_id). This item can be provided several times
-to match additional physical ports.
+The first PHY_PORT item overrides the physical port normally associated with
+the specified DPDK input port (port_id). This item can be provided several
+times to match additional physical ports.
 
 Note that physical ports are not necessarily tied to DPDK input ports
 (port_id) when those are not under DPDK control. Possible values are
@@ -603,9 +603,9 @@ associated with a port_id should be retrieved by other means.
 
 - Default ``mask`` matches any port index.
 
-.. _table_rte_flow_item_port:
+.. _table_rte_flow_item_phy_port:
 
-.. table:: PORT
+.. table:: PHY_PORT
 
    +----------+-----------+--------------------------------+
    | Field    | Subfield  | Value                          |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index af37c3d82..a2bbd1930 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3208,7 +3208,7 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``id {unsigned}``: VF ID.
 
-- ``port``: device-specific physical port index to use.
+- ``phy_port``: match traffic from/to a specific physical port.
 
   - ``index {unsigned}``: physical port index.
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 83b733ff0..36e277a4f 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -38,7 +38,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index f1c7a664e..2c7c4d009 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -84,7 +84,7 @@ struct rte_flow_attr {
 	 * applications) at the device level.
 	 *
 	 * It complements the behavior of some pattern items such as
-	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
 	 *
 	 * When transferring flow rules, ingress and egress attributes keep
 	 * their original meaning, as if processing traffic emitted or
@@ -172,17 +172,12 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets coming from the specified physical port of the
-	 * underlying device.
-	 *
-	 * The first PORT item overrides the physical port normally
-	 * associated with the specified DPDK input port (port_id). This
-	 * item can be provided several times to match additional physical
-	 * ports.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * physical port of the underlying device.
 	 *
-	 * See struct rte_flow_item_port.
+	 * See struct rte_flow_item_phy_port.
 	 */
-	RTE_FLOW_ITEM_TYPE_PORT,
+	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
 	 * Matches a byte string of a given length at a given offset.
@@ -388,13 +383,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
 #endif
 
 /**
- * RTE_FLOW_ITEM_TYPE_PORT
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT
  *
- * Matches packets coming from the specified physical port of the underlying
- * device.
+ * Matches traffic originating from (ingress) or going to (egress) a
+ * physical port of the underlying device.
  *
- * The first PORT item overrides the physical port normally associated with
- * the specified DPDK input port (port_id). This item can be provided
+ * The first PHY_PORT item overrides the physical port normally associated
+ * with the specified DPDK input port (port_id). This item can be provided
  * several times to match additional physical ports.
  *
  * Note that physical ports are not necessarily tied to DPDK input ports
@@ -407,13 +402,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
  *
  * A zeroed mask can be used to match any port index.
  */
-struct rte_flow_item_port {
+struct rte_flow_item_phy_port {
 	uint32_t index; /**< Physical port index. */
 };
 
-/** Default mask for RTE_FLOW_ITEM_TYPE_PORT. */
+/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */
 #ifndef __cplusplus
-static const struct rte_flow_item_port rte_flow_item_port_mask = {
+static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 	.index = 0x00000000,
 };
 #endif
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 13/16] ethdev: update behavior of VF/PF in flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (8 preceding siblings ...)
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 14/16] ethdev: rename physical port item " Adrien Mazarguil
                         ` (3 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

Contrary to all other pattern items, these are inconsistently documented as
affecting traffic instead of simply matching its origin, without provision
for the latter.

This commit clarifies documentation and updates PMDs since the original
behavior now has to be explicitly requested using the new transfer
attribute.

It breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
supported when a transfer attribute is also present.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 12 +++---
 doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
 drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
 drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
 lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
 6 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 1c6b5a112..41103de67 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -1041,21 +1041,21 @@ static const struct token token_list[] = {
 	},
 	[ITEM_PF] = {
 		.name = "pf",
-		.help = "match packets addressed to the physical function",
+		.help = "match traffic from/to the physical function",
 		.priv = PRIV_ITEM(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ITEM_NEXT)),
 		.call = parse_vc,
 	},
 	[ITEM_VF] = {
 		.name = "vf",
-		.help = "match packets addressed to a virtual function ID",
+		.help = "match traffic from/to a virtual function ID",
 		.priv = PRIV_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 		.next = NEXT(item_vf),
 		.call = parse_vc,
 	},
 	[ITEM_VF_ID] = {
 		.name = "id",
-		.help = "destination VF ID",
+		.help = "VF ID",
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
@@ -1686,14 +1686,14 @@ static const struct token token_list[] = {
 	},
 	[ACTION_PF] = {
 		.name = "pf",
-		.help = "redirect packets to physical device function",
+		.help = "direct traffic to physical function",
 		.priv = PRIV_ACTION(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
 	[ACTION_VF] = {
 		.name = "vf",
-		.help = "redirect packets to virtual device function",
+		.help = "direct traffic to a virtual function ID",
 		.priv = PRIV_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 		.next = NEXT(action_vf),
 		.call = parse_vc,
@@ -1708,7 +1708,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_VF_ID] = {
 		.name = "id",
-		.help = "VF ID to redirect packets to",
+		.help = "VF ID",
 		.next = NEXT(action_vf, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 550a4c95b..a0a124aa2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -528,15 +528,12 @@ Usage example, matching non-TCPv4 packets only:
 Item: ``PF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to the physical function of the device.
+Matches traffic originating from (ingress) or going to (egress) the physical
+function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: PF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the physical function is not managed by
+the application and thus not associated with a DPDK port ID.
 
-- Likely to return an error or never match any traffic if applied to a VF
-  device.
 - Can be combined with any number of `Item: VF`_ to match both PF and VF
   traffic.
 - ``spec``, ``last`` and ``mask`` must not be set.
@@ -558,15 +555,15 @@ duplicated between device instances by default.
 Item: ``VF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to a virtual function ID of the device.
+Matches traffic originating from (ingress) or going to (egress) a given
+virtual function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: VF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the virtual function is not managed by the
+application and thus not associated with a DPDK port ID.
+
+Note this pattern item does not match VF representors traffic which, as
+separate entities, should be addressed through their own DPDK port IDs.
 
-- Likely to return an error or never match any traffic if this causes a VF
-  device to match traffic addressed to a different VF.
 - Can be specified multiple times to match traffic addressed to several VF
   IDs.
 - Can be combined with a PF item to match both PF and VF traffic.
@@ -1395,7 +1392,10 @@ only matching traffic goes through.
 Action: ``PF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to the physical function (PF) of the current device.
+Directs matching traffic to the physical function (PF) of the current
+device.
+
+See `Item: PF`_.
 
 - No configurable properties.
 
@@ -1412,13 +1412,15 @@ Redirects packets to the physical function (PF) of the current device.
 Action: ``VF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to a virtual function (VF) of the current device.
+Directs matching traffic to a given virtual function of the current device.
 
 Packets matched by a VF pattern item can be redirected to their original VF
 ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
+See `Item: VF`_.
+
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1428,7 +1430,7 @@ rule or if packets are not addressed to a VF in the first place.
    +==============+================================+
    | ``original`` | use original VF ID if possible |
    +--------------+--------------------------------+
-   | ``vf``       | VF ID to redirect packets to   |
+   | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
 Action: ``METER``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 0bf6c33c9..af37c3d82 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3202,11 +3202,11 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``num {unsigned}``: number of layers covered.
 
-- ``pf``: match packets addressed to the physical function.
+- ``pf``: match traffic from/to the physical function.
 
-- ``vf``: match packets addressed to a virtual function ID.
+- ``vf``: match traffic from/to a virtual function ID.
 
-  - ``id {unsigned}``: destination VF ID.
+  - ``id {unsigned}``: VF ID.
 
 - ``port``: device-specific physical port index to use.
 
@@ -3416,12 +3416,12 @@ This section lists supported actions and their attributes, if any.
 
   - ``queues [{unsigned} [...]] end``: queue indices to use.
 
-- ``pf``: redirect packets to physical device function.
+- ``pf``: direct traffic to physical function.
 
-- ``vf``: redirect packets to virtual device function.
+- ``vf``: direct traffic to a virtual function ID.
 
   - ``original {boolean}``: use original VF ID if possible.
-  - ``id {unsigned}``: VF ID to redirect packets to.
+  - ``id {unsigned}``: VF ID.
 
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index bd166370a..f964b5ea4 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -275,6 +275,7 @@ bnxt_filter_type_check(const struct rte_flow_item pattern[],
 
 static int
 bnxt_validate_and_parse_flow_type(struct bnxt *bp,
+				  const struct rte_flow_attr *attr,
 				  const struct rte_flow_item pattern[],
 				  struct rte_flow_error *error,
 				  struct bnxt_filter_info *filter)
@@ -699,6 +700,16 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 				return -rte_errno;
 			}
 
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   item,
+					   "Matching VF traffic without"
+					   " affecting it (transfer attribute)"
+					   " is unsupported");
+				return -rte_errno;
+			}
+
 			filter->mirror_vnic_id =
 			dflt_vnic = bnxt_hwrm_func_qcfg_vf_dflt_vnic_id(bp, vf);
 			if (dflt_vnic < 0) {
@@ -746,14 +757,6 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -833,7 +836,8 @@ bnxt_validate_and_parse_flow(struct rte_eth_dev *dev,
 		goto ret;
 	}
 
-	rc = bnxt_validate_and_parse_flow_type(bp, pattern, error, filter);
+	rc = bnxt_validate_and_parse_flow_type(bp, attr, pattern, error,
+					       filter);
 	if (rc != 0)
 		goto ret;
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index f416b6a00..057e4f96d 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -54,6 +54,7 @@ static int i40e_flow_parse_ethertype_action(struct rte_eth_dev *dev,
 				    struct rte_flow_error *error,
 				    struct rte_eth_ethertype_filter *filter);
 static int i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+					const struct rte_flow_attr *attr,
 					const struct rte_flow_item *pattern,
 					struct rte_flow_error *error,
 					struct i40e_fdir_filter_conf *filter);
@@ -1918,14 +1919,6 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -2429,6 +2422,7 @@ i40e_flow_fdir_get_pctype_value(struct i40e_pf *pf,
  */
 static int
 i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
 			     const struct rte_flow_item *pattern,
 			     struct rte_flow_error *error,
 			     struct i40e_fdir_filter_conf *filter)
@@ -2966,6 +2960,16 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ITEM_TYPE_VF:
 			vf_spec = item->spec;
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "Matching VF traffic"
+						   " without affecting it"
+						   " (transfer attribute)"
+						   " is unsupported");
+				return -rte_errno;
+			}
 			filter->input.flow_ext.is_vf = 1;
 			filter->input.flow_ext.dst_id = vf_spec->id;
 			if (filter->input.flow_ext.is_vf &&
@@ -3128,7 +3132,8 @@ i40e_flow_parse_fdir_filter(struct rte_eth_dev *dev,
 		&filter->fdir_filter;
 	int ret;
 
-	ret = i40e_flow_parse_fdir_pattern(dev, pattern, error, fdir_filter);
+	ret = i40e_flow_parse_fdir_pattern(dev, attr, pattern, error,
+					   fdir_filter);
 	if (ret)
 		return ret;
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ab2bf2dce..f1c7a664e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -152,13 +152,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to the physical function of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a PF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress)
+	 * the physical function of the current device.
 	 *
 	 * No associated specification structure.
 	 */
@@ -167,13 +162,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to a virtual function ID of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a VF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given virtual function of the current device.
 	 *
 	 * See struct rte_flow_item_vf.
 	 */
@@ -371,15 +361,15 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
 /**
  * RTE_FLOW_ITEM_TYPE_VF
  *
- * Matches packets addressed to a virtual function ID of the device.
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * virtual function of the current device.
  *
- * If the underlying device function differs from the one that would
- * normally receive the matched traffic, specifying this item prevents it
- * from reaching that device unless the flow rule contains a VF
- * action. Packets are not duplicated between device instances by default.
+ * If supported, should work even if the virtual function is not managed by
+ * the application and thus not associated with a DPDK port ID.
+ *
+ * Note this pattern item does not match VF representors traffic which, as
+ * separate entities, should be addressed through their own DPDK port IDs.
  *
- * - Likely to return an error or never match any traffic if this causes a
- *   VF device to match traffic addressed to a different VF.
  * - Can be specified multiple times to match traffic addressed to several
  *   VF IDs.
  * - Can be combined with a PF item to match both PF and VF traffic.
@@ -387,7 +377,7 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
  * A zeroed mask can be used to match any VF ID.
  */
 struct rte_flow_item_vf {
-	uint32_t id; /**< Destination VF ID. */
+	uint32_t id; /**< VF ID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VF. */
@@ -988,16 +978,16 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_RSS,
 
 	/**
-	 * Redirects packets to the physical function (PF) of the current
-	 * device.
+	 * Directs matching traffic to the physical function (PF) of the
+	 * current device.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PF,
 
 	/**
-	 * Redirects packets to the virtual function (VF) of the current
-	 * device with the specified ID.
+	 * Directs matching traffic to a given virtual function of the
+	 * current device.
 	 *
 	 * See struct rte_flow_action_vf.
 	 */
@@ -1111,7 +1101,8 @@ struct rte_flow_action_rss {
 /**
  * RTE_FLOW_ACTION_TYPE_VF
  *
- * Redirects packets to a virtual function (VF) of the current device.
+ * Directs matching traffic to a given virtual function of the current
+ * device.
  *
  * Packets matched by a VF pattern item can be redirected to their original
  * VF ID instead of the specified one. This parameter may not be available
@@ -1122,7 +1113,7 @@ struct rte_flow_action_rss {
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
 	uint32_t reserved:31; /**< Reserved, must be zero. */
-	uint32_t id; /**< VF ID to redirect packets to. */
+	uint32_t id; /**< VF ID. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 12/16] ethdev: add transfer attribute to flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (7 preceding siblings ...)
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
                         ` (4 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Andrew Rybchenko

This new attribute enables applications to create flow rules that do not
simply match traffic whose origin is specified in the pattern (e.g. some
non-default physical port or VF), but actively affect it by applying the
flow rule at the lowest possible level in the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>

---

v3 changes:

Clarified definition for ingress and egress following Andrew's comment on
subsequent patch.

[1] http://dpdk.org/ml/archives/dev/2018-April/095961.html
---
 app/test-pmd/cmdline_flow.c                 | 11 +++++
 app/test-pmd/config.c                       |  6 ++-
 doc/guides/prog_guide/rte_flow.rst          | 26 +++++++++++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 11 ++---
 drivers/net/bnxt/bnxt_filter.c              |  8 ++++
 drivers/net/e1000/igb_flow.c                | 44 ++++++++++++++++++++
 drivers/net/enic/enic_flow.c                |  6 +++
 drivers/net/i40e/i40e_flow.c                |  8 ++++
 drivers/net/ixgbe/ixgbe_flow.c              | 53 ++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_flow.c                |  4 ++
 drivers/net/mlx5/mlx5_flow.c                |  7 ++++
 drivers/net/mvpp2/mrvl_flow.c               |  6 +++
 drivers/net/sfc/sfc_flow.c                  |  6 +++
 drivers/net/tap/tap_flow.c                  |  6 +++
 lib/librte_ether/rte_flow.h                 | 22 +++++++++-
 15 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f8f2a559e..1c6b5a112 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -69,6 +69,7 @@ enum index {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 
 	/* Validate/create pattern. */
 	PATTERN,
@@ -407,6 +408,7 @@ static const enum index next_vc_attr[] = {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 	PATTERN,
 	ZERO,
 };
@@ -960,6 +962,12 @@ static const struct token token_list[] = {
 		.next = NEXT(next_vc_attr),
 		.call = parse_vc,
 	},
+	[TRANSFER] = {
+		.name = "transfer",
+		.help = "apply rule directly to endpoints found in pattern",
+		.next = NEXT(next_vc_attr),
+		.call = parse_vc,
+	},
 	/* Validate/create pattern. */
 	[PATTERN] = {
 		.name = "pattern",
@@ -1945,6 +1953,9 @@ parse_vc(struct context *ctx, const struct token *token,
 	case EGRESS:
 		out->args.vc.attr.egress = 1;
 		return len;
+	case TRANSFER:
+		out->args.vc.attr.transfer = 1;
+		return len;
 	case PATTERN:
 		out->args.vc.pattern =
 			(void *)RTE_ALIGN_CEIL((uintptr_t)(out + 1),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 562fb2f8d..a50a5c544 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1239,6 +1239,7 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
+		[RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
 		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
@@ -1504,12 +1505,13 @@ port_flow_list(portid_t port_id, uint32_t n, const uint32_t group[n])
 		const struct rte_flow_item *item = pf->pattern;
 		const struct rte_flow_action *action = pf->actions;
 
-		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c\t",
+		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c%c\t",
 		       pf->id,
 		       pf->attr.group,
 		       pf->attr.priority,
 		       pf->attr.ingress ? 'i' : '-',
-		       pf->attr.egress ? 'e' : '-');
+		       pf->attr.egress ? 'e' : '-',
+		       pf->attr.transfer ? 't' : '-');
 		while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 			if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
 				printf("%s ", flow_item[item->type].name);
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index c62a80566..550a4c95b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -170,7 +170,13 @@ Note that support for more than a single priority level is not guaranteed.
 Attribute: Traffic direction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Flow rules can apply to inbound and/or outbound traffic (ingress/egress).
+Flow rule patterns apply to inbound and/or outbound traffic.
+
+In the context of this API, **ingress** and **egress** respectively stand
+for **inbound** and **outbound** based on the standpoint of the application
+creating a flow rule.
+
+There are no exceptions to this definition.
 
 Several pattern items and actions are valid and can be used in both
 directions. At least one direction must be specified.
@@ -178,6 +184,24 @@ directions. At least one direction must be specified.
 Specifying both directions at once for a given rule is not recommended but
 may be valid in a few cases (e.g. shared counters).
 
+Attribute: Transfer
+^^^^^^^^^^^^^^^^^^^
+
+Instead of simply matching the properties of traffic as it would appear on a
+given DPDK port ID, enabling this attribute transfers a flow rule to the
+lowest possible level of any device endpoints found in the pattern.
+
+When supported, this effectively enables an application to reroute traffic
+not necessarily intended for it (e.g. coming from or addressed to different
+physical ports, VFs or applications) at the device level.
+
+It complements the behavior of some pattern items such as `Item: PORT`_ and
+is meaningless without them.
+
+When transferring flow rules, **ingress** and **egress** attributes
+(`Attribute: Traffic direction`_) keep their original meaning, as if
+processing traffic emitted or received by the application.
+
 Pattern item
 ~~~~~~~~~~~~
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 923664f7d..0bf6c33c9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -2970,14 +2970,14 @@ following sections.
 - Check whether a flow rule can be created::
 
    flow validate {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
 - Create a flow rule::
 
    flow create {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
@@ -3010,7 +3010,7 @@ underlying device in its current state but stops short of creating it. It is
 bound to ``rte_flow_validate()``::
 
    flow validate {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3047,7 +3047,7 @@ Creating flow rules
 to ``rte_flow_create()``::
 
    flow create {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3061,7 +3061,7 @@ Otherwise it will show an error message of the form::
 
 Parameters describe in the following order:
 
-- Attributes (*group*, *priority*, *ingress*, *egress* tokens).
+- Attributes (*group*, *priority*, *ingress*, *egress*, *transfer* tokens).
 - A matching pattern, starting with the *pattern* token and terminated by an
   *end* pattern item.
 - Actions, starting with the *actions* token and terminated by an *end*
@@ -3089,6 +3089,7 @@ specified before the ``pattern`` token.
 - ``priority {level}``: priority level within group.
 - ``ingress``: rule applies to ingress traffic.
 - ``egress``: rule applies to egress traffic.
+- ``transfer``: apply rule directly to endpoints found in pattern.
 
 Each instance of an attribute specified several times overrides the previous
 value as shown below (group 4 is used)::
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 9bb1575cb..bd166370a 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -746,6 +746,14 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index d1c0b4b8d..073852913 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -379,6 +379,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -624,6 +633,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -923,6 +940,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1211,6 +1237,15 @@ cons_parse_flex_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -1361,6 +1396,15 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index 20d6b9d59..3a0086399 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -1318,6 +1318,12 @@ enic_flow_parse(struct rte_eth_dev *dev,
 					   NULL,
 					   "egress is not supported");
 			return -rte_errno;
+		} else if (attrs->transfer) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					   NULL,
+					   "transfer is not supported");
+			return -rte_errno;
 		} else if (!attrs->ingress) {
 			rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 470ab93d6..f416b6a00 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -1918,6 +1918,14 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 438bfcdfb..eb0644c82 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -557,6 +557,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -787,6 +796,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -1078,6 +1095,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1250,6 +1276,15 @@ cons_parse_l2_tn_filter(struct rte_eth_dev *dev,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
 		rte_flow_error_set(error, EINVAL,
@@ -1354,6 +1389,15 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
 		rte_flow_error_set(error, EINVAL,
@@ -2829,6 +2873,15 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index ce36ac715..e3d7aa8ef 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -652,6 +652,10 @@ mlx4_flow_prepare(struct priv *priv,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
 			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
 	if (!attr->ingress)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 4a2411010..968bef746 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -568,6 +568,13 @@ mlx5_flow_convert_attributes(const struct rte_flow_attr *attr,
 				   "egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   NULL,
+				   "transfer is not supported");
+		return -rte_errno;
+	}
 	if (!attr->ingress) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 6478eb2fe..a2e2129cc 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -2187,6 +2187,12 @@ mrvl_flow_parse_attr(struct mrvl_priv *priv __rte_unused,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, NULL,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index cd6a61b39..bcde2c2f7 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1116,6 +1116,12 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer != 0) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, attr,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->ingress == 0) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, attr,
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index dff09313a..ad2ba9f4e 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1039,6 +1039,12 @@ priv_flow_process(struct pmd_internals *pmd,
 	};
 	int action = 0; /* Only one action authorized for now */
 
+	if (attr->transfer) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			NULL, "transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->group > MAX_GROUP) {
 		rte_flow_error_set(
 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 513734dce..ab2bf2dce 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -72,7 +72,26 @@ struct rte_flow_attr {
 	uint32_t priority; /**< Priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
-	uint32_t reserved:30; /**< Reserved, must be zero. */
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 *
+	 * When transferring flow rules, ingress and egress attributes keep
+	 * their original meaning, as if processing traffic emitted or
+	 * received by the application.
+	 */
+	uint32_t transfer:1;
+	uint32_t reserved:29; /**< Reserved, must be zero. */
 };
 
 /**
@@ -1181,6 +1200,7 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
+	RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
 	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 10/16] ethdev: refine TPID handling in flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (6 preceding siblings ...)
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 09/16] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
                         ` (5 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Andrew Rybchenko, Pascal Mazon

TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
consistent with the normal stacking order of pattern items, which is
confusing to applications.

Problem is that when followed by one of these layers, the EtherType field
of the preceding layer keeps its "inner" definition, and the "outer" TPID
is provided by the subsequent layer, the reverse of how a packet looks like
on the wire:

 Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
 rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]

Worse, when QinQ is involved, the stacking order of VLAN layers is
unspecified. It is unclear whether it should be reversed (innermost to
outermost) as well given TPID applies to the previous layer:

 Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
 rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
 rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]

While specifying EtherType/TPID is hopefully rarely necessary, the stacking
order in case of QinQ and the lack of documentation remain an issue.

This patch replaces TPID in the VLAN pattern item with an inner
EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
clarifies documentation and updates all relevant code.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
items:

- bnxt: EtherType matching is supported with and without VLAN, but TPID
  matching is not and triggers an error.

- e1000: EtherType matching is only supported with the ETHERTYPE filter,
  which does not support VLAN matching, therefore no impact.

- enic: same as bnxt.

- i40e: same as bnxt with existing FDIR limitations on allowed EtherType
  values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
  EtherType matching.

- ixgbe: same as e1000, with additional minor change to rely on the new
  E-Tag macro definition.

- mlx4: EtherType/TPID matching is not supported, no impact.

- mlx5: same as bnxt.

- mvpp2: same as bnxt.

- sfc: same as bnxt.

- tap: same as bnxt.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: John Daley <johndale@cisco.com>
Cc: Hyong Youb Kim <hyonkim@cisco.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Dmitri Epshtein <dima@marvell.com>
Cc: Natalie Samsonov <nsamsono@marvell.com>
Cc: Jianbo Liu <jianbo.liu@arm.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

Updated mrvl to mvpp2.

Moved unrelated default TCI mask update to separate patch.

Fixed sfc according to Andrew's comments [1], which made so much sense that
I standardized on the same behavior for all other PMDs: matching outer TPID
is never supported when a VLAN pattern item is present.

This is done because many devices accept several TPIDs but do not provide
means to match a given one explicitly, it's all or nothing, and that makes
the resulting flow rule inaccurate.

[1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
---
 app/test-pmd/cmdline_flow.c                 | 17 +++----
 doc/guides/nics/tap.rst                     |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
 drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
 drivers/net/enic/enic_flow.c                | 19 +++++---
 drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
 drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
 drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
 drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
 drivers/net/sfc/sfc_flow.c                  | 18 +++++++
 drivers/net/tap/tap_flow.c                  | 14 ++++--
 lib/librte_ether/rte_flow.h                 | 22 ++++++---
 lib/librte_net/rte_ether.h                  |  1 +
 14 files changed, 198 insertions(+), 55 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 976fde7cd..f8f2a559e 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -99,11 +99,11 @@ enum index {
 	ITEM_ETH_SRC,
 	ITEM_ETH_TYPE,
 	ITEM_VLAN,
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_IPV4,
 	ITEM_IPV4_TOS,
 	ITEM_IPV4_TTL,
@@ -505,11 +505,11 @@ static const enum index item_eth[] = {
 };
 
 static const enum index item_vlan[] = {
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1142,12 +1142,6 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vlan),
 		.call = parse_vc,
 	},
-	[ITEM_VLAN_TPID] = {
-		.name = "tpid",
-		.help = "tag protocol identifier",
-		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan, tpid)),
-	},
 	[ITEM_VLAN_TCI] = {
 		.name = "tci",
 		.help = "tag control information",
@@ -1175,6 +1169,13 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY_MASK_HTON(struct rte_flow_item_vlan,
 						  tci, "\x0f\xff")),
 	},
+	[ITEM_VLAN_INNER_TYPE] = {
+		.name = "inner_type",
+		.help = "inner EtherType",
+		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan,
+					     inner_type)),
+	},
 	[ITEM_IPV4] = {
 		.name = "ipv4",
 		.help = "match IPv4 header",
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c97786aca..3f7a15147 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -108,7 +108,7 @@ The kernel support can be checked with this command::
 Supported items:
 
 - eth: src and dst (with variable masks), and eth_type (0xffff mask).
-- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- vlan: vid, pcp, but not eid. (requires kernel 4.9)
 - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
 - udp/tcp: src and dst port (0xffff) mask.
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 1a09e8a0f..fd317b48c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -784,9 +784,15 @@ Item: ``ETH``
 
 Matches an Ethernet header.
 
+The ``type`` field either stands for "EtherType" or "TPID" when followed by
+so-called layer 2.5 pattern items such as ``RTE_FLOW_ITEM_TYPE_VLAN``. In
+the latter case, ``type`` refers to that of the outer header, with the inner
+EtherType/TPID provided by the subsequent pattern item. This is the same
+order as on the wire.
+
 - ``dst``: destination MAC.
 - ``src``: source MAC.
-- ``type``: EtherType.
+- ``type``: EtherType or TPID.
 - Default ``mask`` matches destination and source addresses only.
 
 Item: ``VLAN``
@@ -794,8 +800,12 @@ Item: ``VLAN``
 
 Matches an 802.1Q/ad VLAN tag.
 
-- ``tpid``: tag protocol identifier.
+The corresponding standard outer EtherType (TPID) values are
+``ETHER_TYPE_VLAN`` or ``ETHER_TYPE_QINQ``. It can be overridden by the
+preceding pattern item.
+
 - ``tci``: tag control information.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` matches TCI only.
 
 Item: ``IPV4``
@@ -866,12 +876,15 @@ Item: ``E_TAG``
 
 Matches an IEEE 802.1BR E-Tag header.
 
-- ``tpid``: tag protocol identifier (0x893F)
+The corresponding standard outer EtherType (TPID) value is
+``ETHER_TYPE_ETAG``. It can be overridden by the preceding pattern item.
+
 - ``epcp_edei_in_ecid_b``: E-Tag control information (E-TCI), E-PCP (3b),
   E-DEI (1b), ingress E-CID base (12b).
 - ``rsvd_grp_ecid_b``: reserved (2b), GRP (2b), E-CID base (12b).
 - ``in_ecid_e``: ingress E-CID ext.
 - ``ecid_e``: E-CID ext.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` simultaneously matches GRP and E-CID base.
 
 Item: ``NVGRE``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 3b1073bfc..923664f7d 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3223,15 +3223,15 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``dst {MAC-48}``: destination MAC.
   - ``src {MAC-48}``: source MAC.
-  - ``type {unsigned}``: EtherType.
+  - ``type {unsigned}``: EtherType or TPID.
 
 - ``vlan``: match 802.1Q/ad VLAN tag.
 
-  - ``tpid {unsigned}``: tag protocol identifier.
   - ``tci {unsigned}``: tag control information.
   - ``pcp {unsigned}``: priority code point.
   - ``dei {unsigned}``: drop eligible indicator.
   - ``vid {unsigned}``: VLAN identifier.
+  - ``inner_type {unsigned}``: inner EtherType or TPID.
 
 - ``ipv4``: match IPv4 header.
 
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 0f9c1c9ae..9bb1575cb 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -299,6 +299,7 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 	uint32_t vf = 0;
 	int use_ntuple;
 	uint32_t en = 0;
+	uint32_t en_ethertype;
 	int dflt_vnic;
 
 	use_ntuple = bnxt_filter_type_check(pattern, error);
@@ -308,6 +309,9 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 
 	filter->filter_type = use_ntuple ?
 		HWRM_CFA_NTUPLE_FILTER : HWRM_CFA_EM_FILTER;
+	en_ethertype = use_ntuple ?
+		NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
+		EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
 
 	while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 		if (item->last) {
@@ -377,30 +381,49 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 			if (eth_mask->type) {
 				filter->ethertype =
 					rte_be_to_cpu_16(eth_spec->type);
-				en |= use_ntuple ?
-					NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
-					EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
+				en |= en_ethertype;
 			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+			if (en & en_ethertype) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "VLAN TPID matching is not"
+						   " supported");
+				return -rte_errno;
+			}
 			if (vlan_mask->tci &&
-			    vlan_mask->tci == RTE_BE16(0x0fff) &&
-			    !vlan_mask->tpid) {
+			    vlan_mask->tci == RTE_BE16(0x0fff)) {
 				/* Only the VLAN ID can be matched. */
 				filter->l2_ovlan =
 					rte_be_to_cpu_16(vlan_spec->tci &
 							 RTE_BE16(0x0fff));
 				en |= EM_FLOW_ALLOC_INPUT_EN_OVLAN_VID;
-			} else if (vlan_mask->tci || vlan_mask->tpid) {
+			} else if (vlan_mask->tci) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
 						   "VLAN mask is invalid");
 				return -rte_errno;
 			}
+			if (vlan_mask->inner_type &&
+			    vlan_mask->inner_type != RTE_BE16(0xffff)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "inner ethertype mask not"
+						   " valid");
+				return -rte_errno;
+			}
+			if (vlan_mask->inner_type) {
+				filter->ethertype =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+				en |= en_ethertype;
+			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index a5c6a1670..20d6b9d59 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -557,16 +557,21 @@ enic_copy_item_vlan_v2(const struct rte_flow_item *item,
 	if (!spec)
 		return 0;
 
-	/* Don't support filtering in tpid */
-	if (mask) {
-		if (mask->tpid != 0)
-			return ENOTSUP;
-	} else {
+	if (!mask)
 		mask = &rte_flow_item_vlan_mask;
-		RTE_ASSERT(mask->tpid == 0);
-	}
 
 	if (*inner_ofst == 0) {
+		struct ether_hdr *eth_mask =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].mask;
+		struct ether_hdr *eth_val =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].val;
+
+		/* Outer TPID cannot be matched */
+		if (eth_mask->ether_type)
+			return ENOTSUP;
+		eth_mask->ether_type = mask->inner_type;
+		eth_val->ether_type = spec->inner_type;
+
 		/* Outer header. Use the vlan mask/val fields */
 		gp->mask_vlan = mask->tci;
 		gp->val_vlan = spec->tci;
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index db668835d..470ab93d6 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <stdarg.h>
 
+#include <rte_debug.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_log.h>
@@ -2491,16 +2492,22 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						      "Invalid MAC_addr mask.");
 					return -rte_errno;
 				}
+			}
+			if (eth_spec && eth_mask && eth_mask->type) {
+				enum rte_flow_item_type next = (item + 1)->type;
 
-				if ((eth_mask->type & UINT16_MAX) ==
-				    UINT16_MAX) {
-					input_set |= I40E_INSET_LAST_ETHER_TYPE;
-					filter->input.flow.l2_flow.ether_type =
-						eth_spec->type;
+				if (eth_mask->type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid type mask.");
+					return -rte_errno;
 				}
 
 				ether_type = rte_be_to_cpu_16(eth_spec->type);
-				if (ether_type == ETHER_TYPE_IPv4 ||
+
+				if (next == RTE_FLOW_ITEM_TYPE_VLAN ||
+				    ether_type == ETHER_TYPE_IPv4 ||
 				    ether_type == ETHER_TYPE_IPv6 ||
 				    ether_type == ETHER_TYPE_ARP ||
 				    ether_type == outer_tpid) {
@@ -2510,6 +2517,9 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						     "Unsupported ether_type.");
 					return -rte_errno;
 				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					eth_spec->type;
 			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
@@ -2519,6 +2529,8 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+
+			RTE_ASSERT(!(input_set & I40E_INSET_LAST_ETHER_TYPE));
 			if (vlan_spec && vlan_mask) {
 				if (vlan_mask->tci ==
 				    rte_cpu_to_be_16(I40E_TCI_MASK)) {
@@ -2527,6 +2539,33 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						vlan_spec->tci;
 				}
 			}
+			if (vlan_spec && vlan_mask && vlan_mask->inner_type) {
+				if (vlan_mask->inner_type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid inner_type"
+						      " mask.");
+					return -rte_errno;
+				}
+
+				ether_type =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+
+				if (ether_type == ETHER_TYPE_IPv4 ||
+				    ether_type == ETHER_TYPE_IPv6 ||
+				    ether_type == ETHER_TYPE_ARP ||
+				    ether_type == outer_tpid) {
+					rte_flow_error_set(error, EINVAL,
+						     RTE_FLOW_ERROR_TYPE_ITEM,
+						     item,
+						     "Unsupported inner_type.");
+					return -rte_errno;
+				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					vlan_spec->inner_type;
+			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
 			layer_idx = I40E_FLXPLD_L2_IDX;
@@ -3285,7 +3324,8 @@ i40e_flow_parse_vxlan_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -3515,7 +3555,8 @@ i40e_flow_parse_nvgre_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -4023,7 +4064,8 @@ i40e_flow_parse_qinq_pattern(__rte_unused struct rte_eth_dev *dev,
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
 
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 					   RTE_FLOW_ERROR_TYPE_ITEM,
 					   item,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 25a8d041d..ac2204971 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -115,7 +115,6 @@
 
 #define IXGBE_VT_CTL_POOLING_MODE_MASK         0x00030000
 #define IXGBE_VT_CTL_POOLING_MODE_ETAG         0x00010000
-#define DEFAULT_ETAG_ETYPE                     0x893f
 #define IXGBE_ETAG_ETYPE                       0x00005084
 #define IXGBE_ETAG_ETYPE_MASK                  0x0000ffff
 #define IXGBE_ETAG_ETYPE_VALID                 0x80000000
@@ -1481,7 +1480,7 @@ static int ixgbe_l2_tn_filter_init(struct rte_eth_dev *eth_dev)
 	}
 	l2_tn_info->e_tag_en = FALSE;
 	l2_tn_info->e_tag_fwd_en = FALSE;
-	l2_tn_info->e_tag_ether_type = DEFAULT_ETAG_ETYPE;
+	l2_tn_info->e_tag_ether_type = ETHER_TYPE_ETAG;
 
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 76a1053ec..4a2411010 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_ether.h>
 #include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
@@ -306,6 +307,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.actions = valid_actions,
 		.mask = &(const struct rte_flow_item_vlan){
 			.tci = -1,
+			.inner_type = -1,
 		},
 		.default_mask = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
@@ -1285,6 +1287,7 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 	struct mlx5_flow_parse *parser = data->parser;
 	struct ibv_flow_spec_eth *eth;
 	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg = "VLAN cannot be empty";
 
 	if (spec) {
 		unsigned int i;
@@ -1306,12 +1309,20 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 			 */
 			if (!eth->mask.vlan_tag)
 				goto error;
+			/* Outer TPID cannot be matched. */
+			if (eth->mask.ether_type) {
+				msg = "VLAN TPID matching is not supported";
+				goto error;
+			}
+			eth->val.ether_type = spec->inner_type;
+			eth->mask.ether_type = mask->inner_type;
+			eth->val.ether_type &= eth->mask.ether_type;
 		}
 		return 0;
 	}
 error:
 	return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				  item, "VLAN cannot be empty");
+				  item, msg);
 }
 
 /**
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 8fd4dbfb1..6478eb2fe 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -1091,12 +1091,6 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 	if (ret)
 		return ret;
 
-	if (mask->tpid) {
-		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				   NULL, "Not supported by classifier\n");
-		return -rte_errno;
-	}
-
 	m = rte_be_to_cpu_16(mask->tci);
 	if (m & MRVL_VLAN_ID_MASK) {
 		RTE_LOG(WARNING, PMD, "vlan id mask is ignored\n");
@@ -1112,6 +1106,26 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 			goto out;
 	}
 
+	if (flow->pattern & F_TYPE) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported\n");
+		return -rte_errno;
+	}
+	if (mask->inner_type) {
+		struct rte_flow_item_eth spec_eth = {
+			.type = spec->inner_type,
+		};
+		struct rte_flow_item_eth mask_eth = {
+			.type = mask->inner_type,
+		};
+
+		RTE_LOG(WARNING, PMD, "inner eth type mask is ignored\n");
+		ret = mrvl_parse_type(spec_eth, mask_eth, flow);
+		if (ret)
+			goto out;
+	}
+
 	return 0;
 out:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 3028efbf9..cd6a61b39 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -7,6 +7,7 @@
  * for Solarflare) and Solarflare Communications, Inc.
  */
 
+#include <rte_byteorder.h>
 #include <rte_tailq.h>
 #include <rte_common.h>
 #include <rte_ethdev_driver.h>
@@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 	const struct rte_flow_item_vlan *mask = NULL;
 	const struct rte_flow_item_vlan supp_mask = {
 		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
+		.inner_type = RTE_BE16(0xffff),
 	};
 
 	rc = sfc_flow_parse_init(item,
@@ -393,6 +395,22 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 		return -rte_errno;
 	}
 
+	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported");
+		return -rte_errno;
+	}
+	if (mask->inner_type == supp_mask.inner_type) {
+		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
+	} else if (mask->inner_type) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Bad mask for VLAN inner_type");
+		return -rte_errno;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 7dfaf9ac5..dff09313a 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -270,13 +270,13 @@ static const struct tap_flow_items tap_flow_items[] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
 			       RTE_FLOW_ITEM_TYPE_IPV6),
 		.mask = &(const struct rte_flow_item_vlan){
-			.tpid = -1,
 			/* DEI matching is not supported */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 			.tci = 0xffef,
 #else
 			.tci = 0xefff,
 #endif
+			.inner_type = -1,
 		},
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
 		.default_mask = &rte_flow_item_vlan_mask,
@@ -578,13 +578,19 @@ tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
 	/* use default mask if none provided */
 	if (!mask)
 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
-	/* TC does not support tpid masking. Only accept if exact match. */
-	if (mask->tpid && mask->tpid != 0xffff)
+	/* Outer TPID cannot be matched. */
+	if (info->eth_type)
 		return -1;
 	/* Double-tagging not supported. */
-	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+	if (info->vlan)
 		return -1;
 	info->vlan = 1;
+	if (mask->inner_type) {
+		/* TC does not support partial eth_type masking */
+		if (mask->inner_type != RTE_BE16(0xffff))
+			return -1;
+		info->eth_type = spec->inner_type;
+	}
 	if (!flow)
 		return 0;
 	msg = &flow->msg;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index d0ff26aa3..8e50384d0 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -454,11 +454,17 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
  * RTE_FLOW_ITEM_TYPE_ETH
  *
  * Matches an Ethernet header.
+ *
+ * The @p type field either stands for "EtherType" or "TPID" when followed
+ * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In
+ * the latter case, @p type refers to that of the outer header, with the
+ * inner EtherType/TPID provided by the subsequent pattern item. This is the
+ * same order as on the wire.
  */
 struct rte_flow_item_eth {
 	struct ether_addr dst; /**< Destination MAC. */
 	struct ether_addr src; /**< Source MAC. */
-	rte_be16_t type; /**< EtherType. */
+	rte_be16_t type; /**< EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */
@@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
  *
  * Matches an 802.1Q/ad VLAN tag.
  *
- * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
- * RTE_FLOW_ITEM_TYPE_VLAN.
+ * The corresponding standard outer EtherType (TPID) values are
+ * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
+ * pattern item.
  */
 struct rte_flow_item_vlan {
-	rte_be16_t tpid; /**< Tag protocol identifier. */
 	rte_be16_t tci; /**< Tag control information. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
 #ifndef __cplusplus
 static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
-	.tpid = RTE_BE16(0x0000),
 	.tci = RTE_BE16(0xffff),
+	.inner_type = RTE_BE16(0x0000),
 };
 #endif
 
@@ -636,9 +643,11 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
  * RTE_FLOW_ITEM_TYPE_E_TAG.
  *
  * Matches a E-tag header.
+ *
+ * The corresponding standard outer EtherType (TPID) value is
+ * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item.
  */
 struct rte_flow_item_e_tag {
-	rte_be16_t tpid; /**< Tag protocol identifier (0x893F). */
 	/**
 	 * E-Tag control information (E-TCI).
 	 * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b).
@@ -648,6 +657,7 @@ struct rte_flow_item_e_tag {
 	rte_be16_t rsvd_grp_ecid_b;
 	uint8_t in_ecid_e; /**< Ingress E-CID ext. */
 	uint8_t ecid_e; /**< E-CID ext. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index 45daa911a..a271d1c86 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -301,6 +301,7 @@ struct vxlan_hdr {
 #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */
 #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */
 #define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
+#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */
 #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
 #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
 #define ETHER_TYPE_TEB  0x6558 /**< Transparent Ethernet Bridging. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 09/16] ethdev: add encap level to RSS flow API action
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (5 preceding siblings ...)
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-16 16:22  3%       ` Adrien Mazarguil
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
                         ` (6 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 7436e0356..976fde7cd 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 19e27a6ca..562fb2f8d 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1101,6 +1101,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index e0c68495c..1a09e8a0f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1311,6 +1311,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1320,6 +1342,8 @@ field only, both can be requested simultaneously.
    +===============+=============================================+
    | ``func``      | RSS hash function to apply                  |
    +---------------+---------------------------------------------+
+   | ``level``     | encapsulation level for ``types``           |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 546ef3ab7..3b1073bfc 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 82307ec5d..d1c0b4b8d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index d5c1cd3d3..a3776a0d7 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2906,6 +2906,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2921,6 +2922,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 5cb852f2c..42002422b 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -12040,6 +12040,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -12055,6 +12056,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 897989bbd..db668835d 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4380,6 +4380,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 00d975b93..438bfcdfb 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index e17f5a433..23af21712 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5683,6 +5683,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5698,6 +5699,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 002003235..ce36ac715 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c86703f4c..76a1053ec 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 779edad0c..3028efbf9 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1269,6 +1269,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 		return -EINVAL;
 	}
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 845031a31..7dfaf9ac5 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index a2b51f1e0..83b733ff0 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 97d7d3594..d0ff26aa3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1046,6 +1046,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 08/16] ethdev: add hash function to RSS flow API action
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (4 preceding siblings ...)
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 09/16] ethdev: add encap level " Adrien Mazarguil
                         ` (7 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

By definition, RSS involves some kind of hash algorithm, usually Toeplitz.

Until now it could not be modified on a flow rule basis and PMDs had to
always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
behavior when unspecified (0).

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

- Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
  more explicit where doing so would clarify the code.

- Updated sfc to include Toeplitz as the other allowed value.

Both according to Andrew's suggestions [1].

[1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
---
 app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          |  2 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
 drivers/net/e1000/igb_flow.c                |  4 ++
 drivers/net/e1000/igb_rxtx.c                |  4 +-
 drivers/net/i40e/i40e_ethdev.c              |  4 +-
 drivers/net/i40e/i40e_flow.c                |  4 ++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
 drivers/net/mlx4/mlx4_flow.c                |  7 +++
 drivers/net/mlx5/mlx5_flow.c                | 13 +++++
 drivers/net/sfc/sfc_flow.c                  |  8 +++
 drivers/net/tap/tap_flow.c                  |  6 ++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 |  2 +
 16 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index c9c2c3ad9..7436e0356 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -14,6 +14,7 @@
 #include <sys/socket.h>
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev.h>
 #include <rte_byteorder.h>
 #include <cmdline_parse.h>
@@ -165,6 +166,10 @@ enum index {
 	ACTION_DROP,
 	ACTION_COUNT,
 	ACTION_RSS,
+	ACTION_RSS_FUNC,
+	ACTION_RSS_FUNC_DEFAULT,
+	ACTION_RSS_FUNC_TOEPLITZ,
+	ACTION_RSS_FUNC_SIMPLE_XOR,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
 	ACTION_RSS_KEY,
@@ -632,6 +637,7 @@ static const enum index action_queue[] = {
 };
 
 static const enum index action_rss[] = {
+	ACTION_RSS_FUNC,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -666,6 +672,9 @@ static int parse_vc_conf(struct context *, const struct token *,
 static int parse_vc_action_rss(struct context *, const struct token *,
 			       const char *, unsigned int, void *,
 			       unsigned int);
+static int parse_vc_action_rss_func(struct context *, const struct token *,
+				    const char *, unsigned int, void *,
+				    unsigned int);
 static int parse_vc_action_rss_type(struct context *, const struct token *,
 				    const char *, unsigned int, void *,
 				    unsigned int);
@@ -1584,6 +1593,29 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
+	[ACTION_RSS_FUNC] = {
+		.name = "func",
+		.help = "RSS hash function to apply",
+		.next = NEXT(action_rss,
+			     NEXT_ENTRY(ACTION_RSS_FUNC_DEFAULT,
+					ACTION_RSS_FUNC_TOEPLITZ,
+					ACTION_RSS_FUNC_SIMPLE_XOR)),
+	},
+	[ACTION_RSS_FUNC_DEFAULT] = {
+		.name = "default",
+		.help = "default hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_TOEPLITZ] = {
+		.name = "toeplitz",
+		.help = "Toeplitz hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_SIMPLE_XOR] = {
+		.name = "simple_xor",
+		.help = "simple XOR hash function",
+		.call = parse_vc_action_rss_func,
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2074,6 +2106,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
+			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
@@ -2099,6 +2132,45 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 }
 
 /**
+ * Parse func field for RSS action.
+ *
+ * The RTE_ETH_HASH_FUNCTION_* value to assign is derived from the
+ * ACTION_RSS_FUNC_* index that called this function.
+ */
+static int
+parse_vc_action_rss_func(struct context *ctx, const struct token *token,
+			 const char *str, unsigned int len,
+			 void *buf, unsigned int size)
+{
+	struct action_rss_data *action_rss_data;
+	enum rte_eth_hash_function func;
+
+	(void)buf;
+	(void)size;
+	/* Token name must match. */
+	if (parse_default(ctx, token, str, len, NULL, 0) < 0)
+		return -1;
+	switch (ctx->curr) {
+	case ACTION_RSS_FUNC_DEFAULT:
+		func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+		break;
+	case ACTION_RSS_FUNC_TOEPLITZ:
+		func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+		break;
+	case ACTION_RSS_FUNC_SIMPLE_XOR:
+		func = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR;
+		break;
+	default:
+		return -1;
+	}
+	if (!ctx->object)
+		return len;
+	action_rss_data = ctx->object;
+	action_rss_data->conf.func = func;
+	return len;
+}
+
+/**
  * Parse type field for RSS action.
  *
  * Valid tokens are type field names and the "end" token.
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 3da09536a..19e27a6ca 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1100,6 +1100,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index cf252eeba..e0c68495c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1318,6 +1318,8 @@ field only, both can be requested simultaneously.
    +---------------+---------------------------------------------+
    | Field         | Value                                       |
    +===============+=============================================+
+   | ``func``      | RSS hash function to apply                  |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 17336d163..546ef3ab7 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,6 +3398,9 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
+  - ``func {hash function}``: RSS hash function to apply, allowed tokens are
+    the same as `set_hash_global_config`_.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 8dc5f75f2..82307ec5d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1310,6 +1310,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 45bb3455c..d5c1cd3d3 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2905,6 +2905,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2919,7 +2920,8 @@ int
 igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index e65235fc3..5cb852f2c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -12039,6 +12039,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -12053,7 +12054,8 @@ int
 i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index ec6231003..897989bbd 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4376,6 +4376,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	/* Parse RSS related parameters from configuration */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 4e31c7c56..00d975b93 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2779,6 +2779,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 94ea7444d..e17f5a433 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5682,6 +5682,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5696,7 +5697,8 @@ int
 ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dd86e4ce7..002003235 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -790,6 +790,12 @@ mlx4_flow_prepare(struct priv *priv,
 					" of the context size";
 				goto exit_action_not_supported;
 			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1283,6 +1289,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 86870b0cb..c86703f4c 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
@@ -634,6 +635,15 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "the only supported RSS hash"
+						   " function is Toeplitz");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -683,6 +693,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				}
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
+				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1915,6 +1926,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2429,6 +2441,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 1a2c0299c..779edad0c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1261,6 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
+	switch (rss->func) {
+	case RTE_ETH_HASH_FUNCTION_DEFAULT:
+	case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 67146aaba..845031a31 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,6 +2055,12 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
+	/* Check supported hash functions */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "non-default RSS hash functions are not supported");
+
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
 	if (err < 0) {
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index cc7819b6a..a2b51f1e0 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,6 +330,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index bbc408fa6..97d7d3594 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -19,6 +19,7 @@
 
 #include <rte_arp.h>
 #include <rte_ether.h>
+#include <rte_eth_ctrl.h>
 #include <rte_icmp.h>
 #include <rte_ip.h>
 #include <rte_sctp.h>
@@ -1044,6 +1045,7 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
+	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (3 preceding siblings ...)
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
                         ` (8 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon, Radu Nicolau, Akhil Goyal

Since its inception, the rte_flow RSS action has been relying in part on
external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
This structure lacks parameters such as the hash algorithm to use, and more
recently, a method to tell which layer RSS should be performed on [1].

Given struct rte_eth_rss_conf will never be flexible enough to represent a
complete RSS configuration (e.g. RETA table), this patch supersedes it by
extending the rte_flow RSS action directly.

A subsequent patch will add a field to use a non-default RSS hash
algorithm. To that end, a field named "types" replaces the field formerly
known as "rss_hf" and standing for "RSS hash functions" as it was
confusing. Actual RSS hash function types are defined by enum
rte_eth_hash_function.

This patch updates all PMDs and example applications accordingly.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

[1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
Cc: Radu Nicolau <radu.nicolau@intel.com>
Cc: Akhil Goyal <akhil.goyal@nxp.com>

---

v3 changes:

Documentation update regarding the meaning of a 0 value for RSS types in
flow rules.

It used to implicitly mean "no RSS" but is redefined as requesting a kind
of "best-effort" mode from PMDs, i.e. anything ranging from empty to
all-inclusive RSS; what matters is it provides safe defaults that will work
regardless of PMD capabilities.
---
 app/test-pmd/cmdline_flow.c                 |  48 +++---
 app/test-pmd/config.c                       |  39 ++---
 doc/guides/prog_guide/rte_flow.rst          |  28 ++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  31 ++--
 drivers/net/e1000/igb_rxtx.c                |  51 +++++-
 drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                |  47 +++---
 drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
 drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                |  61 +++----
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
 drivers/net/sfc/sfc_flow.c                  |  21 ++-
 drivers/net/tap/tap_flow.c                  |   8 +-
 examples/ipsec-secgw/ipsec.c                |  10 +-
 lib/librte_ether/rte_flow.c                 |  39 ++---
 lib/librte_ether/rte_flow.h                 |  12 +-
 28 files changed, 478 insertions(+), 355 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 798b7948d..c9c2c3ad9 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -192,9 +192,8 @@ enum index {
 /** Storage for struct rte_flow_action_rss including external data. */
 struct action_rss_data {
 	struct rte_flow_action_rss conf;
+	uint8_t key[RSS_HASH_KEY_LENGTH];
 	uint16_t queue[ACTION_RSS_QUEUE_NUM];
-	struct rte_eth_rss_conf rss_conf;
-	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -1587,7 +1586,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
-		.help = "RSS hash types",
+		.help = "specific RSS hash types",
 		.next = NEXT(action_rss, NEXT_ENTRY(ACTION_RSS_TYPE)),
 	},
 	[ACTION_RSS_TYPE] = {
@@ -1602,21 +1601,21 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
 		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
 			     ARGS_ENTRY_ARB
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len)),
-			     ARGS_ENTRY(struct action_rss_data, rss_key)),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len)),
+			     ARGS_ENTRY(struct action_rss_data, key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len),
 			      0,
 			      RSS_HASH_KEY_LENGTH)),
 	},
@@ -2075,27 +2074,24 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->rss_conf,
-			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.types = rss_hf,
+			.key_len = sizeof(action_rss_data->key),
+			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.key = action_rss_data->key,
 			.queue = action_rss_data->queue,
 		},
+		.key = "testpmd's default RSS hash key",
 		.queue = { 0 },
-		.rss_conf = (struct rte_eth_rss_conf){
-			.rss_key = action_rss_data->rss_key,
-			.rss_key_len = sizeof(action_rss_data->rss_key),
-			.rss_hf = rss_hf,
-		},
-		.rss_key = "testpmd's default RSS hash key",
 	};
-	for (i = 0; i < action_rss_data->conf.num; ++i)
+	for (i = 0; i < action_rss_data->conf.queue_num; ++i)
 		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->rss_key),
+		action_rss_data->conf.key_len =
+			RTE_MIN(sizeof(action_rss_data->key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2123,7 +2119,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->rss_conf.rss_hf = 0;
+		action_rss_data->conf.types = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2142,7 +2138,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->conf.types |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2192,7 +2188,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue_num = i;
 	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 95618e4eb..3da09536a 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1100,40 +1100,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index acbeaacbd..cf252eeba 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1301,6 +1301,12 @@ Action: ``RSS``
 Similar to QUEUE, except RSS is additionally performed on packets to spread
 them among several queues according to the provided parameters.
 
+Unlike global RSS settings used by other DPDK APIs, unsetting the ``types``
+field does not disable RSS in a flow rule. Doing so instead requests safe
+unspecified "best-effort" settings from the underlying PMD, which depending
+on the flow rule, may result in anything ranging from empty (single queue)
+to all-inclusive RSS.
+
 Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
@@ -1309,15 +1315,19 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+--------------------------------+
-   | Field        | Value                          |
-   +==============+================================+
-   | ``rss_conf`` | RSS parameters                 |
-   +--------------+--------------------------------+
-   | ``num``      | number of entries in ``queue`` |
-   +--------------+--------------------------------+
-   | ``queue``    | queue indices to use           |
-   +--------------+--------------------------------+
+   +---------------+---------------------------------------------+
+   | Field         | Value                                       |
+   +===============+=============================================+
+   | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
+   +---------------+---------------------------------------------+
+   | ``key_len``   | hash key length in bytes                    |
+   +---------------+---------------------------------------------+
+   | ``queue_num`` | number of entries in ``queue``              |
+   +---------------+---------------------------------------------+
+   | ``key``       | hash key                                    |
+   +---------------+---------------------------------------------+
+   | ``queue``     | queue indices to use                        |
+   +---------------+---------------------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a015d02a4..17336d163 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,8 +3398,10 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
-  - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
-    are the same as `set_hash_input_set`_, an empty list means none (0).
+  - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
+    tokens are the same as `set_hash_input_set`_, except that an empty list
+    does not disable RSS but instead requests unspecified "best-effort"
+    settings.
 
   - ``key {string}``: RSS hash key, overrides ``key_len``.
 
diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b894a..902001f36 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -4,6 +4,10 @@
 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
+
+#include <stdint.h>
+
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_pci.h>
 
@@ -27,6 +31,7 @@
 #define E1000_CTRL_EXT_EXTEND_VLAN  (1<<26)    /* EXTENDED VLAN */
 #define IGB_VFTA_SIZE 128
 
+#define IGB_HKEY_MAX_INDEX             10
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
@@ -229,8 +234,8 @@ struct igb_ethertype_filter {
 };
 
 struct igb_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IGB_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IGB_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -501,6 +506,10 @@ int eth_igb_syn_filter_set(struct rte_eth_dev *dev,
 int eth_igb_add_del_flex_filter(struct rte_eth_dev *dev,
 			struct rte_eth_flex_filter *filter,
 			bool add);
+int igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		      const struct rte_flow_action_rss *in);
+int igb_action_rss_same(const struct rte_flow_action_rss *comp,
+			const struct rte_flow_action_rss *with);
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 9b808a982..7e9935b7e 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -41,8 +41,6 @@
 #define IGB_DEFAULT_TX_HTHRESH      1
 #define IGB_DEFAULT_TX_WTHRESH      ((hw->mac.type == e1000_82576) ? 1 : 16)
 
-#define IGB_HKEY_MAX_INDEX 10
-
 /* Bit shift and mask */
 #define IGB_4_BIT_WIDTH  (CHAR_BIT / 2)
 #define IGB_4_BIT_MASK   RTE_LEN2MASK(IGB_4_BIT_WIDTH, uint8_t)
@@ -5576,7 +5574,7 @@ igb_rss_filter_restore(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter_info->rss_info, TRUE);
 }
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index c0f5b5190..8dc5f75f2 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1292,7 +1292,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -1300,7 +1300,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1310,14 +1310,18 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IGB_RSS_OFFLOAD_ALL;
-
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (igb_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	index++;
@@ -1518,9 +1522,8 @@ igb_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct igb_rte_flow_rss_conf));
+			igb_rss_conf_init(&rss_filter_ptr->filter_info,
+					  &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&igb_filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
@@ -1757,7 +1760,7 @@ igb_clear_rss_filter(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter->rss_info.num)
+	if (filter->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter->rss_info, FALSE);
 }
 
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 323913f0d..45bb3455c 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2898,12 +2898,47 @@ igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 }
 
 int
+igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		  const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+igb_action_rss_same(const struct rte_flow_action_rss *comp,
+		    const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 igb_config_rss_filter(struct rte_eth_dev *dev,
 		struct igb_rte_flow_rss_conf *conf, bool add)
 {
 	uint32_t shift;
 	uint16_t i, j;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
@@ -2911,8 +2946,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct igb_rte_flow_rss_conf)) == 0) {
+		if (igb_action_rss_same(&filter_info->rss_info.conf,
+					&conf->conf)) {
 			igb_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct igb_rte_flow_rss_conf));
@@ -2921,7 +2956,7 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 
 	/* Fill in redirection table. */
@@ -2933,9 +2968,9 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		} reta;
 		uint8_t q_idx;
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		q_idx = conf->queue[j];
+		q_idx = conf->conf.queue[j];
 		reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
 		if ((i & 3) == 3)
 			E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
@@ -2952,8 +2987,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	igb_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct igb_rte_flow_rss_conf));
+	if (igb_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 180ac7449..e65235fc3 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include <assert.h>
 
+#include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_string_fns.h>
 #include <rte_pci.h>
@@ -11499,7 +11500,7 @@ i40e_rss_filter_restore(struct i40e_pf *pf)
 {
 	struct i40e_rte_flow_rss_conf *conf =
 					&pf->rss_info;
-	if (conf->num)
+	if (conf->conf.queue_num)
 		i40e_config_rss_filter(pf, conf, TRUE);
 }
 
@@ -12031,18 +12032,52 @@ i40e_cloud_filter_qinq_create(struct i40e_pf *pf)
 }
 
 int
+i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		   const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+		     const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	uint32_t i, lut = 0;
 	uint16_t j, num;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct i40e_rte_flow_rss_conf *rss_info = &pf->rss_info;
 
 	if (!add) {
-		if (memcmp(conf, rss_info,
-			sizeof(struct i40e_rte_flow_rss_conf)) == 0) {
+		if (i40e_action_rss_same(&rss_info->conf, &conf->conf)) {
 			i40e_pf_disable_rss(pf);
 			memset(rss_info, 0,
 				sizeof(struct i40e_rte_flow_rss_conf));
@@ -12051,7 +12086,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 		return -EINVAL;
 	}
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		return -EINVAL;
 
 	/* If both VMDQ and RSS enabled, not all of PF queues are configured.
@@ -12062,7 +12097,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	else
 		num = pf->dev_data->nb_rx_queues;
 
-	num = RTE_MIN(num, conf->num);
+	num = RTE_MIN(num, conf->conf.queue_num);
 	PMD_DRV_LOG(INFO, "Max of contiguous %u PF queues are configured",
 			num);
 
@@ -12075,7 +12110,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
 			j = 0;
-		lut = (lut << 8) | (conf->queue[j] & ((0x1 <<
+		lut = (lut << 8) | (conf->conf.queue[j] & ((0x1 <<
 			hw->func_caps.rss_table_entry_width) - 1));
 		if ((i & 3) == 3)
 			I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i >> 2), lut);
@@ -12100,8 +12135,8 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 
 	i40e_hw_rss_hash_set(pf, &rss_conf);
 
-	rte_memcpy(rss_info,
-		conf, sizeof(struct i40e_rte_flow_rss_conf));
+	if (i40e_rss_conf_init(rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index d33b255e7..a0569d4ae 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -5,14 +5,19 @@
 #ifndef _I40E_ETHDEV_H_
 #define _I40E_ETHDEV_H_
 
+#include <stdint.h>
+
 #include <rte_eth_ctrl.h>
 #include <rte_time.h>
 #include <rte_kvargs.h>
 #include <rte_hash.h>
+#include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_tm_driver.h>
 #include "rte_pmd_i40e.h"
 
+#include "base/i40e_register.h"
+
 #define I40E_VLAN_TAG_SIZE        4
 
 #define I40E_AQ_LEN               32
@@ -878,9 +883,11 @@ struct i40e_customized_pctype {
 };
 
 struct i40e_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
 	uint16_t queue_region_conf; /**< Queue region config flag */
-	uint16_t num; /**< Number of entries in queue[]. */
+	uint8_t key[(I40E_VFQF_HKEY_MAX_INDEX > I40E_PFQF_HKEY_MAX_INDEX ?
+		     I40E_VFQF_HKEY_MAX_INDEX : I40E_PFQF_HKEY_MAX_INDEX) + 1 *
+		    sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[I40E_MAX_Q_PER_TC]; /**< Queues indices to use. */
 };
 
@@ -1219,6 +1226,10 @@ void i40e_init_queue_region_conf(struct rte_eth_dev *dev);
 void i40e_flex_payload_reg_set_default(struct i40e_hw *hw);
 int i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, uint8_t key_len);
 int i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, uint16_t lut_size);
+int i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		       const struct rte_flow_action_rss *in);
+int i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+			 const struct rte_flow_action_rss *with);
 int i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index d6f5e9923..ec6231003 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4220,7 +4220,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 
 	if (action_flag) {
 		for (n = 0; n < 64; n++) {
-			if (rss->rss_conf->rss_hf & (hf_bit << n)) {
+			if (rss->types & (hf_bit << n)) {
 				conf_info->region[0].hw_flowtype[0] = n;
 				conf_info->region[0].flowtype_num = 1;
 				conf_info->queue_region_number = 1;
@@ -4236,12 +4236,12 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	 * queue index for this port.
 	 */
 	if (conf_info->queue_region_number) {
-		for (i = 0; i < rss->num; i++) {
-			for (j = 0; j < rss_info->num; j++) {
-				if (rss->queue[i] == rss_info->queue[j])
+		for (i = 0; i < rss->queue_num; i++) {
+			for (j = 0; j < rss_info->conf.queue_num; j++) {
+				if (rss->queue[i] == rss_info->conf.queue[j])
 					break;
 			}
-			if (j == rss_info->num) {
+			if (j == rss_info->conf.queue_num) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4250,7 +4250,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 		}
 
-		for (i = 0; i < rss->num - 1; i++) {
+		for (i = 0; i < rss->queue_num - 1; i++) {
 			if (rss->queue[i + 1] != rss->queue[i] + 1) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4265,8 +4265,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	for (n = 0; n < conf_info->queue_region_number; n++) {
 		if (conf_info->region[n].user_priority_num ||
 				conf_info->region[n].flowtype_num) {
-			if (!((rte_is_power_of_2(rss->num)) &&
-					rss->num <= 64)) {
+			if (!((rte_is_power_of_2(rss->queue_num)) &&
+					rss->queue_num <= 64)) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4294,7 +4294,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 
 			for (i = 0; i < info->queue_region_number; i++) {
-				if (info->region[i].queue_num == rss->num &&
+				if (info->region[i].queue_num ==
+				    rss->queue_num &&
 					info->region[i].queue_start_index ==
 						rss->queue[0])
 					break;
@@ -4310,7 +4311,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				}
 
 				info->region[i].queue_num =
-					rss->num;
+					rss->queue_num;
 				info->region[i].queue_start_index =
 					rss->queue[0];
 				info->region[i].region_id =
@@ -4356,7 +4357,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	if (rss_config->queue_region_conf)
 		return 0;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -4364,7 +4365,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4375,15 +4376,19 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	/* Parse RSS related parameters from configuration */
-	if (rss->rss_conf)
-		rss_config->rss_conf = *rss->rss_conf;
-	else
-		rss_config->rss_conf.rss_hf =
-			pf->adapter->flow_types_mask;
+	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key too large");
+	if (rss->queue_num > RTE_DIM(rss_config->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (i40e_rss_conf_init(rss_config, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
-	for (n = 0; n < rss->num; ++n)
-		rss_config->queue[n] = rss->queue[n];
-	rss_config->num = rss->num;
 	index++;
 
 	/* check if the next not void action is END */
@@ -4903,7 +4908,7 @@ i40e_flow_flush_rss_filter(struct rte_eth_dev *dev)
 
 	ret = i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		ret = i40e_config_rss_filter(pf, rss_info, FALSE);
 	return ret;
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a5e2fc0ca..25a8d041d 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -100,8 +100,6 @@
 
 #define IXGBE_QUEUE_STAT_COUNTERS (sizeof(hw_stats->qprc) / sizeof(hw_stats->qprc[0]))
 
-#define IXGBE_HKEY_MAX_INDEX 10
-
 /* Additional timesync values. */
 #define NSEC_PER_SEC             1000000000L
 #define IXGBE_INCVAL_10GB        0x66666666
@@ -8294,7 +8292,7 @@ ixgbe_rss_filter_restore(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev,
 			&filter_info->rss_info, TRUE);
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 655077700..9491b03f4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -4,6 +4,9 @@
 
 #ifndef _IXGBE_ETHDEV_H_
 #define _IXGBE_ETHDEV_H_
+
+#include <stdint.h>
+
 #include "base/ixgbe_type.h"
 #include "base/ixgbe_dcb.h"
 #include "base/ixgbe_dcb_82599.h"
@@ -12,6 +15,7 @@
 #ifdef RTE_LIBRTE_SECURITY
 #include "ixgbe_ipsec.h"
 #endif
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_hash.h>
 #include <rte_pci.h>
@@ -39,6 +43,7 @@
 #define IXGBE_EXTENDED_VLAN	  (uint32_t)(1 << 26) /* EXTENDED VLAN ENABLE */
 #define IXGBE_VFTA_SIZE 128
 #define IXGBE_VLAN_TAG_SIZE 4
+#define IXGBE_HKEY_MAX_INDEX 10
 #define IXGBE_MAX_RX_QUEUE_NUM	128
 #define IXGBE_MAX_INTR_QUEUE_NUM	15
 #define IXGBE_VMDQ_DCB_NB_QUEUES     IXGBE_MAX_RX_QUEUE_NUM
@@ -196,8 +201,8 @@ struct ixgbe_hw_fdir_info {
 };
 
 struct ixgbe_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IXGBE_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -696,6 +701,10 @@ void ixgbe_tm_conf_init(struct rte_eth_dev *dev);
 void ixgbe_tm_conf_uninit(struct rte_eth_dev *dev);
 int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev, uint16_t queue_idx,
 			       uint16_t tx_rate);
+int ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+			const struct rte_flow_action_rss *in);
+int ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+			  const struct rte_flow_action_rss *with);
 int ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index abdeac28b..4e31c7c56 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2761,7 +2761,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -2769,7 +2769,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -2778,14 +2778,19 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IXGBE_RSS_OFFLOAD_ALL;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (ixgbe_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	act = next_no_void_action(actions, act);
@@ -2834,7 +2839,7 @@ ixgbe_clear_rss_filter(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev, &filter_info->rss_info, FALSE);
 }
 
@@ -3153,9 +3158,8 @@ ixgbe_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct ixgbe_rte_flow_rss_conf));
+			ixgbe_rss_conf_init(&rss_filter_ptr->filter_info,
+					    &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 7511e183f..94ea7444d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5675,6 +5675,36 @@ ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
 }
 
 int
+ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+		    const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+		      const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add)
 {
@@ -5684,7 +5714,12 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	uint16_t j;
 	uint16_t sp_reta_size;
 	uint32_t reta_reg;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
@@ -5694,8 +5729,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	sp_reta_size = ixgbe_reta_size_get(hw->mac.type);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct ixgbe_rte_flow_rss_conf)) == 0) {
+		if (ixgbe_action_rss_same(&filter_info->rss_info.conf,
+					  &conf->conf)) {
 			ixgbe_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct ixgbe_rte_flow_rss_conf));
@@ -5704,7 +5739,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 	/* Fill in redirection table
 	 * The byte-swap is needed because NIC registers are in
@@ -5714,9 +5749,9 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	for (i = 0, j = 0; i < sp_reta_size; i++, j++) {
 		reta_reg = ixgbe_reta_reg_get(hw->mac.type, i);
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		reta = (reta << 8) | conf->queue[j];
+		reta = (reta << 8) | conf->conf.queue[j];
 		if ((i & 3) == 3)
 			IXGBE_WRITE_REG(hw, reta_reg,
 					rte_bswap32(reta));
@@ -5733,8 +5768,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	ixgbe_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct ixgbe_rte_flow_rss_conf));
+	if (ixgbe_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 06f17703b..970d20dd1 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -569,7 +569,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			     " for UDP RSS and inner VXLAN RSS");
 			/* Fake support for all possible RSS hash fields. */
 			priv->hw_rss_sup = ~UINT64_C(0);
-			priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
+			priv->hw_rss_sup = mlx4_conv_rss_types(priv, -1);
 			/* Filter out known unsupported fields. */
 			priv->hw_rss_sup &=
 				~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 8feb6ae31..dd86e4ce7 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -76,22 +76,22 @@ struct mlx4_drop {
 };
 
 /**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert DPDK RSS hash types to their Verbs equivalent.
  *
- * This function returns the supported (default) set when @p rss_hf has
+ * This function returns the supported (default) set when @p types has
  * special value (uint64_t)-1.
  *
  * @param priv
  *   Pointer to private structure.
- * @param rss_hf
- *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ *   Hash types in DPDK format (see struct rte_eth_rss_conf).
  *
  * @return
  *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
  *   otherwise and rte_errno is set.
  */
 uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types)
 {
 	enum { IPV4, IPV6, TCP, UDP, };
 	const uint64_t in[] = {
@@ -126,17 +126,17 @@ mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
 	unsigned int i;
 
 	for (i = 0; i != RTE_DIM(in); ++i)
-		if (rss_hf & in[i]) {
-			seen |= rss_hf & in[i];
+		if (types & in[i]) {
+			seen |= types & in[i];
 			conv |= out[i];
 		}
 	if ((conv & priv->hw_rss_sup) == conv) {
-		if (rss_hf == (uint64_t)-1) {
+		if (types == (uint64_t)-1) {
 			/* Include inner RSS by default if supported. */
 			conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
 			return conv;
 		}
-		if (!(rss_hf & ~seen))
+		if (!(types & ~seen))
 			return conv;
 	}
 	rte_errno = ENOTSUP;
@@ -717,7 +717,8 @@ mlx4_flow_prepare(struct priv *priv,
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
-			const struct rte_eth_rss_conf *rss_conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint64_t fields;
 			unsigned int i;
 
@@ -747,58 +748,56 @@ mlx4_flow_prepare(struct priv *priv,
 				break;
 			rss = action->conf;
 			/* Default RSS configuration if none is provided. */
-			rss_conf =
-				rss->rss_conf ?
-				rss->rss_conf :
-				&(struct rte_eth_rss_conf){
-					.rss_key = mlx4_rss_hash_key_default,
-					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
-					.rss_hf = -1,
-				};
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
 			/* Sanity checks. */
-			for (i = 0; i < rss->num; ++i)
+			for (i = 0; i < rss->queue_num; ++i)
 				if (rss->queue[i] >=
 				    priv->dev->data->nb_rx_queues)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "queue index target beyond number of"
 					" configured Rx queues";
 				goto exit_action_not_supported;
 			}
-			if (!rte_is_power_of_2(rss->num)) {
+			if (!rte_is_power_of_2(rss->queue_num)) {
 				msg = "for RSS, mlx4 requires the number of"
 					" queues to be a power of two";
 				goto exit_action_not_supported;
 			}
-			if (rss_conf->rss_key_len !=
-			    sizeof(flow->rss->key)) {
+			if (rss_key_len != sizeof(flow->rss->key)) {
 				msg = "mlx4 supports exactly one RSS hash key"
 					" length: "
 					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
 				goto exit_action_not_supported;
 			}
-			for (i = 1; i < rss->num; ++i)
+			for (i = 1; i < rss->queue_num; ++i)
 				if (rss->queue[i] - rss->queue[i - 1] != 1)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "mlx4 requires RSS contexts to use"
 					" consecutive queue indices only";
 				goto exit_action_not_supported;
 			}
-			if (rss->queue[0] % rss->num) {
+			if (rss->queue[0] % rss->queue_num) {
 				msg = "mlx4 requires the first queue of a RSS"
 					" context to be aligned on a multiple"
 					" of the context size";
 				goto exit_action_not_supported;
 			}
 			rte_errno = 0;
-			fields = mlx4_conv_rss_hf(priv, rss_conf->rss_hf);
+			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
 				msg = "unsupported RSS hash type requested";
 				goto exit_action_not_supported;
 			}
 			flow->rss = mlx4_rss_get
-				(priv, fields, rss_conf->rss_key, rss->num,
+				(priv, fields, rss_key, rss->queue_num,
 				 rss->queue);
 			if (!flow->rss) {
 				msg = "either invalid parameters or not enough"
@@ -1284,8 +1283,10 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
+		.types = -1,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 4e3889e67..7b83d74b0 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -47,7 +47,7 @@ struct rte_flow {
 
 /* mlx4_flow.c */
 
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t rss_hf);
 int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
 void mlx4_flow_clean(struct priv *priv);
 int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 6be6a0b9a..b430678c7 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
  */
 struct mlx4_rss *
 mlx4_rss_get(struct priv *priv, uint64_t fields,
-	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 	     uint16_t queues, const uint16_t queue_id[])
 {
 	struct mlx4_rss *rss;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index b1af86110..2dfee957f 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -127,7 +127,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
 int mlx4_rss_init(struct priv *priv);
 void mlx4_rss_deinit(struct priv *priv);
 struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
-			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 			      uint16_t queues, const uint16_t queue_id[]);
 void mlx4_rss_put(struct mlx4_rss *rss);
 int mlx4_rss_attach(struct mlx4_rss *rss);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 75ea0cbcb..86870b0cb 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -214,9 +214,8 @@ struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t (*queues)[]; /**< Queues indexes to use. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
 	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
@@ -406,9 +405,8 @@ struct mlx5_flow_parse {
 	uint32_t mark:1; /**< Mark is present in the flow. */
 	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
@@ -532,47 +530,6 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Copy the RSS configuration from the user ones, of the rss_conf is null,
- * uses the driver one.
- *
- * @param parser
- *   Internal parser structure.
- * @param rss_conf
- *   User RSS configuration to save.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_flow_convert_rss_conf(struct mlx5_flow_parse *parser,
-			   const struct rte_eth_rss_conf *rss_conf)
-{
-	/*
-	 * This function is also called at the beginning of
-	 * mlx5_flow_convert_actions() to initialize the parser with the
-	 * device default RSS configuration.
-	 */
-	if (rss_conf) {
-		if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len != 40) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len && rss_conf->rss_key) {
-			parser->rss_conf.rss_key_len = rss_conf->rss_key_len;
-			memcpy(parser->rss_key, rss_conf->rss_key,
-			       rss_conf->rss_key_len);
-			parser->rss_conf.rss_key = parser->rss_key;
-		}
-		parser->rss_conf.rss_hf = rss_conf->rss_hf;
-	}
-	return 0;
-}
-
-/**
  * Extract attribute to the parser.
  *
  * @param[in] attr
@@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	enum { FATE = 1, MARK = 2, COUNT = 4, };
 	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
 
-	/*
-	 * Add default RSS configuration necessary for Verbs to create QP even
-	 * if no RSS is necessary.
-	 */
-	ret = mlx5_flow_convert_rss_conf(parser,
-					 (const struct rte_eth_rss_conf *)
-					 &priv->rss_conf);
-	if (ret)
-		return ret;
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
@@ -671,25 +618,53 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			parser->queues_n = 1;
 			parser->queues[0] = queue->index;
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.queue_num = 1,
+				.queue = parser->queues,
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint16_t n;
 
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
-			if (!rss || !rss->num) {
+			if (rss->types & MLX5_RSS_HF_MASK) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "unsupported RSS type"
+						   " requested");
+				return -rte_errno;
+			}
+			if (rss->key_len) {
+				rss_key_len = rss->key_len;
+				rss_key = rss->key;
+			} else {
+				rss_key_len = rss_hash_default_key_len;
+				rss_key = rss_hash_default_key;
+			}
+			if (rss_key_len != RTE_DIM(parser->rss_key)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS hash key must be"
+						   " exactly 40 bytes long");
+				return -rte_errno;
+			}
+			if (!rss->queue_num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (rss->num > RTE_DIM(parser->queues)) {
+			if (rss->queue_num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -697,7 +672,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " context");
 				return -rte_errno;
 			}
-			for (n = 0; n < rss->num; ++n) {
+			for (n = 0; n < rss->queue_num; ++n) {
 				if (rss->queue[n] >= priv->rxqs_n) {
 					rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -707,16 +682,16 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 					return -rte_errno;
 				}
 			}
-			for (n = 0; n < rss->num; ++n)
-				parser->queues[n] = rss->queue[n];
-			parser->queues_n = rss->num;
-			if (mlx5_flow_convert_rss_conf(parser, rss->rss_conf)) {
-				rte_flow_error_set(error, EINVAL,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "wrong RSS configuration");
-				return -rte_errno;
-			}
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.types = rss->types,
+				.key_len = rss_key_len,
+				.queue_num = rss->queue_num,
+				.key = memcpy(parser->rss_key, rss_key,
+					      sizeof(*rss_key) * rss_key_len),
+				.queue = memcpy(parser->queues, rss->queue,
+						sizeof(*rss->queue) *
+						rss->queue_num),
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -761,7 +736,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
-	if (!parser->queues_n && !parser->drop) {
+	if (!parser->rss_conf.queue_num && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
@@ -941,7 +916,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	unsigned int i;
 
 	/* Remove any other flow not matching the pattern. */
-	if (parser->queues_n == 1 && !parser->rss_conf.rss_hf) {
+	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			if (i == HASH_RXQ_ETH)
 				continue;
@@ -969,7 +944,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	}
 	/* Remove impossible flow according to the RSS configuration. */
 	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.rss_hf) {
+	    parser->rss_conf.types) {
 		/* Remove any other flow. */
 		for (i = hmin; i != (hmax + 1); ++i) {
 			if ((i == parser->layer) ||
@@ -980,7 +955,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		}
 	} else  if (!parser->queue[ip].ibv_attr) {
 		/* no RSS possible with the current configuration. */
-		parser->queues_n = 1;
+		parser->rss_conf.queue_num = 1;
 		return;
 	}
 fill:
@@ -1109,7 +1084,7 @@ mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.rss_hf &
+			if (!(parser->rss_conf.types &
 			      hash_rxq_init[i].dpdk_rss_hf) &&
 			    (i != HASH_RXQ_ETH))
 				continue;
@@ -1777,20 +1752,20 @@ mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -1861,9 +1836,9 @@ mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 				   NULL, "internal error in flow creation");
 		goto error;
 	}
-	for (i = 0; i != parser->queues_n; ++i) {
+	for (i = 0; i != parser->rss_conf.queue_num; ++i) {
 		struct mlx5_rxq_data *q =
-			(*priv->rxqs)[parser->queues[i]];
+			(*priv->rxqs)[parser->rss_conf.queue[i]];
 
 		q->mark |= parser->mark;
 	}
@@ -1927,7 +1902,8 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	if (ret)
 		goto exit;
 	flow = rte_calloc(__func__, 1,
-			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  sizeof(*flow) +
+			  parser.rss_conf.queue_num * sizeof(uint16_t),
 			  0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
@@ -1936,15 +1912,20 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 				   "cannot allocate flow memory");
 		return NULL;
 	}
-	/* Copy queues configuration. */
+	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
-	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
-	flow->queues_n = parser.queues_n;
+	flow->rss_conf = (struct rte_flow_action_rss){
+		.types = parser.rss_conf.types,
+		.key_len = parser.rss_conf.key_len,
+		.queue_num = parser.rss_conf.queue_num,
+		.key = memcpy(flow->rss_key, parser.rss_conf.key,
+			      sizeof(*parser.rss_conf.key) *
+			      parser.rss_conf.key_len),
+		.queue = memcpy(flow->queues, parser.rss_conf.queue,
+				sizeof(*parser.rss_conf.queue) *
+				parser.rss_conf.queue_num),
+	};
 	flow->mark = parser.mark;
-	/* Copy RSS configuration. */
-	flow->rss_conf = parser.rss_conf;
-	flow->rss_conf.rss_key = flow->rss_key;
-	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
 	/* finalise the flow. */
 	if (parser.drop)
 		ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow,
@@ -2024,7 +2005,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 
 	if (flow->drop || !flow->mark)
 		goto free;
-	for (i = 0; i != flow->queues_n; ++i) {
+	for (i = 0; i != flow->rss_conf.queue_num; ++i) {
 		struct rte_flow *tmp;
 		int mark = 0;
 
@@ -2334,19 +2315,19 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			if (!flow->frxq[i].ibv_attr)
 				continue;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_get(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_get(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_new(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_new(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
@@ -2370,8 +2351,8 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 		}
 		if (!flow->mark)
 			continue;
-		for (i = 0; i != flow->queues_n; ++i)
-			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
+		for (i = 0; i != flow->rss_conf.queue_num; ++i)
+			(*priv->rxqs)[flow->rss_conf.queue[i]]->mark = 1;
 	}
 	return 0;
 }
@@ -2448,8 +2429,10 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = &priv->rss_conf,
-		.num = priv->reta_idx_n,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index eda3ba3d5..18ad40813 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1218,8 +1218,8 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1286,8 +1286,8 @@ mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
  *   An indirection table if found.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1391,8 +1391,10 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1419,7 +1421,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = rss_key,
+				.rx_hash_key = (void *)(uintptr_t)rss_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
@@ -1469,8 +1471,10 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
  *   An hash Rx queue on success.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2309aa4f3..ee534c340 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -134,7 +134,7 @@ struct mlx5_ind_table_ibv {
 	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
-	uint16_t queues_n; /**< Number of queues in the list. */
+	uint32_t queues_n; /**< Number of queues in the list. */
 	uint16_t queues[]; /**< Queue list. */
 };
 
@@ -145,7 +145,7 @@ struct mlx5_hrxq {
 	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
-	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
 
@@ -237,20 +237,22 @@ int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_rxq_verify(struct rte_eth_dev *dev);
 int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
 			       struct mlx5_ind_table_ibv *ind_tbl);
 int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev);
-struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
-struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 056405515..1a2c0299c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	struct sfc_rxq *rxq;
 	unsigned int rxq_hw_index_min;
 	unsigned int rxq_hw_index_max;
-	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
-	uint64_t rss_hf;
-	uint8_t *rss_key = NULL;
+	const uint8_t *rss_key;
 	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
 	unsigned int i;
 
-	if (rss->num == 0)
+	if (rss->queue_num == 0)
 		return -EINVAL;
 
 	rxq_sw_index = sa->rxq_count - 1;
@@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	rxq_hw_index_min = rxq->hw_index;
 	rxq_hw_index_max = 0;
 
-	for (i = 0; i < rss->num; ++i) {
+	for (i = 0; i < rss->queue_num; ++i) {
 		rxq_sw_index = rss->queue[i];
 
 		if (rxq_sw_index >= sa->rxq_count)
@@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
-	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
-	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
+	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
-	if (rss_conf != NULL) {
-		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
+	if (rss->key_len) {
+		if (rss->key_len != sizeof(sa->rss_key))
 			return -EINVAL;
 
-		rss_key = rss_conf->rss_key;
+		rss_key = rss->key;
 	} else {
 		rss_key = sa->rss_key;
 	}
@@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 
 	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
 	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
-	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
+	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
 	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
 
 	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
-		unsigned int rxq_sw_index = rss->queue[i % rss->num];
+		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
 		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
 
 		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index fe2f94010..67146aaba 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1215,7 +1215,7 @@ priv_flow_process(struct pmd_internals *pmd,
 				if (err)
 					goto exit_action_not_supported;
 			}
-			if (flow && rss)
+			if (flow)
 				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
@@ -2050,7 +2050,7 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 			   struct rte_flow_error *error)
 {
 	/* 4096 is the maximum number of instructions for a BPF program */
-	int i;
+	unsigned int i;
 	int err;
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
@@ -2066,8 +2066,8 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	}
 
 	/* Update RSS map entry with queues */
-	rss_entry.nb_queues = rss->num;
-	for (i = 0; i < rss->num; i++)
+	rss_entry.nb_queues = rss->queue_num;
+	for (i = 0; i < rss->queue_num; i++)
 		rss_entry.queues[i] = rss->queue[i];
 	rss_entry.hash_fields =
 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 8b2047adb..3ce76c413 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -202,9 +202,13 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
 						queue[j++] = i;
-				action_rss.rss_conf = &rss_conf;
-				action_rss.num = j;
-				action_rss.queue = queue;
+				action_rss = (struct rte_flow_action_rss){
+					.types = rss_conf.rss_hf,
+					.key_len = rss_conf.rss_key_len,
+					.queue_num = j,
+					.key = rss_key,
+					.queue = queue,
+				};
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index bb19e28c6..cc7819b6a 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,40 +330,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ad2e55b8e..bbc408fa6 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1033,13 +1033,21 @@ struct rte_flow_query_count {
  * Similar to QUEUE, except RSS is additionally performed on packets to
  * spread them among several queues according to the provided parameters.
  *
+ * Unlike global RSS settings used by other DPDK APIs, unsetting the
+ * @p types field does not disable RSS in a flow rule. Doing so instead
+ * requests safe unspecified "best-effort" settings from the underlying PMD,
+ * which depending on the flow rule, may result in anything ranging from
+ * empty (single queue) to all-inclusive RSS.
+ *
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
-	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+	uint32_t key_len; /**< Hash key length in bytes. */
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	const uint8_t *key; /**< Hash key. */
 	const uint16_t *queue; /**< Queue indices to use. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (2 preceding siblings ...)
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  2018-04-17 20:18  0%         ` Thomas Monjalon
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
                         ` (9 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 135 insertions(+), 117 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 2ddb08feb..798b7948d 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,29 +2073,29 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->s.rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->s.rss_key),
+		action_rss_data->rss_conf.rss_key_len =
+			RTE_MIN(sizeof(action_rss_data->rss_key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2125,7 +2113,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2135,7 +2123,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2154,7 +2142,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2169,7 +2157,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2186,10 +2174,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2206,6 +2193,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2483,8 +2471,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2493,6 +2481,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2503,6 +2492,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2525,8 +2519,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index d0d372797..95618e4eb 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -977,7 +977,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1026,14 +1026,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1065,7 +1071,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1096,11 +1102,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 15cdf07b7..8feb6ae31 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 9923bfa59..75ea0cbcb 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 80f9cb6cb..bb19e28c6 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 96184f030..ad2e55b8e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  2018-04-18 12:26  0%         ` Andrew Rybchenko
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
                         ` (10 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to do so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index b9f36587c..a5c6a1670 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -964,6 +965,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -1001,6 +1011,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 67fd568bc..15cdf07b7 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 1ca413e32..9923bfa59 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap & FATE)
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3b7a960b0..fe2f94010 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 6ace24ff4..96184f030 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
                         ` (11 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f0b4b7bc4..2ddb08feb 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a7645adb8..d0d372797 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1065,7 +1065,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ada280810..80f9cb6cb 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index d28a2a473..6ace24ff4 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-16 16:22  3%       ` Adrien Mazarguil
  2018-04-17 19:37  0%         ` Ferruh Yigit
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
                         ` (12 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 5daa93bb3..a7645adb8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1244,8 +1244,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 44ae19d3b..26b95c772 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (12 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-16 16:22  4%     ` Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
                         ` (13 more replies)
  13 siblings, 14 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v4 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/096509.html

v5 changes:

- No change besides new acked-by lines, rebased series to address conflicts.

v3 changes:

- Rebased series, fixed latest conflicts.
- Addressed Andrew's comments, see affected patches for details:
  - Empty RSS types in flow rule means PMD-specific RSS instead of no RSS.
  - RSS hash function now explicitly compared against
    RTE_ETH_HASH_FUNCTION_DEFAULT instead of 0 in all PMDs.
  - sfc PMD updated to also accept Toeplitz.
  - Implicit VLAN TPID matching now removed from all PMDs.
  - Default mask upate for VLAN TCI now split as separate patch #11.
  - Ingress/egress definition clarified in patch #12.

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (16):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: limit default VLAN TCI mask in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 394 +++++++++++----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 618 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  60 ++-
 drivers/net/bnxt/bnxt_filter.c              |  49 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 ++-
 drivers/net/e1000/igb_rxtx.c                |  55 +-
 drivers/net/enic/enic_flow.c                |  50 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 130 +++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 +-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 316 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  32 +-
 drivers/net/sfc/sfc_flow.c                  |  78 ++-
 drivers/net/tap/tap_flow.c                  |  49 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 339 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1750 insertions(+), 1123 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v5 00/11] Bunch of flow API-related fixes
  2018-04-10 16:34  3% ` [dpdk-dev] [PATCH v4 " Adrien Mazarguil
@ 2018-04-16 16:21  3%   ` Adrien Mazarguil
  2018-04-17  9:17  0%     ` Ferruh Yigit
  2018-04-19 10:07  3%     ` [dpdk-dev] [PATCH v6 " Adrien Mazarguil
  0 siblings, 2 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:21 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v5 changes:

- No change, rebased series to address conflicts.

v4 changes:

- Rebased again.
- The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
  see updated patch for details.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline.c                      |   2 +
 app/test-pmd/cmdline_flow.c                 | 252 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 10 files changed, 494 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 15:33  0%     ` Olivier Matz
@ 2018-04-16 15:41  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 15:41 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev, Anatoly Burakov

On 04/16/2018 06:33 PM, Olivier Matz wrote:
> On Mon, Apr 16, 2018 at 02:24:33PM +0100, Andrew Rybchenko wrote:
>> Size of memory chunk required to populate mempool objects depends
>> on how objects are stored in the memory. Different mempool drivers
>> may have different requirements and a new operation allows to
>> calculate memory size in accordance with driver requirements and
>> advertise requirements on minimum memory chunk size and alignment
>> in a generic way.
>>
>> Bump ABI version since the patch breaks it.
>>
>> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
>> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> [...]
>
>> @@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>>   	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
>>   	 * memory, then we'll go and reserve space page-by-page.
>>   	 */
>> -	no_pageshift = no_contig || force_contig ||
>> -			rte_eal_iova_mode() == RTE_IOVA_VA;
>> +	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
>>   	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
> In case there is a v5 for another reason, I think the last line is
> equivalent to:
>
>    try_contig = !no_pageshift && rte_eal_has_hugepages();

Agree. As I understand it is true before my patch as well.

> Otherwise:
> Acked-by: Olivier Matz <olivier.matz@6wind.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
@ 2018-04-16 15:33  0%     ` Olivier Matz
  2018-04-16 15:41  0%       ` Andrew Rybchenko
  2018-04-17 10:23  0%     ` Burakov, Anatoly
  1 sibling, 1 reply; 200+ results
From: Olivier Matz @ 2018-04-16 15:33 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: dev, Anatoly Burakov

On Mon, Apr 16, 2018 at 02:24:33PM +0100, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>

[...]

> @@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
>  	 * memory, then we'll go and reserve space page-by-page.
>  	 */
> -	no_pageshift = no_contig || force_contig ||
> -			rte_eal_iova_mode() == RTE_IOVA_VA;
> +	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
>  	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();

In case there is a v5 for another reason, I think the last line is
equivalent to:

  try_contig = !no_pageshift && rte_eal_has_hugepages();


Otherwise:
Acked-by: Olivier Matz <olivier.matz@6wind.com>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation
  2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
@ 2018-04-16 13:33  4%   ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:33 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Artem V. Andreev

From: "Artem V. Andreev" <Artem.Andreev@oktetlabs.ru>

If mempool manager supports object blocks (physically and virtual
contiguous set of objects), it is sufficient to get the first
object only and the function allows to avoid filling in of
information about each block member.

Signed-off-by: Artem V. Andreev <Artem.Andreev@oktetlabs.ru>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 doc/guides/rel_notes/deprecation.rst       |   7 --
 lib/librte_mempool/Makefile                |   1 +
 lib/librte_mempool/meson.build             |   2 +
 lib/librte_mempool/rte_mempool.c           |  39 ++++++++
 lib/librte_mempool/rte_mempool.h           | 151 ++++++++++++++++++++++++++++-
 lib/librte_mempool/rte_mempool_ops.c       |   1 +
 lib/librte_mempool/rte_mempool_version.map |   1 +
 7 files changed, 194 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 6d9a0c8..f3284c5 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -47,13 +47,6 @@ Deprecation Notices
 
   - ``rte_eal_mbuf_default_mempool_ops``
 
-* mempool: several API and ABI changes are planned in v18.05.
-
-  The following changes are planned:
-
-  - addition of new op to allocate contiguous
-    block of objects if underlying driver supports it.
-
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
   definition in line with the ethdev TM and MTR APIs. Currently, this field
   is defined in librte_sched in a non-generic way. The new generic format
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 7f19f00..e3c32b1 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -10,6 +10,7 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
 # Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
 # from earlier deprecated rte_mempool_populate_phys_tab()
 CFLAGS += -Wno-deprecated-declarations
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index baf2d24..d507e55 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
+
 extra_flags = []
 
 # Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 84b3d64..cf5d124 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -1255,6 +1255,36 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #endif
 }
 
+void
+rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp,
+	void * const *first_obj_table_const, unsigned int n, int free)
+{
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	struct rte_mempool_info info;
+	const size_t total_elt_sz =
+		mp->header_size + mp->elt_size + mp->trailer_size;
+	unsigned int i, j;
+
+	rte_mempool_ops_get_info(mp, &info);
+
+	for (i = 0; i < n; ++i) {
+		void *first_obj = first_obj_table_const[i];
+
+		for (j = 0; j < info.contig_block_size; ++j) {
+			void *obj;
+
+			obj = (void *)((uintptr_t)first_obj + j * total_elt_sz);
+			rte_mempool_check_cookies(mp, &obj, 1, free);
+		}
+	}
+#else
+	RTE_SET_USED(mp);
+	RTE_SET_USED(first_obj_table_const);
+	RTE_SET_USED(n);
+	RTE_SET_USED(free);
+#endif
+}
+
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 static void
 mempool_obj_audit(struct rte_mempool *mp, __rte_unused void *opaque,
@@ -1320,6 +1350,7 @@ void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
 #endif
@@ -1361,6 +1392,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	/* sum and dump statistics */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
@@ -1369,6 +1401,8 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_objs += mp->stats[lcore_id].get_success_objs;
 		sum.get_fail_bulk += mp->stats[lcore_id].get_fail_bulk;
 		sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs;
+		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
+		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
@@ -1377,6 +1411,11 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "    get_success_objs=%"PRIu64"\n", sum.get_success_objs);
 	fprintf(f, "    get_fail_bulk=%"PRIu64"\n", sum.get_fail_bulk);
 	fprintf(f, "    get_fail_objs=%"PRIu64"\n", sum.get_fail_objs);
+	if (info.contig_block_size > 0) {
+		fprintf(f, "    get_success_blks=%"PRIu64"\n",
+			sum.get_success_blks);
+		fprintf(f, "    get_fail_blks=%"PRIu64"\n", sum.get_fail_blks);
+	}
 #else
 	fprintf(f, "  no statistics available\n");
 #endif
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 1ac2f57..3cab3a0 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -70,6 +70,10 @@ struct rte_mempool_debug_stats {
 	uint64_t get_success_objs; /**< Objects successfully allocated. */
 	uint64_t get_fail_bulk;    /**< Failed allocation number. */
 	uint64_t get_fail_objs;    /**< Objects that failed to be allocated. */
+	/** Successful allocation number of contiguous blocks. */
+	uint64_t get_success_blks;
+	/** Failed allocation number of contiguous blocks. */
+	uint64_t get_fail_blks;
 } __rte_cache_aligned;
 #endif
 
@@ -195,7 +199,10 @@ struct rte_mempool_memhdr {
  *
  * Additional information about the mempool
  */
-struct rte_mempool_info;
+struct rte_mempool_info {
+	/** Number of objects in the contiguous block */
+	unsigned int contig_block_size;
+};
 
 /**
  * The RTE mempool structure.
@@ -273,8 +280,16 @@ struct rte_mempool {
 			mp->stats[__lcore_id].name##_bulk += 1;	\
 		}                                               \
 	} while(0)
+#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do {                    \
+		unsigned int __lcore_id = rte_lcore_id();       \
+		if (__lcore_id < RTE_MAX_LCORE) {               \
+			mp->stats[__lcore_id].name##_blks += n;	\
+			mp->stats[__lcore_id].name##_bulk += 1;	\
+		}                                               \
+	} while (0)
 #else
 #define __MEMPOOL_STAT_ADD(mp, name, n) do {} while(0)
+#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do {} while (0)
 #endif
 
 /**
@@ -342,6 +357,38 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Check contiguous object blocks and update cookies or panic.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param first_obj_table_const
+ *   Pointer to a table of void * pointers (first object of the contiguous
+ *   object blocks).
+ * @param n
+ *   Number of contiguous object blocks.
+ * @param free
+ *   - 0: object is supposed to be allocated, mark it as free
+ *   - 1: object is supposed to be free, mark it as allocated
+ *   - 2: just check that cookie is valid (free or allocated)
+ */
+void rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp,
+	void * const *first_obj_table_const, unsigned int n, int free);
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+					      free) \
+	rte_mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+						free)
+#else
+#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+					      free) \
+	do {} while (0)
+#endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
+
 #define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
 
 /**
@@ -374,6 +421,15 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 		void **obj_table, unsigned int n);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dequeue a number of contiquous object blocks from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_contig_blocks_t)(struct rte_mempool *mp,
+		 void **first_obj_table, unsigned int n);
+
+/**
  * Return the number of available objects in the external pool.
  */
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
@@ -539,6 +595,10 @@ struct rte_mempool_ops {
 	 * Get mempool info
 	 */
 	rte_mempool_get_info_t get_info;
+	/**
+	 * Dequeue a number of contiguous object blocks.
+	 */
+	rte_mempool_dequeue_contig_blocks_t dequeue_contig_blocks;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -617,6 +677,30 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
 }
 
 /**
+ * @internal Wrapper for mempool_ops dequeue_contig_blocks callback.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[out] first_obj_table
+ *   Pointer to a table of void * pointers (first objects).
+ * @param[in] n
+ *   Number of blocks to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of dequeue function.
+ */
+static inline int
+rte_mempool_ops_dequeue_contig_blocks(struct rte_mempool *mp,
+		void **first_obj_table, unsigned int n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	RTE_ASSERT(ops->dequeue_contig_blocks != NULL);
+	return ops->dequeue_contig_blocks(mp, first_obj_table, n);
+}
+
+/**
  * @internal wrapper for mempool_ops enqueue callback.
  *
  * @param mp
@@ -1531,6 +1615,71 @@ rte_mempool_get(struct rte_mempool *mp, void **obj_p)
 }
 
 /**
+ * @internal Get contiguous blocks of objects from the pool. Used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param first_obj_table
+ *   A pointer to a pointer to the first object in each block.
+ * @param n
+ *   A number of blocks to get.
+ * @return
+ *   - >0: Success
+ *   - <0: Error
+ */
+static __rte_always_inline int
+__mempool_generic_get_contig_blocks(struct rte_mempool *mp,
+				    void **first_obj_table, unsigned int n)
+{
+	int ret;
+
+	ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n);
+	if (ret < 0)
+		__MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_fail, n);
+	else
+		__MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_success, n);
+
+	return ret;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get a contiguous blocks of objects from the mempool.
+ *
+ * If cache is enabled, consider to flush it first, to reuse objects
+ * as soon as possible.
+ *
+ * The application should check that the driver supports the operation
+ * by calling rte_mempool_ops_get_info() and checking that `contig_block_size`
+ * is not zero.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param first_obj_table
+ *   A pointer to a pointer to the first object in each block.
+ * @param n
+ *   The number of blocks to get from mempool.
+ * @return
+ *   - 0: Success; blocks taken.
+ *   - -ENOBUFS: Not enough entries in the mempool; no object is retrieved.
+ *   - -EOPNOTSUPP: The mempool driver does not support block dequeue
+ */
+static __rte_always_inline int
+__rte_experimental
+rte_mempool_get_contig_blocks(struct rte_mempool *mp,
+			      void **first_obj_table, unsigned int n)
+{
+	int ret;
+
+	ret = __mempool_generic_get_contig_blocks(mp, first_obj_table, n);
+	if (ret == 0)
+		__mempool_contig_blocks_check_cookies(mp, first_obj_table, n,
+						      1);
+	return ret;
+}
+
+/**
  * Return the number of entries in the mempool.
  *
  * When cache is enabled, this function has to browse the length of
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index efc1c08..a27e1fa 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -60,6 +60,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
 	ops->get_info = h->get_info;
+	ops->dequeue_contig_blocks = h->dequeue_contig_blocks;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index c9d16ec..1c406b5 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -53,6 +53,7 @@ DPDK_17.11 {
 DPDK_18.05 {
 	global:
 
+	rte_mempool_contig_blocks_check_cookies;
 	rte_mempool_op_calc_mem_size_default;
 	rte_mempool_op_populate_default;
 
-- 
2.7.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver
    2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
@ 2018-04-16 13:33  3% ` Andrew Rybchenko
  2018-04-16 13:33  4%   ` [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation Andrew Rybchenko
  1 sibling, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:33 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The initial patch series [1] (RFCv1 is [2]) is split into two to simplify
processing.  It is the second part which relies on the first one [3].

It should be applied on top of [3].

The patch series adds bucket mempool driver which allows to allocate
(both physically and virtually) contiguous blocks of objects and adds
mempool API to do it. It is still capable to provide separate objects,
but it is definitely more heavy-weight than ring/stack drivers.
The driver will be used by the future Solarflare driver enhancements
which allow to utilize physical contiguous blocks in the NIC firmware.

The target usecase is dequeue in blocks and enqueue separate objects
back (which are collected in buckets to be dequeued). So, the memory
pool with bucket driver is created by an application and provided to
networking PMD receive queue. The choice of bucket driver is done using
rte_eth_dev_pool_ops_supported(). A PMD that relies upon contiguous
block allocation should report the bucket driver as the only supported
and preferred one.

Introduction of the contiguous block dequeue operation is proven by
performance measurements using autotest with minor enhancements:
 - in the original test bulks are powers of two, which is unacceptable
   for us, so they are changed to multiple of contig_block_size;
 - the test code is duplicated to support plain dequeue and
   dequeue_contig_blocks;
 - all the extra test variations (with/without cache etc) are eliminated;
 - a fake read from the dequeued buffer is added (in both cases) to
   simulate mbufs access.

start performance test for bucket (without cache)
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   111935488
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   115290931
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   353055539
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   353330790
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   224657407
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   230411468
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   706700902
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   703673139
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   425236887
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   437295512
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=  1343409356
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=  1336567397
start performance test for bucket (without cache + contiguous dequeue)
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   122945536
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   126458265
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   374262988
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   377316966
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   244842496
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   251618917
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   751226060
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   756233010
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   462068120
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   476997221
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=  1432171313
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=  1438829771

The number of objects in the contiguous block is a function of bucket
memory size (.config option) and total element size. In the future
additional API with possibility to pass parameters on mempool allocation
may be added.

It breaks ABI since changes rte_mempool_ops. The ABI version is already
bumped in [4].


[1] https://dpdk.org/ml/archives/dev/2018-January/088698.html
[2] https://dpdk.org/ml/archives/dev/2017-November/082335.html
[3] https://dpdk.org/ml/archives/dev/2018-April/097354.html
[4] https://dpdk.org/ml/archives/dev/2018-April/097352.html

v1 -> v2:
  - just rebase

RFCv2 -> v1:
  - rebased on top of [3]
  - cleanup deprecation notice when it is done
  - mark a new API experimental
  - move contig blocks dequeue debug checks/processing to the library function
  - add contig blocks get stats
  - add release notes

RFCv1 -> RFCv2:
  - change info API to get information from driver required to
    API user to know contiguous block size
  - use SPDX tags
  - avoid all objects affinity to single lcore
  - fix bucket get_count
  - fix NO_CACHE_ALIGN case in bucket mempool



Andrew Rybchenko (1):
  doc: advertise bucket mempool driver

Artem V. Andreev (5):
  mempool/bucket: implement bucket mempool manager
  mempool: implement abstract mempool info API
  mempool: support block dequeue operation
  mempool/bucket: implement block dequeue operation
  mempool/bucket: do not allow one lcore to grab all buckets

 MAINTAINERS                                        |   9 +
 config/common_base                                 |   2 +
 doc/guides/rel_notes/deprecation.rst               |   7 -
 doc/guides/rel_notes/release_18_05.rst             |   9 +
 drivers/mempool/Makefile                           |   1 +
 drivers/mempool/bucket/Makefile                    |  27 +
 drivers/mempool/bucket/meson.build                 |   9 +
 drivers/mempool/bucket/rte_mempool_bucket.c        | 627 +++++++++++++++++++++
 .../mempool/bucket/rte_mempool_bucket_version.map  |   4 +
 lib/librte_mempool/Makefile                        |   1 +
 lib/librte_mempool/meson.build                     |   2 +
 lib/librte_mempool/rte_mempool.c                   |  39 ++
 lib/librte_mempool/rte_mempool.h                   | 190 +++++++
 lib/librte_mempool/rte_mempool_ops.c               |  16 +
 lib/librte_mempool/rte_mempool_version.map         |   8 +
 mk/rte.app.mk                                      |   1 +
 16 files changed, 945 insertions(+), 7 deletions(-)
 create mode 100644 drivers/mempool/bucket/Makefile
 create mode 100644 drivers/mempool/bucket/meson.build
 create mode 100644 drivers/mempool/bucket/rte_mempool_bucket.c
 create mode 100644 drivers/mempool/bucket/rte_mempool_bucket_version.map

-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
                     ` (2 preceding siblings ...)
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
@ 2018-04-16 13:24  4%   ` Andrew Rybchenko
  2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Thomas Monjalon

Move rte_mempool_xmem_size() code to internal helper function
since it is required in two places: deprecated rte_mempool_xmem_size()
and non-deprecated rte_mempool_op_calc_mem_size_default().

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v2 -> v3:
 - none

v1 -> v2:
 - deprecate rte_mempool_populate_iova_tab()
 - add -Wno-deprecated-declarations to fix build errors because of
   rte_mempool_populate_iova_tab() deprecation
 - add @deprecated to deprecated functions description

RFCv2 -> v1:
 - advertise deprecation in release notes
 - factor out default memory size calculation into non-deprecated
   internal function to avoid usage of deprecated function internally
 - remove test for deprecated functions to address build issue because
   of usage of deprecated functions (it is easy to allow usage of
   deprecated function in Makefile, but very complicated in meson)

 doc/guides/rel_notes/deprecation.rst         |  7 -------
 doc/guides/rel_notes/release_18_05.rst       | 11 ++++++++++
 lib/librte_mempool/Makefile                  |  3 +++
 lib/librte_mempool/meson.build               | 12 +++++++++++
 lib/librte_mempool/rte_mempool.c             | 19 ++++++++++++++---
 lib/librte_mempool/rte_mempool.h             | 30 +++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool_ops_default.c |  4 ++--
 test/test/test_mempool.c                     | 31 ----------------------------
 8 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 99a0b01..8d1b362 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -48,13 +48,6 @@ Deprecation Notices
   - ``rte_eal_mbuf_default_mempool_ops``
 
 * mempool: several API and ABI changes are planned in v18.05.
-  The following functions, introduced for Xen, which is not supported
-  anymore since v17.11, are hard to use, not used anywhere else in DPDK.
-  Therefore they will be deprecated in v18.05 and removed in v18.08:
-
-  - ``rte_mempool_xmem_create``
-  - ``rte_mempool_xmem_size``
-  - ``rte_mempool_xmem_usage``
 
   The following changes are planned:
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index f481eea..3869d04 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -181,6 +181,17 @@ API Changes
   Now the new driver callbacks ``calc_mem_size`` and ``populate`` may be
   used to achieve it without specific knowledge in the generic code.
 
+* **Deprecated mempool xmem functions.**
+
+  The following functions, introduced for Xen, which is not supported
+  anymore since v17.11, are hard to use, not used anywhere else in DPDK.
+  Therefore they were deprecated in v18.05 and will be removed in v18.08:
+
+  - ``rte_mempool_xmem_create``
+  - ``rte_mempool_xmem_size``
+  - ``rte_mempool_xmem_usage``
+  - ``rte_mempool_populate_iova_tab``
+
 
 ABI Changes
 -----------
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 421e2a7..7f19f00 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -7,6 +7,9 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_mempool.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
+# from earlier deprecated rte_mempool_populate_phys_tab()
+CFLAGS += -Wno-deprecated-declarations
 LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 6181ad8..baf2d24 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,6 +1,18 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+extra_flags = []
+
+# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
+# from earlier deprecated rte_mempool_populate_phys_tab()
+extra_flags += '-Wno-deprecated-declarations'
+
+foreach flag: extra_flags
+	if cc.has_argument(flag)
+		cflags += flag
+	endif
+endforeach
+
 version = 4
 sources = files('rte_mempool.c', 'rte_mempool_ops.c',
 		'rte_mempool_ops_default.c')
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 5c75c16..c63c363 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -227,11 +227,13 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 
 
 /*
- * Calculate maximum amount of memory required to store given number of objects.
+ * Internal function to calculate required memory chunk size shared
+ * by default implementation of the corresponding callback and
+ * deprecated external function.
  */
 size_t
-rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
-		      __rte_unused unsigned int flags)
+rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz,
+				 uint32_t pg_shift)
 {
 	size_t obj_per_page, pg_num, pg_sz;
 
@@ -251,6 +253,17 @@ rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
 }
 
 /*
+ * Calculate maximum amount of memory required to store given number of objects.
+ */
+size_t
+rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
+		      __rte_unused unsigned int flags)
+{
+	return rte_mempool_calc_mem_size_helper(elt_num, total_elt_sz,
+						pg_shift);
+}
+
+/*
  * Calculate how much memory would be actually required with the
  * given memory footprint to store required number of elements.
  */
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0b83d5e..9107f5a 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -427,6 +427,28 @@ ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		size_t *min_chunk_size, size_t *align);
 
 /**
+ * @internal Helper function to calculate memory size required to store
+ * specified number of objects in assumption that the memory buffer will
+ * be aligned at page boundary.
+ *
+ * Note that if object size is bigger than page size, then it assumes
+ * that pages are grouped in subsets of physically continuous pages big
+ * enough to store at least one object.
+ *
+ * @param elt_num
+ *   Number of elements.
+ * @param total_elt_sz
+ *   The size of each element, including header and trailer, as returned
+ *   by rte_mempool_calc_obj_size().
+ * @param pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+size_t rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz,
+		uint32_t pg_shift);
+
+/**
  * Function to be called for each populated object.
  *
  * @param[in] mp
@@ -855,6 +877,7 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 		   int socket_id, unsigned flags);
 
 /**
+ * @deprecated
  * Create a new mempool named *name* in memory.
  *
  * The pool contains n elements of elt_size. Its size is set to n.
@@ -912,6 +935,7 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
  *   The pointer to the new allocated mempool, on success. NULL on error
  *   with rte_errno set appropriately. See rte_mempool_create() for details.
  */
+__rte_deprecated
 struct rte_mempool *
 rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
 		unsigned cache_size, unsigned private_data_size,
@@ -1008,6 +1032,7 @@ int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	void *opaque);
 
 /**
+ * @deprecated
  * Add physical memory for objects in the pool at init
  *
  * Add a virtually contiguous memory chunk in the pool where objects can
@@ -1033,6 +1058,7 @@ int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
  *   On error, the chunks are not added in the memory list of the
  *   mempool and a negative errno is returned.
  */
+__rte_deprecated
 int rte_mempool_populate_iova_tab(struct rte_mempool *mp, char *vaddr,
 	const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift,
 	rte_mempool_memchunk_free_cb_t *free_cb, void *opaque);
@@ -1652,6 +1678,7 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 	struct rte_mempool_objsz *sz);
 
 /**
+ * @deprecated
  * Get the size of memory required to store mempool elements.
  *
  * Calculate the maximum amount of memory required to store given number
@@ -1674,10 +1701,12 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  * @return
  *   Required memory size aligned at page boundary.
  */
+__rte_deprecated
 size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz,
 	uint32_t pg_shift, unsigned int flags);
 
 /**
+ * @deprecated
  * Get the size of memory required to store mempool elements.
  *
  * Calculate how much memory would be actually required with the given
@@ -1705,6 +1734,7 @@ size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz,
  *   buffer is too small, return a negative value whose absolute value
  *   is the actual number of elements that can be stored in that buffer.
  */
+__rte_deprecated
 ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num,
 	size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num,
 	uint32_t pg_shift, unsigned int flags);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 3defc15..fd63ca1 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -16,8 +16,8 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
-					 mp->flags);
+	mem_size = rte_mempool_calc_mem_size_helper(obj_num, total_elt_sz,
+						    pg_shift);
 
 	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
 
diff --git a/test/test/test_mempool.c b/test/test/test_mempool.c
index 63f921e..8d29af2 100644
--- a/test/test/test_mempool.c
+++ b/test/test/test_mempool.c
@@ -444,34 +444,6 @@ test_mempool_same_name_twice_creation(void)
 	return 0;
 }
 
-/*
- * Basic test for mempool_xmem functions.
- */
-static int
-test_mempool_xmem_misc(void)
-{
-	uint32_t elt_num, total_size;
-	size_t sz;
-	ssize_t usz;
-
-	elt_num = MAX_KEEP;
-	total_size = rte_mempool_calc_obj_size(MEMPOOL_ELT_SIZE, 0, NULL);
-	sz = rte_mempool_xmem_size(elt_num, total_size, MEMPOOL_PG_SHIFT_MAX,
-					0);
-
-	usz = rte_mempool_xmem_usage(NULL, elt_num, total_size, 0, 1,
-		MEMPOOL_PG_SHIFT_MAX, 0);
-
-	if (sz != (size_t)usz)  {
-		printf("failure @ %s: rte_mempool_xmem_usage(%u, %u) "
-			"returns: %#zx, while expected: %#zx;\n",
-			__func__, elt_num, total_size, sz, (size_t)usz);
-		return -1;
-	}
-
-	return 0;
-}
-
 static void
 walk_cb(struct rte_mempool *mp, void *userdata __rte_unused)
 {
@@ -596,9 +568,6 @@ test_mempool(void)
 	if (test_mempool_same_name_twice_creation() < 0)
 		goto err;
 
-	if (test_mempool_xmem_misc() < 0)
-		goto err;
-
 	/* test the stack handler */
 	if (test_mempool_basic(mp_stack, 1) < 0)
 		goto err;
-- 
2.7.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
                     ` (3 preceding siblings ...)
  2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
@ 2018-04-16 13:24  8%   ` Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The callback is not required any more since there is a new callback
to populate objects using provided memory area which provides
the same information.

Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - none

v2 -> v3:
 - none

v1 -> v2:
 - none

RFCv2 -> v1:
 - advertise ABI changes in release notes

 doc/guides/rel_notes/deprecation.rst       |  1 -
 doc/guides/rel_notes/release_18_05.rst     |  2 ++
 lib/librte_mempool/rte_mempool.c           |  5 -----
 lib/librte_mempool/rte_mempool.h           | 31 ------------------------------
 lib/librte_mempool/rte_mempool_ops.c       | 14 --------------
 lib/librte_mempool/rte_mempool_version.map |  1 -
 6 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 8d1b362..02ffcd4 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -51,7 +51,6 @@ Deprecation Notices
 
   The following changes are planned:
 
-  - substitute ``register_memory_area`` with ``populate`` ops.
   - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 3869d04..3ed4aae 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -223,6 +223,8 @@ ABI Changes
   Callback ``get_capabilities`` has been removed from ``rte_mempool_ops``
   since its features are covered by ``calc_mem_size`` and ``populate``
   callbacks.
+  Callback ``register_memory_area`` has been removed from ``rte_mempool_ops``
+  since the new callback ``populate`` may be used instead of it.
 
 
 Removed Items
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index c63c363..84b3d64 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -378,11 +378,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (ret != 0)
 		return ret;
 
-	/* Notify memory area to mempool */
-	ret = rte_mempool_ops_register_memory_area(mp, vaddr, iova, len);
-	if (ret != -ENOTSUP && ret < 0)
-		return ret;
-
 	/* mempool is already populated */
 	if (mp->populated_size >= mp->size)
 		return -ENOSPC;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 9107f5a..314f909 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -371,12 +371,6 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
 
 /**
- * Notify new memory area to mempool.
- */
-typedef int (*rte_mempool_ops_register_memory_area_t)
-(const struct rte_mempool *mp, char *vaddr, rte_iova_t iova, size_t len);
-
-/**
  * Calculate memory size required to store given number of objects.
  *
  * If mempool objects are not required to be IOVA-contiguous
@@ -514,10 +508,6 @@ struct rte_mempool_ops {
 	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
 	rte_mempool_get_count get_count; /**< Get qty of available objs. */
 	/**
-	 * Notify new memory area to mempool
-	 */
-	rte_mempool_ops_register_memory_area_t register_memory_area;
-	/**
 	 * Optional callback to calculate memory size required to
 	 * store specified number of objects.
 	 */
@@ -639,27 +629,6 @@ unsigned
 rte_mempool_ops_get_count(const struct rte_mempool *mp);
 
 /**
- * @internal wrapper for mempool_ops register_memory_area callback.
- * API to notify the mempool handler when a new memory area is added to pool.
- *
- * @param mp
- *   Pointer to the memory pool.
- * @param vaddr
- *   Pointer to the buffer virtual address.
- * @param iova
- *   Pointer to the buffer IO address.
- * @param len
- *   Pool size.
- * @return
- *   - 0: Success;
- *   - -ENOTSUP - doesn't support register_memory_area ops (valid error case).
- *   - Otherwise, rte_mempool_populate_phys fails thus pool create fails.
- */
-int
-rte_mempool_ops_register_memory_area(const struct rte_mempool *mp,
-				char *vaddr, rte_iova_t iova, size_t len);
-
-/**
  * @internal wrapper for mempool_ops calc_mem_size callback.
  * API to calculate size of memory required to store specified number of
  * object.
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 6ac669a..ea9be1e 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -57,7 +57,6 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->enqueue = h->enqueue;
 	ops->dequeue = h->dequeue;
 	ops->get_count = h->get_count;
-	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
 
@@ -99,19 +98,6 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 }
 
 /* wrapper to notify new memory area to external mempool */
-int
-rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
-					rte_iova_t iova, size_t len)
-{
-	struct rte_mempool_ops *ops;
-
-	ops = rte_mempool_get_ops(mp->ops_index);
-
-	RTE_FUNC_PTR_OR_ERR_RET(ops->register_memory_area, -ENOTSUP);
-	return ops->register_memory_area(mp, vaddr, iova, len);
-}
-
-/* wrapper to notify new memory area to external mempool */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 637f73f..cf375db 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -45,7 +45,6 @@ DPDK_16.07 {
 DPDK_17.11 {
 	global:
 
-	rte_mempool_ops_register_memory_area;
 	rte_mempool_populate_iova;
 	rte_mempool_populate_iova_tab;
 
-- 
2.7.4

^ permalink raw reply	[relevance 8%]

* [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
@ 2018-04-16 13:24  6%   ` Andrew Rybchenko
  2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
  2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Santosh Shukla, Jerin Jacob

The callback was introduced to let generic code to know octeontx
mempool driver requirements to use single physically contiguous
memory chunk to store all objects and align object address to
total object size. Now these requirements are met using a new
callbacks to calculate required memory chunk size and to populate
objects using provided memory chunk.

These capability flags are not used anywhere else.

Restricting capabilities to flags is not generic and likely to
be insufficient to describe mempool driver features. If required
in the future, API which returns structured information may be
added.

Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - rebase

v2 -> v3:
 - none

v1 -> v2:
 - fix typo
 - rebase on top of patch which renames MEMPOOL_F_NO_PHYS_CONTIG

RFCv2 -> v1:
 - squash mempool/octeontx patches to add calc_mem_size and populate
   callbacks to this one in order to avoid breakages in the middle of
   patchset
 - advertise API changes in release notes

 doc/guides/rel_notes/deprecation.rst            |  1 -
 doc/guides/rel_notes/release_18_05.rst          | 11 +++++
 drivers/mempool/octeontx/rte_mempool_octeontx.c | 59 +++++++++++++++++++++----
 lib/librte_mempool/rte_mempool.c                | 34 ++------------
 lib/librte_mempool/rte_mempool.h                | 52 +---------------------
 lib/librte_mempool/rte_mempool_ops.c            | 14 ------
 lib/librte_mempool/rte_mempool_ops_default.c    | 15 +------
 lib/librte_mempool/rte_mempool_version.map      |  1 -
 8 files changed, 68 insertions(+), 119 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 575da18..99a0b01 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -58,7 +58,6 @@ Deprecation Notices
 
   The following changes are planned:
 
-  - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
   - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 5c6588e..f481eea 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -173,6 +173,14 @@ API Changes
    fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
    resulted in an error.
 
+* **Removed mempool capability flags and related functions.**
+
+  Flags ``MEMPOOL_F_CAPA_PHYS_CONTIG`` and
+  ``MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS`` were used by octeontx mempool
+  driver to customize generic mempool library behaviour.
+  Now the new driver callbacks ``calc_mem_size`` and ``populate`` may be
+  used to achieve it without specific knowledge in the generic code.
+
 
 ABI Changes
 -----------
@@ -201,6 +209,9 @@ ABI Changes
   to allow to customize required memory size calculation.
   A new callback ``populate`` has been added to ``rte_mempool_ops``
   to allow to customize objects population.
+  Callback ``get_capabilities`` has been removed from ``rte_mempool_ops``
+  since its features are covered by ``calc_mem_size`` and ``populate``
+  callbacks.
 
 
 Removed Items
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index d143d05..64ed528 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -126,14 +126,29 @@ octeontx_fpavf_get_count(const struct rte_mempool *mp)
 	return octeontx_fpa_bufpool_free_count(pool);
 }
 
-static int
-octeontx_fpavf_get_capabilities(const struct rte_mempool *mp,
-				unsigned int *flags)
+static ssize_t
+octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
+			     uint32_t obj_num, uint32_t pg_shift,
+			     size_t *min_chunk_size, size_t *align)
 {
-	RTE_SET_USED(mp);
-	*flags |= (MEMPOOL_F_CAPA_PHYS_CONTIG |
-			MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS);
-	return 0;
+	ssize_t mem_size;
+
+	/*
+	 * Simply need space for one more object to be able to
+	 * fulfil alignment requirements.
+	 */
+	mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
+							pg_shift,
+							min_chunk_size, align);
+	if (mem_size >= 0) {
+		/*
+		 * Memory area which contains objects must be physically
+		 * contiguous.
+		 */
+		*min_chunk_size = mem_size;
+	}
+
+	return mem_size;
 }
 
 static int
@@ -150,6 +165,33 @@ octeontx_fpavf_register_memory_area(const struct rte_mempool *mp,
 	return octeontx_fpavf_pool_set_range(pool_bar, len, vaddr, gpool);
 }
 
+static int
+octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
+			void *vaddr, rte_iova_t iova, size_t len,
+			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+{
+	size_t total_elt_sz;
+	size_t off;
+
+	if (iova == RTE_BAD_IOVA)
+		return -EINVAL;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	/* align object start address to a multiple of total_elt_sz */
+	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+
+	if (len < off)
+		return -EINVAL;
+
+	vaddr = (char *)vaddr + off;
+	iova += off;
+	len -= off;
+
+	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+					       obj_cb, obj_cb_arg);
+}
+
 static struct rte_mempool_ops octeontx_fpavf_ops = {
 	.name = "octeontx_fpavf",
 	.alloc = octeontx_fpavf_alloc,
@@ -157,8 +199,9 @@ static struct rte_mempool_ops octeontx_fpavf_ops = {
 	.enqueue = octeontx_fpavf_enqueue,
 	.dequeue = octeontx_fpavf_dequeue,
 	.get_count = octeontx_fpavf_get_count,
-	.get_capabilities = octeontx_fpavf_get_capabilities,
 	.register_memory_area = octeontx_fpavf_register_memory_area,
+	.calc_mem_size = octeontx_fpavf_calc_mem_size,
+	.populate = octeontx_fpavf_populate,
 };
 
 MEMPOOL_REGISTER_OPS(octeontx_fpavf_ops);
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 68ae12f..5c75c16 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -231,15 +231,9 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  */
 size_t
 rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
-		      unsigned int flags)
+		      __rte_unused unsigned int flags)
 {
 	size_t obj_per_page, pg_num, pg_sz;
-	unsigned int mask;
-
-	mask = MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS | MEMPOOL_F_CAPA_PHYS_CONTIG;
-	if ((flags & mask) == mask)
-		/* alignment need one additional object */
-		elt_num += 1;
 
 	if (total_elt_sz == 0)
 		return 0;
@@ -263,18 +257,12 @@ rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
 ssize_t
 rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num,
-	uint32_t pg_shift, unsigned int flags)
+	uint32_t pg_shift, __rte_unused unsigned int flags)
 {
 	uint32_t elt_cnt = 0;
 	rte_iova_t start, end;
 	uint32_t iova_idx;
 	size_t pg_sz = (size_t)1 << pg_shift;
-	unsigned int mask;
-
-	mask = MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS | MEMPOOL_F_CAPA_PHYS_CONTIG;
-	if ((flags & mask) == mask)
-		/* alignment need one additional object */
-		elt_num += 1;
 
 	/* if iova is NULL, assume contiguous memory */
 	if (iova == NULL) {
@@ -368,8 +356,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb,
 	void *opaque)
 {
-	unsigned total_elt_sz;
-	unsigned int mp_capa_flags;
 	unsigned i = 0;
 	size_t off;
 	struct rte_mempool_memhdr *memhdr;
@@ -388,17 +374,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (mp->populated_size >= mp->size)
 		return -ENOSPC;
 
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
-
-	/* Get mempool capabilities */
-	mp_capa_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_capa_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
-	/* update mempool capabilities */
-	mp->flags |= mp_capa_flags;
-
 	memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0);
 	if (memhdr == NULL)
 		return -ENOMEM;
@@ -410,10 +385,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	memhdr->free_cb = free_cb;
 	memhdr->opaque = opaque;
 
-	if (mp_capa_flags & MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS)
-		/* align object start address to a multiple of total_elt_sz */
-		off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
-	else if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
+	if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
 		off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr;
 	else
 		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 754261e..0b83d5e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -246,24 +246,6 @@ struct rte_mempool {
 #define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
 #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
-/**
- * This capability flag is advertised by a mempool handler, if the whole
- * memory area containing the objects must be physically contiguous.
- * Note: This flag should not be passed by application.
- */
-#define MEMPOOL_F_CAPA_PHYS_CONTIG 0x0040
-/**
- * This capability flag is advertised by a mempool handler. Used for a case
- * where mempool driver wants object start address(vaddr) aligned to block
- * size(/ total element size).
- *
- * Note:
- * - This flag should not be passed by application.
- *   Flag used for mempool driver only.
- * - Mempool driver must also set MEMPOOL_F_CAPA_PHYS_CONTIG flag along with
- *   MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS.
- */
-#define MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS 0x0080
 
 /**
  * @internal When debug is enabled, store some statistics.
@@ -389,12 +371,6 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
 
 /**
- * Get the mempool capabilities.
- */
-typedef int (*rte_mempool_get_capabilities_t)(const struct rte_mempool *mp,
-		unsigned int *flags);
-
-/**
  * Notify new memory area to mempool.
  */
 typedef int (*rte_mempool_ops_register_memory_area_t)
@@ -440,13 +416,7 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
- * If mempool driver requires object addresses to be block size aligned
- * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
- * reserved to be able to meet the requirement.
- *
- * Minimum size of memory chunk is either all required space, if
- * capabilities say that whole memory area must be physically contiguous
- * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
+ * Minimum size of memory chunk is a maximum of the page size and total
  * element size.
  *
  * Required memory chunk alignment is a maximum of page size and cache
@@ -522,10 +492,6 @@ struct rte_mempool_ops {
 	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
 	rte_mempool_get_count get_count; /**< Get qty of available objs. */
 	/**
-	 * Get the mempool capabilities
-	 */
-	rte_mempool_get_capabilities_t get_capabilities;
-	/**
 	 * Notify new memory area to mempool
 	 */
 	rte_mempool_ops_register_memory_area_t register_memory_area;
@@ -651,22 +617,6 @@ unsigned
 rte_mempool_ops_get_count(const struct rte_mempool *mp);
 
 /**
- * @internal wrapper for mempool_ops get_capabilities callback.
- *
- * @param mp [in]
- *   Pointer to the memory pool.
- * @param flags [out]
- *   Pointer to the mempool flags.
- * @return
- *   - 0: Success; The mempool driver has advertised his pool capabilities in
- *   flags param.
- *   - -ENOTSUP - doesn't support get_capabilities ops (valid case).
- *   - Otherwise, pool create fails.
- */
-int
-rte_mempool_ops_get_capabilities(const struct rte_mempool *mp,
-					unsigned int *flags);
-/**
  * @internal wrapper for mempool_ops register_memory_area callback.
  * API to notify the mempool handler when a new memory area is added to pool.
  *
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 1a7f39f..6ac669a 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -57,7 +57,6 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->enqueue = h->enqueue;
 	ops->dequeue = h->dequeue;
 	ops->get_count = h->get_count;
-	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
@@ -99,19 +98,6 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to get external mempool capabilities. */
-int
-rte_mempool_ops_get_capabilities(const struct rte_mempool *mp,
-					unsigned int *flags)
-{
-	struct rte_mempool_ops *ops;
-
-	ops = rte_mempool_get_ops(mp->ops_index);
-
-	RTE_FUNC_PTR_OR_ERR_RET(ops->get_capabilities, -ENOTSUP);
-	return ops->get_capabilities(mp, flags);
-}
-
 /* wrapper to notify new memory area to external mempool */
 int
 rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 57295f7..3defc15 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -11,26 +11,15 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     uint32_t obj_num, uint32_t pg_shift,
 				     size_t *min_chunk_size, size_t *align)
 {
-	unsigned int mp_flags;
-	int ret;
 	size_t total_elt_sz;
 	size_t mem_size;
 
-	/* Get mempool capabilities */
-	mp_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
-					 mp->flags | mp_flags);
+					 mp->flags);
 
-	if (mp_flags & MEMPOOL_F_CAPA_PHYS_CONTIG)
-		*min_chunk_size = mem_size;
-	else
-		*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
+	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
 
 	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
 
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 41a0b09..637f73f 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -45,7 +45,6 @@ DPDK_16.07 {
 DPDK_17.11 {
 	global:
 
-	rte_mempool_ops_get_capabilities;
 	rte_mempool_ops_register_memory_area;
 	rte_mempool_populate_iova;
 	rte_mempool_populate_iova_tab;
-- 
2.7.4

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver
  @ 2018-04-16 13:24  2% ` Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
                     ` (4 more replies)
  2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
  1 sibling, 5 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev
  Cc: Olivier MATZ, Thomas Monjalon, Anatoly Burakov, Santosh Shukla,
	Jerin Jacob, Hemant Agrawal, Shreyansh Jain

The initial patch series [1] is split into two to simplify processing.
The second series relies on this one and will add bucket mempool driver
and related ops.

The patch series has generic enhancements suggested by Olivier.
Basically it adds driver callbacks to calculate required memory size and
to populate objects using provided memory area. It allows to remove
so-called capability flags used before to tell generic code how to
allocate and slice allocated memory into mempool objects.
Clean up which removes get_capabilities and register_memory_area is
not strictly required, but I think right thing to do.
Existing mempool drivers are updated.

rte_mempool_populate_iova_tab() is also deprecated in v2 as agreed in [2].
Unfortunately it requires addition of -Wno-deprecated-declarations flag
to librte_mempool since the function is used by deprecated earlier
rte_mempool_populate_phys_tab(). If the later may be removed in the
release, we can avoid addition of the flag to allow usage of deprecated
functions.

A new patch is added to the series in v3 to rename MEMPOOL_F_NO_PHYS_CONTIG
as MEMPOOL_F_NO_IOVA_CONTIG as agreed in [3].
MEMPOOL_F_CAPA_PHYS_CONTIG is not renamed since it removed in this
patchset.

It breaks ABI since changes rte_mempool_ops. Also it removes
rte_mempool_ops_register_memory_area() and
rte_mempool_ops_get_capabilities() since corresponding callbacks are
removed.

Internal global functions are not listed in map file since it is not
a part of external API.

[1] https://dpdk.org/ml/archives/dev/2018-January/088698.html
[2] https://dpdk.org/ml/archives/dev/2018-March/093186.html
[3] https://dpdk.org/ml/archives/dev/2018-March/093345.html

v3 -> v4:
  - rebase on memory rework

v2 -> v3:
  - fix build error in mempool/dpaa: prepare to remove register memory area op

v1 -> v2:
  - deprecate rte_mempool_populate_iova_tab()
  - add patch to fix memory leak if no objects are populated
  - add patch to rename MEMPOOL_F_NO_PHYS_CONTIG
  - minor fixes (typos, blank line at the end of file)
  - highlight meaning of min_chunk_size (when it is virtual or
    physical contiguous)
  - make sure that mempool is initialized in rte_mempool_populate_anon()
  - move patch to ensure that mempool is initialized earlier in the series

RFCv2 -> v1:
  - split the series in two
  - squash octeontx patches which implement calc_mem_size and populate
    callbacks into the patch which removes get_capabilities since it is
    the easiest way to untangle the tangle of tightly related library
    functions and flags advertised by the driver
  - consistently name default callbacks
  - move default callbacks to dedicated file
  - see detailed description in patches

RFCv1 -> RFCv2:
  - add driver ops to calculate required memory size and populate
    mempool objects, remove extra flags which were required before
    to control it
  - transition of octeontx and dpaa drivers to the new callbacks
  - change info API to get information from driver required to
    API user to know contiguous block size
  - remove get_capabilities (not required any more and may be
    substituted with more in info get API)
  - remove register_memory_area since it is substituted with
    populate callback which can do more
  - use SPDX tags
  - avoid all objects affinity to single lcore
  - fix bucket get_count
  - deprecate XMEM API
  - avoid introduction of a new function to flush cache
  - fix NO_CACHE_ALIGN case in bucket mempool


Andrew Rybchenko (9):
  mempool: fix memhdr leak when no objects are populated
  mempool: rename flag to control IOVA-contiguous objects
  mempool: add op to calculate memory size to be allocated
  mempool: add op to populate objects using provided memory
  mempool: remove callback to get capabilities
  mempool: deprecate xmem functions
  mempool/octeontx: prepare to remove register memory area op
  mempool/dpaa: prepare to remove register memory area op
  mempool: remove callback to register memory area

Artem V. Andreev (2):
  mempool: ensure the mempool is initialized before populating
  mempool: support flushing the default cache of the mempool

 doc/guides/rel_notes/deprecation.rst            |  12 +-
 doc/guides/rel_notes/release_18_05.rst          |  34 ++-
 drivers/mempool/dpaa/dpaa_mempool.c             |  13 +-
 drivers/mempool/octeontx/rte_mempool_octeontx.c |  64 ++++--
 drivers/net/thunderx/nicvf_ethdev.c             |   2 +-
 lib/librte_mempool/Makefile                     |   6 +-
 lib/librte_mempool/meson.build                  |  17 +-
 lib/librte_mempool/rte_mempool.c                | 240 ++++++++++----------
 lib/librte_mempool/rte_mempool.h                | 280 +++++++++++++++++-------
 lib/librte_mempool/rte_mempool_ops.c            |  37 ++--
 lib/librte_mempool/rte_mempool_ops_default.c    |  51 +++++
 lib/librte_mempool/rte_mempool_version.map      |  10 +-
 test/test/test_mempool.c                        |  31 ---
 13 files changed, 528 insertions(+), 269 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops_default.c

-- 
2.7.4

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
@ 2018-04-16 13:24  6%   ` Andrew Rybchenko
  2018-04-16 15:33  0%     ` Olivier Matz
  2018-04-17 10:23  0%     ` Burakov, Anatoly
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
                     ` (3 subsequent siblings)
  4 siblings, 2 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Anatoly Burakov

Size of memory chunk required to populate mempool objects depends
on how objects are stored in the memory. Different mempool drivers
may have different requirements and a new operation allows to
calculate memory size in accordance with driver requirements and
advertise requirements on minimum memory chunk size and alignment
in a generic way.

Bump ABI version since the patch breaks it.

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
v3 -> v4:
 - rebased on top of memory rework
 - dropped previous Ack's since rebase is not trivial
 - check size calculation failure in rte_mempool_populate_anon() and
   rte_mempool_memchunk_anon_free()

v2 -> v3:
 - none

v1 -> v2:
 - clarify min_chunk_size meaning
 - rebase on top of patch series which fixes library version in meson
   build

RFCv2 -> v1:
 - move default calc_mem_size callback to rte_mempool_ops_default.c
 - add ABI changes to release notes
 - name default callback consistently: rte_mempool_op_<callback>_default()
 - bump ABI version since it is the first patch which breaks ABI
 - describe default callback behaviour in details
 - avoid introduction of internal function to cope with deprecation
   (keep it to deprecation patch)
 - move cache-line or page boundary chunk alignment to default callback
 - highlight that min_chunk_size and align parameters are output only

 doc/guides/rel_notes/deprecation.rst         |   3 +-
 doc/guides/rel_notes/release_18_05.rst       |   8 +-
 lib/librte_mempool/Makefile                  |   3 +-
 lib/librte_mempool/meson.build               |   5 +-
 lib/librte_mempool/rte_mempool.c             | 114 +++++++++++++++------------
 lib/librte_mempool/rte_mempool.h             |  86 +++++++++++++++++++-
 lib/librte_mempool/rte_mempool_ops.c         |  18 +++++
 lib/librte_mempool/rte_mempool_ops_default.c |  38 +++++++++
 lib/librte_mempool/rte_mempool_version.map   |   7 ++
 9 files changed, 225 insertions(+), 57 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops_default.c

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index c929dcc..2aa5ef3 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -60,8 +60,7 @@ Deprecation Notices
 
   - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
-  - addition of new ops to customize required memory chunk calculation,
-    customize objects population and allocate contiguous
+  - addition of new ops to customize objects population and allocate contiguous
     block of objects if underlying driver supports it.
 
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 84295e4..7dbe7ac 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -195,6 +195,12 @@ ABI Changes
   type ``uint16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
   are parameter values recommended for use by the PMD.
 
+* **Changed rte_mempool_ops structure.**
+
+  A new callback ``calc_mem_size`` has been added to ``rte_mempool_ops``
+  to allow to customize required memory size calculation.
+
+
 Removed Items
 -------------
 
@@ -267,7 +273,7 @@ The libraries prepended with a plus sign were incremented in this version.
      librte_latencystats.so.1
      librte_lpm.so.2
    + librte_mbuf.so.4
-     librte_mempool.so.3
+   + librte_mempool.so.4
    + librte_meter.so.2
      librte_metrics.so.1
      librte_net.so.1
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 1f85d34..421e2a7 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -11,7 +11,7 @@ LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
 
-LIBABIVER := 3
+LIBABIVER := 4
 
 # memseg walk is not yet part of stable API
 CFLAGS += -DALLOW_EXPERIMENTAL_API
@@ -19,6 +19,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops_default.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 89506c5..6181ad8 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
-version = 3
-sources = files('rte_mempool.c', 'rte_mempool_ops.c')
+version = 4
+sources = files('rte_mempool.c', 'rte_mempool_ops.c',
+		'rte_mempool_ops_default.c')
 headers = files('rte_mempool.h')
 deps += ['ring']
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index b15b79b..fdcee05 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -574,12 +574,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
-	size_t size, total_elt_sz, align, pg_sz, pg_shift;
+	ssize_t mem_size;
+	size_t align, pg_sz, pg_shift;
 	rte_iova_t iova;
 	unsigned mz_id, n;
-	unsigned int mp_flags;
 	int ret;
-	bool force_contig, no_contig, try_contig, no_pageshift;
+	bool no_contig, try_contig, no_pageshift;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -589,22 +589,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	if (mp->nb_mem_chunks != 0)
 		return -EEXIST;
 
-	/* Get mempool capabilities */
-	mp_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
-	/* update mempool capabilities */
-	mp->flags |= mp_flags;
-
 	no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG;
-	force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
 
 	/*
 	 * the following section calculates page shift and page size values.
 	 *
-	 * these values impact the result of rte_mempool_xmem_size(), which
+	 * these values impact the result of calc_mem_size operation, which
 	 * returns the amount of memory that should be allocated to store the
 	 * desired number of objects. when not zero, it allocates more memory
 	 * for the padding between objects, to ensure that an object does not
@@ -625,7 +615,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * if our IO addresses are virtual, not actual physical (IOVA as VA
 	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous physical memory as far as the hardware is concerned, so
+	 * contiguous IO memory as far as the hardware is concerned, so
 	 * act as if we're getting contiguous memory.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
@@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
 	 * memory, then we'll go and reserve space page-by-page.
 	 */
-	no_pageshift = no_contig || force_contig ||
-			rte_eal_iova_mode() == RTE_IOVA_VA;
+	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
 	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
-	if (force_contig)
-		mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 	if (no_pageshift) {
 		pg_sz = 0;
 		pg_shift = 0;
-		align = RTE_CACHE_LINE_SIZE;
 	} else if (try_contig) {
 		pg_sz = get_min_page_size();
 		pg_shift = rte_bsf32(pg_sz);
-		/* we're trying to reserve contiguous memzone first, so try
-		 * align to cache line; if we fail to reserve a contiguous
-		 * memzone, we'll adjust alignment to equal pagesize later.
-		 */
-		align = RTE_CACHE_LINE_SIZE;
 	} else {
 		pg_sz = getpagesize();
 		pg_shift = rte_bsf32(pg_sz);
-		align = pg_sz;
 	}
 
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+		size_t min_chunk_size;
 		unsigned int flags;
+
 		if (try_contig || no_pageshift)
-			size = rte_mempool_xmem_size(n, total_elt_sz, 0,
-				mp->flags);
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					0, &min_chunk_size, &align);
 		else
-			size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
-				mp->flags);
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					pg_shift, &min_chunk_size, &align);
+
+		if (mem_size < 0) {
+			ret = mem_size;
+			goto fail;
+		}
 
 		ret = snprintf(mz_name, sizeof(mz_name),
 			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
@@ -692,27 +678,31 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		if (try_contig)
 			flags |= RTE_MEMZONE_IOVA_CONTIG;
 
-		mz = rte_memzone_reserve_aligned(mz_name, size, mp->socket_id,
-				flags, align);
+		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
+				mp->socket_id, flags, align);
 
-		/* if we were trying to allocate contiguous memory, adjust
-		 * memzone size and page size to fit smaller page sizes, and
-		 * try again.
+		/* if we were trying to allocate contiguous memory, failed and
+		 * minimum required contiguous chunk fits minimum page, adjust
+		 * memzone size to the page size, and try again.
 		 */
-		if (mz == NULL && try_contig) {
+		if (mz == NULL && try_contig && min_chunk_size <= pg_sz) {
 			try_contig = false;
 			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-			align = pg_sz;
-			size = rte_mempool_xmem_size(n, total_elt_sz,
-				pg_shift, mp->flags);
 
-			mz = rte_memzone_reserve_aligned(mz_name, size,
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					pg_shift, &min_chunk_size, &align);
+			if (mem_size < 0) {
+				ret = mem_size;
+				goto fail;
+			}
+
+			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
 				mp->socket_id, flags, align);
 		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
-		if (!force_contig && mz == NULL) {
+		if (min_chunk_size < (size_t)mem_size && mz == NULL) {
 			/* not enough memory, retry with the biggest zone we
 			 * have
 			 */
@@ -724,6 +714,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
+		if (mz->len < min_chunk_size) {
+			rte_memzone_free(mz);
+			ret = -ENOMEM;
+			goto fail;
+		}
+
 		if (no_contig)
 			iova = RTE_BAD_IOVA;
 		else
@@ -753,16 +749,18 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 }
 
 /* return the memory size required for mempool objects in anonymous mem */
-static size_t
+static ssize_t
 get_anon_size(const struct rte_mempool *mp)
 {
-	size_t size, total_elt_sz, pg_sz, pg_shift;
+	ssize_t size;
+	size_t pg_sz, pg_shift;
+	size_t min_chunk_size;
+	size_t align;
 
 	pg_sz = getpagesize();
 	pg_shift = rte_bsf32(pg_sz);
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
-	size = rte_mempool_xmem_size(mp->size, total_elt_sz, pg_shift,
-					mp->flags);
+	size = rte_mempool_ops_calc_mem_size(mp, mp->size, pg_shift,
+					     &min_chunk_size, &align);
 
 	return size;
 }
@@ -772,14 +770,25 @@ static void
 rte_mempool_memchunk_anon_free(struct rte_mempool_memhdr *memhdr,
 	void *opaque)
 {
-	munmap(opaque, get_anon_size(memhdr->mp));
+	ssize_t size;
+
+	/*
+	 * Calculate size since memhdr->len has contiguous chunk length
+	 * which may be smaller if anon map is split into many contiguous
+	 * chunks. Result must be the same as we calculated on populate.
+	 */
+	size = get_anon_size(memhdr->mp);
+	if (size < 0)
+		return;
+
+	munmap(opaque, size);
 }
 
 /* populate the mempool with an anonymous mapping */
 int
 rte_mempool_populate_anon(struct rte_mempool *mp)
 {
-	size_t size;
+	ssize_t size;
 	int ret;
 	char *addr;
 
@@ -793,8 +802,13 @@ rte_mempool_populate_anon(struct rte_mempool *mp)
 	if (ret != 0)
 		return ret;
 
-	/* get chunk of virtually continuous memory */
 	size = get_anon_size(mp);
+	if (size < 0) {
+		rte_errno = -size;
+		return 0;
+	}
+
+	/* get chunk of virtually continuous memory */
 	addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
 		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 	if (addr == MAP_FAILED) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index e531a15..191255d 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -400,6 +400,62 @@ typedef int (*rte_mempool_get_capabilities_t)(const struct rte_mempool *mp,
 typedef int (*rte_mempool_ops_register_memory_area_t)
 (const struct rte_mempool *mp, char *vaddr, rte_iova_t iova, size_t len);
 
+/**
+ * Calculate memory size required to store given number of objects.
+ *
+ * If mempool objects are not required to be IOVA-contiguous
+ * (the flag MEMPOOL_F_NO_IOVA_CONTIG is set), min_chunk_size defines
+ * virtually contiguous chunk size. Otherwise, if mempool objects must
+ * be IOVA-contiguous (the flag MEMPOOL_F_NO_IOVA_CONTIG is clear),
+ * min_chunk_size defines IOVA-contiguous chunk size.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[in] obj_num
+ *   Number of objects.
+ * @param[in] pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[out] min_chunk_size
+ *   Location for minimum size of the memory chunk which may be used to
+ *   store memory pool objects.
+ * @param[out] align
+ *   Location for required memory chunk alignment.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
+		uint32_t obj_num,  uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
+/**
+ * Default way to calculate memory size required to store given number of
+ * objects.
+ *
+ * If page boundaries may be ignored, it is just a product of total
+ * object size including header and trailer and number of objects.
+ * Otherwise, it is a number of pages required to store given number of
+ * objects without crossing page boundary.
+ *
+ * Note that if object size is bigger than page size, then it assumes
+ * that pages are grouped in subsets of physically continuous pages big
+ * enough to store at least one object.
+ *
+ * If mempool driver requires object addresses to be block size aligned
+ * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
+ * reserved to be able to meet the requirement.
+ *
+ * Minimum size of memory chunk is either all required space, if
+ * capabilities say that whole memory area must be physically contiguous
+ * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
+ * element size.
+ *
+ * Required memory chunk alignment is a maximum of page size and cache
+ * line size.
+ */
+ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+		uint32_t obj_num, uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
 /** Structure defining mempool operations structure */
 struct rte_mempool_ops {
 	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
@@ -416,6 +472,11 @@ struct rte_mempool_ops {
 	 * Notify new memory area to mempool
 	 */
 	rte_mempool_ops_register_memory_area_t register_memory_area;
+	/**
+	 * Optional callback to calculate memory size required to
+	 * store specified number of objects.
+	 */
+	rte_mempool_calc_mem_size_t calc_mem_size;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -565,6 +626,29 @@ rte_mempool_ops_register_memory_area(const struct rte_mempool *mp,
 				char *vaddr, rte_iova_t iova, size_t len);
 
 /**
+ * @internal wrapper for mempool_ops calc_mem_size callback.
+ * API to calculate size of memory required to store specified number of
+ * object.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[in] obj_num
+ *   Number of objects.
+ * @param[in] pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[out] min_chunk_size
+ *   Location for minimum size of the memory chunk which may be used to
+ *   store memory pool objects.
+ * @param[out] align
+ *   Location for required memory chunk alignment.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+ssize_t rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
+				      uint32_t obj_num, uint32_t pg_shift,
+				      size_t *min_chunk_size, size_t *align);
+
+/**
  * @internal wrapper for mempool_ops free callback.
  *
  * @param mp
@@ -1534,7 +1618,7 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  * of objects. Assume that the memory buffer will be aligned at page
  * boundary.
  *
- * Note that if object size is bigger then page size, then it assumes
+ * Note that if object size is bigger than page size, then it assumes
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 0732255..26908cc 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -59,6 +59,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->get_count = h->get_count;
 	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
+	ops->calc_mem_size = h->calc_mem_size;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
@@ -123,6 +124,23 @@ rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
 	return ops->register_memory_area(mp, vaddr, iova, len);
 }
 
+/* wrapper to notify new memory area to external mempool */
+ssize_t
+rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+
+	if (ops->calc_mem_size == NULL)
+		return rte_mempool_op_calc_mem_size_default(mp, obj_num,
+				pg_shift, min_chunk_size, align);
+
+	return ops->calc_mem_size(mp, obj_num, pg_shift, min_chunk_size, align);
+}
+
 /* sets mempool ops previously registered by rte_mempool_register_ops. */
 int
 rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
new file mode 100644
index 0000000..57fe79b
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2018 Solarflare Communications Inc.
+ */
+
+#include <rte_mempool.h>
+
+ssize_t
+rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+				     uint32_t obj_num, uint32_t pg_shift,
+				     size_t *min_chunk_size, size_t *align)
+{
+	unsigned int mp_flags;
+	int ret;
+	size_t total_elt_sz;
+	size_t mem_size;
+
+	/* Get mempool capabilities */
+	mp_flags = 0;
+	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
+	if ((ret < 0) && (ret != -ENOTSUP))
+		return ret;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
+					 mp->flags | mp_flags);
+
+	if (mp_flags & MEMPOOL_F_CAPA_PHYS_CONTIG)
+		*min_chunk_size = mem_size;
+	else
+		*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
+
+	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+
+	return mem_size;
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 62b76f9..cb38189 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -51,3 +51,10 @@ DPDK_17.11 {
 	rte_mempool_populate_iova_tab;
 
 } DPDK_16.07;
+
+DPDK_18.05 {
+	global:
+
+	rte_mempool_op_calc_mem_size_default;
+
+} DPDK_17.11;
-- 
2.7.4

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
@ 2018-04-16 13:24  6%   ` Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The callback allows to customize how objects are stored in the
memory chunk. Default implementation of the callback which simply
puts objects one by one is available.

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - none

v2 -> v3:
 - none

v1 -> v2:
 - fix memory leak if off is bigger than len

RFCv2 -> v1:
 - advertise ABI changes in release notes
 - use consistent name for default callback:
   rte_mempool_op_<callback>_default()
 - add opaque data pointer to populated object callback
 - move default callback to dedicated file

 doc/guides/rel_notes/deprecation.rst         |  2 +-
 doc/guides/rel_notes/release_18_05.rst       |  2 +
 lib/librte_mempool/rte_mempool.c             | 23 ++++---
 lib/librte_mempool/rte_mempool.h             | 90 ++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool_ops.c         | 21 +++++++
 lib/librte_mempool/rte_mempool_ops_default.c | 24 ++++++++
 lib/librte_mempool/rte_mempool_version.map   |  1 +
 7 files changed, 149 insertions(+), 14 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 2aa5ef3..575da18 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -60,7 +60,7 @@ Deprecation Notices
 
   - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
-  - addition of new ops to customize objects population and allocate contiguous
+  - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
 
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 7dbe7ac..5c6588e 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -199,6 +199,8 @@ ABI Changes
 
   A new callback ``calc_mem_size`` has been added to ``rte_mempool_ops``
   to allow to customize required memory size calculation.
+  A new callback ``populate`` has been added to ``rte_mempool_ops``
+  to allow to customize objects population.
 
 
 Removed Items
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index fdcee05..68ae12f 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -122,7 +122,8 @@ get_min_page_size(void)
 
 
 static void
-mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
+mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque,
+		 void *obj, rte_iova_t iova)
 {
 	struct rte_mempool_objhdr *hdr;
 	struct rte_mempool_objtlr *tlr __rte_unused;
@@ -139,9 +140,6 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
 	tlr = __mempool_get_trailer(obj);
 	tlr->cookie = RTE_MEMPOOL_TRAILER_COOKIE;
 #endif
-
-	/* enqueue in ring */
-	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -420,17 +418,16 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	else
 		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
 
-	while (off + total_elt_sz <= len && mp->populated_size < mp->size) {
-		off += mp->header_size;
-		if (iova == RTE_BAD_IOVA)
-			mempool_add_elem(mp, (char *)vaddr + off,
-				RTE_BAD_IOVA);
-		else
-			mempool_add_elem(mp, (char *)vaddr + off, iova + off);
-		off += mp->elt_size + mp->trailer_size;
-		i++;
+	if (off > len) {
+		ret = -EINVAL;
+		goto fail;
 	}
 
+	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
+		(char *)vaddr + off,
+		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
+		len - off, mempool_add_elem, NULL);
+
 	/* not enough room to store one object */
 	if (i == 0) {
 		ret = -EINVAL;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 191255d..754261e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -456,6 +456,63 @@ ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
 		size_t *min_chunk_size, size_t *align);
 
+/**
+ * Function to be called for each populated object.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] opaque
+ *   An opaque pointer passed to iterator.
+ * @param[in] vaddr
+ *   Object virtual address.
+ * @param[in] iova
+ *   Input/output virtual address of the object or RTE_BAD_IOVA.
+ */
+typedef void (rte_mempool_populate_obj_cb_t)(struct rte_mempool *mp,
+		void *opaque, void *vaddr, rte_iova_t iova);
+
+/**
+ * Populate memory pool objects using provided memory chunk.
+ *
+ * Populated objects should be enqueued to the pool, e.g. using
+ * rte_mempool_ops_enqueue_bulk().
+ *
+ * If the given IO address is unknown (iova = RTE_BAD_IOVA),
+ * the chunk doesn't need to be physically contiguous (only virtually),
+ * and allocated objects may span two pages.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] max_objs
+ *   Maximum number of objects to be populated.
+ * @param[in] vaddr
+ *   The virtual address of memory that should be used to store objects.
+ * @param[in] iova
+ *   The IO address
+ * @param[in] len
+ *   The length of memory in bytes.
+ * @param[in] obj_cb
+ *   Callback function to be executed for each populated object.
+ * @param[in] obj_cb_arg
+ *   An opaque pointer passed to the callback function.
+ * @return
+ *   The number of objects added on success.
+ *   On error, no objects are populated and a negative errno is returned.
+ */
+typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
+/**
+ * Default way to populate memory pool object using provided memory
+ * chunk: just slice objects one by one.
+ */
+int rte_mempool_op_populate_default(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
 /** Structure defining mempool operations structure */
 struct rte_mempool_ops {
 	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
@@ -477,6 +534,11 @@ struct rte_mempool_ops {
 	 * store specified number of objects.
 	 */
 	rte_mempool_calc_mem_size_t calc_mem_size;
+	/**
+	 * Optional callback to populate mempool objects using
+	 * provided memory chunk.
+	 */
+	rte_mempool_populate_t populate;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -649,6 +711,34 @@ ssize_t rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				      size_t *min_chunk_size, size_t *align);
 
 /**
+ * @internal wrapper for mempool_ops populate callback.
+ *
+ * Populate memory pool objects using provided memory chunk.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] max_objs
+ *   Maximum number of objects to be populated.
+ * @param[in] vaddr
+ *   The virtual address of memory that should be used to store objects.
+ * @param[in] iova
+ *   The IO address
+ * @param[in] len
+ *   The length of memory in bytes.
+ * @param[in] obj_cb
+ *   Callback function to be executed for each populated object.
+ * @param[in] obj_cb_arg
+ *   An opaque pointer passed to the callback function.
+ * @return
+ *   The number of objects added on success.
+ *   On error, no objects are populated and a negative errno is returned.
+ */
+int rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs,
+			     void *vaddr, rte_iova_t iova, size_t len,
+			     rte_mempool_populate_obj_cb_t *obj_cb,
+			     void *obj_cb_arg);
+
+/**
  * @internal wrapper for mempool_ops free callback.
  *
  * @param mp
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 26908cc..1a7f39f 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -60,6 +60,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
+	ops->populate = h->populate;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
@@ -141,6 +142,26 @@ rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 	return ops->calc_mem_size(mp, obj_num, pg_shift, min_chunk_size, align);
 }
 
+/* wrapper to populate memory pool objects using provided memory chunk */
+int
+rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs,
+				void *vaddr, rte_iova_t iova, size_t len,
+				rte_mempool_populate_obj_cb_t *obj_cb,
+				void *obj_cb_arg)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+
+	if (ops->populate == NULL)
+		return rte_mempool_op_populate_default(mp, max_objs, vaddr,
+						       iova, len, obj_cb,
+						       obj_cb_arg);
+
+	return ops->populate(mp, max_objs, vaddr, iova, len, obj_cb,
+			     obj_cb_arg);
+}
+
 /* sets mempool ops previously registered by rte_mempool_register_ops. */
 int
 rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 57fe79b..57295f7 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -36,3 +36,27 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 
 	return mem_size;
 }
+
+int
+rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+{
+	size_t total_elt_sz;
+	size_t off;
+	unsigned int i;
+	void *obj;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+		off += mp->header_size;
+		obj = (char *)vaddr + off;
+		obj_cb(mp, obj_cb_arg, obj,
+		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
+		off += mp->elt_size + mp->trailer_size;
+	}
+
+	return i;
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index cb38189..41a0b09 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,6 @@ DPDK_18.05 {
 	global:
 
 	rte_mempool_op_calc_mem_size_default;
+	rte_mempool_op_populate_default;
 
 } DPDK_17.11;
-- 
2.7.4

^ permalink raw reply	[relevance 6%]

* Re: [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (5 preceding siblings ...)
  2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
@ 2018-04-16 11:22  0% ` Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-16 11:22 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, wenzhuo.lu, declan.doherty,
	jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 13-Apr-18 7:30 PM, Arnon Warshavsky wrote:
> The purpose of this patch series is to cleanup the library code
> from paths that end up aborting the process,
> and move to checking error values, in order to allow the running process
> perform an orderly teardown or other mitigation of the event.
> 
> This patch modifies the majority of rte_panic calls
> under lib and drivers, and replaces them with a log message
> and an error return code according to context,
> that can be propagated up the call stack.
> 
> - Focus was given to the dpdk initialization path
> - Some of the panic calls within drivers were left in place where
>    the call is from within an interrupt or calls that are
>    on the data path,where there is no simple applicative
>    route to propagate the error to temination.
>    These should be handled by the driver maintainers.
> - In order to avoid breaking ABI where panic was called from public
>    void functions, a panic state variable was introduced so that
>    it can be queried after calling these void functions.
>    This tool place for a single function call.
> - local void functions with no api were changed to retrun a value
>    where needed
> - No change took place in example and test files
> - No change took place for debug assertions calling panic
> - A new function was added to devtools/checkpatches.sh
>    in order to prevent new additions of calls to rte_panic
>    under lib and drivers.
> 
> Keep calm and don't panic
> 
> ---
> 
> v2:
> - reformat error messages so that literal string are in the same line
> - fix typo in commit message
> - add new return code to doxigen of rte_memzone_free()
> 
> v3:
> - submit  all 13 patches changed and unchanged in the same patchset
> 

This patchset needs to be rebased. There were a few changes that make 
some of the patches unnecessary.

Changes in patch 7 and 9 were addressed in earlier memory hotplug 
patchset, and are no longer applicable. Some things may have changed for 
patch 12 as well.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v10 02/10] crypto/virtio: support virtio device init
  @ 2018-04-16  2:21  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-16  2:21 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/meson.build        |   3 +-
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 462 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 11 files changed, 1443 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index 2f04f0c..786afb8 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/meson.build b/drivers/crypto/virtio/meson.build
index 57d84c4..cee77cc 100644
--- a/drivers/crypto/virtio/meson.build
+++ b/drivers/crypto/virtio/meson.build
@@ -6,6 +6,7 @@ if not dep.found()
 	build = false
 endif
 deps += ['bus_pci']
-sources = files('virtio_cryptodev.c')
+sources = files('virtio_cryptodev.c', 'virtio_pci.c',
+		'virtio_rxtx.c', 'virtqueue.c')
 ext_deps += dep
 pkgconfig_extra_libs += '-lcrypto'
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..832c465
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v9 02/11] crypto/virtio: support virtio device init
  @ 2018-04-15  8:51  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-15  8:51 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 462 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 10 files changed, 1441 insertions(+), 2 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index 2f04f0c..786afb8 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..832c465
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v8 02/11] crypto/virtio: support virtio device init
  @ 2018-04-14  9:34  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-14  9:34 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 460 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 10 files changed, 1439 insertions(+), 2 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index a3b44e9..c4727ea 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (4 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
@ 2018-04-13 18:30  2% ` Arnon Warshavsky
  2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local functions to this file,
changing from void to int are non-abi-breaking.
For handling the single function that cannot
change from void to int due to abi,
where this is the only place it is called in,
I added a state variable that is being checked
right after the call to this function.

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_eal/bsdapp/eal/eal.c           |  87 ++++++++++++++-------
 lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
 lib/librte_eal/common/eal_common_launch.c |  21 ++++++
 lib/librte_eal/common/include/rte_debug.h |  12 +++
 lib/librte_eal/linuxapp/eal/eal.c         | 121 ++++++++++++++++++++----------
 lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
 6 files changed, 272 insertions(+), 99 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..f6aa3b2 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -150,7 +150,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -159,60 +159,79 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
 	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	void *rte_mem_cfg_addr;
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 	close(mem_cfg_fd);
-	if (rte_mem_cfg_addr == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
+	}
 
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -236,23 +255,28 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create())
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach())
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+	return 0;
 }
 
 /* display usage */
@@ -583,7 +607,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_mp_channel_init() < 0) {
 		rte_eal_init_alert("failed to init mp channel\n");
@@ -630,7 +655,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_check_mem_on_local_socket();
 
-	eal_thread_init_master(rte_config.master_lcore);
+	if (eal_thread_init_master(rte_config.master_lcore) != 0)
+		return -1;
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -644,18 +670,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
index d602daf..5c3947c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_thread.c
+++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
index fe0ba3f..6f8bd46 100644
--- a/lib/librte_eal/common/eal_common_launch.c
+++ b/lib/librte_eal/common/eal_common_launch.c
@@ -14,6 +14,7 @@
 #include <rte_pause.h>
 #include <rte_per_lcore.h>
 #include <rte_lcore.h>
+#include <rte_debug.h>
 
 /*
  * Wait until a lcore finished its job.
@@ -88,3 +89,23 @@ enum rte_lcore_state_t
 		rte_eal_wait_lcore(lcore_id);
 	}
 }
+
+/* panic state */
+static int _panic_state;
+
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void)
+{
+	return _panic_state;
+}
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void)
+{
+	_panic_state = 1;
+}
diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
index 272df49..b421d33 100644
--- a/lib/librte_eal/common/include/rte_debug.h
+++ b/lib/librte_eal/common/include/rte_debug.h
@@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
 }
 #endif
 
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void);
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void);
+
+
 #endif /* _RTE_DEBUG_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..b7b950a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -160,7 +160,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -169,7 +169,7 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* map the config before hugepage address so that we don't waste a page */
 	if (internal_config.base_virtaddr != 0)
@@ -179,30 +179,39 @@ enum rte_iova_mode
 	else
 		rte_mem_cfg_addr = NULL;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for "
+					"rte_mem_config\n", __func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
-	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config\n", __func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
@@ -211,10 +220,11 @@ enum rte_iova_mode
 	 * processes could later map the config into this exact location */
 	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
 
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	struct rte_mem_config *mem_config;
@@ -222,33 +232,41 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+						__func__, pathname);
+			return -1;
+		}
 	}
 
 	/* map it as read-only first */
 	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
 			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
-	if (mem_config == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-			  errno, strerror(errno));
+	if (mem_config == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config! error %i (%s)\n",
+				__func__, errno, strerror(errno));
+		return -1;
+	}
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* reattach the shared config at exact memory location primary process has it */
-static void
+static int
 rte_eal_config_reattach(void)
 {
 	struct rte_mem_config *mem_config;
 	void *rte_mem_cfg_addr;
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* save the address primary process has mapped shared config to */
 	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
@@ -263,16 +281,21 @@ enum rte_iova_mode
 	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
 		if (mem_config != MAP_FAILED)
 			/* errno is stale, don't use */
-			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
-				  " - please use '--base-virtaddr' option\n",
-				  rte_mem_cfg_addr, mem_config);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config at [%p], got [%p] - please use "
+					"'--base-virtaddr' option\n",
+					__func__, rte_mem_cfg_addr, mem_config);
 		else
-			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-				  errno, strerror(errno));
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config! error %i (%s)\n",
+					__func__, errno, strerror(errno));
+		return -1;
 	}
 	close(mem_cfg_fd);
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -296,24 +319,31 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create() != 0)
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach() != 0)
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
-		rte_eal_config_reattach();
+		if (rte_eal_config_reattach() != 0)
+			return -1;
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+
+	return 0;
 }
 
 /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
@@ -827,7 +857,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
 		rte_eal_init_alert("Cannot init logging.");
@@ -890,6 +921,9 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_thread_init_master(rte_config.master_lcore);
 
+	if (rte_get_panic_state())
+		return -1;
+
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
@@ -907,18 +941,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b..3afcee5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (3 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_ether/rte_ethdev.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..57e1e6b 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -194,7 +194,7 @@ enum {
 	return port_id;
 }
 
-static void
+static int
 rte_eth_dev_shared_data_prepare(void)
 {
 	const unsigned flags = 0;
@@ -210,8 +210,12 @@ enum {
 					rte_socket_id(), flags);
 		} else
 			mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA);
-		if (mz == NULL)
-			rte_panic("Cannot allocate ethdev shared data\n");
+		if (mz == NULL) {
+			rte_spinlock_unlock(&rte_eth_shared_data_lock);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot allocate ethdev shared data\n",
+					__func__);
+			return -1;
+		}
 
 		rte_eth_dev_shared_data = mz->addr;
 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
@@ -224,6 +228,8 @@ enum {
 	}
 
 	rte_spinlock_unlock(&rte_eth_shared_data_lock);
+
+	return 0;
 }
 
 struct rte_eth_dev *
@@ -274,7 +280,8 @@ struct rte_eth_dev *
 	uint16_t port_id;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port creation between primary and secondary threads. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -317,7 +324,8 @@ struct rte_eth_dev *
 	uint16_t i;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port attachment to primary port creation and release. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -345,7 +353,8 @@ struct rte_eth_dev *
 	if (eth_dev == NULL)
 		return -EINVAL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -399,7 +408,8 @@ struct rte_eth_dev *
 int __rte_experimental
 rte_eth_dev_owner_new(uint64_t *owner_id)
 {
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -450,7 +460,8 @@ struct rte_eth_dev *
 {
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -467,7 +478,8 @@ struct rte_eth_dev *
 			{.id = RTE_ETH_DEV_NO_OWNER, .name = ""};
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -482,7 +494,8 @@ struct rte_eth_dev *
 {
 	uint16_t port_id;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -502,7 +515,8 @@ struct rte_eth_dev *
 {
 	int ret = 0;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (2 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_kni/rte_kni.c      | 18 ++++++++++++------
 lib/librte_kni/rte_kni_fifo.h | 11 ++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2867411..54050c8 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -353,37 +353,43 @@ struct rte_kni *
 	/* TX RING */
 	mz = slot->m_tx_q;
 	ctx->tx_q = mz->addr;
-	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.tx_phys = mz->phys_addr;
 
 	/* RX RING */
 	mz = slot->m_rx_q;
 	ctx->rx_q = mz->addr;
-	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.rx_phys = mz->phys_addr;
 
 	/* ALLOC RING */
 	mz = slot->m_alloc_q;
 	ctx->alloc_q = mz->addr;
-	kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.alloc_phys = mz->phys_addr;
 
 	/* FREE RING */
 	mz = slot->m_free_q;
 	ctx->free_q = mz->addr;
-	kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.free_phys = mz->phys_addr;
 
 	/* Request RING */
 	mz = slot->m_req_q;
 	ctx->req_q = mz->addr;
-	kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.req_phys = mz->phys_addr;
 
 	/* Response RING */
 	mz = slot->m_resp_q;
 	ctx->resp_q = mz->addr;
-	kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.resp_phys = mz->phys_addr;
 
 	/* Req/Resp sync mem area */
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c..5052015 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -7,17 +7,22 @@
 /**
  * Initializes the kni fifo structure
  */
-static void
+static int
 kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size)
 {
 	/* Ensure size is power of 2 */
-	if (size & (size - 1))
-		rte_panic("KNI fifo size must be power of 2\n");
+	if (size & (size - 1)) {
+		RTE_LOG(CRIT, EAL, "%s(): KNI fifo size must be power of 2\n",
+				__func__);
+		return -1;
+	}
 
 	fifo->write = 0;
 	fifo->read = 0;
 	fifo->len = size;
 	fifo->elem_size = sizeof(void *);
+
+	return 0;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/ixgbe/ixgbe_ethdev.c |  3 ++-
 drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
 drivers/net/ixgbe/ixgbe_pf.c     | 13 +++++++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 4df5c75..96188dc 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1224,7 +1224,8 @@ struct rte_ixgbe_xstats_name_off {
 	memset(hwstrip, 0, sizeof(*hwstrip));
 
 	/* initialize PF if max_vfs not zero */
-	ixgbe_pf_host_init(eth_dev);
+	if (ixgbe_pf_host_init(eth_dev) != 0)
+		return -1;
 
 	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
 	/* let hardware know driver is loaded */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index c56d652..82d7fd2 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -663,7 +663,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
 
 void ixgbe_vlan_hw_strip_disable_all(struct rte_eth_dev *dev);
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
index ea99737..5c25de0 100644
--- a/drivers/net/ixgbe/ixgbe_pf.c
+++ b/drivers/net/ixgbe/ixgbe_pf.c
@@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct ixgbe_vf_info **vfinfo =
 		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
+				__func__);
+		return -1;
+	}
 
 	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
 	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
@@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	/* set mb interrupt mask */
 	ixgbe_mb_intr_setup(eth_dev);
+
+	return 0;
 }
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/e1000/e1000_ethdev.h |  2 +-
 drivers/net/e1000/igb_ethdev.c   |  3 ++-
 drivers/net/e1000/igb_pf.c       | 15 +++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 23b089c..a66ff42 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -405,7 +405,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
 /*
  * misc function prototypes
  */
-void igb_pf_host_init(struct rte_eth_dev *eth_dev);
+int igb_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index d7eef9a..994bb5a 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
 	}
 
 	/* initialize PF if max_vfs not zero */
-	igb_pf_host_init(eth_dev);
+	if (igb_pf_host_init(eth_dev) != 0)
+		goto err_late;
 
 	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
 	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
index b9f2e53..dfa63c9 100644
--- a/drivers/net/e1000/igb_pf.c
+++ b/drivers/net/e1000/igb_pf.c
@@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void igb_pf_host_init(struct rte_eth_dev *eth_dev)
+int igb_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct e1000_vf_info **vfinfo =
 		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	if (0 == (vf_num = dev_num_vf(eth_dev)))
-		return;
+		return 0;
 
 	if (hw->mac.type == e1000_i350)
 		nb_queue = 1;
@@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 		/* per datasheet, it should be 2, but 1 seems correct */
 		nb_queue = 1;
 	else
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private "
+				"VF data\n", __func__);
+		return -1;
+	}
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
 	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
@@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 	/* set mb interrupt mask */
 	igb_mb_intr_setup(eth_dev);
 
-	return;
+	return 0;
 }
 
 void igb_pf_host_uninit(struct rte_eth_dev *dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local functions to this file,
changing from void to int are non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c         | 30 +++++++++++++++--------
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
 drivers/net/bonding/rte_eth_bond_api.c            | 20 ++++++++++-----
 drivers/net/bonding/rte_eth_bond_pmd.c            | 10 +++++---
 drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
 5 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index c452318..310118c 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -893,7 +893,7 @@
 			bond_mode_8023ad_periodic_cb, arg);
 }
 
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
 				uint16_t slave_id)
 {
@@ -939,7 +939,7 @@
 	timer_cancel(&port->warning_timer);
 
 	if (port->mbuf_pool != NULL)
-		return;
+		return 0;
 
 	RTE_ASSERT(port->rx_ring == NULL);
 	RTE_ASSERT(port->tx_ring == NULL);
@@ -968,8 +968,10 @@
 	/* Any memory allocation failure in initialization is critical because
 	 * resources can't be free, so reinitialization is impossible. */
 	if (port->mbuf_pool == NULL) {
-		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-			slave_id, mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory"
+				" pool '%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
@@ -977,8 +979,9 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
 
 	if (port->rx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	/* TX ring is at least one pkt longer to make room for marker packet. */
@@ -987,9 +990,13 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
 
 	if (port->tx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring "
+				"'%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
+
+	return 0;
 }
 
 int
@@ -1143,9 +1150,12 @@
 	struct bond_dev_private *internals = bond_dev->data->dev_private;
 	uint8_t i;
 
-	for (i = 0; i < internals->active_slave_count; i++)
-		bond_mode_8023ad_activate_slave(bond_dev,
+	for (i = 0; i < internals->active_slave_count; i++) {
+		int rc = bond_mode_8023ad_activate_slave(bond_dev,
 				internals->active_slaves[i]);
+		if (rc != 0)
+			return rc;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
index 0f490a5..96a42f2 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
@@ -263,7 +263,7 @@ struct mode8023ad_private {
  * @return
  *  0 on success, negative value otherwise.
  */
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
 
 /**
diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index f854b73..6bc5887 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -69,14 +69,15 @@
 	return 0;
 }
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
 {
 	struct bond_dev_private *internals = eth_dev->data->dev_private;
 	uint8_t active_count = internals->active_slave_count;
 
 	if (internals->mode == BONDING_MODE_8023AD)
-		bond_mode_8023ad_activate_slave(eth_dev, port_id);
+		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
+			return -1;
 
 	if (internals->mode == BONDING_MODE_TLB
 			|| internals->mode == BONDING_MODE_ALB) {
@@ -349,10 +350,17 @@
 				bond_ethdev_primary_set(internals,
 							slave_port_id);
 
-			if (find_slave_by_id(internals->active_slaves,
-					     internals->active_slave_count,
-					     slave_port_id) == internals->active_slave_count)
-				activate_slave(bonded_eth_dev, slave_port_id);
+			int rc =
+				find_slave_by_id(internals->active_slaves,
+					internals->active_slave_count,
+					slave_port_id);
+
+			if (rc == internals->active_slave_count) {
+				int rc = activate_slave(bonded_eth_dev,
+							slave_port_id);
+				if (rc != 0)
+					return -1;
+			}
 		}
 	}
 
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index b59ba9f..96f8b1a 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -1740,8 +1740,11 @@ struct bwg_slave {
 		/* Any memory allocation failure in initialization is critical because
 		 * resources can't be free, so reinitialization is impossible. */
 		if (port->slow_pool == NULL) {
-			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-				slave_id, mem_name, rte_strerror(rte_errno));
+			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create"
+					" memory pool '%s': %s\n",
+					__func__, slave_id, mem_name,
+					rte_strerror(rte_errno));
+			return -1;
 		}
 	}
 
@@ -2660,7 +2663,8 @@ struct bwg_slave {
 			mac_address_slaves_update(bonded_eth_dev);
 		}
 
-		activate_slave(bonded_eth_dev, port_id);
+		if (activate_slave(bonded_eth_dev, port_id) != 0)
+			return -1;
 
 		/* If user has defined the primary port then default to using it */
 		if (internals->user_defined_primary_port &&
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 92e15f8..65453aa 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -185,7 +185,7 @@ struct bond_dev_private {
 void
 deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
 void
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances
@ 2018-04-13 18:30  3% Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
                   ` (6 more replies)
  0 siblings, 7 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

The purpose of this patch series is to cleanup the library code
from paths that end up aborting the process,
and move to checking error values, in order to allow the running process
perform an orderly teardown or other mitigation of the event.

This patch modifies the majority of rte_panic calls
under lib and drivers, and replaces them with a log message
and an error return code according to context,
that can be propagated up the call stack.

- Focus was given to the dpdk initialization path
- Some of the panic calls within drivers were left in place where
  the call is from within an interrupt or calls that are
  on the data path,where there is no simple applicative
  route to propagate the error to temination.
  These should be handled by the driver maintainers.
- In order to avoid breaking ABI where panic was called from public
  void functions, a panic state variable was introduced so that
  it can be queried after calling these void functions.
  This tool place for a single function call.
- local void functions with no api were changed to retrun a value
  where needed
- No change took place in example and test files
- No change took place for debug assertions calling panic
- A new function was added to devtools/checkpatches.sh
  in order to prevent new additions of calls to rte_panic
  under lib and drivers.

Keep calm and don't panic

---

v2:
- reformat error messages so that literal string are in the same line
- fix typo in commit message
- add new return code to doxigen of rte_memzone_free()

v3:
- submit  all 13 patches changed and unchanged in the same patchset

Arnon Warshavsky (13):
  crypto: replace rte_panic instances in crypto driver
  bond: replace rte_panic instances in bonding driver
  e1000: replace rte_panic instances in e1000 driver
  ixgbe: replace rte_panic instances in ixgbe driver
  eal: replace rte_panic instances in eventdev
  kni: replace rte_panic instances in kni
  e1000: replace rte_panic instances in e1000 driver
  eal: replace rte_panic instances in hugepage_info
  eal: replace rte_panic instances in common_memzone
  eal: replace rte_panic instances in interrupts thread
  eal: replace rte_panic instances in ethdev
  eal: replace rte_panic instances in init sequence
  devtools: prevent new instances of rte_panic and rte_exit

 devtools/checkpatches.sh                          |  94 ++++++++++++++++-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c       |   8 +-
 drivers/crypto/dpaa_sec/dpaa_sec.c                |   8 +-
 drivers/net/bonding/rte_eth_bond_8023ad.c         |  30 ++++--
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |   2 +-
 drivers/net/bonding/rte_eth_bond_api.c            |  20 ++--
 drivers/net/bonding/rte_eth_bond_pmd.c            |  10 +-
 drivers/net/bonding/rte_eth_bond_private.h        |   2 +-
 drivers/net/e1000/e1000_ethdev.h                  |   2 +-
 drivers/net/e1000/igb_ethdev.c                    |   3 +-
 drivers/net/e1000/igb_pf.c                        |  15 +--
 drivers/net/ixgbe/ixgbe_ethdev.c                  |   3 +-
 drivers/net/ixgbe/ixgbe_ethdev.h                  |   2 +-
 drivers/net/ixgbe/ixgbe_pf.c                      |  13 ++-
 lib/librte_eal/bsdapp/eal/eal.c                   |  87 +++++++++++-----
 lib/librte_eal/bsdapp/eal/eal_thread.c            |  65 +++++++++---
 lib/librte_eal/common/eal_common_launch.c         |  21 ++++
 lib/librte_eal/common/eal_common_memzone.c        |   3 +-
 lib/librte_eal/common/include/rte_debug.h         |  12 +++
 lib/librte_eal/common/include/rte_memzone.h       |   1 +
 lib/librte_eal/common/rte_malloc.c                |   7 +-
 lib/librte_eal/linuxapp/eal/eal.c                 | 121 +++++++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  21 ++--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c      |  27 +++--
 lib/librte_eal/linuxapp/eal/eal_thread.c          |  65 +++++++++---
 lib/librte_ether/rte_ethdev.c                     |  36 +++++--
 lib/librte_eventdev/rte_eventdev_pmd_pci.h        |   8 +-
 lib/librte_eventdev/rte_eventdev_pmd_vdev.h       |   8 +-
 lib/librte_kni/rte_kni.c                          |  18 ++--
 lib/librte_kni/rte_kni_fifo.h                     |  11 +-
 30 files changed, 540 insertions(+), 183 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 02/10] bpf: add BPF loading and execution framework
  @ 2018-04-13 14:43  2% ` Konstantin Ananyev
  0 siblings, 0 replies; 200+ results
From: Konstantin Ananyev @ 2018-04-13 14:43 UTC (permalink / raw)
  To: dev; +Cc: Konstantin Ananyev

librte_bpf provides a framework to load and execute eBPF bytecode
inside user-space dpdk based applications.
It supports basic set of features from eBPF spec
(https://www.kernel.org/doc/Documentation/networking/filter.txt).

Not currently supported features:
 - JIT
 - cBPF
 - tail-pointer call
 - eBPF MAP
 - skb
 - function calls for 32-bit apps

It also adds dependency on libelf.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 config/common_base                 |   5 +
 lib/Makefile                       |   2 +
 lib/librte_bpf/Makefile            |  30 +++
 lib/librte_bpf/bpf.c               |  59 +++++
 lib/librte_bpf/bpf_exec.c          | 452 +++++++++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_impl.h          |  41 ++++
 lib/librte_bpf/bpf_load.c          | 386 +++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_validate.c      |  55 +++++
 lib/librte_bpf/meson.build         |  18 ++
 lib/librte_bpf/rte_bpf.h           | 170 ++++++++++++++
 lib/librte_bpf/rte_bpf_version.map |  12 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   2 +
 13 files changed, 1233 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_bpf/Makefile
 create mode 100644 lib/librte_bpf/bpf.c
 create mode 100644 lib/librte_bpf/bpf_exec.c
 create mode 100644 lib/librte_bpf/bpf_impl.h
 create mode 100644 lib/librte_bpf/bpf_load.c
 create mode 100644 lib/librte_bpf/bpf_validate.c
 create mode 100644 lib/librte_bpf/meson.build
 create mode 100644 lib/librte_bpf/rte_bpf.h
 create mode 100644 lib/librte_bpf/rte_bpf_version.map

diff --git a/config/common_base b/config/common_base
index c09c7cf88..d68c2e211 100644
--- a/config/common_base
+++ b/config/common_base
@@ -821,3 +821,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile librte_bpf
+#
+CONFIG_RTE_LIBRTE_BPF=y
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..a4a2329f9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -97,6 +97,8 @@ DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
 DEPDIRS-librte_gso += librte_mempool
+DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
+DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ether
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile
new file mode 100644
index 000000000..e0f434e77
--- /dev/null
+++ b/lib/librte_bpf/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_bpf.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+LDLIBS += -lrte_net -lrte_eal
+LDLIBS += -lrte_mempool -lrte_ring
+LDLIBS += -lrte_mbuf -lrte_ethdev
+LDLIBS += -lelf
+
+EXPORT_MAP := rte_bpf_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c
+
+# install header files
+SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c
new file mode 100644
index 000000000..d7f68c017
--- /dev/null
+++ b/lib/librte_bpf/bpf.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+int rte_bpf_logtype;
+
+__rte_experimental void
+rte_bpf_destroy(struct rte_bpf *bpf)
+{
+	if (bpf != NULL) {
+		if (bpf->jit.func != NULL)
+			munmap(bpf->jit.func, bpf->jit.sz);
+		munmap(bpf, bpf->sz);
+	}
+}
+
+__rte_experimental int
+rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	jit[0] = bpf->jit;
+	return 0;
+}
+
+int
+bpf_jit(struct rte_bpf *bpf)
+{
+	int32_t rc;
+
+	rc = -ENOTSUP;
+	if (rc != 0)
+		RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
+
+RTE_INIT(rte_bpf_init_log);
+
+static void
+rte_bpf_init_log(void)
+{
+	rte_bpf_logtype = rte_log_register("lib.bpf");
+	if (rte_bpf_logtype >= 0)
+		rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO);
+}
diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c
new file mode 100644
index 000000000..0382ade98
--- /dev/null
+++ b/lib/librte_bpf/bpf_exec.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+
+#include "bpf_impl.h"
+
+#define BPF_JMP_UNC(ins)	((ins) += (ins)->off)
+
+#define BPF_JMP_CND_REG(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \
+		(ins)->off : 0)
+
+#define BPF_JMP_CND_IMM(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \
+		(ins)->off : 0)
+
+#define BPF_NEG_ALU(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg]))
+
+#define BPF_MOV_ALU_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg])
+
+#define BPF_OP_ALU_REG(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg])
+
+#define BPF_MOV_ALU_IMM(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(ins)->imm)
+
+#define BPF_OP_ALU_IMM(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(ins)->imm)
+
+#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \
+	if ((type)(reg)[(ins)->src_reg] == 0) { \
+		RTE_BPF_LOG(ERR, \
+			"%s(%p): division by 0 at pc: %#zx;\n", \
+			__func__, bpf, \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+		return 0; \
+	} \
+} while (0)
+
+#define BPF_LD_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = \
+		*(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off))
+
+#define BPF_ST_IMM(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(ins)->imm)
+
+#define BPF_ST_REG(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(reg)[(ins)->src_reg])
+
+#define BPF_ST_XADD_REG(reg, ins, tp)	\
+	(rte_atomic##tp##_add((rte_atomic##tp##_t *) \
+		(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \
+		reg[ins->src_reg]))
+
+static inline void
+bpf_alu_be(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_be_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_be_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_be_64(*v);
+		break;
+	}
+}
+
+static inline void
+bpf_alu_le(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_le_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_le_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_le_64(*v);
+		break;
+	}
+}
+
+static inline uint64_t
+bpf_exec(const struct rte_bpf *bpf, uint64_t reg[MAX_BPF_REG])
+{
+	const struct bpf_insn *ins;
+
+	for (ins = bpf->prm.ins; ; ins++) {
+		switch (ins->code) {
+		/* 32 bit ALU IMM operations */
+		case (BPF_ALU | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint32_t);
+			break;
+		/* 32 bit ALU REG operations */
+		case (BPF_ALU | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_BE):
+			bpf_alu_be(reg, ins);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_LE):
+			bpf_alu_le(reg, ins);
+			break;
+		/* 64 bit ALU IMM operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint64_t);
+			break;
+		/* 64 bit ALU REG operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint64_t);
+			break;
+		/* load instructions */
+		case (BPF_LDX | BPF_MEM | BPF_B):
+			BPF_LD_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_H):
+			BPF_LD_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_W):
+			BPF_LD_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_DW):
+			BPF_LD_REG(reg, ins, uint64_t);
+			break;
+		/* load 64 bit immediate value */
+		case (BPF_LD | BPF_IMM | BPF_DW):
+			reg[ins->dst_reg] = (uint32_t)ins[0].imm |
+				(uint64_t)(uint32_t)ins[1].imm << 32;
+			ins++;
+			break;
+		/* store instructions */
+		case (BPF_STX | BPF_MEM | BPF_B):
+			BPF_ST_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_H):
+			BPF_ST_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_W):
+			BPF_ST_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_DW):
+			BPF_ST_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_B):
+			BPF_ST_IMM(reg, ins, uint8_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_H):
+			BPF_ST_IMM(reg, ins, uint16_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_W):
+			BPF_ST_IMM(reg, ins, uint32_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_DW):
+			BPF_ST_IMM(reg, ins, uint64_t);
+			break;
+		/* atomic add instructions */
+		case (BPF_STX | BPF_XADD | BPF_W):
+			BPF_ST_XADD_REG(reg, ins, 32);
+			break;
+		case (BPF_STX | BPF_XADD | BPF_DW):
+			BPF_ST_XADD_REG(reg, ins, 64);
+			break;
+		/* jump instructions */
+		case (BPF_JMP | BPF_JA):
+			BPF_JMP_UNC(ins);
+			break;
+		/* jump IMM instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, &, uint64_t);
+			break;
+		/* jump REG instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, &, uint64_t);
+			break;
+		/* call instructions */
+		case (BPF_JMP | BPF_CALL):
+			reg[BPF_REG_0] = bpf->prm.xsym[ins->imm].func(
+				reg[BPF_REG_1], reg[BPF_REG_2], reg[BPF_REG_3],
+				reg[BPF_REG_4], reg[BPF_REG_5]);
+			break;
+		/* return instruction */
+		case (BPF_JMP | BPF_EXIT):
+			return reg[BPF_REG_0];
+		default:
+			RTE_BPF_LOG(ERR,
+				"%s(%p): invalid opcode %#x at pc: %#zx;\n",
+				__func__, bpf, ins->code,
+				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+			return 0;
+		}
+	}
+
+	/* should never be reached */
+	RTE_VERIFY(0);
+	return 0;
+}
+
+__rte_experimental uint32_t
+rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
+	uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[MAX_BPF_REG];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		reg[BPF_REG_1] = (uintptr_t)ctx[i];
+		reg[BPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+__rte_experimental uint64_t
+rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
+{
+	uint64_t rc;
+
+	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
+	return rc;
+}
diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h
new file mode 100644
index 000000000..5d7e65c31
--- /dev/null
+++ b/lib/librte_bpf/bpf_impl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _BPF_H_
+#define _BPF_H_
+
+#include <rte_bpf.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BPF_STACK_SIZE	0x200
+
+struct rte_bpf {
+	struct rte_bpf_prm prm;
+	struct rte_bpf_jit jit;
+	size_t sz;
+	uint32_t stack_sz;
+};
+
+extern int bpf_validate(struct rte_bpf *bpf);
+
+extern int bpf_jit(struct rte_bpf *bpf);
+
+#ifdef RTE_ARCH_X86_64
+extern int bpf_jit_x86(struct rte_bpf *);
+#endif
+
+extern int rte_bpf_logtype;
+
+#define	RTE_BPF_LOG(lvl, fmt, args...) \
+	rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BPF_H_ */
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
new file mode 100644
index 000000000..3c7279a6c
--- /dev/null
+++ b/lib/librte_bpf/bpf_load.c
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+
+#include "bpf_impl.h"
+
+/* To overcome compatibility issue */
+#ifndef EM_BPF
+#define	EM_BPF	247
+#endif
+
+static uint32_t
+bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
+	const struct rte_bpf_xsym fp[], uint32_t fn)
+{
+	uint32_t i;
+
+	if (sn == NULL || fp == NULL)
+		return UINT32_MAX;
+
+	for (i = 0; i != fn; i++) {
+		if (fp[i].type == type && strcmp(sn, fp[i].name) == 0)
+			break;
+	}
+
+	return (i != fn) ? i : UINT32_MAX;
+}
+
+/*
+ * update BPF code at offset *ofs* with a proper address(index) for external
+ * symbol *sn*
+ */
+static int
+resolve_xsym(const char *sn, size_t ofs, struct bpf_insn *ins, size_t ins_sz,
+	const struct rte_bpf_prm *prm)
+{
+	uint32_t idx, fidx;
+	enum rte_bpf_xtype type;
+
+	if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz)
+		return -EINVAL;
+
+	idx = ofs / sizeof(ins[0]);
+	if (ins[idx].code == (BPF_JMP | BPF_CALL))
+		type = RTE_BPF_XTYPE_FUNC;
+	else if (ins[idx].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+			ofs < ins_sz - sizeof(ins[idx]))
+		type = RTE_BPF_XTYPE_VAR;
+	else
+		return -EINVAL;
+
+	fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym);
+	if (fidx == UINT32_MAX)
+		return -ENOENT;
+
+	/* for function we just need an index in our xsym table */
+	if (type == RTE_BPF_XTYPE_FUNC)
+		ins[idx].imm = fidx;
+	/* for variable we need to store its absolute address */
+	else {
+		ins[idx].imm = (uintptr_t)prm->xsym[fidx].var;
+		ins[idx + 1].imm =
+			(uint64_t)(uintptr_t)prm->xsym[fidx].var >> 32;
+	}
+
+	return 0;
+}
+
+static int
+check_elf_header(const Elf64_Ehdr * eh)
+{
+	const char *err;
+
+	err = NULL;
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	if (eh->e_ident[EI_DATA] != ELFDATA2LSB)
+#else
+	if (eh->e_ident[EI_DATA] != ELFDATA2MSB)
+#endif
+		err = "not native byte order";
+	else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE)
+		err = "unexpected OS ABI";
+	else if (eh->e_type != ET_REL)
+		err = "unexpected ELF type";
+	else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF)
+		err = "unexpected machine type";
+
+	if (err != NULL) {
+		RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find executable section by name.
+ */
+static int
+find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
+{
+	Elf_Scn *sc;
+	const Elf64_Ehdr *eh;
+	const Elf64_Shdr *sh;
+	Elf_Data *sd;
+	const char *sn;
+	int32_t rc;
+
+	eh = elf64_getehdr(elf);
+	if (eh == NULL) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	if (check_elf_header(eh) != 0)
+		return -EINVAL;
+
+	/* find given section by name */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL;
+			sc = elf_nextscn(elf, sc)) {
+		sh = elf64_getshdr(sc);
+		sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name);
+		if (sn != NULL && strcmp(section, sn) == 0 &&
+				sh->sh_type == SHT_PROGBITS &&
+				sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR))
+			break;
+	}
+
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL || sd->d_size == 0 ||
+			sd->d_size % sizeof(struct bpf_insn) != 0) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	*psd = sd;
+	*pidx = elf_ndxscn(sc);
+	return 0;
+}
+
+/*
+ * helper function to process data from relocation table.
+ */
+static int
+process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
+	struct bpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+{
+	int32_t rc;
+	uint32_t i, n;
+	size_t ofs, sym;
+	const char *sn;
+	const Elf64_Ehdr *eh;
+	Elf_Scn *sc;
+	const Elf_Data *sd;
+	Elf64_Sym *sm;
+
+	eh = elf64_getehdr(elf);
+
+	/* get symtable by section index */
+	sc = elf_getscn(elf, sym_idx);
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL)
+		return -EINVAL;
+	sm = sd->d_buf;
+
+	n = re_sz / sizeof(re[0]);
+	for (i = 0; i != n; i++) {
+
+		ofs = re[i].r_offset;
+
+		/* retrieve index in the symtable */
+		sym = ELF64_R_SYM(re[i].r_info);
+		if (sym * sizeof(sm[0]) >= sd->d_size)
+			return -EINVAL;
+
+		sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name);
+
+		rc = resolve_xsym(sn, ofs, ins, ins_sz, prm);
+		if (rc != 0) {
+			RTE_BPF_LOG(ERR,
+				"resolve_xsym(%s, %zu) error code: %d\n",
+				sn, ofs, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find relocation information (if any)
+ * and update bpf code.
+ */
+static int
+elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
+	const struct rte_bpf_prm *prm)
+{
+	Elf64_Rel *re;
+	Elf_Scn *sc;
+	const Elf64_Shdr *sh;
+	const Elf_Data *sd;
+	int32_t rc;
+
+	rc = 0;
+
+	/* walk through all sections */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0;
+			sc = elf_nextscn(elf, sc)) {
+
+		sh = elf64_getshdr(sc);
+
+		/* relocation data for our code section */
+		if (sh->sh_type == SHT_REL && sh->sh_info == sidx) {
+			sd = elf_getdata(sc, NULL);
+			if (sd == NULL || sd->d_size == 0 ||
+					sd->d_size % sizeof(re[0]) != 0)
+				return -EINVAL;
+			rc = process_reloc(elf, sh->sh_link,
+				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				prm);
+		}
+	}
+
+	return rc;
+}
+
+static struct rte_bpf *
+bpf_load(const struct rte_bpf_prm *prm)
+{
+	uint8_t *buf;
+	struct rte_bpf *bpf;
+	size_t sz, bsz, insz, xsz;
+
+	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
+	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	bsz = sizeof(bpf[0]);
+	sz = insz + xsz + bsz;
+
+	buf = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf == MAP_FAILED)
+		return NULL;
+
+	bpf = (void *)buf;
+	bpf->sz = sz;
+
+	memcpy(&bpf->prm, prm, sizeof(bpf->prm));
+
+	memcpy(buf + bsz, prm->xsym, xsz);
+	memcpy(buf + bsz + xsz, prm->ins, insz);
+
+	bpf->prm.xsym = (void *)(buf + bsz);
+	bpf->prm.ins = (void *)(buf + bsz + xsz);
+
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	if (prm == NULL || prm->ins == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load(prm);
+	if (bpf == NULL) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	rc = bpf_validate(bpf);
+	if (rc == 0) {
+		bpf_jit(bpf);
+		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
+			rc = -ENOMEM;
+	}
+
+	if (rc != 0) {
+		rte_bpf_destroy(bpf);
+		rte_errno = -rc;
+		return NULL;
+	}
+
+	return bpf;
+}
+
+static struct rte_bpf *
+bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+{
+	Elf *elf;
+	Elf_Data *sd;
+	size_t sidx;
+	int32_t rc;
+	struct rte_bpf *bpf;
+	struct rte_bpf_prm np;
+
+	elf_version(EV_CURRENT);
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+
+	rc = find_elf_code(elf, section, &sd, &sidx);
+	if (rc == 0)
+		rc = elf_reloc_code(elf, sd, sidx, prm);
+
+	if (rc == 0) {
+		np = prm[0];
+		np.ins = sd->d_buf;
+		np.nb_ins = sd->d_size / sizeof(struct bpf_insn);
+		bpf = rte_bpf_load(&np);
+	} else {
+		bpf = NULL;
+		rte_errno = -rc;
+	}
+
+	elf_end(elf);
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	int32_t fd, rc;
+	struct rte_bpf *bpf;
+
+	if (prm == NULL || fname == NULL || sname == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		rc = errno;
+		RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n",
+			__func__, fname, rc, strerror(rc));
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load_elf(prm, fd, sname);
+	close(fd);
+
+	if (bpf == NULL) {
+		RTE_BPF_LOG(ERR,
+			"%s(fname=\"%s\", sname=\"%s\") failed, "
+			"error code: %d\n",
+			__func__, fname, sname, rte_errno);
+		return NULL;
+	}
+
+	RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") "
+		"successfully creates %p(jit={.func=%p,.sz=%zu});\n",
+		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
+	return bpf;
+}
diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c
new file mode 100644
index 000000000..1911e1381
--- /dev/null
+++ b/lib/librte_bpf/bpf_validate.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+/*
+ * dummy one for now, need more work.
+ */
+int
+bpf_validate(struct rte_bpf *bpf)
+{
+	int32_t rc, ofs, stack_sz;
+	uint32_t i, op, dr;
+	const struct bpf_insn *ins;
+
+	rc = 0;
+	stack_sz = 0;
+	for (i = 0; i != bpf->prm.nb_ins; i++) {
+
+		ins = bpf->prm.ins + i;
+		op = ins->code;
+		dr = ins->dst_reg;
+		ofs = ins->off;
+
+		if ((BPF_CLASS(op) == BPF_STX || BPF_CLASS(op) == BPF_ST) &&
+				dr == BPF_REG_10) {
+			ofs -= sizeof(uint64_t);
+			stack_sz = RTE_MIN(ofs, stack_sz);
+		}
+	}
+
+	if (stack_sz != 0) {
+		stack_sz = -stack_sz;
+		if (stack_sz > MAX_BPF_STACK_SIZE)
+			rc = -ERANGE;
+		else
+			bpf->stack_sz = stack_sz;
+	}
+
+	if (rc != 0)
+		RTE_BPF_LOG(ERR, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build
new file mode 100644
index 000000000..05c48c7ff
--- /dev/null
+++ b/lib/librte_bpf/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+allow_experimental_apis = true
+sources = files('bpf.c',
+		'bpf_exec.c',
+		'bpf_load.c',
+		'bpf_validate.c')
+
+install_headers = files('rte_bpf.h')
+
+deps += ['mbuf', 'net']
+
+dep = dependency('libelf', required: false)
+if dep.found() == false
+	build = false
+endif
+ext_deps += dep
diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h
new file mode 100644
index 000000000..825621404
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_BPF_H_
+#define _RTE_BPF_H_
+
+/**
+ * @file
+ *
+ * RTE BPF support.
+ * librte_bpf provides a framework to load and execute eBPF bytecode
+ * inside user-space dpdk based applications.
+ * It supports basic set of features from eBPF spec
+ * (https://www.kernel.org/doc/Documentation/networking/filter.txt).
+ */
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <bpf_def.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible types for external symbols.
+ */
+enum rte_bpf_xtype {
+	RTE_BPF_XTYPE_FUNC, /**< function */
+	RTE_BPF_XTYPE_VAR, /**< variable */
+	RTE_BPF_XTYPE_NUM
+};
+
+/**
+ * Definition for external symbols available in the BPF program.
+ */
+struct rte_bpf_xsym {
+	const char *name;        /**< name */
+	enum rte_bpf_xtype type; /**< type */
+	union {
+		uint64_t (*func)(uint64_t, uint64_t, uint64_t,
+				uint64_t, uint64_t);
+		void *var;
+	}; /**< value */
+};
+
+/**
+ * Possible BPF program types.
+ * Use negative values for DPDK specific prog-types, to make sure they will
+ * not interfere with Linux related ones.
+ */
+enum rte_bpf_prog_type {
+	RTE_BPF_PROG_TYPE_UNSPEC = BPF_PROG_TYPE_UNSPEC,
+	/**< input is a pointer to raw data */
+	RTE_BPF_PROG_TYPE_MBUF = INT32_MIN,
+	/**< input is a pointer to rte_mbuf */
+};
+
+/**
+ * Input parameters for loading eBPF code.
+ */
+struct rte_bpf_prm {
+	const struct bpf_insn *ins; /**< array of eBPF instructions */
+	uint32_t nb_ins;            /**< number of instructions in ins */
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym; /**< number of elements in xsym */
+	enum rte_bpf_prog_type prog_type; /**< eBPF program type */
+};
+
+/**
+ * Information about compiled into native ISA eBPF code.
+ */
+struct rte_bpf_jit {
+	uint64_t (*func)(void *); /**< JIT-ed native code */
+	size_t sz;                /**< size of JIT-ed code */
+};
+
+struct rte_bpf;
+
+/**
+ * De-allocate all memory used by this eBPF execution context.
+ *
+ * @param bpf
+ *   BPF handle to destroy.
+ */
+void rte_bpf_destroy(struct rte_bpf *bpf);
+
+/**
+ * Create a new eBPF execution context and load given BPF code into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_load(const struct rte_bpf_prm *prm);
+
+/**
+ * Create a new eBPF execution context and load BPF code from given ELF
+ * file into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @param fname
+ *  Pathname for a ELF file.
+ * @param sname
+ *  Name of the executable section within the file to load.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_elf_load(const struct rte_bpf_prm *prm,
+	const char *fname, const char *sname);
+
+/**
+ * Execute given BPF bytecode.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to input context.
+ * @return
+ *   BPF execution return value.
+ */
+uint64_t rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
+
+/**
+ * Execute given BPF bytecode over a set of input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   array of pointers to the input contexts.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number of elements in ctx[] (and rc[]).
+ * @return
+ *   number of successfully processed inputs.
+ */
+uint32_t rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[],
+	uint64_t rc[], uint32_t num);
+
+/**
+ * Provide information about natively compield code for given BPF handle.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the rte_bpf_jit structure to be filled with related data.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BPF_H_ */
diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map
new file mode 100644
index 000000000..ff65144df
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_bpf_destroy;
+	rte_bpf_elf_load;
+	rte_bpf_exec;
+	rte_bpf_exec_burst;
+	rte_bpf_get_jit;
+	rte_bpf_load;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ef6159170..7ff7aaaa5 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -23,7 +23,7 @@ libraries = [ 'compat', # just a header, used for versioning
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
-	'flow_classify']
+	'flow_classify', 'bpf']
 
 foreach l:libraries
 	build = true
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 258590819..405a13147 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -83,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER)          += -lrte_timer
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 
+_LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf -lelf
+
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
-- 
2.13.6

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 0/5] introduce new tunnel types
                     ` (2 preceding siblings ...)
  2018-04-12  7:33  3% ` [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types Xueming Li
@ 2018-04-13 11:02  3% ` Xueming Li
  2018-04-17 15:04  3%   ` [dpdk-dev] [PATCH v5 0/4] " Xueming Li
  2018-04-20 11:56  3%   ` [dpdk-dev] [PATCH v6 0/5] " Xueming Li
  3 siblings, 2 replies; 200+ results
From: Xueming Li @ 2018-04-13 11:02 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v4:
- Update testpmd doc for flow VXLAN-GPE paramter.
v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine

Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c                 |  24 +++++++
 app/test-pmd/config.c                       |   2 +
 app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
 app/test-pmd/parameters.c                   |  12 +++-
 app/test-pmd/testpmd.h                      |   2 +
 doc/guides/prog_guide/rte_flow.rst          |  12 ++++
 doc/guides/rel_notes/deprecation.rst        |   4 --
 doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
 lib/librte_ether/rte_eth_ctrl.h             |   3 +-
 lib/librte_ether/rte_flow.c                 |   1 +
 lib/librte_ether/rte_flow.h                 |  27 ++++++++
 lib/librte_mbuf/rte_mbuf.c                  |   3 +
 lib/librte_mbuf/rte_mbuf.h                  |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c            |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h            |  47 +++++++++++++
 lib/librte_net/rte_ether.h                  |  25 +++++++
 17 files changed, 261 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances
  @ 2018-04-13  9:16  0% ` Burakov, Anatoly
  0 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-13  9:16 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, wenzhuo.lu, declan.doherty,
	jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04-Apr-18 11:01 PM, Arnon Warshavsky wrote:
> 
> The purpose of this patch series is to cleanup the library code
> from paths that end up aborting the process,
> and move to checking error values, in order to allow the running process
> perform an orderly teardown or other mitigation of the event.
> 
> This patch modifies the majority of rte_panic calls
> under lib and drivers, and replaces them with a log message
> and an error return code according to context,
> that can be propagated up the call stack.
> 
> - Focus was given to the dpdk initialization path
> - Some of the panic calls within drivers were left in place where
>    the call is from within an interrupt or calls that are
>    on the data path,where there is no simple applicative
>    route to propagate the error to temination.
>    These should be handled by the driver maintainers.
> - In order to avoid breaking ABI where panic was called from public
>    void functions, a panic state variable was introduced so that
>    it can be queried after calling these void functions.
>    This tool place for a single function call.
> - local void functions with no api were changed to retrun a value
>    where needed
> - No change took place in example and test files
> - No change took place for debug assertions calling panic
> - A new function was added to devtools/checkpatches.sh
>    in order to prevent new additions of calls to rte_panic
>    under lib and drivers.
> 
> Keep calm and don't panic
> 
> ---
> 
> v2:
> - reformat error messages so that literal string are in the same line
> - fix typo in commit message
> - add new return code to doxigen of rte_memzone_free()

Hi Arnon,

When sending new versions, the entire patchset must be sent. It makes it 
easier for maintainers to apply patches this way.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-12 16:34  0%                     ` Ananyev, Konstantin
@ 2018-04-12 18:58  0%                       ` Yongseok Koh
  0 siblings, 0 replies; 200+ results
From: Yongseok Koh @ 2018-04-12 18:58 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Thu, Apr 12, 2018 at 04:34:56PM +0000, Ananyev, Konstantin wrote:
> > >
> > > > > >
> > > > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > > > Hi Yongseok,
> > > > > > >
> > > > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > > > Hi,
> > > > > > > > >
> > > > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > > > >
> > > > > > > > > > Possible use-cases could be:
> > > > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > > > >
> > > > > > > > > I think the current API is already able to do what you want.
> > > > > > > > >
> > > > > > > > > 1/ Here is a mbuf m with its data
> > > > > > > > >
> > > > > > > > >                off
> > > > > > > > >                <-->
> > > > > > > > >                       len
> > > > > > > > >           +----+   <---------->
> > > > > > > > >           |    |
> > > > > > > > >         +-|----v----------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +-----------------------------+
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > 2/ clone m:
> > > > > > > > >
> > > > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > > > >
> > > > > > > > >   Note that c has its own offset and length fields.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >                off
> > > > > > > > >                <-->
> > > > > > > > >                       len
> > > > > > > > >           +----+   <---------->
> > > > > > > > >           |    |
> > > > > > > > >         +-|----v----------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +------^----------------------+
> > > > > > > > >                |
> > > > > > > > >           +----+
> > > > > > > > > indirect  |
> > > > > > > > >         +-|---------------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > c       | buf  |                     ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +-----------------------------+
> > > > > > > > >
> > > > > > > > >                 off    len
> > > > > > > > >                 <--><---------->
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > 3/ remove some data from c without changing m
> > > > > > > > >
> > > > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Please let me know if it fits your needs.
> > > > > > > >
> > > > > > > > No, it doesn't.
> > > > > > > >
> > > > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > > > difference offsets in m,
> > > > > > > >
> > > > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > > > >
> > > > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > > > header should be linked by h1->next = c2.
> > > > > > >
> > > > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > > > length) that will:
> > > > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > > > >     in the range [offset : length+offset].
> > > > > > >   - prepend an empty and writable mbuf for the headers
> > > > > > >
> > > > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > > > >
> > > > > > > What do you mean by properly handled?
> > > > > > >
> > > > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > > > won't be protected.
> > > > > > >
> > > > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > > > may share their data. How an application can know if the data is shared
> > > > > > > or not?
> > > > > > >
> > > > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > > > >
> > > > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > > > enough to handle that use-case.
> > > > > >
> > > > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > > > indirect referencing.
> > > > >
> > > > > But just to make HW to RX multiple packets into one mbuf,
> > > > > data_off inside indirect mbuf should be enough, correct?
> > > > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > > > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > > > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > > > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > > > to saturate the network link.
> > >
> > > There were few complains that 64KB max is a limitation for some use-cases.
> > > I am not against increasing it, but I don't think we have free space on first cache-line for that
> > > without another big rework of mbuf layout.
> > > Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too.
> > >
> > > >
> > > > > As I understand, what you'd like to achieve with this new field -
> > > > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > > > indirect mbufs trying to modify same direct buffer.
> > > >
> > > > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > > > is read-only. What that means, all the entities which own such mbufs have to be
> > > > aware of that and keep the principle as DPDK can't enforce the rule and there
> > > > can't be such sanity check. In this sense, HW doesn't violate it because the
> > > > direct mbuf is injected to HW before indirection. When packets are written by
> > > > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > > > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > > > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> > >
> > > Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> > > which implies that it should be possible by some entity (upper layer?) to manipulate these
> > > indirect mbufs.
> > > And we don't know how exactly it will be done.
> > 
> > That's a valid concern. I can make it private by merging into the _attach_to()
> > func, or I just can add a comment in the API doc. However, if users are aware
> > that a mbuf is read-only and we expect them to keep it intact by their own
> > judgement, they would/should not use those APIs. We can't stop them modifying
> > content or the buffer itself anyway. Will add more comments of this discussion
> > regarding read-only mode.
> 
> Ok, so these functions are intended to be used only by PMD level?
> But in that case do you need them at all?
> Isn't it possible implement same thing with just data_off?
> I mean your PMD knows in advance what is the buf_len of mbuf and at startup
> time it can decide it going to slice it how to slice it into multiple packets.
> So each offset is known in advance and you don't need to worry that you'll overwrite
> neighbor packet's data. 

Since Olivier's last comment, I've been thinking about the approach all over
again. It looks like I'm trapped in self-contradiction. The reason why I didn't
want to use data_off was to provide valid headroom for each Rx packet and let
users freely write the headroom. But, given that indirect mbuf should be
considered read-only, this isn't a right approach. Instead of slicing a buffer
with mbuf indirection and manipulating boundaries, the idea of external data (as
Olivier suggested) would fit better. Even though it is more complex, it is
doable. I summarized ideas yesterday and will come up with a new patch soon.

Briefly, I think reserved bit 61 of ol_flags can be used to indicate externally
attached mbuf. The following is my initial thought.

#define EXT_ATTACHED_MBUF    (1ULL << 61)

struct rte_pktmbuf_ext_shared_info {
	refcnt;
	*free_cb();
	*opaque /* arg for free_cb() */
}

rte_pktmbuf_get_ext_shinfo() {
	/* Put shared info at the end of external buffer */
	return (struct rte_pktmbuf_ext_shared_info *)(m->buf_addr + m->buf_len);
}

rte_pktmbuf_attach_ext_buf(m, buf_addr, buf_len, free_cb, opaque) {
	struct rte_pktmbuf_ext_shared_info *shinfo;

	m->buf_addr = buf_addr;
	m->buf_iova = rte_mempool_virt2iova(buf_addr);
	/* Have to add some calculation for alignment */
	m->buf_len = buf_len - sizeof (*shinfo);
	shinfo = m->buf_addr + m->buf_len;
	...
	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
	m->ol_flags |= EXT_ATTACHED_MBUF;
	atomic set shinfo->refcnt = 1;

	shinfo->free_cb = free_cb;
	shinfo->opaque = opaque;

	...
}
rte_pktmbuf_detach_ext_buf(m)

#define RTE_MBUF_EXT(mb)   ((mb)->ol_flags & EXT_ATTACHED_MBUF)

In rte_pktmbuf_prefree_seg(),

		if (RTE_MBUF_INDIRECT(m))
			rte_pktmbuf_detach(m);
		else if (RTE_MBUF_EXT(m))
			rte_pktmbuf_detach_ext_buf(m);

And in rte_pktmbuf_attach(), if the mbuf attaching to is externally attached,
then just increase refcnt in shinfo so that multiple mbufs can refer to the same
external buffer.

Please feel free to share any concern/idea.

> > > > The direct buffer will be freed and get available for reuse when all the attached
> > > > indirect mbufs are freed.
> > > >
> > > > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > > > Fields for indirect mbufs, straight after attach()?
> > > >
> > > > Good point.
> > > > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > > > had to consider a case where priv_size is really given by user. Even though it
> > > > is less likely, but if original priv_size is quite big, it can't cover entire
> > > > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > > > 16bit field (buf_off) looked more plausible.
> > >
> > > As I remember, we can't have mbufs bigger then 64K,
> > > so priv_size + buf_len should be always less than 64K, correct?
> > 
> > Can you let me know where I can find the constraint? I checked
> > rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
> > but there's no such limitation.
> > 
> > 	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
> > 		(unsigned)data_room_size;
> 
> 
> Ok I scanned through librte_mbuf and didn't find any limitations.
> Seems like a false impression from my side.
> Anyway that seems like a corner case to have priv_szie + buf_len >64KB.
> Do you really need to support it?

If a user must have 64kB buffer (it's valid, no violation) and the priv_size is
just a few bytes. Then, does library have to force the user to sacrifice a few
bytes for priv_size? Do you think it's a corner case? Still using priv_size
doesn't seem to be a good idea.

Yongseok

> > The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
> > sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
> > priv_size couldn't be used for this purpose.
> > 
> > Yongseok
> > 
> > > > > > > >
> > > > > > > > Does this make sense?
> > > > > > >
> > > > > > > I understand the need.
> > > > > > >
> > > > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > > > a quick overview.
> > > > > > >
> > > > > > > [1]
> > > > > >
> > > >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > > > -Session05-OlivierMatz-
> > > > > >
> > > >
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > > > >
> > > > > > > The advantage is that it does not require the large data to be inside a
> > > > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > > > to implement compared to your solution.
> > > > > >
> > > > > > I knew that you presented the slides and frankly, I had considered that option
> > > > > > at first. But even with that option, metadata to store refcnt should also be
> > > > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > > > I just wanted to make full use of the existing framework because it is less
> > > > > > complex as you mentioned. Given that you presented the idea of external data
> > > > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > > > more demands come from users in the future as it would be a huge change and may
> > > > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > > > discussions and write codes for it if I can be helpful.
> > > > > >
> > > > > > Do you think this patch is okay for now?
> > > > > >
> > > > > >
> > > > > > Thanks for your comments,
> > > > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11 17:08  0%                   ` Yongseok Koh
@ 2018-04-12 16:34  0%                     ` Ananyev, Konstantin
  2018-04-12 18:58  0%                       ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-12 16:34 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

> >
> > > > >
> > > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > > Hi Yongseok,
> > > > > >
> > > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > > Hi,
> > > > > > > >
> > > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > > >
> > > > > > > > > Possible use-cases could be:
> > > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > > >
> > > > > > > > I think the current API is already able to do what you want.
> > > > > > > >
> > > > > > > > 1/ Here is a mbuf m with its data
> > > > > > > >
> > > > > > > >                off
> > > > > > > >                <-->
> > > > > > > >                       len
> > > > > > > >           +----+   <---------->
> > > > > > > >           |    |
> > > > > > > >         +-|----v----------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +-----------------------------+
> > > > > > > >
> > > > > > > >
> > > > > > > > 2/ clone m:
> > > > > > > >
> > > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > > >
> > > > > > > >   Note that c has its own offset and length fields.
> > > > > > > >
> > > > > > > >
> > > > > > > >                off
> > > > > > > >                <-->
> > > > > > > >                       len
> > > > > > > >           +----+   <---------->
> > > > > > > >           |    |
> > > > > > > >         +-|----v----------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +------^----------------------+
> > > > > > > >                |
> > > > > > > >           +----+
> > > > > > > > indirect  |
> > > > > > > >         +-|---------------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > c       | buf  |                     ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +-----------------------------+
> > > > > > > >
> > > > > > > >                 off    len
> > > > > > > >                 <--><---------->
> > > > > > > >
> > > > > > > >
> > > > > > > > 3/ remove some data from c without changing m
> > > > > > > >
> > > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > > >
> > > > > > > >
> > > > > > > > Please let me know if it fits your needs.
> > > > > > >
> > > > > > > No, it doesn't.
> > > > > > >
> > > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > > difference offsets in m,
> > > > > > >
> > > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > > >
> > > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > > header should be linked by h1->next = c2.
> > > > > >
> > > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > > length) that will:
> > > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > > >     in the range [offset : length+offset].
> > > > > >   - prepend an empty and writable mbuf for the headers
> > > > > >
> > > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > > >
> > > > > > What do you mean by properly handled?
> > > > > >
> > > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > > won't be protected.
> > > > > >
> > > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > > may share their data. How an application can know if the data is shared
> > > > > > or not?
> > > > > >
> > > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > > >
> > > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > > enough to handle that use-case.
> > > > >
> > > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > > indirect referencing.
> > > >
> > > > But just to make HW to RX multiple packets into one mbuf,
> > > > data_off inside indirect mbuf should be enough, correct?
> > > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > > to saturate the network link.
> >
> > There were few complains that 64KB max is a limitation for some use-cases.
> > I am not against increasing it, but I don't think we have free space on first cache-line for that
> > without another big rework of mbuf layout.
> > Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too.
> >
> > >
> > > > As I understand, what you'd like to achieve with this new field -
> > > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > > indirect mbufs trying to modify same direct buffer.
> > >
> > > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > > is read-only. What that means, all the entities which own such mbufs have to be
> > > aware of that and keep the principle as DPDK can't enforce the rule and there
> > > can't be such sanity check. In this sense, HW doesn't violate it because the
> > > direct mbuf is injected to HW before indirection. When packets are written by
> > > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> >
> > Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> > which implies that it should be possible by some entity (upper layer?) to manipulate these
> > indirect mbufs.
> > And we don't know how exactly it will be done.
> 
> That's a valid concern. I can make it private by merging into the _attach_to()
> func, or I just can add a comment in the API doc. However, if users are aware
> that a mbuf is read-only and we expect them to keep it intact by their own
> judgement, they would/should not use those APIs. We can't stop them modifying
> content or the buffer itself anyway. Will add more comments of this discussion
> regarding read-only mode.

Ok, so these functions are intended to be used only by PMD level?
But in that case do you need them at all?
Isn't it possible implement same thing with just data_off?
I mean your PMD knows in advance what is the buf_len of mbuf and at startup
time it can decide it going to slice it how to slice it into multiple packets.
So each offset is known in advance and you don't need to worry that you'll overwrite
neighbor packet's data. 

> 
> > > The direct buffer will be freed and get available for reuse when all the attached
> > > indirect mbufs are freed.
> > >
> > > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > > Fields for indirect mbufs, straight after attach()?
> > >
> > > Good point.
> > > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > > had to consider a case where priv_size is really given by user. Even though it
> > > is less likely, but if original priv_size is quite big, it can't cover entire
> > > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > > 16bit field (buf_off) looked more plausible.
> >
> > As I remember, we can't have mbufs bigger then 64K,
> > so priv_size + buf_len should be always less than 64K, correct?
> 
> Can you let me know where I can find the constraint? I checked
> rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
> but there's no such limitation.
> 
> 	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
> 		(unsigned)data_room_size;


Ok I scanned through librte_mbuf and didn't find any limitations.
Seems like a false impression from my side.
Anyway that seems like a corner case to have priv_szie + buf_len >64KB.
Do you really need to support it?

Konstantin

> 
> The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
> sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
> priv_size couldn't be used for this purpose.
> 
> Yongseok
> 
> > > > > > >
> > > > > > > Does this make sense?
> > > > > >
> > > > > > I understand the need.
> > > > > >
> > > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > > a quick overview.
> > > > > >
> > > > > > [1]
> > > > >
> > >
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > > -Session05-OlivierMatz-
> > > > >
> > >
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > > >
> > > > > > The advantage is that it does not require the large data to be inside a
> > > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > > to implement compared to your solution.
> > > > >
> > > > > I knew that you presented the slides and frankly, I had considered that option
> > > > > at first. But even with that option, metadata to store refcnt should also be
> > > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > > I just wanted to make full use of the existing framework because it is less
> > > > > complex as you mentioned. Given that you presented the idea of external data
> > > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > > more demands come from users in the future as it would be a huge change and may
> > > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > > discussions and write codes for it if I can be helpful.
> > > > >
> > > > > Do you think this patch is okay for now?
> > > > >
> > > > >
> > > > > Thanks for your comments,
> > > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated
  @ 2018-04-12 15:22  0%     ` Burakov, Anatoly
  0 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-12 15:22 UTC (permalink / raw)
  To: Andrew Rybchenko, dev; +Cc: Olivier MATZ

On 26-Mar-18 5:09 PM, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> ---

Hi Andrew,

<...>

> -	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> -		size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
> -						mp->flags);
> +		size_t min_chunk_size;
> +
> +		mem_size = rte_mempool_ops_calc_mem_size(mp, n, pg_shift,
> +				&min_chunk_size, &align);
> +		if (mem_size < 0) {
> +			ret = mem_size;
> +			goto fail;
> +		}
>   
>   		ret = snprintf(mz_name, sizeof(mz_name),
>   			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
> @@ -606,7 +600,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			goto fail;
>   		}
>   
> -		mz = rte_memzone_reserve_aligned(mz_name, size,
> +		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>   			mp->socket_id, mz_flags, align);
>   		/* not enough memory, retry with the biggest zone we have */
>   		if (mz == NULL)
> @@ -617,6 +611,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			goto fail;
>   		}
>   
> +		if (mz->len < min_chunk_size) {
> +			rte_memzone_free(mz);
> +			ret = -ENOMEM;
> +			goto fail;
> +		}
> +
>   		if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
>   			iova = RTE_BAD_IOVA;

OK by me, but needs to be rebased.

>   		else
> @@ -649,13 +649,14 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   static size_t
>   get_anon_size(const struct rte_mempool *mp)
>   {
> -	size_t size, total_elt_sz, pg_sz, pg_shift;
> +	size_t size, pg_sz, pg_shift;
> +	size_t min_chunk_size;
> +	size_t align;
>   
>   	pg_sz = getpagesize();

<...>

>   
> +/**
> + * Calculate memory size required to store given number of objects.
> + *
> + * If mempool objects are not required to be IOVA-contiguous
> + * (the flag MEMPOOL_F_NO_IOVA_CONTIG is set), min_chunk_size defines
> + * virtually contiguous chunk size. Otherwise, if mempool objects must
> + * be IOVA-contiguous (the flag MEMPOOL_F_NO_IOVA_CONTIG is clear),
> + * min_chunk_size defines IOVA-contiguous chunk size.
> + *
> + * @param[in] mp
> + *   Pointer to the memory pool.
> + * @param[in] obj_num
> + *   Number of objects.
> + * @param[in] pg_shift
> + *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
> + * @param[out] min_chunk_size
> + *   Location for minimum size of the memory chunk which may be used to
> + *   store memory pool objects.
> + * @param[out] align
> + *   Location for required memory chunk alignment.
> + * @return
> + *   Required memory size aligned at page boundary.
> + */
> +typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
> +		uint32_t obj_num,  uint32_t pg_shift,
> +		size_t *min_chunk_size, size_t *align);
> +
> +/**
> + * Default way to calculate memory size required to store given number of
> + * objects.
> + *
> + * If page boundaries may be ignored, it is just a product of total
> + * object size including header and trailer and number of objects.
> + * Otherwise, it is a number of pages required to store given number of
> + * objects without crossing page boundary.
> + *
> + * Note that if object size is bigger than page size, then it assumes
> + * that pages are grouped in subsets of physically continuous pages big
> + * enough to store at least one object.
> + *
> + * If mempool driver requires object addresses to be block size aligned
> + * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
> + * reserved to be able to meet the requirement.
> + *
> + * Minimum size of memory chunk is either all required space, if
> + * capabilities say that whole memory area must be physically contiguous
> + * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
> + * element size.
> + *
> + * Required memory chunk alignment is a maximum of page size and cache
> + * line size.
> + */
> +ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> +		uint32_t obj_num, uint32_t pg_shift,
> +		size_t *min_chunk_size, size_t *align);

For API docs and wording,

Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

Should be pretty straightforward to rebase, so you probably should keep 
my ack for v4.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-12  9:19  0%         ` Adrien Mazarguil
@ 2018-04-12 10:00  0%           ` Zhang, Qi Z
  0 siblings, 0 replies; 200+ results
From: Zhang, Qi Z @ 2018-04-12 10:00 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce



> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Thursday, April 12, 2018 5:20 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> 
> On Thu, Apr 12, 2018 at 05:12:08AM +0000, Zhang, Qi Z wrote:
> > Hi Adrien:
> >
> > 	Thank you so much for your careful review and helpful suggestions!
> > 	I agree with most of your comments, except couple question about
> RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
> > 	Please see my comment inline.
> >
> > Thanks!
> > Qi
> 
> Thanks, replying inline also.
> 
> > > -----Original Message-----
> > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > Sent: Thursday, April 12, 2018 12:32 AM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>;
> > > Chandran, Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> > > <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>;
> > > Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > > Bruce <bruce.richardson@intel.com>
> > > Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow
> > > API
> > >
> > > On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > > > Add new protocol header match support as below
> > > >
> > > > RTE_FLOW_ITEM_TYPE_ARP
> > > > 	- match IPv4 ARP header
> > > > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > > > 	- match any IPv6 extension header
> > >
> > > While properly defined in the patch, "IPV6" is missing here.
> > >
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6
> > > > 	- match IPv6 ICMP header
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > > 	- match IPv6 ICMP Target address
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > > > 	- match IPv6 ICMP Source Link-layer address
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > > > 	- match IPv6 ICMP Target Link-layer address
> > > >
> > > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > >
> > > First, since they are added at the end of enum rte_flow_item_type,
> > > no ABI breakage notice is necessary.
> > >
> > > However testpmd implementation [1][2] and documentation update
> > > [3][4] are mandatory for all new pattern items and actions.
> >
> > OK, will add this into v2.
> >
> > >
> > > More comments below regarding these definitions.
> > >
> > > [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an
> > > example in app/test-pmd/cmdline_flow.c [3] "Pattern items" section
> > > in doc/guides/testpmd_app_ug/testpmd_funcs.rst
> > > [4] using "Item: ``ICMP``" section as an example in
> > >     doc/guides/prog_guide/rte_flow.rst
> > >
> > > > ---
> > > >  lib/librte_ether/rte_flow.h | 160
> > > > ++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 160 insertions(+)
> > > >
> > > > diff --git a/lib/librte_ether/rte_flow.h
> > > > b/lib/librte_ether/rte_flow.h index 8f75db0..a8ec780 100644
> > > > --- a/lib/librte_ether/rte_flow.h
> > > > +++ b/lib/librte_ether/rte_flow.h
> > > > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> > > >  	 * See struct rte_flow_item_geneve.
> > > >  	 */
> > > >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > > > +
> > > > +	/**
> > > > +	 * Matches ARP IPv4 header.
> > >
> > > => Matches an IPv4 ARP header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_arp.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ARP,
> > >
> > > While you're right to make "IPv4" clear since ARP is also used for
> > > other protocols DPDK doesn't support (and likely never will), the
> > > ARP header has both a fixed and a variably-sized part.
> > >
> > > Ideally an ARP pattern item should match the fixed part only and a
> > > separate
> > > ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP
> below.
> > >
> > > Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so
> > > one suggestion would be to rename this pattern item ARP_IPV4 directly:
> > >
> > > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > >
> > > > +
> > > > +	/**
> > > > +	 * Matches any IPv6 Extension header.
> > >
> > > => Matches an IPv6 extension header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_ipv6_ext_any.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> > >
> > > I'm not sure this definition is necessary, more below about that.
> > >
> > > Also I don't see a benefit in having "ANY" part of the name, if you
> > > want to keep it, I suggest the simpler:
> > >
> > > => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> > >
> > > > +
> > > > +	/**
> > > > +	 * Matches ICMPv6 header.
> > >
> > > => Matches an ICMPv6 header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6
> > >
> > > Missing "."
> > >
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > > > +
> > >
> > > Before entering NDP territory below, I understand those should be
> > > stacked on top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for
> > > clarity they should be named after the NDP types they represent, not inner
> data fields.
> > >
> > > Also I think we should consider NDP as a protocol sitting on top of
> > > ICMPv6. We could therefore drop "ICMP" from these definitions.
> > >
> > > Since "ND" is a common shorthand for this protocol and "6" another
> > > when doing something related to IPv6, I suggest to use "ND6" to name
> > > he related pattern items.
> >
> > I agree.
> >
> > >
> > > These are the reasons behind my next suggestions:
> > >
> > > > +	/**
> > > > +	 * Match ICMPv6 target address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> > >
> > > => Matches an IPv6 network discovery router solicitation.
> > > => See struct rte_flow_item_nd6_rs.
> > > => RTE_FLOW_ITEM_TYPE_ND6_RS,
> 
> By the way, I wrote "router solicitation" (RS) here but it should have been
> "neighbor solicitation" (NS) obviously.
> 
> > >
> > > You should add another item for neighbor advertisement messages
> > > using the same template:
> > >
> > > => Match an IPv6 network discovery neighbor advertisement.
> > > => See struct rte_flow_item_nd6_na.
> > > => RTE_FLOW_ITEM_TYPE_ND6_NA,
> >
> > The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a
> "target address"
> > according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22,
> > when type = 135/136
> >
> > so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor
> > Solicitation)  and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor
> Advertisement)
> > here, and with the same template (an IPV6 addr) for
> rte_flow_item_icmpv6_tgt_addr?
> 
> The rationale is that while they share a similar format, they are in fact different
> messages that applications could want to match more conveniently than
> providing ICMP type/code values. It would be done for consistency given the
> same RFC also defines router solicitation/advertisement messages.
> 
> However a problem remains since these messages are part of the ICMP format
> whose "reserved" field sometimes contains message flags, particularly with RA.
> These structures would lack that data.
> 
> Honestly your approach makes sense, but it shouldn't be possible to mix target
> addresses with RA/RS and it should be convenient to match these messages
> without specifically matching their contents.
> 
> So another suggestion would be to define new types at the ICMPv6 level to use
> directly on top of ETH for each possible message and define separate
> structures for options.
> 
> First let's drop one character here and in all other definitions in this
> patch:
> 
>  ICMPV6 => ICMP6
> 
> Then the new items would respectively be:
> 
>  RTE_FLOW_ITEM_TYPE_ICMP6
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA
> 
> All the related structure definitions would include the ICMPv6 header part
> defined according to the RFC and except for RTE_FLOW_ITEM_TYPE_ICMP6, a
> default mask that excludes type/code since they are implicit:
> 
>  struct rte_flow_item_icmp6_nd_na {
>       uint8_t type; /**< ICMPv6 type, normally 136. */
>       uint8_t code; /**< ICMPv6 code, normally 0. */
>       rte_be16_t checksum; /**< ICMPv6 checksum. */
>       /**
>        * Router flag (1b), solicited flag (1b), override flag (1b),
>        * reserved (29b).
>        */
>       rte_be32_t rso_reserved;
>       uint8_t target[16]; /**< Target address. */  };
> 
>  static const struct rte_flow_item_icmp6_nd_na
> rte_flow_item_icmp6_nd_na_mask = {
>      .target =
>           "\xff\xff\xff\xff\xff\xff\xff\xff"
>           "\xff\xff\xff\xff\xff\xff\xff\xff",
>  };
> 
> Also notice how uint(16|32)_t were modified as rte_be(16|32)_t while there.
> 
> What's your opinion?

OK, I will take this method, it looks good, thanks 

> 
> >
> > >
> > > The following are possible options for these headers, if specified
> > > they must be found afterward. Also since IPv6 may run on top of
> > > protocols other than Ethernet, you need to clarify these link-layer
> > > addresses use the Ethernet
> > > format:
> > >
> > > > +
> > > > +	/**
> > > > +	 * Match ICMPv6 Source Link-Layer Address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_sll.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> > >
> > > => Matches an IPv6 network discovery source Ethernet link-layer
> > > address option.
> > > => See struct rte_flow_item_nd6_opt_sla_eth.
> > > => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> > >
> > > > +
> > > > +	/**
> > > > +	 * Match ICMPv6 Target Link-Layer Address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_tll.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> > >
> > > => Matches an IPv6 network discovery target Ethernet link-layer
> > > address option.
> > > => See struct rte_flow_item_nd6_opt_tla_eth.
> > > => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> > >
> >
> > Agree to rename.
> >
> > > > +
> > >
> > > Unnecessary empty line.
> > >
> > > >  };
> > > >
> > > >  /**
> > > > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > > > rte_flow_item_geneve_mask = {  #endif
> > > >
> > > >  /**
> > > > + * RTE_FLOW_ITEM_TYPE_ARP
> > > > + *
> > > > + * Matches IPv4 ARP packet header
> > >
> > > As above:
> > >
> > > => Matches an IPv4 ARP header.
> > > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > >
> > > > + */
> > > > +struct rte_flow_item_arp {
> > > > +	struct arp_hdr hdr;
> > > > +};
> > >
> > > Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> > > consistency, see ICMP and other definitions.
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef
> > > > +__cplusplus static const struct rte_flow_item_arp
> rte_flow_item_arp_mask = {
> > > > +	.hdr = {
> > > > +		.arp_data = {
> > > > +			.arp_sha = {
> > > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +			},
> > > > +			.arp_sip = RTE_BE32(0xffffffff),
> > > > +			.arp_tha = {
> > > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +			},
> > > > +			.arp_tip = RTE_BE32(0xffffffff),
> > > > +		},
> > > > +	},
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > > > + *
> > > > + * Matches any IPv6 extension header.
> > > > + */
> > > > +struct rte_flow_item_ipv6_ext_hdr_any {
> > > > +	uint8_t next_hdr;
> > > > +};
> > >
> > > So what's the point? next_hdr is already part of either struct
> > > ipv6_hdr
> > > ("proto") and individual extension headers. Moreover it's implicit
> > > if an extension header is provided in a pattern.
> > >
> > > How about removing it?
> >
> > We need this to match a packet that have extend header For example:
> > IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>)
> / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) /
> TCP ...
> >
> > I use "ANY" to match any extend header regardless their content.
> > There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX
> > in futures
> 
> I see, makes sense. How about doing like ICMPv6 above? Generic item uses the
> base name and can only match the generic part specifically (next_hdr), while
> specific items don't match the generic part but whatever additions their
> dedicated structures define, i.e.:
> 
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_HBH
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_DEST
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_RTHDR
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_FRAG
>  ...

Yes, agree.

> 
> No need to define them all if you only need EXT, this is just to describe the idea
> (it's also OK if you want to define them while you're at it).
> 
> >
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> > > #ifndef
> > > > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > > > +rte_flow_item_ipv6_ext_any_mask = {
> > > > +	.next_hdr = 0xff,
> > > > +};
> > > > +#endif
> > >
> > > Ditto.
> > >
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > > > + *
> > > > + * Matches ICMPv6 header.
> > >
> > > => Matches an ICMPv6 header.
> > >
> > > > + */
> > > > +struct rte_flow_item_icmpv6 {
> > > > +	uint8_t type;
> > > > +	uint8_t code;
> > > > +	uint16_t checksum;
> > >
> > > The last 32-bit "reserved" data field is missing.
> > >
> > > > +};
> > >
> > > Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could
> add it.
> > > In any case Doxygen comments are missing, please add them (see other
> > > structure definitions for examples).
> 
> No need to rely on an external definition due to the above suggestions by the
> way.
> 
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask =
> {
> > > > +	.type = 0xff,
> > > > +	.code = 0xff,
> > > > +	.checksum = RTE_BE16(0xffff),
> > > > +};
> > > > +#endif
> > >
> > > You must remove checksum matching from the default mask. That's the
> > > last thing an application might want to match exactly :)
> > >
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > > + *
> > > > + * Matches ICMPv6's Target Address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_tgt_addr {
> > > > +	uint8_t addr[16];
> > > > +};
> > >
> > > You need to expand this as two items, see prior comments regarding
> > > RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and
> their
> > > respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that these are only valid when used after
> > > RTE_FLOW_ITEM_TYPE_ICMPV6.
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const
> > > > +struct rte_flow_item_icmpv6_tgt_addr
> > > rte_flow_item_icmpv6_tgt_addr_mask = {
> > > > +	.addr =
> > > > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > > > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > > > + *
> > > > + * Matches ICMPv6 Source Link-Layer address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_sll {
> > > > +	struct ether_addr addr;
> > > > +};
> > >
> > > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH
> and
> > > struct rte_flow_item_type_nd6_opt_sla_eth.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that it is only valid when found after either
> > > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > >
> > > Also missing empty line here.
> > >
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6_sll
> > > rte_flow_item_icmpv6_sll_mask = {
> > > > +	.addr = {
> > > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +	}
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > > > + *
> > > > + * Matches ICMPv6 Target Link-Layer address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_tll {
> > > > +	struct ether_addr addr;
> > > > +};
> > >
> > > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> > > and struct rte_flow_item_type_nd6_opt_tla_eth.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that it is only valid when found after either
> > > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > >
> > > Also missing empty line here.
> > >
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6_tll
> > > rte_flow_item_icmpv6_tll_mask = {
> > > > +	.addr = {
> > > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +	}
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > >   * Matching pattern item definition.
> > > >   *
> > > >   * A pattern is formed by stacking items starting from the lowest
> > > > protocol
> > > > --
> > > > 2.7.4
> > > >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-12  5:12  0%       ` Zhang, Qi Z
@ 2018-04-12  9:19  0%         ` Adrien Mazarguil
  2018-04-12 10:00  0%           ` Zhang, Qi Z
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-12  9:19 UTC (permalink / raw)
  To: Zhang, Qi Z
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce

On Thu, Apr 12, 2018 at 05:12:08AM +0000, Zhang, Qi Z wrote:
> Hi Adrien:
> 
> 	Thank you so much for your careful review and helpful suggestions!
> 	I agree with most of your comments, except couple question about RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
> 	Please see my comment inline.
> 
> Thanks!
> Qi

Thanks, replying inline also.

> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Thursday, April 12, 2018 12:32 AM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> > Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> > <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> > Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> > 
> > On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > > Add new protocol header match support as below
> > >
> > > RTE_FLOW_ITEM_TYPE_ARP
> > > 	- match IPv4 ARP header
> > > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > > 	- match any IPv6 extension header
> > 
> > While properly defined in the patch, "IPV6" is missing here.
> > 
> > > RTE_FLOW_ITEM_TYPE_ICMPV6
> > > 	- match IPv6 ICMP header
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > 	- match IPv6 ICMP Target address
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > > 	- match IPv6 ICMP Source Link-layer address
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > > 	- match IPv6 ICMP Target Link-layer address
> > >
> > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > 
> > First, since they are added at the end of enum rte_flow_item_type, no ABI
> > breakage notice is necessary.
> > 
> > However testpmd implementation [1][2] and documentation update [3][4] are
> > mandatory for all new pattern items and actions.
> 
> OK, will add this into v2.
> 
> > 
> > More comments below regarding these definitions.
> > 
> > [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an example
> > in app/test-pmd/cmdline_flow.c [3] "Pattern items" section in
> > doc/guides/testpmd_app_ug/testpmd_funcs.rst
> > [4] using "Item: ``ICMP``" section as an example in
> >     doc/guides/prog_guide/rte_flow.rst
> > 
> > > ---
> > >  lib/librte_ether/rte_flow.h | 160
> > > ++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 160 insertions(+)
> > >
> > > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > > index 8f75db0..a8ec780 100644
> > > --- a/lib/librte_ether/rte_flow.h
> > > +++ b/lib/librte_ether/rte_flow.h
> > > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> > >  	 * See struct rte_flow_item_geneve.
> > >  	 */
> > >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > > +
> > > +	/**
> > > +	 * Matches ARP IPv4 header.
> > 
> > => Matches an IPv4 ARP header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_arp.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ARP,
> > 
> > While you're right to make "IPv4" clear since ARP is also used for other
> > protocols DPDK doesn't support (and likely never will), the ARP header has
> > both a fixed and a variably-sized part.
> > 
> > Ideally an ARP pattern item should match the fixed part only and a separate
> > ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.
> > 
> > Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
> > suggestion would be to rename this pattern item ARP_IPV4 directly:
> > 
> > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > 
> > > +
> > > +	/**
> > > +	 * Matches any IPv6 Extension header.
> > 
> > => Matches an IPv6 extension header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_ipv6_ext_any.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> > 
> > I'm not sure this definition is necessary, more below about that.
> > 
> > Also I don't see a benefit in having "ANY" part of the name, if you want to keep
> > it, I suggest the simpler:
> > 
> > => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> > 
> > > +
> > > +	/**
> > > +	 * Matches ICMPv6 header.
> > 
> > => Matches an ICMPv6 header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6
> > 
> > Missing "."
> > 
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > > +
> > 
> > Before entering NDP territory below, I understand those should be stacked on
> > top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
> > named after the NDP types they represent, not inner data fields.
> > 
> > Also I think we should consider NDP as a protocol sitting on top of ICMPv6. We
> > could therefore drop "ICMP" from these definitions.
> > 
> > Since "ND" is a common shorthand for this protocol and "6" another when
> > doing something related to IPv6, I suggest to use "ND6" to name he related
> > pattern items.
> 
> I agree.
> 
> > 
> > These are the reasons behind my next suggestions:
> > 
> > > +	/**
> > > +	 * Match ICMPv6 target address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> > 
> > => Matches an IPv6 network discovery router solicitation.
> > => See struct rte_flow_item_nd6_rs.
> > => RTE_FLOW_ITEM_TYPE_ND6_RS,

By the way, I wrote "router solicitation" (RS) here but it should have been
"neighbor solicitation" (NS) obviously.

> > 
> > You should add another item for neighbor advertisement messages using the
> > same template:
> > 
> > => Match an IPv6 network discovery neighbor advertisement.
> > => See struct rte_flow_item_nd6_na.
> > => RTE_FLOW_ITEM_TYPE_ND6_NA,
> 
> The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a "target address"
> according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22, when type = 135/136
> 
> so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor Solicitation)
>  and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor Advertisement) here,
> and with the same template (an IPV6 addr) for rte_flow_item_icmpv6_tgt_addr?

The rationale is that while they share a similar format, they are in fact
different messages that applications could want to match more conveniently
than providing ICMP type/code values. It would be done for consistency given
the same RFC also defines router solicitation/advertisement messages.

However a problem remains since these messages are part of the ICMP format
whose "reserved" field sometimes contains message flags, particularly with
RA. These structures would lack that data.

Honestly your approach makes sense, but it shouldn't be possible to mix
target addresses with RA/RS and it should be convenient to match these
messages without specifically matching their contents.

So another suggestion would be to define new types at the ICMPv6 level to
use directly on top of ETH for each possible message and define separate
structures for options.

First let's drop one character here and in all other definitions in this
patch:

 ICMPV6 => ICMP6 

Then the new items would respectively be:

 RTE_FLOW_ITEM_TYPE_ICMP6
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA

All the related structure definitions would include the ICMPv6 header part
defined according to the RFC and except for RTE_FLOW_ITEM_TYPE_ICMP6, a
default mask that excludes type/code since they are implicit:

 struct rte_flow_item_icmp6_nd_na {
      uint8_t type; /**< ICMPv6 type, normally 136. */
      uint8_t code; /**< ICMPv6 code, normally 0. */
      rte_be16_t checksum; /**< ICMPv6 checksum. */
      /**
       * Router flag (1b), solicited flag (1b), override flag (1b),
       * reserved (29b).
       */
      rte_be32_t rso_reserved;
      uint8_t target[16]; /**< Target address. */
 };

 static const struct rte_flow_item_icmp6_nd_na rte_flow_item_icmp6_nd_na_mask = {
     .target =
          "\xff\xff\xff\xff\xff\xff\xff\xff"
          "\xff\xff\xff\xff\xff\xff\xff\xff",
 };

Also notice how uint(16|32)_t were modified as rte_be(16|32)_t while there.

What's your opinion?

> 
> > 
> > The following are possible options for these headers, if specified they must be
> > found afterward. Also since IPv6 may run on top of protocols other than
> > Ethernet, you need to clarify these link-layer addresses use the Ethernet
> > format:
> > 
> > > +
> > > +	/**
> > > +	 * Match ICMPv6 Source Link-Layer Address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_sll.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> > 
> > => Matches an IPv6 network discovery source Ethernet link-layer address
> > option.
> > => See struct rte_flow_item_nd6_opt_sla_eth.
> > => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> > 
> > > +
> > > +	/**
> > > +	 * Match ICMPv6 Target Link-Layer Address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_tll.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> > 
> > => Matches an IPv6 network discovery target Ethernet link-layer address
> > option.
> > => See struct rte_flow_item_nd6_opt_tla_eth.
> > => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> > 
> 
> Agree to rename.
> 
> > > +
> > 
> > Unnecessary empty line.
> > 
> > >  };
> > >
> > >  /**
> > > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > > rte_flow_item_geneve_mask = {  #endif
> > >
> > >  /**
> > > + * RTE_FLOW_ITEM_TYPE_ARP
> > > + *
> > > + * Matches IPv4 ARP packet header
> > 
> > As above:
> > 
> > => Matches an IPv4 ARP header.
> > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > 
> > > + */
> > > +struct rte_flow_item_arp {
> > > +	struct arp_hdr hdr;
> > > +};
> > 
> > Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> > consistency, see ICMP and other definitions.
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef __cplusplus
> > > +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> > > +	.hdr = {
> > > +		.arp_data = {
> > > +			.arp_sha = {
> > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +			},
> > > +			.arp_sip = RTE_BE32(0xffffffff),
> > > +			.arp_tha = {
> > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +			},
> > > +			.arp_tip = RTE_BE32(0xffffffff),
> > > +		},
> > > +	},
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > > + *
> > > + * Matches any IPv6 extension header.
> > > + */
> > > +struct rte_flow_item_ipv6_ext_hdr_any {
> > > +	uint8_t next_hdr;
> > > +};
> > 
> > So what's the point? next_hdr is already part of either struct ipv6_hdr
> > ("proto") and individual extension headers. Moreover it's implicit if an
> > extension header is provided in a pattern.
> > 
> > How about removing it?
> 
> We need this to match a packet that have extend header
> For example:
> IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>) / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) / TCP ...
> 
> I use "ANY" to match any extend header regardless their content.
> There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX in futures

I see, makes sense. How about doing like ICMPv6 above? Generic item uses
the base name and can only match the generic part specifically (next_hdr),
while specific items don't match the generic part but whatever additions
their dedicated structures define, i.e.:

 RTE_FLOW_ITEM_TYPE_IPV6_EXT
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_HBH
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_DEST
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_RTHDR
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_FRAG
 ...

No need to define them all if you only need EXT, this is just to describe
the idea (it's also OK if you want to define them while you're at it).

> 
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> > #ifndef
> > > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > > +rte_flow_item_ipv6_ext_any_mask = {
> > > +	.next_hdr = 0xff,
> > > +};
> > > +#endif
> > 
> > Ditto.
> > 
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > > + *
> > > + * Matches ICMPv6 header.
> > 
> > => Matches an ICMPv6 header.
> > 
> > > + */
> > > +struct rte_flow_item_icmpv6 {
> > > +	uint8_t type;
> > > +	uint8_t code;
> > > +	uint16_t checksum;
> > 
> > The last 32-bit "reserved" data field is missing.
> > 
> > > +};
> > 
> > Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add it.
> > In any case Doxygen comments are missing, please add them (see other
> > structure definitions for examples).

No need to rely on an external definition due to the above suggestions by
the way.

> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> > > +	.type = 0xff,
> > > +	.code = 0xff,
> > > +	.checksum = RTE_BE16(0xffff),
> > > +};
> > > +#endif
> > 
> > You must remove checksum matching from the default mask. That's the last
> > thing an application might want to match exactly :)
> > 
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > + *
> > > + * Matches ICMPv6's Target Address.
> > > + */
> > > +struct rte_flow_item_icmpv6_tgt_addr {
> > > +	uint8_t addr[16];
> > > +};
> > 
> > You need to expand this as two items, see prior comments regarding
> > RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their
> > respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that these are only valid when used after
> > RTE_FLOW_ITEM_TYPE_ICMPV6.
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const
> > > +struct rte_flow_item_icmpv6_tgt_addr
> > rte_flow_item_icmpv6_tgt_addr_mask = {
> > > +	.addr =
> > > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > > + *
> > > + * Matches ICMPv6 Source Link-Layer address.
> > > + */
> > > +struct rte_flow_item_icmpv6_sll {
> > > +	struct ether_addr addr;
> > > +};
> > 
> > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and
> > struct rte_flow_item_type_nd6_opt_sla_eth.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that it is only valid when found after either
> > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > 
> > Also missing empty line here.
> > 
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6_sll
> > rte_flow_item_icmpv6_sll_mask = {
> > > +	.addr = {
> > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +	}
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > > + *
> > > + * Matches ICMPv6 Target Link-Layer address.
> > > + */
> > > +struct rte_flow_item_icmpv6_tll {
> > > +	struct ether_addr addr;
> > > +};
> > 
> > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> > and struct rte_flow_item_type_nd6_opt_tla_eth.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that it is only valid when found after either
> > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > 
> > Also missing empty line here.
> > 
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6_tll
> > rte_flow_item_icmpv6_tll_mask = {
> > > +	.addr = {
> > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +	}
> > > +};
> > > +#endif
> > > +
> > > +/**
> > >   * Matching pattern item definition.
> > >   *
> > >   * A pattern is formed by stacking items starting from the lowest
> > > protocol
> > > --
> > > 2.7.4
> > >

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types
      @ 2018-04-12  7:33  3% ` Xueming Li
  2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
  3 siblings, 0 replies; 200+ results
From: Xueming Li @ 2018-04-12  7:33 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine


Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c           |  24 ++++++++
 app/test-pmd/config.c                 |   2 +
 app/test-pmd/csumonly.c               | 103 ++++++++++++++++++++++++++++++----
 app/test-pmd/parameters.c             |  12 +++-
 app/test-pmd/testpmd.h                |   2 +
 doc/guides/prog_guide/rte_flow.rst    |  12 ++++
 doc/guides/rel_notes/deprecation.rst  |   4 --
 doc/guides/testpmd_app_ug/run_app.rst |   5 ++
 lib/librte_ether/rte_eth_ctrl.h       |   3 +-
 lib/librte_ether/rte_flow.c           |   1 +
 lib/librte_ether/rte_flow.h           |  27 +++++++++
 lib/librte_mbuf/rte_mbuf.c            |   3 +
 lib/librte_mbuf/rte_mbuf.h            |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c      |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h      |  47 ++++++++++++++++
 lib/librte_net/rte_ether.h            |  25 +++++++++
 16 files changed, 257 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v6 4/4] doc: add ifcvf driver document and release note
  @ 2018-04-12  7:19  3%   ` Xiao Wang
  0 siblings, 0 replies; 200+ results
From: Xiao Wang @ 2018-04-12  7:19 UTC (permalink / raw)
  To: ferruh.yigit
  Cc: dev, maxime.coquelin, zhihong.wang, tiwei.bie, jianfeng.tan,
	cunming.liang, dan.daly, thomas, gaetan.rivet, anatoly.burakov,
	hemant.agrawal, Xiao Wang

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/nics/features/ifcvf.ini     |  8 +++
 doc/guides/nics/ifcvf.rst              | 98 ++++++++++++++++++++++++++++++++++
 doc/guides/nics/index.rst              |  1 +
 doc/guides/rel_notes/release_18_05.rst |  9 ++++
 4 files changed, 116 insertions(+)
 create mode 100644 doc/guides/nics/features/ifcvf.ini
 create mode 100644 doc/guides/nics/ifcvf.rst

diff --git a/doc/guides/nics/features/ifcvf.ini b/doc/guides/nics/features/ifcvf.ini
new file mode 100644
index 000000000..ef1fc4711
--- /dev/null
+++ b/doc/guides/nics/features/ifcvf.ini
@@ -0,0 +1,8 @@
+;
+; Supported features of the 'ifcvf' vDPA driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+x86-32               = Y
+x86-64               = Y
diff --git a/doc/guides/nics/ifcvf.rst b/doc/guides/nics/ifcvf.rst
new file mode 100644
index 000000000..d7e76353c
--- /dev/null
+++ b/doc/guides/nics/ifcvf.rst
@@ -0,0 +1,98 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2018 Intel Corporation.
+
+IFCVF vDPA driver
+=================
+
+The IFCVF vDPA (vhost data path acceleration) driver provides support for the
+Intel FPGA 100G VF (IFCVF). IFCVF's datapath is virtio ring compatible, it
+works as a HW vhost backend which can send/receive packets to/from virtio
+directly by DMA. Besides, it supports dirty page logging and device state
+report/restore. This driver enables its vDPA functionality with live migration
+feature.
+
+
+Pre-Installation Configuration
+------------------------------
+
+Config File Options
+~~~~~~~~~~~~~~~~~~~
+
+The following option can be modified in the ``config`` file.
+
+- ``CONFIG_RTE_LIBRTE_IFCVF_VDPA_PMD`` (default ``y`` for linux)
+
+  Toggle compilation of the ``librte_ifcvf_vdpa`` driver.
+
+
+IFCVF vDPA Implementation
+-------------------------
+
+IFCVF's vendor ID and device ID are same as that of virtio net pci device,
+with its specific subsystem vendor ID and device ID. To let the device be
+probed by IFCVF driver, adding "vdpa=1" parameter helps to specify that this
+device is to be used in vDPA mode, rather than polling mode, virtio pmd will
+skip when it detects this message.
+
+Different VF devices serve different virtio frontends which are in different
+VMs, so each VF needs to have its own DMA address translation service. During
+the driver probe a new container is created for this device, with this
+container vDPA driver can program DMA remapping table with the VM's memory
+region information.
+
+Key IFCVF vDPA driver ops
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- ifcvf_dev_config:
+  Enable VF data path with virtio information provided by vhost lib, including
+  IOMMU programming to enable VF DMA to VM's memory, VFIO interrupt setup to
+  route HW interrupt to virtio driver, create notify relay thread to translate
+  virtio driver's kick to a MMIO write onto HW, HW queues configuration.
+
+  This function gets called to set up HW data path backend when virtio driver
+  in VM gets ready.
+
+- ifcvf_dev_close:
+  Revoke all the setup in ifcvf_dev_config.
+
+  This function gets called when virtio driver stops device in VM.
+
+To create a vhost port with IFC VF
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Create a vhost socket and assign a VF's device ID to this socket via
+  vhost API. When QEMU vhost connection gets ready, the assigned VF will
+  get configured automatically.
+
+
+Features
+--------
+
+Features of the IFCVF driver are:
+
+- Compatibility with virtio 0.95 and 1.0.
+- Live migration.
+
+
+Prerequisites
+-------------
+
+- Platform with IOMMU feature. IFC VF needs address translation service to
+  Rx/Tx directly with virtio driver in VM.
+
+
+Limitations
+-----------
+
+Dependency on vfio-pci
+~~~~~~~~~~~~~~~~~~~~~~
+
+vDPA driver needs to setup VF MSIX interrupts, each queue's interrupt vector
+is mapped to a callfd associated with a virtio ring. Currently only vfio-pci
+allows multiple interrupts, so the IFCVF driver is dependent on vfio-pci.
+
+Live Migration with VIRTIO_NET_F_GUEST_ANNOUNCE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+IFC VF doesn't support RARP packet generation, virtio frontend supporting
+VIRTIO_NET_F_GUEST_ANNOUNCE feature can help to do that.
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 51c453d9c..a294ab389 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -44,6 +44,7 @@ Network Interface Controller Drivers
     vmxnet3
     pcap_ring
     fail_safe
+    ifcvf
 
 **Figures**
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 3e1ae0cfd..1bf609f6b 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -84,6 +84,15 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Added IFCVF vDPA driver.**
+
+  Added IFCVF vDPA driver to support Intel FPGA 100G VF device. IFCVF works
+  as a HW vhost data path accelerator, it supports live migration and is
+  compatible with virtio 0.95 and 1.0. This driver registers ifcvf vDPA driver
+  to vhost lib, when virtio connected, with the help of the registered vDPA
+  driver the assigned VF gets configured to Rx/Tx directly to VM's virtio
+  vrings.
+
 
 ABI Changes
 -----------
-- 
2.15.1

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-11 16:32  2%     ` Adrien Mazarguil
@ 2018-04-12  5:12  0%       ` Zhang, Qi Z
  2018-04-12  9:19  0%         ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Zhang, Qi Z @ 2018-04-12  5:12 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce

Hi Adrien:

	Thank you so much for your careful review and helpful suggestions!
	I agree with most of your comments, except couple question about RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
	Please see my comment inline.

Thanks!
Qi

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Thursday, April 12, 2018 12:32 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> 
> On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > Add new protocol header match support as below
> >
> > RTE_FLOW_ITEM_TYPE_ARP
> > 	- match IPv4 ARP header
> > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > 	- match any IPv6 extension header
> 
> While properly defined in the patch, "IPV6" is missing here.
> 
> > RTE_FLOW_ITEM_TYPE_ICMPV6
> > 	- match IPv6 ICMP header
> > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > 	- match IPv6 ICMP Target address
> > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > 	- match IPv6 ICMP Source Link-layer address
> > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > 	- match IPv6 ICMP Target Link-layer address
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> 
> First, since they are added at the end of enum rte_flow_item_type, no ABI
> breakage notice is necessary.
> 
> However testpmd implementation [1][2] and documentation update [3][4] are
> mandatory for all new pattern items and actions.

OK, will add this into v2.

> 
> More comments below regarding these definitions.
> 
> [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an example
> in app/test-pmd/cmdline_flow.c [3] "Pattern items" section in
> doc/guides/testpmd_app_ug/testpmd_funcs.rst
> [4] using "Item: ``ICMP``" section as an example in
>     doc/guides/prog_guide/rte_flow.rst
> 
> > ---
> >  lib/librte_ether/rte_flow.h | 160
> > ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 160 insertions(+)
> >
> > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > index 8f75db0..a8ec780 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> >  	 * See struct rte_flow_item_geneve.
> >  	 */
> >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > +
> > +	/**
> > +	 * Matches ARP IPv4 header.
> 
> => Matches an IPv4 ARP header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_arp.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ARP,
> 
> While you're right to make "IPv4" clear since ARP is also used for other
> protocols DPDK doesn't support (and likely never will), the ARP header has
> both a fixed and a variably-sized part.
> 
> Ideally an ARP pattern item should match the fixed part only and a separate
> ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.
> 
> Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
> suggestion would be to rename this pattern item ARP_IPV4 directly:
> 
> => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> 
> > +
> > +	/**
> > +	 * Matches any IPv6 Extension header.
> 
> => Matches an IPv6 extension header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_ipv6_ext_any.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> 
> I'm not sure this definition is necessary, more below about that.
> 
> Also I don't see a benefit in having "ANY" part of the name, if you want to keep
> it, I suggest the simpler:
> 
> => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> 
> > +
> > +	/**
> > +	 * Matches ICMPv6 header.
> 
> => Matches an ICMPv6 header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6
> 
> Missing "."
> 
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > +
> 
> Before entering NDP territory below, I understand those should be stacked on
> top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
> named after the NDP types they represent, not inner data fields.
> 
> Also I think we should consider NDP as a protocol sitting on top of ICMPv6. We
> could therefore drop "ICMP" from these definitions.
> 
> Since "ND" is a common shorthand for this protocol and "6" another when
> doing something related to IPv6, I suggest to use "ND6" to name he related
> pattern items.

I agree.

> 
> These are the reasons behind my next suggestions:
> 
> > +	/**
> > +	 * Match ICMPv6 target address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> 
> => Matches an IPv6 network discovery router solicitation.
> => See struct rte_flow_item_nd6_rs.
> => RTE_FLOW_ITEM_TYPE_ND6_RS,
> 
> You should add another item for neighbor advertisement messages using the
> same template:
> 
> => Match an IPv6 network discovery neighbor advertisement.
> => See struct rte_flow_item_nd6_na.
> => RTE_FLOW_ITEM_TYPE_ND6_NA,

The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a "target address"
according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22, when type = 135/136

so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor Solicitation)
 and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor Advertisement) here,
and with the same template (an IPV6 addr) for rte_flow_item_icmpv6_tgt_addr?

> 
> The following are possible options for these headers, if specified they must be
> found afterward. Also since IPv6 may run on top of protocols other than
> Ethernet, you need to clarify these link-layer addresses use the Ethernet
> format:
> 
> > +
> > +	/**
> > +	 * Match ICMPv6 Source Link-Layer Address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_sll.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> 
> => Matches an IPv6 network discovery source Ethernet link-layer address
> option.
> => See struct rte_flow_item_nd6_opt_sla_eth.
> => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> 
> > +
> > +	/**
> > +	 * Match ICMPv6 Target Link-Layer Address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_tll.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> 
> => Matches an IPv6 network discovery target Ethernet link-layer address
> option.
> => See struct rte_flow_item_nd6_opt_tla_eth.
> => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> 

Agree to rename.

> > +
> 
> Unnecessary empty line.
> 
> >  };
> >
> >  /**
> > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > rte_flow_item_geneve_mask = {  #endif
> >
> >  /**
> > + * RTE_FLOW_ITEM_TYPE_ARP
> > + *
> > + * Matches IPv4 ARP packet header
> 
> As above:
> 
> => Matches an IPv4 ARP header.
> => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> 
> > + */
> > +struct rte_flow_item_arp {
> > +	struct arp_hdr hdr;
> > +};
> 
> Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> consistency, see ICMP and other definitions.
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef __cplusplus
> > +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> > +	.hdr = {
> > +		.arp_data = {
> > +			.arp_sha = {
> > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +			},
> > +			.arp_sip = RTE_BE32(0xffffffff),
> > +			.arp_tha = {
> > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +			},
> > +			.arp_tip = RTE_BE32(0xffffffff),
> > +		},
> > +	},
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > + *
> > + * Matches any IPv6 extension header.
> > + */
> > +struct rte_flow_item_ipv6_ext_hdr_any {
> > +	uint8_t next_hdr;
> > +};
> 
> So what's the point? next_hdr is already part of either struct ipv6_hdr
> ("proto") and individual extension headers. Moreover it's implicit if an
> extension header is provided in a pattern.
> 
> How about removing it?

We need this to match a packet that have extend header
For example:
IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>) / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) / TCP ...

I use "ANY" to match any extend header regardless their content.
There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX in futures

> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> #ifndef
> > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > +rte_flow_item_ipv6_ext_any_mask = {
> > +	.next_hdr = 0xff,
> > +};
> > +#endif
> 
> Ditto.
> 
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > + *
> > + * Matches ICMPv6 header.
> 
> => Matches an ICMPv6 header.
> 
> > + */
> > +struct rte_flow_item_icmpv6 {
> > +	uint8_t type;
> > +	uint8_t code;
> > +	uint16_t checksum;
> 
> The last 32-bit "reserved" data field is missing.
> 
> > +};
> 
> Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add it.
> In any case Doxygen comments are missing, please add them (see other
> structure definitions for examples).
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> > +	.type = 0xff,
> > +	.code = 0xff,
> > +	.checksum = RTE_BE16(0xffff),
> > +};
> > +#endif
> 
> You must remove checksum matching from the default mask. That's the last
> thing an application might want to match exactly :)
> 
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > + *
> > + * Matches ICMPv6's Target Address.
> > + */
> > +struct rte_flow_item_icmpv6_tgt_addr {
> > +	uint8_t addr[16];
> > +};
> 
> You need to expand this as two items, see prior comments regarding
> RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their
> respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that these are only valid when used after
> RTE_FLOW_ITEM_TYPE_ICMPV6.
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const
> > +struct rte_flow_item_icmpv6_tgt_addr
> rte_flow_item_icmpv6_tgt_addr_mask = {
> > +	.addr =
> > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > + *
> > + * Matches ICMPv6 Source Link-Layer address.
> > + */
> > +struct rte_flow_item_icmpv6_sll {
> > +	struct ether_addr addr;
> > +};
> 
> See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and
> struct rte_flow_item_type_nd6_opt_sla_eth.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that it is only valid when found after either
> RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> 
> Also missing empty line here.
> 
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6_sll
> rte_flow_item_icmpv6_sll_mask = {
> > +	.addr = {
> > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +	}
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > + *
> > + * Matches ICMPv6 Target Link-Layer address.
> > + */
> > +struct rte_flow_item_icmpv6_tll {
> > +	struct ether_addr addr;
> > +};
> 
> See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> and struct rte_flow_item_type_nd6_opt_tla_eth.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that it is only valid when found after either
> RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> 
> Also missing empty line here.
> 
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6_tll
> rte_flow_item_icmpv6_tll_mask = {
> > +	.addr = {
> > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +	}
> > +};
> > +#endif
> > +
> > +/**
> >   * Matching pattern item definition.
> >   *
> >   * A pattern is formed by stacking items starting from the lowest
> > protocol
> > --
> > 2.7.4
> >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11 11:39  0%                 ` Ananyev, Konstantin
@ 2018-04-11 17:08  0%                   ` Yongseok Koh
  2018-04-12 16:34  0%                     ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-11 17:08 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Wed, Apr 11, 2018 at 11:39:47AM +0000, Ananyev, Konstantin wrote:
> 
> Hi Yongseok,
> 
> > > >
> > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > Hi Yongseok,
> > > > >
> > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > Hi,
> > > > > > >
> > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > >
> > > > > > > > Possible use-cases could be:
> > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > >
> > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > >
> > > > > > > I think the current API is already able to do what you want.
> > > > > > >
> > > > > > > 1/ Here is a mbuf m with its data
> > > > > > >
> > > > > > >                off
> > > > > > >                <-->
> > > > > > >                       len
> > > > > > >           +----+   <---------->
> > > > > > >           |    |
> > > > > > >         +-|----v----------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > >         |      -----------------------|
> > > > > > >         +-----------------------------+
> > > > > > >
> > > > > > >
> > > > > > > 2/ clone m:
> > > > > > >
> > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > >
> > > > > > >   Note that c has its own offset and length fields.
> > > > > > >
> > > > > > >
> > > > > > >                off
> > > > > > >                <-->
> > > > > > >                       len
> > > > > > >           +----+   <---------->
> > > > > > >           |    |
> > > > > > >         +-|----v----------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > >         |      -----------------------|
> > > > > > >         +------^----------------------+
> > > > > > >                |
> > > > > > >           +----+
> > > > > > > indirect  |
> > > > > > >         +-|---------------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > c       | buf  |                     ||
> > > > > > >         |      -----------------------|
> > > > > > >         +-----------------------------+
> > > > > > >
> > > > > > >                 off    len
> > > > > > >                 <--><---------->
> > > > > > >
> > > > > > >
> > > > > > > 3/ remove some data from c without changing m
> > > > > > >
> > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > >
> > > > > > >
> > > > > > > Please let me know if it fits your needs.
> > > > > >
> > > > > > No, it doesn't.
> > > > > >
> > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > difference offsets in m,
> > > > > >
> > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > >
> > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > header should be linked by h1->next = c2.
> > > > >
> > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > length) that will:
> > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > >     in the range [offset : length+offset].
> > > > >   - prepend an empty and writable mbuf for the headers
> > > > >
> > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > >
> > > > > What do you mean by properly handled?
> > > > >
> > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > won't be protected.
> > > > >
> > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > may share their data. How an application can know if the data is shared
> > > > > or not?
> > > > >
> > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > >
> > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > enough to handle that use-case.
> > > >
> > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > indirect referencing.
> > >
> > > But just to make HW to RX multiple packets into one mbuf,
> > > data_off inside indirect mbuf should be enough, correct?
> > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > to saturate the network link.
> 
> There were few complains that 64KB max is a limitation for some use-cases.
> I am not against increasing it, but I don't think we have free space on first cache-line for that
> without another big rework of mbuf layout. 
> Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too. 
> 
> > 
> > > As I understand, what you'd like to achieve with this new field -
> > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > indirect mbufs trying to modify same direct buffer.
> > 
> > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > is read-only. What that means, all the entities which own such mbufs have to be
> > aware of that and keep the principle as DPDK can't enforce the rule and there
> > can't be such sanity check. In this sense, HW doesn't violate it because the
> > direct mbuf is injected to HW before indirection. When packets are written by
> > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> 
> Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> which implies that it should be possible by some entity (upper layer?) to manipulate these
> indirect mbufs.
> And we don't know how exactly it will be done.

That's a valid concern. I can make it private by merging into the _attach_to()
func, or I just can add a comment in the API doc. However, if users are aware
that a mbuf is read-only and we expect them to keep it intact by their own
judgement, they would/should not use those APIs. We can't stop them modifying
content or the buffer itself anyway. Will add more comments of this discussion
regarding read-only mode.

> > The direct buffer will be freed and get available for reuse when all the attached
> > indirect mbufs are freed.
> > 
> > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > Fields for indirect mbufs, straight after attach()?
> > 
> > Good point.
> > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > had to consider a case where priv_size is really given by user. Even though it
> > is less likely, but if original priv_size is quite big, it can't cover entire
> > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > 16bit field (buf_off) looked more plausible.
> 
> As I remember, we can't have mbufs bigger then 64K,
> so priv_size + buf_len should be always less than 64K, correct?

Can you let me know where I can find the constraint? I checked
rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
but there's no such limitation.

	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
		(unsigned)data_room_size;

The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
priv_size couldn't be used for this purpose.

Yongseok

> > > > > >
> > > > > > Does this make sense?
> > > > >
> > > > > I understand the need.
> > > > >
> > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > a quick overview.
> > > > >
> > > > > [1]
> > > >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > -Session05-OlivierMatz-
> > > >
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > >
> > > > > The advantage is that it does not require the large data to be inside a
> > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > to implement compared to your solution.
> > > >
> > > > I knew that you presented the slides and frankly, I had considered that option
> > > > at first. But even with that option, metadata to store refcnt should also be
> > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > I just wanted to make full use of the existing framework because it is less
> > > > complex as you mentioned. Given that you presented the idea of external data
> > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > more demands come from users in the future as it would be a huge change and may
> > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > discussions and write codes for it if I can be helpful.
> > > >
> > > > Do you think this patch is okay for now?
> > > >
> > > >
> > > > Thanks for your comments,
> > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  @ 2018-04-11 16:32  2%     ` Adrien Mazarguil
  2018-04-12  5:12  0%       ` Zhang, Qi Z
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-11 16:32 UTC (permalink / raw)
  To: Qi Zhang
  Cc: dev, declan.doherty, sugesh.chandran, michael.j.glynn, yu.y.liu,
	konstantin.ananyev, bruce.richardson

On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> Add new protocol header match support as below
> 
> RTE_FLOW_ITEM_TYPE_ARP
> 	- match IPv4 ARP header
> RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> 	- match any IPv6 extension header

While properly defined in the patch, "IPV6" is missing here.

> RTE_FLOW_ITEM_TYPE_ICMPV6
> 	- match IPv6 ICMP header
> RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> 	- match IPv6 ICMP Target address
> RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> 	- match IPv6 ICMP Source Link-layer address
> RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> 	- match IPv6 ICMP Target Link-layer address
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

First, since they are added at the end of enum rte_flow_item_type, no ABI
breakage notice is necessary.

However testpmd implementation [1][2] and documentation update [3][4] are
mandatory for all new pattern items and actions.

More comments below regarding these definitions.

[1] flow_item[] in app/test-pmd/config.c
[2] using ITEM_ICMP as an example in app/test-pmd/cmdline_flow.c
[3] "Pattern items" section in doc/guides/testpmd_app_ug/testpmd_funcs.rst
[4] using "Item: ``ICMP``" section as an example in
    doc/guides/prog_guide/rte_flow.rst

> ---
>  lib/librte_ether/rte_flow.h | 160 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 160 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 8f75db0..a8ec780 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -323,6 +323,49 @@ enum rte_flow_item_type {
>  	 * See struct rte_flow_item_geneve.
>  	 */
>  	RTE_FLOW_ITEM_TYPE_GENEVE,
> +
> +	/**
> +	 * Matches ARP IPv4 header.

=> Matches an IPv4 ARP header.

> +	 *
> +	 * See struct rte_flow_item_arp.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ARP,

While you're right to make "IPv4" clear since ARP is also used for other
protocols DPDK doesn't support (and likely never will), the ARP header has
both a fixed and a variably-sized part.

Ideally an ARP pattern item should match the fixed part only and a separate
ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.

Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
suggestion would be to rename this pattern item ARP_IPV4 directly:

=> RTE_FLOW_ITEM_TYPE_ARP_IPV4

> +
> +	/**
> +	 * Matches any IPv6 Extension header.

=> Matches an IPv6 extension header.

> +	 *
> +	 * See struct rte_flow_item_ipv6_ext_any.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,

I'm not sure this definition is necessary, more below about that.

Also I don't see a benefit in having "ANY" part of the name, if you want to
keep it, I suggest the simpler:

=> RTE_FLOW_ITEM_TYPE_IPV6_EXT

> +
> +	/**
> +	 * Matches ICMPv6 header.

=> Matches an ICMPv6 header.

> +	 *
> +	 * See struct rte_flow_item_icmpv6

Missing "."

> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> +

Before entering NDP territory below, I understand those should be stacked on
top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
named after the NDP types they represent, not inner data fields.

Also I think we should consider NDP as a protocol sitting on top of
ICMPv6. We could therefore drop "ICMP" from these definitions.

Since "ND" is a common shorthand for this protocol and "6" another when
doing something related to IPv6, I suggest to use "ND6" to name he related
pattern items.

These are the reasons behind my next suggestions:

> +	/**
> +	 * Match ICMPv6 target address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,

=> Matches an IPv6 network discovery router solicitation.
=> See struct rte_flow_item_nd6_rs.
=> RTE_FLOW_ITEM_TYPE_ND6_RS,

You should add another item for neighbor advertisement messages using the
same template:

=> Match an IPv6 network discovery neighbor advertisement.
=> See struct rte_flow_item_nd6_na.
=> RTE_FLOW_ITEM_TYPE_ND6_NA,

The following are possible options for these headers, if specified they must
be found afterward. Also since IPv6 may run on top of protocols other than
Ethernet, you need to clarify these link-layer addresses use the Ethernet
format:

> +
> +	/**
> +	 * Match ICMPv6 Source Link-Layer Address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_sll.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,

=> Matches an IPv6 network discovery source Ethernet link-layer address option.
=> See struct rte_flow_item_nd6_opt_sla_eth.
=> RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,

> +
> +	/**
> +	 * Match ICMPv6 Target Link-Layer Address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_tll.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,

=> Matches an IPv6 network discovery target Ethernet link-layer address option.
=> See struct rte_flow_item_nd6_opt_tla_eth.
=> RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,

> +

Unnecessary empty line.

>  };
>  
>  /**
> @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve rte_flow_item_geneve_mask = {
>  #endif
>  
>  /**
> + * RTE_FLOW_ITEM_TYPE_ARP
> + *
> + * Matches IPv4 ARP packet header

As above:

=> Matches an IPv4 ARP header.
=> RTE_FLOW_ITEM_TYPE_ARP_IPV4

> + */
> +struct rte_flow_item_arp {
> +	struct arp_hdr hdr;
> +};

Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
consistency, see ICMP and other definitions.

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */
> +#ifndef __cplusplus
> +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> +	.hdr = {
> +		.arp_data = {
> +			.arp_sha = {
> +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +			},
> +			.arp_sip = RTE_BE32(0xffffffff),
> +			.arp_tha = {
> +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +			},
> +			.arp_tip = RTE_BE32(0xffffffff),
> +		},
> +	},
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> + *
> + * Matches any IPv6 extension header.
> + */
> +struct rte_flow_item_ipv6_ext_hdr_any {
> +	uint8_t next_hdr;
> +};

So what's the point? next_hdr is already part of either struct ipv6_hdr
("proto") and individual extension headers. Moreover it's implicit if an
extension header is provided in a pattern.

How about removing it?

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> +#ifndef __cplusplus
> +static const
> +struct rte_flow_item_ipv6_ext_hdr_any rte_flow_item_ipv6_ext_any_mask = {
> +	.next_hdr = 0xff,
> +};
> +#endif

Ditto.

> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6
> + *
> + * Matches ICMPv6 header.

=> Matches an ICMPv6 header.

> + */
> +struct rte_flow_item_icmpv6 {
> +	uint8_t type;
> +	uint8_t code;
> +	uint16_t checksum;

The last 32-bit "reserved" data field is missing.

> +};

Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add
it. In any case Doxygen comments are missing, please add them (see other
structure definitions for examples).

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> +	.type = 0xff,
> +	.code = 0xff,
> +	.checksum = RTE_BE16(0xffff),
> +};
> +#endif

You must remove checksum matching from the default mask. That's the last
thing an application might want to match exactly :)

> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> + *
> + * Matches ICMPv6's Target Address.
> + */
> +struct rte_flow_item_icmpv6_tgt_addr {
> +	uint8_t addr[16];
> +};

You need to expand this as two items, see prior comments regarding
RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their respective
structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.

Also Doxygen documentation is missing for the addr field and you need to
describe that these are only valid when used after
RTE_FLOW_ITEM_TYPE_ICMPV6.

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */

Missing "."

> +#ifndef __cplusplus
> +static const
> +struct rte_flow_item_icmpv6_tgt_addr rte_flow_item_icmpv6_tgt_addr_mask = {
> +	.addr =
> +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> + *
> + * Matches ICMPv6 Source Link-Layer address.
> + */
> +struct rte_flow_item_icmpv6_sll {
> +	struct ether_addr addr;
> +};

See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and struct
rte_flow_item_type_nd6_opt_sla_eth.

Also Doxygen documentation is missing for the addr field and you need to
describe that it is only valid when found after either
RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.

Also missing empty line here.

> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6_sll rte_flow_item_icmpv6_sll_mask = {
> +	.addr = {
> +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +	}
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> + *
> + * Matches ICMPv6 Target Link-Layer address.
> + */
> +struct rte_flow_item_icmpv6_tll {
> +	struct ether_addr addr;
> +};

See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH and struct
rte_flow_item_type_nd6_opt_tla_eth.

Also Doxygen documentation is missing for the addr field and you need to
describe that it is only valid when found after either
RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.

Also missing empty line here.

> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6_tll rte_flow_item_icmpv6_tll_mask = {
> +	.addr = {
> +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +	}
> +};
> +#endif
> +
> +/**
>   * Matching pattern item definition.
>   *
>   * A pattern is formed by stacking items starting from the lowest protocol
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 2/4] ether: add flow last hit query support
  @ 2018-04-11 16:31  3%     ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-11 16:31 UTC (permalink / raw)
  To: Qi Zhang
  Cc: dev, declan.doherty, sugesh.chandran, michael.j.glynn, yu.y.liu,
	konstantin.ananyev, bruce.richardson

On Sun, Apr 01, 2018 at 05:19:20PM -0400, Qi Zhang wrote:
> Enhanced the action RTE_FLOW_TYPE_ACTION_COUNT, number of
> milliseconds since last hit can be queried.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

Please confirm whether existing devices have the ability to report time
elapsed since last hit, or if PMDs are supposed to take care of that
entirely on their own in software?

If the latter, I suggest to drop this patch and let applications check
counters regularly on their own. Unlike applications, PMDs do not easily
have access to a reliable time source.

Otherwise, the patch looks acceptable but I can't tell if milliseconds are
the right unit for such information. Same issue as mbuf timestamps [1]
basically. As a 64-bit field, a precision down to the nanosecond is a
possibility so perhaps like mbufs, the reference and precision should be
undefined in the API in order to be processed by a PMD callback?

More comments below.

[1] commit 918ae9dc775e ("mbuf: add a timestamp field")

> ---
>  lib/librte_ether/rte_flow.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 1080086..8f75db0 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1054,9 +1054,11 @@ struct rte_flow_query_count {
>  	uint32_t reset:1; /**< Reset counters after query [in]. */
>  	uint32_t hits_set:1; /**< hits field is set [out]. */
>  	uint32_t bytes_set:1; /**< bytes field is set [out]. */
> +	uint32_t last_hit_set:1; /**< last_hit field is set [out]. */
>  	uint32_t reserved:29; /**< Reserved, must be zero [in, out]. */

You need to decrement reserved bits.

>  	uint64_t hits; /**< Number of hits for this rule [out]. */
>  	uint64_t bytes; /**< Number of bytes through this rule [out]. */
> +	uint64_t last_hit; /**< Number of milliseconds since last hit [out]. */
>  };

Doing so impacts ABI compatibility. While normally frowned upon for
rte_flow, it's OK for 18.05 because we already destroyed it. You still need
to mention what functions are impacted by this change as in "ethdev: add
encap level to RSS flow API action" [2] and update the .map files where
necessary.

In this case at least rte_flow_query() is impacted.

Please update doc/guides/prog_guide/rte_flow.rst as well (look for "COUNT
query").

[2] http://dpdk.org/ml/archives/dev/2018-April/096531.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-09 14:42  0%       ` Adrien Mazarguil
@ 2018-04-11 13:21  0%         ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:21 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Xueming Li, Wenzhuo Lu,
	Jingjing Wu, Beilei Xing, Qi Zhang, Konstantin Ananyev,
	Nelio Laranjeiro, Yongseok Koh, Pascal Mazon, Radu Nicolau,
	Akhil Goyal, Ivan Malov

On 04/09/2018 05:42 PM, Adrien Mazarguil wrote:
> On Sat, Apr 07, 2018 at 12:05:51PM +0300, Andrew Rybchenko wrote:
>> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
>>> Since its inception, the rte_flow RSS action has been relying in part on
>>> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
>>> This structure lacks parameters such as the hash algorithm to use, and more
>>> recently, a method to tell which layer RSS should be performed on [1].
>>>
>>> Given struct rte_eth_rss_conf will never be flexible enough to represent a
>>> complete RSS configuration (e.g. RETA table), this patch supersedes it by
>>> extending the rte_flow RSS action directly.
>>>
>>> A subsequent patch will add a field to use a non-default RSS hash
>>> algorithm. To that end, a field named "types" replaces the field formerly
>>> known as "rss_hf" and standing for "RSS hash functions" as it was
>>> confusing. Actual RSS hash function types are defined by enum
>>> rte_eth_hash_function.
>>> This patch updates all PMDs and example applications accordingly.
>>>
>>> It breaks ABI compatibility for the following public functions:
>>>
>>> - rte_flow_copy()
>>> - rte_flow_create()
>>> - rte_flow_query()
>>> - rte_flow_validate()
>>>
>>> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>>>       configuration")
>>>
>>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
>>> Cc: Xueming Li <xuemingl@mellanox.com>
>>> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
>>> Cc: Thomas Monjalon <thomas@monjalon.net>
>>> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
>>> Cc: Jingjing Wu <jingjing.wu@intel.com>
>>> Cc: Beilei Xing <beilei.xing@intel.com>
>>> Cc: Qi Zhang <qi.z.zhang@intel.com>
>>> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
>>> Cc: Yongseok Koh <yskoh@mellanox.com>
>>> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
>>> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>>> Cc: Radu Nicolau <radu.nicolau@intel.com>
>>> Cc: Akhil Goyal <akhil.goyal@nxp.com>
>>> ---
>>>    app/test-pmd/cmdline_flow.c        |  59 +++++-----
>>>    app/test-pmd/config.c              |  39 +++----
>>>    doc/guides/prog_guide/rte_flow.rst |  22 ++--
>>>    drivers/net/e1000/e1000_ethdev.h   |  13 ++-
>>>    drivers/net/e1000/igb_ethdev.c     |   4 +-
>>>    drivers/net/e1000/igb_flow.c       |  31 ++---
>>>    drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
>>>    drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
>>>    drivers/net/i40e/i40e_ethdev.h     |  15 ++-
>>>    drivers/net/i40e/i40e_flow.c       |  47 ++++----
>>>    drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
>>>    drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
>>>    drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
>>>    drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
>>>    drivers/net/mlx4/mlx4.c            |   2 +-
>>>    drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
>>>    drivers/net/mlx4/mlx4_flow.h       |   2 +-
>>>    drivers/net/mlx4/mlx4_rxq.c        |   2 +-
>>>    drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
>>>    drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
>>>    drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
>>>    drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
>>>    drivers/net/sfc/sfc_flow.c         |  21 ++--
>>>    drivers/net/tap/tap_flow.c         |   8 +-
>>>    examples/ipsec-secgw/ipsec.c       |  10 +-
>>>    lib/librte_ether/rte_flow.c        |  39 +++----
>>>    lib/librte_ether/rte_flow.h        |   6 +-
>>>    27 files changed, 473 insertions(+), 353 deletions(-)
>> <...>
>>
>>> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
>>> index 056405515..1a2c0299c 100644
>>> --- a/drivers/net/sfc/sfc_flow.c
>>> +++ b/drivers/net/sfc/sfc_flow.c
>>> @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	struct sfc_rxq *rxq;
>>>    	unsigned int rxq_hw_index_min;
>>>    	unsigned int rxq_hw_index_max;
>>> -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
>>> -	uint64_t rss_hf;
>>> -	uint8_t *rss_key = NULL;
>>> +	const uint8_t *rss_key;
>>>    	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
>>>    	unsigned int i;
>>> -	if (rss->num == 0)
>>> +	if (rss->queue_num == 0)
>>>    		return -EINVAL;
>>>    	rxq_sw_index = sa->rxq_count - 1;
>>> @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	rxq_hw_index_min = rxq->hw_index;
>>>    	rxq_hw_index_max = 0;
>>> -	for (i = 0; i < rss->num; ++i) {
>>> +	for (i = 0; i < rss->queue_num; ++i) {
>>>    		rxq_sw_index = rss->queue[i];
>>>    		if (rxq_sw_index >= sa->rxq_count)
>>> @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    			rxq_hw_index_max = rxq->hw_index;
>>>    	}
>>> -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
>> Here we had a fallback to default rss_hf (now types) if rss_conf is
>> unspecified.
> Thing is, rss_action->conf was never supposed to be NULL in the first
> place. Crashing on a NULL configuration has always been fine, but until
> recently prevented validation with testpmd's broken implementation. This
> problem was addressed in a prior series [1][2][3].
>
> Since a value is now always provided, no need for a fallback.

testpmd is not the only application. But in any case I agree that it was
possible have rss_hf==0 before. So, no big changes.

> [1] "app/testpmd: fix lack of flow action configuration"
>      http://dpdk.org/ml/archives/dev/2018-April/095280.html
> [2] "app/testpmd: fix RSS flow action configuration"
>      http://dpdk.org/ml/archives/dev/2018-April/095281.html
> [3] "app/testpmd: fix missing RSS fields in flow action"
>      http://dpdk.org/ml/archives/dev/2018-April/095282.html
>
>>> -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
>>> +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
>>>    		return -EINVAL;
>>> -	if (rss_conf != NULL) {
>>> -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
>>> +	if (rss->key_len) {
>>> +		if (rss->key_len != sizeof(sa->rss_key))
>>>    			return -EINVAL;
>>> -		rss_key = rss_conf->rss_key;
>>> +		rss_key = rss->key;
>>>    	} else {
>>>    		rss_key = sa->rss_key;
>>>    	}
>>> @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
>>>    	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
>>> -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
>>> +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
>> Now types go directly to mapping function and unspecified types (0)
>> will result in 0 rss_hash_types. Of course, it is a question how to treat
>> types==0. It is possible to say that it no RSS, but it does not make sense.
>> So, real options are device defaults (regardless configured on device level)
>> or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
>> Please, document the intended behaviour in rte_flow.rst.
> Granted the existing documentation doesn't say much on that topic, but a 0
> value for rss_hf does actually mean "no RSS" [4]:
>
>   "The *rss_hf* field of the *rss_conf* structure indicates the different
>    types of IPv4/IPv6 packets to which the RSS hashing must be applied.
>    Supplying an *rss_hf* equal to zero disables the RSS feature."
>
> Now since this action doesn't use struct rte_eth_rss_conf anymore, we could
> define 0 as a PMD-specific behavior, which could be no RSS. It would make
> the API easier to use for applications that don't care about the RSS
> capabilities of each underlying adapter, 0 would just work everywhere as a
> safe default.

PMD-specific is fine with some limits. It should be either device RSS 
config or
device defaults. I think it is bad idea to allow types=0 disable RSS as 
an option
of the PMD-specific behaviour.

> [4] https://dpdk.org/doc/api/structrte__eth__rss__conf.html
>
>> If the later is chosen, above we'll have a bug since fallback to fixed
>> default.
>> Just use sa->rss_hash_types as fallback. Something like:
>> if (rss->types)
>>      sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
>> else
>>      sfc_rss_conf->rss_hash_types =sa->rss_hash_types;
> Looks like the previous code didn't provide a fallback when rss_hf was 0,
> only when rss_conf itself was NULL. So this is not a new issue introduced by
> this patch.

Yes, I agree.

> I will update documentation to define 0 as described above for the
> convenience of application writers and leave the existing code in place.
> PMD maintainers will be free to enhance it as they wish later.
> Just remember testpmd now always provides a default value for it after
> querying the device [2].

Many thanks.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-11 13:06  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:06 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon,
	Radu Nicolau, Akhil Goyal

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> Since its inception, the rte_flow RSS action has been relying in part on
> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> This structure lacks parameters such as the hash algorithm to use, and more
> recently, a method to tell which layer RSS should be performed on [1].
>
> Given struct rte_eth_rss_conf will never be flexible enough to represent a
> complete RSS configuration (e.g. RETA table), this patch supersedes it by
> extending the rte_flow RSS action directly.
>
> A subsequent patch will add a field to use a non-default RSS hash
> algorithm. To that end, a field named "types" replaces the field formerly
> known as "rss_hf" and standing for "RSS hash functions" as it was
> confusing. Actual RSS hash function types are defined by enum
> rte_eth_hash_function.
>
> This patch updates all PMDs and example applications accordingly.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>      configuration")
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> Cc: Radu Nicolau <radu.nicolau@intel.com>
> Cc: Akhil Goyal <akhil.goyal@nxp.com>
>
> ---
>
> v3 changes:
>
> Documentation update regarding the meaning of a 0 value for RSS types in
> flow rules.
>
> It used to implicitly mean "no RSS" but is redefined as requesting a kind
> of "best-effort" mode from PMDs, i.e. anything ranging from empty to
> all-inclusive RSS; what matters is it provides safe defaults that will work
> regardless of PMD capabilities.
> ---
>   app/test-pmd/cmdline_flow.c                 |  48 +++---
>   app/test-pmd/config.c                       |  39 ++---
>   doc/guides/prog_guide/rte_flow.rst          |  28 ++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
>   drivers/net/e1000/e1000_ethdev.h            |  13 +-
>   drivers/net/e1000/igb_ethdev.c              |   4 +-
>   drivers/net/e1000/igb_flow.c                |  31 ++--
>   drivers/net/e1000/igb_rxtx.c                |  51 +++++-
>   drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
>   drivers/net/i40e/i40e_ethdev.h              |  15 +-
>   drivers/net/i40e/i40e_flow.c                |  57 ++++---
>   drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
>   drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
>   drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
>   drivers/net/mlx4/mlx4.c                     |   2 +-
>   drivers/net/mlx4/mlx4_flow.c                |  61 +++----
>   drivers/net/mlx4/mlx4_flow.h                |   2 +-
>   drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
>   drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
>   drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
>   drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
>   drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
>   drivers/net/sfc/sfc_flow.c                  |  21 ++-
>   drivers/net/tap/tap_flow.c                  |   8 +-
>   examples/ipsec-secgw/ipsec.c                |  10 +-
>   lib/librte_ether/rte_flow.c                 |  39 ++---
>   lib/librte_ether/rte_flow.h                 |  12 +-
>   28 files changed, 484 insertions(+), 359 deletions(-)

Generic and net/sfc;
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and action to flow API
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-11 13:02  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:02 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Zhang, Qi Z, Declan Doherty

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
> into a different device, as identified by its DPDK port ID.
>
> This is normally only supported when the target port ID has some kind of
> relationship with the port ID the flow rule is created against, such as
> being exposed by a common physical device (e.g. a different port of an
> Ethernet switch).
>
> The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
> flow rule match traffic whose origin is the specified port ID. Note that
> specifying a port ID that differs from the one the flow rule is created
> against is normally meaningless (if even accepted), but can make sense if
> combined with the transfer attribute.
>
> These must not be confused with their PHY_PORT counterparts, which refer to
> physical ports using device-specific indices, but unlike PORT_ID are not
> necessarily tied to DPDK port IDs.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> Cc: Declan Doherty <declan.doherty@intel.com>
>
> ---
>
> This patch provides the same functionality and supersedes Qi Zhang's
> "ether: add flow action to redirect packet to a port" [1].
>
> The main differences are:
>
> - Action is named PORT_ID instead of PORT.
> - Addition of a PORT_ID pattern item.
> - More extensive documentation.
> - Testpmd support.
> - rte_flow_copy() support.
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  2 +
>   doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
>   lib/librte_ether/rte_flow.c                 |  2 +
>   lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
>   6 files changed, 174 insertions(+)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to flow API
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-11 13:00  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:00 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> This patch adds the missing action counterpart to the PHY_PORT pattern
> item, that is, the ability to directly inject matching traffic into a
> physical port of the underlying device.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
>   6 files changed, 84 insertions(+)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item in flow API
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-11 12:57  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:57 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
> device using specific identifiers, these are often confused with DPDK port
> IDs exposed to applications in the global name space.
>
> Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
> for better clarity.
>
> No ABI impact.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
>   app/test-pmd/config.c                       |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
>   lib/librte_ether/rte_flow.c                 |  2 +-
>   lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
>   6 files changed, 41 insertions(+), 45 deletions(-)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-11 12:45  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:45 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Pascal Mazon

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> consistent with the normal stacking order of pattern items, which is
> confusing to applications.
>
> Problem is that when followed by one of these layers, the EtherType field
> of the preceding layer keeps its "inner" definition, and the "outer" TPID
> is provided by the subsequent layer, the reverse of how a packet looks like
> on the wire:
>
>   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
>   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
>
> Worse, when QinQ is involved, the stacking order of VLAN layers is
> unspecified. It is unclear whether it should be reversed (innermost to
> outermost) as well given TPID applies to the previous layer:
>
>   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
>   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
>   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
>
> While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> order in case of QinQ and the lack of documentation remain an issue.
>
> This patch replaces TPID in the VLAN pattern item with an inner
> EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> clarifies documentation and updates all relevant code.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> items:
>
> - bnxt: EtherType matching is supported with and without VLAN, but TPID
>    matching is not and triggers an error.
>
> - e1000: EtherType matching is only supported with the ETHERTYPE filter,
>    which does not support VLAN matching, therefore no impact.
>
> - enic: same as bnxt.
>
> - i40e: same as bnxt with existing FDIR limitations on allowed EtherType
>    values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
>    EtherType matching.
>
> - ixgbe: same as e1000, with additional minor change to rely on the new
>    E-Tag macro definition.
>
> - mlx4: EtherType/TPID matching is not supported, no impact.
>
> - mlx5: same as bnxt.
>
> - mvpp2: same as bnxt.
>
> - sfc: same as bnxt.
>
> - tap: same as bnxt.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Hyong Youb Kim <hyonkim@cisco.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Tomasz Duszynski <tdu@semihalf.com>
> Cc: Dmitri Epshtein <dima@marvell.com>
> Cc: Natalie Samsonov <nsamsono@marvell.com>
> Cc: Jianbo Liu <jianbo.liu@arm.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> v3 changes:
>
> Updated mrvl to mvpp2.
>
> Moved unrelated default TCI mask update to separate patch.
>
> Fixed sfc according to Andrew's comments [1], which made so much sense that
> I standardized on the same behavior for all other PMDs: matching outer TPID
> is never supported when a VLAN pattern item is present.
>
> This is done because many devices accept several TPIDs but do not provide
> means to match a given one explicitly, it's all or nothing, and that makes
> the resulting flow rule inaccurate.
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 17 +++----
>   doc/guides/nics/tap.rst                     |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
>   drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
>   drivers/net/enic/enic_flow.c                | 19 +++++---
>   drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
>   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
>   drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
>   drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
>   drivers/net/sfc/sfc_flow.c                  | 18 +++++++
>   drivers/net/tap/tap_flow.c                  | 14 ++++--
>   lib/librte_ether/rte_flow.h                 | 22 ++++++---
>   lib/librte_net/rte_ether.h                  |  1 +
>   14 files changed, 198 insertions(+), 55 deletions(-)

Generic and net/sfc
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-11 12:40  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:40 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> By definition, RSS involves some kind of hash algorithm, usually Toeplitz.
>
> Until now it could not be modified on a flow rule basis and PMDs had to
> always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
> behavior when unspecified (0).
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> v3 changes:
>
> - Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
>    more explicit where doing so would clarify the code.
>
> - Updated sfc to include Toeplitz as the other allowed value.
>
> Both according to Andrew's suggestions [1].
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          |  2 +
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
>   drivers/net/e1000/igb_flow.c                |  4 ++
>   drivers/net/e1000/igb_rxtx.c                |  4 +-
>   drivers/net/i40e/i40e_ethdev.c              |  4 +-
>   drivers/net/i40e/i40e_flow.c                |  4 ++
>   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
>   drivers/net/mlx4/mlx4_flow.c                |  7 +++
>   drivers/net/mlx5/mlx5_flow.c                | 13 +++++
>   drivers/net/sfc/sfc_flow.c                  |  8 +++
>   drivers/net/tap/tap_flow.c                  |  6 ++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 |  2 +
>   16 files changed, 136 insertions(+), 3 deletions(-)

Generic and net/sfc
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE
  2018-04-11  9:59  5%   ` Adrien Mazarguil
@ 2018-04-11 12:04  0%     ` Xueming(Steven) Li
  0 siblings, 0 replies; 200+ results
From: Xueming(Steven) Li @ 2018-04-11 12:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nélio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

Hi Adrien,

> -----Original Message-----
> From: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Sent: Wednesday, April 11, 2018 5:59 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>; Jingjing Wu <jingjing.wu@intel.com>;
> Thomas Monjalon <thomas@monjalon.net>; Nélio Laranjeiro
> <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>;
> dev@dpdk.org; Olivier Matz <olivier.matz@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-
> GPE
> 
> On Tue, Apr 10, 2018 at 09:00:33PM +0800, Xueming Li wrote:
> > VXLAN-GPE enables VXLAN for all protocols. Protocol link:
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdat
> > atracker.ietf.org%2Fdoc%2Fdraft-ietf-nvo3-vxlan-gpe%2F&data=02%7C01%7C
> > xuemingl%40mellanox.com%7Ce69c95d98f5f457c966908d59f92e393%7Ca652971c7
> > d2e4d9ba6a4d149256f461b%7C0%7C0%7C636590375601220397&sdata=XZ6kpgEIrbB
> > wHrpODaZiByf6a2NQl4J6MadYYAsuNsc%3D&reserved=0
> >
> > Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> 
> Adding a new rte_flow pattern item in the middle of enum
> rte_flow_item_type breaks ABI compatibility. It's fine for 18.05 because
> prior series already destroyed it, however for this patch you need to
> choose between:
> 
> - Adding the new entry at the end of the enum and modifying the rest of
> the
>   code to follow the same order (preferred approach when not doing a full
>   API overhaul).
> 
> *or*
> 
> - Stating in the commit log what functions are impacted by ABI changes as
> in
>   "ethdev: remove DUP action from flow API" [1].
> 
> Also you must add a new "Item: ``VXLAN_GPE``" section to
> doc/guides/prog_guide/rte_flow.rst (look for "VXLAN" for clues).
> 
> Otherwise patch is mostly fine, just a few comments below.
> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fdpdk.org
> %2Fml%2Farchives%2Fdev%2F2018-
> April%2F096526.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Ce69c95d98f5f4
> 57c966908d59f92e393%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636590375
> 601220397&sdata=8Q%2FMigA8hSHmM25UWUvhTOtVuit%2FxQRFBA6iF6lYxv8%3D&reserve
> d=0
> 

Thanks, I've update code according to option 1.

> > ---
> >  lib/librte_ether/rte_eth_ctrl.h  |  3 ++-
> >  lib/librte_ether/rte_flow.c      |  1 +
> >  lib/librte_ether/rte_flow.h      | 27 +++++++++++++++++++++++++++
> >  lib/librte_mbuf/rte_mbuf.c       |  3 +++
> >  lib/librte_mbuf/rte_mbuf.h       |  1 +
> >  lib/librte_mbuf/rte_mbuf_ptype.c |  1 +
> > lib/librte_mbuf/rte_mbuf_ptype.h | 13 +++++++++++++
> >  lib/librte_net/rte_ether.h       | 25 +++++++++++++++++++++++++
> >  8 files changed, 73 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_ether/rte_eth_ctrl.h
> > b/lib/librte_ether/rte_eth_ctrl.h index 668f59acb..5ea8ae24c 100644
> > --- a/lib/librte_ether/rte_eth_ctrl.h
> > +++ b/lib/librte_ether/rte_eth_ctrl.h
> > @@ -54,7 +54,8 @@ extern "C" {
> >  #define RTE_ETH_FLOW_VXLAN              19 /**< VXLAN protocol based
> flow */
> >  #define RTE_ETH_FLOW_GENEVE             20 /**< GENEVE protocol based
> flow */
> >  #define RTE_ETH_FLOW_NVGRE              21 /**< NVGRE protocol based
> flow */
> > -#define RTE_ETH_FLOW_MAX                22
> > +#define RTE_ETH_FLOW_VXLAN_GPE          22 /**< VXLAN-GPE protocol
> based flow */
> > +#define RTE_ETH_FLOW_MAX                23
> >
> >  /**
> >   * Feature filter types
> > diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> > index 3d8116ebd..fb710fff7 100644
> > --- a/lib/librte_ether/rte_flow.c
> > +++ b/lib/librte_ether/rte_flow.c
> > @@ -50,6 +50,7 @@ static const struct rte_flow_desc_data
> rte_flow_desc_item[] = {
> >  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
> >  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
> >  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> > +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),
> 
> Should be at the end of this array if you choose to not impact ABI.
> 
> >  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
> >  	MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
> >  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)), diff --git
> > a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h index
> > bed727df8..c7cfc201a 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -256,6 +256,13 @@ enum rte_flow_item_type {
> >  	RTE_FLOW_ITEM_TYPE_VXLAN,
> >
> >  	/**
> > +	 * Matches a VXLAN-GPE header.
> > +	 *
> > +	 * See struct rte_flow_item_vxlan_gpe.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
> > +
> > +	/**
> 
> Ditto for the enum definition.
> 
> >  	 * Matches a E_TAG header.
> >  	 *
> >  	 * See struct rte_flow_item_e_tag.
> > @@ -676,6 +683,26 @@ static const struct rte_flow_item_vxlan
> > rte_flow_item_vxlan_mask = {  #endif
> >
> >  /**
> > + * RTE_FLOW_ITEM_TYPE_VXLAN_GPE.
> > + *
> > + * Matches a VXLAN-GPE header.
> 
> You should name the current IETF draft pending a proper RFC:
> 
>  Matches a VXLAN-GPE header (draft-ietf-nvo3-vxlan-gpe-05).
> 
> > + */
> > +struct rte_flow_item_vxlan_gpe {
> > +	uint8_t flags; /**< Normally 0x0c (I and P flag). */
> > +	uint8_t rsvd0[2]; /**< Reserved, normally 0x0000. */
> > +	uint8_t protocol; /**< Protocol type. */
> > +	uint8_t vni[3]; /**< VXLAN identifier. */
> > +	uint8_t rsvd1; /**< Reserved, normally 0x00. */ };
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN_GPE. */ #ifndef
> > +__cplusplus static const struct rte_flow_item_vxlan_gpe
> > +rte_flow_item_vxlan_gpe_mask = {
> > +	.vni = "\xff\xff\xff",
> > +};
> > +#endif
> 
> Again if you choose to not impact ABI, this should be moved further down,
> after the last item definition for consistency.
> 
> > +
> > +/**
> >   * RTE_FLOW_ITEM_TYPE_E_TAG.
> >   *
> >   * Matches a E-tag header.
> > diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> > index 091d388d3..dc90379e5 100644
> > --- a/lib/librte_mbuf/rte_mbuf.c
> > +++ b/lib/librte_mbuf/rte_mbuf.c
> > @@ -405,6 +405,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
> >  	case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP";
> >  	case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE";
> >  	case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP";
> > +	case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE";
> >  	case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
> >  	case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
> >  	default: return NULL;
> > @@ -439,6 +440,8 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf,
> size_t buflen)
> >  		  "PKT_TX_TUNNEL_NONE" },
> >  		{ PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
> >  		  "PKT_TX_TUNNEL_NONE" },
> > +		{ PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK,
> > +		  "PKT_TX_TUNNEL_NONE" },
> >  		{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
> >  		{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
> >  	};
> > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > index 62740254d..1839cf2ed 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -210,6 +210,7 @@ extern "C" {
> >  #define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)  /**< TX packet with
> > MPLS-in-UDP RFC 7510 header. */  #define PKT_TX_TUNNEL_MPLSINUDP
> > (0x5ULL << 45)
> > +#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
> >  /* add new TX TUNNEL type here */
> >  #define PKT_TX_TUNNEL_MASK    (0xFULL << 45)
> >
> > diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c
> > b/lib/librte_mbuf/rte_mbuf_ptype.c
> > index 1feefacc6..49106c7df 100644
> > --- a/lib/librte_mbuf/rte_mbuf_ptype.c
> > +++ b/lib/librte_mbuf/rte_mbuf_ptype.c
> > @@ -65,6 +65,7 @@ const char *rte_get_ptype_tunnel_name(uint32_t ptype)
> >  	case RTE_PTYPE_TUNNEL_GTPU: return "TUNNEL_GTPU";
> >  	case RTE_PTYPE_TUNNEL_ESP: return "TUNNEL_ESP";
> >  	case RTE_PTYPE_TUNNEL_L2TP: return "TUNNEL_L2TP";
> > +	case RTE_PTYPE_TUNNEL_VXLAN_GPE: return "TUNNEL_VXLAN_GPE";
> >  	default: return "TUNNEL_UNKNOWN";
> >  	}
> >  }
> > diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> > b/lib/librte_mbuf/rte_mbuf_ptype.h
> > index b9a338110..7caf83312 100644
> > --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> > +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> > @@ -423,6 +423,19 @@ extern "C" {
> >   */
> >  #define RTE_PTYPE_TUNNEL_L2TP               0x0000a000
> >  /**
> > + * VXLAN-GPE (VXLAN Generic Protocol Extension) tunneling packet type.
> > + *
> > + * Packet format:
> > + * <'ether type'=0x0800
> > + * | 'version'=4, 'protocol'=17
> > + * | 'destination port'=4790>
> > + * or,
> > + * <'ether type'=0x86DD
> > + * | 'version'=6, 'next header'=17
> > + * | 'destination port'=4790>
> > + */
> > +#define RTE_PTYPE_TUNNEL_VXLAN_GPE          0x0000b000
> > +/**
> >   * Mask of tunneling packet types.
> >   */
> >  #define RTE_PTYPE_TUNNEL_MASK               0x0000f000
> > diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
> > index a271d1c86..a64814179 100644
> > --- a/lib/librte_net/rte_ether.h
> > +++ b/lib/librte_net/rte_ether.h
> > @@ -311,6 +311,31 @@ struct vxlan_hdr {  /**< VXLAN tunnel header
> > length. */
> >
> >  /**
> > + * VXLAN-GPE protocol header.
> > + * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network
> > + * Identifier and Reserved fields (16 bits and 8 bits).
> 
> Another reference to the current IETF draft here shouldn't hurt.
> 
> > + */
> > +struct vxlan_gpe_hdr {
> > +	uint8_t vx_flags; /**< flag (8). */
> > +	uint8_t reserved[2]; /**< Reserved (16). */
> > +	uint8_t proto; /**< next-protocol (8). */
> > +	uint32_t vx_vni;   /**< VNI (24) + Reserved (8). */
> > +} __attribute__((__packed__));
> > +
> > +/* VXLAN-GPE next protocol types */
> > +#define VXLAN_GPE_TYPE_IPv4 1 /**< IPv4 Protocol. */ #define
> > +VXLAN_GPE_TYPE_IPv6 2 /**< IPv6 Protocol. */ #define
> > +VXLAN_GPE_TYPE_ETH  3 /**< Ethernet Protocol. */ #define
> > +VXLAN_GPE_TYPE_NSH  4 /**< NSH Protocol. */ #define
> > +VXLAN_GPE_TYPE_MPLS 5 /**< MPLS Protocol. */ #define
> > +VXLAN_GPE_TYPE_GBP  6 /**< GBP Protocol. */ #define
> > +VXLAN_GPE_TYPE_VBNG 7 /**< vBNG Protocol. */
> > +
> > +#define ETHER_VXLAN_GPE_HLEN (sizeof(struct udp_hdr) + \
> > +			      sizeof(struct vxlan_gpe_hdr)) /**< VXLAN-GPE
> tunnel header
> > +length. */
> > +
> > +/**
> >   * Extract VLAN tag information into mbuf
> >   *
> >   * Software version of VLAN stripping
> > --
> > 2.13.3
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11  5:33  0%               ` Yongseok Koh
@ 2018-04-11 11:39  0%                 ` Ananyev, Konstantin
  2018-04-11 17:08  0%                   ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-11 11:39 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev


Hi Yongseok,

> > >
> > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > Hi Yongseok,
> > > >
> > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > Hi,
> > > > > >
> > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > rte_pktmbuf_attach_at().
> > > > > > >
> > > > > > > Possible use-cases could be:
> > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > >
> > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > >
> > > > > > I think the current API is already able to do what you want.
> > > > > >
> > > > > > 1/ Here is a mbuf m with its data
> > > > > >
> > > > > >                off
> > > > > >                <-->
> > > > > >                       len
> > > > > >           +----+   <---------->
> > > > > >           |    |
> > > > > >         +-|----v----------------------+
> > > > > >         | |    -----------------------|
> > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > >         |      -----------------------|
> > > > > >         +-----------------------------+
> > > > > >
> > > > > >
> > > > > > 2/ clone m:
> > > > > >
> > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > >   rte_pktmbuf_attach(c, m);
> > > > > >
> > > > > >   Note that c has its own offset and length fields.
> > > > > >
> > > > > >
> > > > > >                off
> > > > > >                <-->
> > > > > >                       len
> > > > > >           +----+   <---------->
> > > > > >           |    |
> > > > > >         +-|----v----------------------+
> > > > > >         | |    -----------------------|
> > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > >         |      -----------------------|
> > > > > >         +------^----------------------+
> > > > > >                |
> > > > > >           +----+
> > > > > > indirect  |
> > > > > >         +-|---------------------------+
> > > > > >         | |    -----------------------|
> > > > > > c       | buf  |                     ||
> > > > > >         |      -----------------------|
> > > > > >         +-----------------------------+
> > > > > >
> > > > > >                 off    len
> > > > > >                 <--><---------->
> > > > > >
> > > > > >
> > > > > > 3/ remove some data from c without changing m
> > > > > >
> > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > >
> > > > > >
> > > > > > Please let me know if it fits your needs.
> > > > >
> > > > > No, it doesn't.
> > > > >
> > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > difference offsets in m,
> > > > >
> > > > > rte_pktmbuf_adj(c1, 10);
> > > > > rte_pktmbuf_adj(c2, 20);
> > > > >
> > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > header should be linked by h1->next = c2.
> > > >
> > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > length) that will:
> > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > >     in the range [offset : length+offset].
> > > >   - prepend an empty and writable mbuf for the headers
> > > >
> > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > which actually shrink the headroom, this case can be properly handled.
> > > >
> > > > What do you mean by properly handled?
> > > >
> > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > won't be protected.
> > > >
> > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > have refcnt != 1, should be both considered as read-only because they
> > > > may share their data. How an application can know if the data is shared
> > > > or not?
> > > >
> > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > understanding is correct, you want to have indirect mbufs with RW data.
> > >
> > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > enough to handle that use-case.
> > >
> > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > indirect referencing.
> >
> > But just to make HW to RX multiple packets into one mbuf,
> > data_off inside indirect mbuf should be enough, correct?
> Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> to saturate the network link.

There were few complains that 64KB max is a limitation for some use-cases.
I am not against increasing it, but I don't think we have free space on first cache-line for that
without another big rework of mbuf layout. 
Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too. 

> 
> > As I understand, what you'd like to achieve with this new field -
> > ability to manipulate packet boundaries after RX, probably at upper layer.
> > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > indirect mbufs trying to modify same direct buffer.
> 
> I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> is read-only. What that means, all the entities which own such mbufs have to be
> aware of that and keep the principle as DPDK can't enforce the rule and there
> can't be such sanity check. In this sense, HW doesn't violate it because the
> direct mbuf is injected to HW before indirection. When packets are written by
> HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> application layer with freeing the original direct mbuf (decrement refcnt by 1).
> So, HW doesn't touch the direct buffer once it reaches to upper layer.

Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
which implies that it should be possible by some entity (upper layer?) to manipulate these
indirect mbufs.
And we don't know how exactly it will be done.

> The direct buffer will be freed and get available for reuse when all the attached
> indirect mbufs are freed.
> 
> > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > Fields for indirect mbufs, straight after attach()?
> 
> Good point.
> Actually that was my draft (Mellanox internal) version of this patch :-) But I
> had to consider a case where priv_size is really given by user. Even though it
> is less likely, but if original priv_size is quite big, it can't cover entire
> buf_len. For this, I had to increase priv_size to 32-bit but adding another
> 16bit field (buf_off) looked more plausible.

As I remember, we can't have mbufs bigger then 64K,
so priv_size + buf_len should be always less than 64K, correct?
Konstantin  

> 
> Thanks for good comments,
> Yongseok
> 
> > > > >
> > > > > Does this make sense?
> > > >
> > > > I understand the need.
> > > >
> > > > Another option would be to make the mbuf->buffer point to an external
> > > > buffer (not inside the direct mbuf). This would require to add a
> > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > a quick overview.
> > > >
> > > > [1]
> > >
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > -Session05-OlivierMatz-
> > >
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > >
> > > > The advantage is that it does not require the large data to be inside a
> > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > to implement compared to your solution.
> > >
> > > I knew that you presented the slides and frankly, I had considered that option
> > > at first. But even with that option, metadata to store refcnt should also be
> > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > end of the data segment. Even though it could have smaller metadata structure,
> > > I just wanted to make full use of the existing framework because it is less
> > > complex as you mentioned. Given that you presented the idea of external data
> > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > simpler.  I personally think that we can take the idea of external data seg when
> > > more demands come from users in the future as it would be a huge change and may
> > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > discussions and write codes for it if I can be helpful.
> > >
> > > Do you think this patch is okay for now?
> > >
> > >
> > > Thanks for your comments,
> > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 4/5] app/testpmd: introduce new tunnel VXLAN-GPE
  @ 2018-04-11  9:59  3%   ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-11  9:59 UTC (permalink / raw)
  To: Xueming Li
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nelio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

On Tue, Apr 10, 2018 at 09:00:35PM +0800, Xueming Li wrote:
> Add VXLAN-GPE support to csum forwarding engine and rte flow.
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>

Depending on whether you chose to impact ABI compatibility in the second
patch of the series, you may need to reorder all VXLAN_GPE definitions in
this patch to match that of the rte_flow API.

A few more comments below.

> ---
>  app/test-pmd/cmdline_flow.c           | 24 ++++++++++
>  app/test-pmd/config.c                 |  2 +
>  app/test-pmd/csumonly.c               | 83 +++++++++++++++++++++++++++++++++--
>  app/test-pmd/parameters.c             | 12 ++++-
>  app/test-pmd/testpmd.h                |  2 +
>  doc/guides/testpmd_app_ug/run_app.rst |  5 +++
>  6 files changed, 124 insertions(+), 4 deletions(-)
> 
> diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
> index f85c1c57f..f5abd589d 100644
> --- a/app/test-pmd/cmdline_flow.c
> +++ b/app/test-pmd/cmdline_flow.c
> @@ -137,6 +137,8 @@ enum index {
>  	ITEM_SCTP_CKSUM,
>  	ITEM_VXLAN,
>  	ITEM_VXLAN_VNI,
> +	ITEM_VXLAN_GPE,
> +	ITEM_VXLAN_GPE_VNI,
>  	ITEM_E_TAG,
>  	ITEM_E_TAG_GRP_ECID_B,
>  	ITEM_NVGRE,
> @@ -461,6 +463,7 @@ static const enum index next_item[] = {
>  	ITEM_TCP,
>  	ITEM_SCTP,
>  	ITEM_VXLAN,
> +	ITEM_VXLAN_GPE,
>  	ITEM_E_TAG,
>  	ITEM_NVGRE,
>  	ITEM_MPLS,
> @@ -589,6 +592,12 @@ static const enum index item_vxlan[] = {
>  	ZERO,
>  };
>  
> +static const enum index item_vxlan_gpe[] = {
> +	ITEM_VXLAN_GPE_VNI,
> +	ITEM_NEXT,
> +	ZERO,
> +};
> +
>  static const enum index item_e_tag[] = {
>  	ITEM_E_TAG_GRP_ECID_B,
>  	ITEM_NEXT,
> @@ -1441,6 +1450,21 @@ static const struct token token_list[] = {
>  		.next = NEXT(item_vxlan, NEXT_ENTRY(UNSIGNED), item_param),
>  		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vxlan, vni)),
>  	},
> +	[ITEM_VXLAN_GPE] = {
> +		.name = "vxlan-gpe",
> +		.help = "match VXLAN-GPE header",
> +		.priv = PRIV_ITEM(VXLAN_GPE,
> +				  sizeof(struct rte_flow_item_vxlan_gpe)),
> +		.next = NEXT(item_vxlan_gpe),
> +		.call = parse_vc,
> +	},
> +	[ITEM_VXLAN_GPE_VNI] = {
> +		.name = "vni",
> +		.help = "VXLAN-GPE identifier",
> +		.next = NEXT(item_vxlan_gpe, NEXT_ENTRY(UNSIGNED), item_param),
> +		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vxlan_gpe,
> +					     vni)),
> +	},
>  	[ITEM_E_TAG] = {
>  		.name = "e_tag",
>  		.help = "match E-Tag header",
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 4a273eff7..1a9bc37ed 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -972,6 +972,7 @@ static const struct {
>  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
>  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
>  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),
>  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)),
>  	MK_FLOW_ITEM(NVGRE, sizeof(struct rte_flow_item_nvgre)),
>  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),

My first comment applies to all the above hunks.

> @@ -3080,6 +3081,7 @@ flowtype_to_str(uint16_t flow_type)
>  		{"l2_payload", RTE_ETH_FLOW_L2_PAYLOAD},
>  		{"port", RTE_ETH_FLOW_PORT},
>  		{"vxlan", RTE_ETH_FLOW_VXLAN},
> +		{"vxlan-gpe", RTE_ETH_FLOW_VXLAN_GPE},
>  		{"geneve", RTE_ETH_FLOW_GENEVE},
>  		{"nvgre", RTE_ETH_FLOW_NVGRE},
>  	};
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
> index 5f5ab64aa..213888374 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -60,6 +60,8 @@
>  #define _htons(x) (x)
>  #endif
>  
> +uint16_t vxlan_gpe_udp_port = 4790;
> +
>  /* structure that caches offload info for the current packet */
>  struct testpmd_offload_info {
>  	uint16_t ethertype;
> @@ -194,6 +196,70 @@ parse_vxlan(struct udp_hdr *udp_hdr,
>  	info->l2_len += ETHER_VXLAN_HLEN; /* add udp + vxlan */
>  }
>  
> +/* Parse a vxlan-gpe header */
> +static void
> +parse_vxlan_gpe(struct udp_hdr *udp_hdr,
> +	    struct testpmd_offload_info *info)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ipv6_hdr *ipv6_hdr;
> +	struct vxlan_gpe_hdr *vxlan_gpe_hdr;
> +	uint8_t vxlan_gpe_len = sizeof(*vxlan_gpe_hdr);
> +
> +	/* check udp destination port, 4790 is the default vxlan-gpe port */
> +	if (udp_hdr->dst_port != _htons(vxlan_gpe_udp_port))
> +		return;
> +
> +	vxlan_gpe_hdr = (struct vxlan_gpe_hdr *)((char *)udp_hdr +
> +				sizeof(struct udp_hdr));
> +
> +	if (!vxlan_gpe_hdr->proto || vxlan_gpe_hdr->proto ==
> +	    VXLAN_GPE_TYPE_IPv4) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		ipv4_hdr = (struct ipv4_hdr *)((char *)vxlan_gpe_hdr +
> +			   vxlan_gpe_len);
> +
> +		parse_ipv4(ipv4_hdr, info);
> +		info->ethertype = _htons(ETHER_TYPE_IPv4);
> +		info->l2_len = 0;
> +
> +	} else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_IPv6) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		ipv6_hdr = (struct ipv6_hdr *)((char *)vxlan_gpe_hdr +
> +			   vxlan_gpe_len);
> +
> +		info->ethertype = _htons(ETHER_TYPE_IPv6);
> +		parse_ipv6(ipv6_hdr, info);
> +		info->l2_len = 0;
> +
> +	} else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_ETH) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		eth_hdr = (struct ether_hdr *)((char *)vxlan_gpe_hdr +
> +			  vxlan_gpe_len);
> +
> +		parse_ethernet(eth_hdr, info);
> +	} else
> +		return;
> +
> +	info->l2_len += ETHER_VXLAN_GPE_HLEN;
> +}
> +
>  /* Parse a gre header */
>  static void
>  parse_gre(struct simple_gre_hdr *gre_hdr, struct testpmd_offload_info *info)
> @@ -588,6 +654,10 @@ pkt_copy_split(const struct rte_mbuf *pkt)
>   *   Ether / (vlan) / IP|IP6 / UDP|TCP|SCTP .
>   *   Ether / (vlan) / outer IP|IP6 / outer UDP / VxLAN / Ether / IP|IP6 /
>   *           UDP|TCP|SCTP
> + *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / Ether / IP|IP6 /
> + *           UDP|TCP|SCTP
> + *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / IP|IP6 /
> + *           UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / GRE / Ether / IP|IP6 / UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / GRE / IP|IP6 / UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / IP|IP6 / UDP|TCP|SCTP
> @@ -691,9 +761,16 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
>  
>  				udp_hdr = (struct udp_hdr *)((char *)l3_hdr +
>  					info.l3_len);
> -				parse_vxlan(udp_hdr, &info, m->packet_type);
> -				if (info.is_tunnel)
> -					tx_ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +				parse_vxlan_gpe(udp_hdr, &info);
> +				if (info.is_tunnel) {
> +					tx_ol_flags |= PKT_TX_TUNNEL_VXLAN_GPE;
> +				} else {
> +					parse_vxlan(udp_hdr, &info,
> +						    m->packet_type);
> +					if (info.is_tunnel)
> +						tx_ol_flags |=
> +							PKT_TX_TUNNEL_VXLAN;
> +				}
>  			} else if (info.l4_proto == IPPROTO_GRE) {
>  				struct simple_gre_hdr *gre_hdr;
>  
> diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
> index 2192bdcdf..68063b7a4 100644
> --- a/app/test-pmd/parameters.c
> +++ b/app/test-pmd/parameters.c
> @@ -70,7 +70,7 @@ usage(char* progname)
>  	       "--rss-ip | --rss-udp | "
>  	       "--rxpt= | --rxht= | --rxwt= | --rxfreet= | "
>  	       "--txpt= | --txht= | --txwt= | --txfreet= | "
> -	       "--txrst= | --tx-offloads ]\n",
> +	       "--txrst= | --tx-offloads= | --vxlan-gpe-port= ]\n",
>  	       progname);
>  #ifdef RTE_LIBRTE_CMDLINE
>  	printf("  --interactive: run in interactive mode.\n");
> @@ -186,6 +186,7 @@ usage(char* progname)
>  	printf("  --flow-isolate-all: "
>  	       "requests flow API isolated mode on all ports at initialization time.\n");
>  	printf("  --tx-offloads=0xXXXXXXXX: hexadecimal bitmask of TX queue offloads\n");
> +	printf("  --vxlan-gpe-port=N: UPD port of tunnel VXLAN-GPE\n");
>  }
>  
>  #ifdef RTE_LIBRTE_CMDLINE
> @@ -621,6 +622,7 @@ launch_args_parse(int argc, char** argv)
>  		{ "print-event",		1, 0, 0 },
>  		{ "mask-event",			1, 0, 0 },
>  		{ "tx-offloads",		1, 0, 0 },
> +		{ "vxlan-gpe-port",		1, 0, 0 },
>  		{ 0, 0, 0, 0 },
>  	};
>  
> @@ -1091,6 +1093,14 @@ launch_args_parse(int argc, char** argv)
>  					rte_exit(EXIT_FAILURE,
>  						 "tx-offloads must be >= 0\n");
>  			}
> +			if (!strcmp(lgopts[opt_idx].name, "vxlan-gpe-port")) {
> +				n = atoi(optarg);
> +				if (n >= 0)
> +					vxlan_gpe_udp_port = (uint16_t)n;
> +				else
> +					rte_exit(EXIT_FAILURE,
> +						 "vxlan-gpe-port must be >= 0\n");
> +			}
>  			if (!strcmp(lgopts[opt_idx].name, "print-event"))
>  				if (parse_event_printing_config(optarg, 1)) {
>  					rte_exit(EXIT_FAILURE,
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
> index 593ae2160..b270602d9 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -446,6 +446,8 @@ extern uint32_t retry_enabled;
>  extern struct fwd_lcore  **fwd_lcores;
>  extern struct fwd_stream **fwd_streams;
>  
> +extern uint16_t vxlan_gpe_udp_port; /**< UDP port of tunnel VXLAN-GPE. */
> +
>  extern portid_t nb_peer_eth_addrs; /**< Number of peer ethernet addresses. */
>  extern struct ether_addr peer_eth_addrs[RTE_MAX_ETHPORTS];
>  
> diff --git a/doc/guides/testpmd_app_ug/run_app.rst b/doc/guides/testpmd_app_ug/run_app.rst
> index 1fd53958a..2e8690f41 100644
> --- a/doc/guides/testpmd_app_ug/run_app.rst
> +++ b/doc/guides/testpmd_app_ug/run_app.rst
> @@ -479,3 +479,8 @@ The commandline options are:
>  
>      Set the hexadecimal bitmask of TX queue offloads.
>      The default value is 0.
> +
> +*   ``--vxlan-gpe-port=N``
> +
> +    Set the UDP port number of tunnel VXLAN-GPE to N.
> +    The default value is 4790.

You need to update the "Pattern items" section of the flow command
documentation as well.

> -- 
> 2.13.3
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE
  @ 2018-04-11  9:59  5%   ` Adrien Mazarguil
  2018-04-11 12:04  0%     ` Xueming(Steven) Li
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-11  9:59 UTC (permalink / raw)
  To: Xueming Li
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nelio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

On Tue, Apr 10, 2018 at 09:00:33PM +0800, Xueming Li wrote:
> VXLAN-GPE enables VXLAN for all protocols. Protocol link:
> https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>

Adding a new rte_flow pattern item in the middle of enum rte_flow_item_type
breaks ABI compatibility. It's fine for 18.05 because prior series already
destroyed it, however for this patch you need to choose between:

- Adding the new entry at the end of the enum and modifying the rest of the
  code to follow the same order (preferred approach when not doing a full
  API overhaul).

*or*

- Stating in the commit log what functions are impacted by ABI changes as in
  "ethdev: remove DUP action from flow API" [1].

Also you must add a new "Item: ``VXLAN_GPE``" section to
doc/guides/prog_guide/rte_flow.rst (look for "VXLAN" for clues).

Otherwise patch is mostly fine, just a few comments below.

[1] http://dpdk.org/ml/archives/dev/2018-April/096526.html

> ---
>  lib/librte_ether/rte_eth_ctrl.h  |  3 ++-
>  lib/librte_ether/rte_flow.c      |  1 +
>  lib/librte_ether/rte_flow.h      | 27 +++++++++++++++++++++++++++
>  lib/librte_mbuf/rte_mbuf.c       |  3 +++
>  lib/librte_mbuf/rte_mbuf.h       |  1 +
>  lib/librte_mbuf/rte_mbuf_ptype.c |  1 +
>  lib/librte_mbuf/rte_mbuf_ptype.h | 13 +++++++++++++
>  lib/librte_net/rte_ether.h       | 25 +++++++++++++++++++++++++
>  8 files changed, 73 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_ether/rte_eth_ctrl.h b/lib/librte_ether/rte_eth_ctrl.h
> index 668f59acb..5ea8ae24c 100644
> --- a/lib/librte_ether/rte_eth_ctrl.h
> +++ b/lib/librte_ether/rte_eth_ctrl.h
> @@ -54,7 +54,8 @@ extern "C" {
>  #define RTE_ETH_FLOW_VXLAN              19 /**< VXLAN protocol based flow */
>  #define RTE_ETH_FLOW_GENEVE             20 /**< GENEVE protocol based flow */
>  #define RTE_ETH_FLOW_NVGRE              21 /**< NVGRE protocol based flow */
> -#define RTE_ETH_FLOW_MAX                22
> +#define RTE_ETH_FLOW_VXLAN_GPE          22 /**< VXLAN-GPE protocol based flow */
> +#define RTE_ETH_FLOW_MAX                23
>  
>  /**
>   * Feature filter types
> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> index 3d8116ebd..fb710fff7 100644
> --- a/lib/librte_ether/rte_flow.c
> +++ b/lib/librte_ether/rte_flow.c
> @@ -50,6 +50,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
>  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
>  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
>  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),

Should be at the end of this array if you choose to not impact ABI.

>  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
>  	MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
>  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)),
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index bed727df8..c7cfc201a 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -256,6 +256,13 @@ enum rte_flow_item_type {
>  	RTE_FLOW_ITEM_TYPE_VXLAN,
>  
>  	/**
> +	 * Matches a VXLAN-GPE header.
> +	 *
> +	 * See struct rte_flow_item_vxlan_gpe.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
> +
> +	/**

Ditto for the enum definition.

>  	 * Matches a E_TAG header.
>  	 *
>  	 * See struct rte_flow_item_e_tag.
> @@ -676,6 +683,26 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
>  #endif
>  
>  /**
> + * RTE_FLOW_ITEM_TYPE_VXLAN_GPE.
> + *
> + * Matches a VXLAN-GPE header.

You should name the current IETF draft pending a proper RFC:

 Matches a VXLAN-GPE header (draft-ietf-nvo3-vxlan-gpe-05).

> + */
> +struct rte_flow_item_vxlan_gpe {
> +	uint8_t flags; /**< Normally 0x0c (I and P flag). */
> +	uint8_t rsvd0[2]; /**< Reserved, normally 0x0000. */
> +	uint8_t protocol; /**< Protocol type. */
> +	uint8_t vni[3]; /**< VXLAN identifier. */
> +	uint8_t rsvd1; /**< Reserved, normally 0x00. */
> +};
> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN_GPE. */
> +#ifndef __cplusplus
> +static const struct rte_flow_item_vxlan_gpe rte_flow_item_vxlan_gpe_mask = {
> +	.vni = "\xff\xff\xff",
> +};
> +#endif

Again if you choose to not impact ABI, this should be moved further down,
after the last item definition for consistency.

> +
> +/**
>   * RTE_FLOW_ITEM_TYPE_E_TAG.
>   *
>   * Matches a E-tag header.
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 091d388d3..dc90379e5 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -405,6 +405,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
>  	case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP";
>  	case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE";
>  	case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP";
> +	case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE";
>  	case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
>  	case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
>  	default: return NULL;
> @@ -439,6 +440,8 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
>  		  "PKT_TX_TUNNEL_NONE" },
>  		{ PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
>  		  "PKT_TX_TUNNEL_NONE" },
> +		{ PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK,
> +		  "PKT_TX_TUNNEL_NONE" },
>  		{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
>  		{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
>  	};
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 62740254d..1839cf2ed 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -210,6 +210,7 @@ extern "C" {
>  #define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)
>  /**< TX packet with MPLS-in-UDP RFC 7510 header. */
>  #define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45)
> +#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
>  /* add new TX TUNNEL type here */
>  #define PKT_TX_TUNNEL_MASK    (0xFULL << 45)
>  
> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c
> index 1feefacc6..49106c7df 100644
> --- a/lib/librte_mbuf/rte_mbuf_ptype.c
> +++ b/lib/librte_mbuf/rte_mbuf_ptype.c
> @@ -65,6 +65,7 @@ const char *rte_get_ptype_tunnel_name(uint32_t ptype)
>  	case RTE_PTYPE_TUNNEL_GTPU: return "TUNNEL_GTPU";
>  	case RTE_PTYPE_TUNNEL_ESP: return "TUNNEL_ESP";
>  	case RTE_PTYPE_TUNNEL_L2TP: return "TUNNEL_L2TP";
> +	case RTE_PTYPE_TUNNEL_VXLAN_GPE: return "TUNNEL_VXLAN_GPE";
>  	default: return "TUNNEL_UNKNOWN";
>  	}
>  }
> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
> index b9a338110..7caf83312 100644
> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> @@ -423,6 +423,19 @@ extern "C" {
>   */
>  #define RTE_PTYPE_TUNNEL_L2TP               0x0000a000
>  /**
> + * VXLAN-GPE (VXLAN Generic Protocol Extension) tunneling packet type.
> + *
> + * Packet format:
> + * <'ether type'=0x0800
> + * | 'version'=4, 'protocol'=17
> + * | 'destination port'=4790>
> + * or,
> + * <'ether type'=0x86DD
> + * | 'version'=6, 'next header'=17
> + * | 'destination port'=4790>
> + */
> +#define RTE_PTYPE_TUNNEL_VXLAN_GPE          0x0000b000
> +/**
>   * Mask of tunneling packet types.
>   */
>  #define RTE_PTYPE_TUNNEL_MASK               0x0000f000
> diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
> index a271d1c86..a64814179 100644
> --- a/lib/librte_net/rte_ether.h
> +++ b/lib/librte_net/rte_ether.h
> @@ -311,6 +311,31 @@ struct vxlan_hdr {
>  /**< VXLAN tunnel header length. */
>  
>  /**
> + * VXLAN-GPE protocol header.
> + * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network
> + * Identifier and Reserved fields (16 bits and 8 bits).

Another reference to the current IETF draft here shouldn't hurt.

> + */
> +struct vxlan_gpe_hdr {
> +	uint8_t vx_flags; /**< flag (8). */
> +	uint8_t reserved[2]; /**< Reserved (16). */
> +	uint8_t proto; /**< next-protocol (8). */
> +	uint32_t vx_vni;   /**< VNI (24) + Reserved (8). */
> +} __attribute__((__packed__));
> +
> +/* VXLAN-GPE next protocol types */
> +#define VXLAN_GPE_TYPE_IPv4 1 /**< IPv4 Protocol. */
> +#define VXLAN_GPE_TYPE_IPv6 2 /**< IPv6 Protocol. */
> +#define VXLAN_GPE_TYPE_ETH  3 /**< Ethernet Protocol. */
> +#define VXLAN_GPE_TYPE_NSH  4 /**< NSH Protocol. */
> +#define VXLAN_GPE_TYPE_MPLS 5 /**< MPLS Protocol. */
> +#define VXLAN_GPE_TYPE_GBP  6 /**< GBP Protocol. */
> +#define VXLAN_GPE_TYPE_VBNG 7 /**< vBNG Protocol. */
> +
> +#define ETHER_VXLAN_GPE_HLEN (sizeof(struct udp_hdr) + \
> +			      sizeof(struct vxlan_gpe_hdr))
> +/**< VXLAN-GPE tunnel header length. */
> +
> +/**
>   * Extract VLAN tag information into mbuf
>   *
>   * Software version of VLAN stripping
> -- 
> 2.13.3

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  2018-04-11  2:48  4%                   ` Jerin Jacob
@ 2018-04-11  8:40  0%                     ` Ananyev, Konstantin
  0 siblings, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2018-04-11  8:40 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Olivier Matz, dev, Richardson, Bruce

Hi Jerin,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Wednesday, April 11, 2018 3:49 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> 
> -----Original Message-----
> > Date: Wed, 11 Apr 2018 00:33:14 +0000
> > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > CC: Olivier Matz <olivier.matz@6wind.com>, "dev@dpdk.org" <dev@dpdk.org>,
> >  "Richardson, Bruce" <bruce.richardson@intel.com>
> > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> >  structure
> >
> 
> Hi Konstantin,
> 
> >
> > > -----Original Message-----
> > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > Sent: Friday, April 6, 2018 2:26 AM
> > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > >
> > > -----Original Message-----
> > >
> > > Hi Konstantin,
> > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > Sent: Thursday, April 5, 2018 9:02 AM
> > > > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > >
> > > > > -----Original Message-----
> > > > > > Date: Wed, 4 Apr 2018 23:38:41 +0000
> > > > > > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, Olivier Matz
> > > > > >  <olivier.matz@6wind.com>
> > > > > > CC: "dev@dpdk.org" <dev@dpdk.org>, "Richardson, Bruce"
> > > > > >  <bruce.richardson@intel.com>
> > > > > > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > >  structure
> > > > > >
> > > > > > Hi lads,
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > > > Sent: Tuesday, April 3, 2018 5:43 PM
> > > > > > > To: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > > > >
> > > > > > > -----Original Message-----
> > > > > > > > Date: Tue, 3 Apr 2018 17:56:01 +0200
> > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > >  structure
> > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > >
> > > > > > > > On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > > > > > > > > -----Original Message-----
> > > > > > > > > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > >  structure
> > > > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > > > >
> > > > > > > > > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > > > > > > > > -----Original Message-----
> > > > > > > > > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > > > To: dev@dpdk.org
> > > > > > > > > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > > > >  structure
> > > > > > > > > > > > X-Mailer: git-send-email 2.11.0
> > > > > > > > > > > >
> > > > > > > > > > > > The initial objective of
> > > > > > > > > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > > > > > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > > > > > > > > data (on platform with cache line size = 64B), preventing from
> > > > > > > > > > > > having them on adjacent cache lines.
> > > > > > > > > > > >
> > > > > > > > > > > > Following discussion on the mailing list, it appears that this
> > > > > > > > > > > > also imposes an alignment constraint that is not required.
> > > > > > > > > > > >
> > > > > > > > > > > > This patch removes the extra alignment constraint and adds the
> > > > > > > > > > > > empty cache lines using padding fields in the structure. The
> > > > > > > > > > > > size of rte_ring structure and the offset of the fields remain
> > > > > > > > > > > > the same on platforms with cache line size = 64B:
> > > > > > > > > > > >
> > > > > > > > > > > >   rte_ring = 384
> > > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > > >   rte_ring.prod = 128
> > > > > > > > > > > >   rte_ring.cons = 256
> > > > > > > > > > > >
> > > > > > > > > > > > But it has an impact on platform where cache line size is 128B:
> > > > > > > > > > > >
> > > > > > > > > > > >   rte_ring = 384        -> 768
> > > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > > >   rte_ring.prod = 128   -> 256
> > > > > > > > > > > >   rte_ring.cons = 256   -> 512
> > > > > > > > > > >
> > > > > > > > > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > > > > > > > > the adjust cacheline(consumer)?
> > > > > > > > > > >
> > > > > > > > > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > > > > > > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > > > > > > > >
> > > > > > > > > > The impact on machines that have a 128B cache line is that an unused
> > > > > > > > > > cache line will be added between the producer and consumer data. I
> > > > > > > > > > expect that the impact is positive in case there is a hw prefetcher, and
> > > > > > > > > > null in case there is no such prefetcher.
> > > > > > > > >
> > > > > > > > > It is not NULL, Right? You are loosing 256B for each ring.
> > > > > > > >
> > > > > > > > Is it really that important?
> > > > > > >
> > > > > > > Pipeline or eventdev SW cases there could more rings in the system.
> > > > > > > I don't see any downside of having config option which is enabled
> > > > > > > default.
> > > > > > >
> > > > > > > In my view, such config options are good, as in embedded usecases, customers
> > > > > > > can really fine tune the target for the need. In server usecases, let the default
> > > > > > > of option be enabled, no harm.
> > > > > >
> > > > > > But that would mean we have to maintain two layouts for the rte_ring structure.
> > > > >
> > > > > Is there any downside of having two configurable layout? meaning, we are not
> > > > > transferring rte_ring structure over network etc(ie no interoperability
> > > > > issue). Does it really matter? May I am missing something here.
> > > >
> > > > My concern about potential compatibility problems we are introducing -
> > > > library build with 'y', while app wit 'n', or visa-versa.
> > >
> > > Got it.
> > >
> > > > I wonder are there really a lot of users who would be interested in such savings?
> > > > Could it happen that this new option would sit here unused and untested?
> > >
> > > OK. Fair enough. I have no objections for Olivier patch.
> > >
> > > As a suggestion, may be we can move "char name[RTE_MEMZONE_NAMESIZE]" in the
> > > struct rte_ring in place of " empty cacheline" to save 32B. No strong option
> > > though.
> >
> > That sounds like a good idea to me...
> > But I suppose in that case we need to move to that empty cacheline all fields that precede prod?
> 
> Even though those fields are read only in fastpath,I suppose moving all
> the fields(used in fast path) after prod, prefetch _cons_ cache line in cross
> CPU case.

Ah yes, you right, missed that.
Konstantin

> 
> I think, following comment can be addressed in code as it is an ABI change.
>         /*
>          * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to
>          * ABI
>          * compatibility requirements, it could be changed to
>          * RTE_RING_NAMESIZE
>          * next time the ABI changes
>          */
>         char name[RTE_MEMZONE_NAMESIZE] __rte_cache_aligned; /**< Name of the ring. */
> 
> 
> > Otherwise there will be not much advantage in such move.
> >
> >

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11  0:25  0%             ` Ananyev, Konstantin
@ 2018-04-11  5:33  0%               ` Yongseok Koh
  2018-04-11 11:39  0%                 ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-11  5:33 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Tue, Apr 10, 2018 at 05:25:31PM -0700, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Yongseok Koh
> > Sent: Tuesday, April 10, 2018 2:59 AM
> > To: Olivier Matz <olivier.matz@6wind.com>
> > Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; adrien.mazarguil@6wind.com;
> > nelio.laranjeiro@6wind.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
> > 
> > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > Hi Yongseok,
> > >
> > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > Hi,
> > > > >
> > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > rte_pktmbuf_attach_at().
> > > > > >
> > > > > > Possible use-cases could be:
> > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > >
> > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > >
> > > > > I think the current API is already able to do what you want.
> > > > >
> > > > > 1/ Here is a mbuf m with its data
> > > > >
> > > > >                off
> > > > >                <-->
> > > > >                       len
> > > > >           +----+   <---------->
> > > > >           |    |
> > > > >         +-|----v----------------------+
> > > > >         | |    -----------------------|
> > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > >         |      -----------------------|
> > > > >         +-----------------------------+
> > > > >
> > > > >
> > > > > 2/ clone m:
> > > > >
> > > > >   c = rte_pktmbuf_alloc(pool);
> > > > >   rte_pktmbuf_attach(c, m);
> > > > >
> > > > >   Note that c has its own offset and length fields.
> > > > >
> > > > >
> > > > >                off
> > > > >                <-->
> > > > >                       len
> > > > >           +----+   <---------->
> > > > >           |    |
> > > > >         +-|----v----------------------+
> > > > >         | |    -----------------------|
> > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > >         |      -----------------------|
> > > > >         +------^----------------------+
> > > > >                |
> > > > >           +----+
> > > > > indirect  |
> > > > >         +-|---------------------------+
> > > > >         | |    -----------------------|
> > > > > c       | buf  |                     ||
> > > > >         |      -----------------------|
> > > > >         +-----------------------------+
> > > > >
> > > > >                 off    len
> > > > >                 <--><---------->
> > > > >
> > > > >
> > > > > 3/ remove some data from c without changing m
> > > > >
> > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > >
> > > > >
> > > > > Please let me know if it fits your needs.
> > > >
> > > > No, it doesn't.
> > > >
> > > > Trimming head and tail with the current APIs removes data and make the space
> > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > difference offsets in m,
> > > >
> > > > rte_pktmbuf_adj(c1, 10);
> > > > rte_pktmbuf_adj(c2, 20);
> > > >
> > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > header should be linked by h1->next = c2.
> > >
> > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > length) that will:
> > >   - alloc and attach indirect mbuf for each segment of m that is
> > >     in the range [offset : length+offset].
> > >   - prepend an empty and writable mbuf for the headers
> > >
> > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > which actually shrink the headroom, this case can be properly handled.
> > >
> > > What do you mean by properly handled?
> > >
> > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > won't be protected.
> > >
> > > From an application point of view, indirect mbufs, or direct mbufs that
> > > have refcnt != 1, should be both considered as read-only because they
> > > may share their data. How an application can know if the data is shared
> > > or not?
> > >
> > > Maybe we need a flag to differentiate mbufs that are read-only
> > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > understanding is correct, you want to have indirect mbufs with RW data.
> > 
> > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > enough to handle that use-case.
> > 
> > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > indirect referencing.
> 
> But just to make HW to RX multiple packets into one mbuf,
> data_off inside indirect mbuf should be enough, correct?
Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
to saturate the network link.

> As I understand, what you'd like to achieve with this new field -
> ability to manipulate packet boundaries after RX, probably at upper layer.
> As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> indirect mbufs trying to modify same direct buffer.

I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
is read-only. What that means, all the entities which own such mbufs have to be
aware of that and keep the principle as DPDK can't enforce the rule and there
can't be such sanity check. In this sense, HW doesn't violate it because the
direct mbuf is injected to HW before indirection. When packets are written by
HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
application layer with freeing the original direct mbuf (decrement refcnt by 1).
So, HW doesn't touch the direct buffer once it reaches to upper layer. The
direct buffer will be freed and get available for reuse when all the attached
indirect mbufs are freed.

> Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> Fields for indirect mbufs, straight after attach()?

Good point.
Actually that was my draft (Mellanox internal) version of this patch :-) But I
had to consider a case where priv_size is really given by user. Even though it
is less likely, but if original priv_size is quite big, it can't cover entire
buf_len. For this, I had to increase priv_size to 32-bit but adding another
16bit field (buf_off) looked more plausible.

Thanks for good comments,
Yongseok

> > > >
> > > > Does this make sense?
> > >
> > > I understand the need.
> > >
> > > Another option would be to make the mbuf->buffer point to an external
> > > buffer (not inside the direct mbuf). This would require to add a
> > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > a quick overview.
> > >
> > > [1]
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > -Session05-OlivierMatz-
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > >
> > > The advantage is that it does not require the large data to be inside a
> > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > allocated from a mempool). On the other hand, it is maybe more complex
> > > to implement compared to your solution.
> > 
> > I knew that you presented the slides and frankly, I had considered that option
> > at first. But even with that option, metadata to store refcnt should also be
> > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > end of the data segment. Even though it could have smaller metadata structure,
> > I just wanted to make full use of the existing framework because it is less
> > complex as you mentioned. Given that you presented the idea of external data
> > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > simpler.  I personally think that we can take the idea of external data seg when
> > more demands come from users in the future as it would be a huge change and may
> > break current ABI/API. When the day comes, I'll gladly participate in the
> > discussions and write codes for it if I can be helpful.
> > 
> > Do you think this patch is okay for now?
> > 
> > 
> > Thanks for your comments,
> > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  @ 2018-04-11  2:48  4%                   ` Jerin Jacob
  2018-04-11  8:40  0%                     ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Jerin Jacob @ 2018-04-11  2:48 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Olivier Matz, dev, Richardson, Bruce

-----Original Message-----
> Date: Wed, 11 Apr 2018 00:33:14 +0000
> From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: Olivier Matz <olivier.matz@6wind.com>, "dev@dpdk.org" <dev@dpdk.org>,
>  "Richardson, Bruce" <bruce.richardson@intel.com>
> Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
>  structure
> 

Hi Konstantin,

> 
> > -----Original Message-----
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Friday, April 6, 2018 2:26 AM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > 
> > -----Original Message-----
> > 
> > Hi Konstantin,
> > 
> > >
> > > > -----Original Message-----
> > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > Sent: Thursday, April 5, 2018 9:02 AM
> > > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > >
> > > > -----Original Message-----
> > > > > Date: Wed, 4 Apr 2018 23:38:41 +0000
> > > > > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, Olivier Matz
> > > > >  <olivier.matz@6wind.com>
> > > > > CC: "dev@dpdk.org" <dev@dpdk.org>, "Richardson, Bruce"
> > > > >  <bruce.richardson@intel.com>
> > > > > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > >  structure
> > > > >
> > > > > Hi lads,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > > Sent: Tuesday, April 3, 2018 5:43 PM
> > > > > > To: Olivier Matz <olivier.matz@6wind.com>
> > > > > > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > > >
> > > > > > -----Original Message-----
> > > > > > > Date: Tue, 3 Apr 2018 17:56:01 +0200
> > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > >  structure
> > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > >
> > > > > > > On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > > > > > > > -----Original Message-----
> > > > > > > > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > >  structure
> > > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > > >
> > > > > > > > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > > > > > > > -----Original Message-----
> > > > > > > > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > > To: dev@dpdk.org
> > > > > > > > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > > >  structure
> > > > > > > > > > > X-Mailer: git-send-email 2.11.0
> > > > > > > > > > >
> > > > > > > > > > > The initial objective of
> > > > > > > > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > > > > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > > > > > > > data (on platform with cache line size = 64B), preventing from
> > > > > > > > > > > having them on adjacent cache lines.
> > > > > > > > > > >
> > > > > > > > > > > Following discussion on the mailing list, it appears that this
> > > > > > > > > > > also imposes an alignment constraint that is not required.
> > > > > > > > > > >
> > > > > > > > > > > This patch removes the extra alignment constraint and adds the
> > > > > > > > > > > empty cache lines using padding fields in the structure. The
> > > > > > > > > > > size of rte_ring structure and the offset of the fields remain
> > > > > > > > > > > the same on platforms with cache line size = 64B:
> > > > > > > > > > >
> > > > > > > > > > >   rte_ring = 384
> > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > >   rte_ring.prod = 128
> > > > > > > > > > >   rte_ring.cons = 256
> > > > > > > > > > >
> > > > > > > > > > > But it has an impact on platform where cache line size is 128B:
> > > > > > > > > > >
> > > > > > > > > > >   rte_ring = 384        -> 768
> > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > >   rte_ring.prod = 128   -> 256
> > > > > > > > > > >   rte_ring.cons = 256   -> 512
> > > > > > > > > >
> > > > > > > > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > > > > > > > the adjust cacheline(consumer)?
> > > > > > > > > >
> > > > > > > > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > > > > > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > > > > > > >
> > > > > > > > > The impact on machines that have a 128B cache line is that an unused
> > > > > > > > > cache line will be added between the producer and consumer data. I
> > > > > > > > > expect that the impact is positive in case there is a hw prefetcher, and
> > > > > > > > > null in case there is no such prefetcher.
> > > > > > > >
> > > > > > > > It is not NULL, Right? You are loosing 256B for each ring.
> > > > > > >
> > > > > > > Is it really that important?
> > > > > >
> > > > > > Pipeline or eventdev SW cases there could more rings in the system.
> > > > > > I don't see any downside of having config option which is enabled
> > > > > > default.
> > > > > >
> > > > > > In my view, such config options are good, as in embedded usecases, customers
> > > > > > can really fine tune the target for the need. In server usecases, let the default
> > > > > > of option be enabled, no harm.
> > > > >
> > > > > But that would mean we have to maintain two layouts for the rte_ring structure.
> > > >
> > > > Is there any downside of having two configurable layout? meaning, we are not
> > > > transferring rte_ring structure over network etc(ie no interoperability
> > > > issue). Does it really matter? May I am missing something here.
> > >
> > > My concern about potential compatibility problems we are introducing -
> > > library build with 'y', while app wit 'n', or visa-versa.
> > 
> > Got it.
> > 
> > > I wonder are there really a lot of users who would be interested in such savings?
> > > Could it happen that this new option would sit here unused and untested?
> > 
> > OK. Fair enough. I have no objections for Olivier patch.
> > 
> > As a suggestion, may be we can move "char name[RTE_MEMZONE_NAMESIZE]" in the
> > struct rte_ring in place of " empty cacheline" to save 32B. No strong option
> > though.
> 
> That sounds like a good idea to me...
> But I suppose in that case we need to move to that empty cacheline all fields that precede prod?

Even though those fields are read only in fastpath,I suppose moving all
the fields(used in fast path) after prod, prefetch _cons_ cache line in cross
CPU case.

I think, following comment can be addressed in code as it is an ABI change.
        /*
         * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to
         * ABI
         * compatibility requirements, it could be changed to
         * RTE_RING_NAMESIZE
         * next time the ABI changes
         */
        char name[RTE_MEMZONE_NAMESIZE] __rte_cache_aligned; /**< Name of the ring. */


> Otherwise there will be not much advantage in such move.
> 
> 

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-10  1:59  3%           ` Yongseok Koh
@ 2018-04-11  0:25  0%             ` Ananyev, Konstantin
  2018-04-11  5:33  0%               ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-11  0:25 UTC (permalink / raw)
  To: Yongseok Koh, Olivier Matz
  Cc: Lu, Wenzhuo, Wu, Jingjing, adrien.mazarguil, nelio.laranjeiro, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Yongseok Koh
> Sent: Tuesday, April 10, 2018 2:59 AM
> To: Olivier Matz <olivier.matz@6wind.com>
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; adrien.mazarguil@6wind.com;
> nelio.laranjeiro@6wind.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
> 
> On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > Hi Yongseok,
> >
> > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > Hi,
> > > >
> > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > rte_pktmbuf_attach_at().
> > > > >
> > > > > Possible use-cases could be:
> > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > >
> > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > >
> > > > I think the current API is already able to do what you want.
> > > >
> > > > 1/ Here is a mbuf m with its data
> > > >
> > > >                off
> > > >                <-->
> > > >                       len
> > > >           +----+   <---------->
> > > >           |    |
> > > >         +-|----v----------------------+
> > > >         | |    -----------------------|
> > > > m       | buf  |    XXXXXXXXXXX      ||
> > > >         |      -----------------------|
> > > >         +-----------------------------+
> > > >
> > > >
> > > > 2/ clone m:
> > > >
> > > >   c = rte_pktmbuf_alloc(pool);
> > > >   rte_pktmbuf_attach(c, m);
> > > >
> > > >   Note that c has its own offset and length fields.
> > > >
> > > >
> > > >                off
> > > >                <-->
> > > >                       len
> > > >           +----+   <---------->
> > > >           |    |
> > > >         +-|----v----------------------+
> > > >         | |    -----------------------|
> > > > m       | buf  |    XXXXXXXXXXX      ||
> > > >         |      -----------------------|
> > > >         +------^----------------------+
> > > >                |
> > > >           +----+
> > > > indirect  |
> > > >         +-|---------------------------+
> > > >         | |    -----------------------|
> > > > c       | buf  |                     ||
> > > >         |      -----------------------|
> > > >         +-----------------------------+
> > > >
> > > >                 off    len
> > > >                 <--><---------->
> > > >
> > > >
> > > > 3/ remove some data from c without changing m
> > > >
> > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > >
> > > >
> > > > Please let me know if it fits your needs.
> > >
> > > No, it doesn't.
> > >
> > > Trimming head and tail with the current APIs removes data and make the space
> > > available. Adjusting packet head means giving more headroom, not shifting the
> > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > difference offsets in m,
> > >
> > > rte_pktmbuf_adj(c1, 10);
> > > rte_pktmbuf_adj(c2, 20);
> > >
> > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > wants to attach outer header, it will overwrite the headroom even though the
> > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > header should be linked by h1->next = c2.
> >
> > Yes, after these operations c1, c2 and m should become read-only. So, to
> > prepend headers, another mbuf has to be inserted before as you suggest. It
> > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > length) that will:
> >   - alloc and attach indirect mbuf for each segment of m that is
> >     in the range [offset : length+offset].
> >   - prepend an empty and writable mbuf for the headers
> >
> > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > which actually shrink the headroom, this case can be properly handled.
> >
> > What do you mean by properly handled?
> >
> > Yes, prepending data or adding data in the indirect mbuf won't override
> > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > won't be protected.
> >
> > From an application point of view, indirect mbufs, or direct mbufs that
> > have refcnt != 1, should be both considered as read-only because they
> > may share their data. How an application can know if the data is shared
> > or not?
> >
> > Maybe we need a flag to differentiate mbufs that are read-only
> > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > understanding is correct, you want to have indirect mbufs with RW data.
> 
> Agree that indirect mbuf must be treated as read-only, Then the current code is
> enough to handle that use-case.
> 
> > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > indirect referencing.

But just to make HW to RX multiple packets into one mbuf,
data_off inside indirect mbuf should be enough, correct?
As I understand, what you'd like to achieve with this new field -
ability to manipulate packet boundaries after RX, probably at upper layer.
As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
indirect mbufs trying to modify same direct buffer.
Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
Fields for indirect mbufs, straight after attach()?
Konstantin

> > >
> > > Does this make sense?
> >
> > I understand the need.
> >
> > Another option would be to make the mbuf->buffer point to an external
> > buffer (not inside the direct mbuf). This would require to add a
> > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > a quick overview.
> >
> > [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> -Session05-OlivierMatz-
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> >
> > The advantage is that it does not require the large data to be inside a
> > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > allocated from a mempool). On the other hand, it is maybe more complex
> > to implement compared to your solution.
> 
> I knew that you presented the slides and frankly, I had considered that option
> at first. But even with that option, metadata to store refcnt should also be
> allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> end of the data segment. Even though it could have smaller metadata structure,
> I just wanted to make full use of the existing framework because it is less
> complex as you mentioned. Given that you presented the idea of external data
> buffer in 2016 and there hasn't been many follow-up discussions/activities so
> far, I thought the demand isn't so big yet thus I wanted to make this patch
> simpler.  I personally think that we can take the idea of external data seg when
> more demands come from users in the future as it would be a huge change and may
> break current ABI/API. When the day comes, I'll gladly participate in the
> discussions and write codes for it if I can be helpful.
> 
> Do you think this patch is okay for now?
> 
> 
> Thanks for your comments,
> Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-04-10  9:43  4% ` [dpdk-dev] [PATCH v6 " Remy Horton
  2018-04-10  9:43  7%   ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
@ 2018-04-10 18:56  0%   ` Ferruh Yigit
  1 sibling, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-10 18:56 UTC (permalink / raw)
  To: Remy Horton, dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

On 4/10/2018 10:43 AM, Remy Horton wrote:
> The optimal values of several transmission & reception related parameters,
> such as burst sizes, descriptor ring sizes, and number of queues, varies
> between different network interface devices. This patchset allows individual
> PMDs to specify their preferred parameter values, and if so indicated by an
> application, for them to be used automatically by the ethdev layer.
> 
> rte_eth_dev_configure() has been changed so that specifying zero for both
> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
> are not available, falls back to EAL defaults. Setting one (but not both)
> to zero does not cause the use of defaults, as having one of them zeroed is
> a valid setup.
> 
> This patchset includes per-PMD values for e1000 and i40e but it is expected
> that subsequent patchsets will cover other PMDs. A deprecation notice
> covering the API/ABI change is in place.
> 
> Changes in v6:
> * Updated/corrected testpmd documentation
> * Carried forward acks/review
> * Rebased to d218a4d060de
> 
> Changes in v5:
> * uint_16_t corrected to uint16_t
> 
> Changes in v4:
> * Added API/ABI change documentation
> * Rebased to 78f5a2e93d74
> 
> Changes in v3:
> * Changed formatting around new rte_eth_dev_info fields
> * Added Doxygen documentation to struct rte_eth_dev_portconf
> * Testpmd "port config all burst 0" and --burst=0 uses PMD 
>   Rx burst recommendations.
> * Added to release notes
> * Rebased to 8ea081f38161
> 
> Changes in v2:
> * Rebased to master
> * Removed fallback values from rte_eth_dev_info_get()
> * Added fallback values to rte_rte_[rt]x_queue_setup()
> * Added fallback values to rte_eth_dev_configure()
> * Corrected comment
> * Removed deprecation notice
> * Split RX and Tx into seperate structures
> * Changed parameter names
> 
> 
> Remy Horton (4):
>   ethdev: add support for PMD-tuned Tx/Rx parameters
>   net/e1000: add TxRx tuning parameters
>   net/i40e: add TxRx tuning parameters
>   testpmd: make use of per-PMD TxRx parameters

Series applied to dpdk-next-net/master, thanks.

(Thomas' ack added into ethdev patch)

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and action to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (11 preceding siblings ...)
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-11 13:02  0%       ` Andrew Rybchenko
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z, Declan Doherty

RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
into a different device, as identified by its DPDK port ID.

This is normally only supported when the target port ID has some kind of
relationship with the port ID the flow rule is created against, such as
being exposed by a common physical device (e.g. a different port of an
Ethernet switch).

The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
flow rule match traffic whose origin is the specified port ID. Note that
specifying a port ID that differs from the one the flow rule is created
against is normally meaningless (if even accepted), but can make sense if
combined with the transfer attribute.

These must not be confused with their PHY_PORT counterparts, which refer to
physical ports using device-specific indices, but unlike PORT_ID are not
necessarily tied to DPDK port IDs.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
Cc: Declan Doherty <declan.doherty@intel.com>

---

This patch provides the same functionality and supersedes Qi Zhang's
"ether: add flow action to redirect packet to a port" [1].

The main differences are:

- Action is named PORT_ID instead of PORT.
- Addition of a PORT_ID pattern item.
- More extensive documentation.
- Testpmd support.
- rte_flow_copy() support.

[1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
---
 app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  2 +
 doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
 lib/librte_ether/rte_flow.c                 |  2 +
 lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index cc78b4f2c..fae3c4b12 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -89,6 +89,8 @@ enum index {
 	ITEM_VF_ID,
 	ITEM_PHY_PORT,
 	ITEM_PHY_PORT_INDEX,
+	ITEM_PORT_ID,
+	ITEM_PORT_ID_ID,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -185,6 +187,9 @@ enum index {
 	ACTION_PHY_PORT,
 	ACTION_PHY_PORT_ORIGINAL,
 	ACTION_PHY_PORT_INDEX,
+	ACTION_PORT_ID,
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -445,6 +450,7 @@ static const enum index next_item[] = {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_PHY_PORT,
+	ITEM_PORT_ID,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -491,6 +497,12 @@ static const enum index item_phy_port[] = {
 	ZERO,
 };
 
+static const enum index item_port_id[] = {
+	ITEM_PORT_ID_ID,
+	ITEM_NEXT,
+	ZERO,
+};
+
 static const enum index item_raw[] = {
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -627,6 +639,7 @@ static const enum index next_action[] = {
 	ACTION_PF,
 	ACTION_VF,
 	ACTION_PHY_PORT,
+	ACTION_PORT_ID,
 	ACTION_METER,
 	ZERO,
 };
@@ -668,6 +681,13 @@ static const enum index action_phy_port[] = {
 	ZERO,
 };
 
+static const enum index action_port_id[] = {
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1084,6 +1104,20 @@ static const struct token token_list[] = {
 		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
+	[ITEM_PORT_ID] = {
+		.name = "port_id",
+		.help = "match traffic from/to a given DPDK port ID",
+		.priv = PRIV_ITEM(PORT_ID,
+				  sizeof(struct rte_flow_item_port_id)),
+		.next = NEXT(item_port_id),
+		.call = parse_vc,
+	},
+	[ITEM_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(item_port_id, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port_id, id)),
+	},
 	[ITEM_RAW] = {
 		.name = "raw",
 		.help = "match an arbitrary byte string",
@@ -1749,6 +1783,29 @@ static const struct token token_list[] = {
 					index)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PORT_ID] = {
+		.name = "port_id",
+		.help = "direct matching traffic to a given DPDK port ID",
+		.priv = PRIV_ACTION(PORT_ID,
+				    sizeof(struct rte_flow_action_port_id)),
+		.next = NEXT(action_port_id),
+		.call = parse_vc,
+	},
+	[ACTION_PORT_ID_ORIGINAL] = {
+		.name = "original",
+		.help = "use original DPDK port ID if possible",
+		.next = NEXT(action_port_id, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_port_id,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(action_port_id, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_port_id, id)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index effb4ff81..4a273eff7 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,6 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -1059,6 +1060,7 @@ static const struct {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a39c1e1b0..2fb8e9c3f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -617,6 +617,36 @@ associated with a port_id should be retrieved by other means.
    | ``mask`` | ``index`` | zeroed to match any port index |
    +----------+-----------+--------------------------------+
 
+Item: ``PORT_ID``
+^^^^^^^^^^^^^^^^^
+
+Matches traffic originating from (ingress) or going to (egress) a given DPDK
+port ID.
+
+Normally only supported if the port ID in question is known by the
+underlying PMD and related to the device the flow rule is created against.
+
+This must not be confused with `Item: PHY_PORT`_ which refers to the
+physical port of a device, whereas `Item: PORT_ID`_ refers to a ``struct
+rte_eth_dev`` object on the application side (also known as "port
+representor" depending on the kind of underlying device).
+
+- Default ``mask`` matches the specified DPDK port ID.
+
+.. _table_rte_flow_item_port_id:
+
+.. table:: PORT_ID
+
+   +----------+----------+-----------------------------+
+   | Field    | Subfield | Value                       |
+   +==========+==========+=============================+
+   | ``spec`` | ``id``   | DPDK port ID                |
+   +----------+----------+-----------------------------+
+   | ``last`` | ``id``   | upper range value           |
+   +----------+----------+-----------------------------+
+   | ``mask`` | ``id``   | zeroed to match any port ID |
+   +----------+----------+-----------------------------+
+
 Data matching item types
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1453,6 +1483,24 @@ See `Item: PHY_PORT`_.
    | ``index``    | physical port index                 |
    +--------------+-------------------------------------+
 
+Action: ``PORT_ID``
+^^^^^^^^^^^^^^^^^^^
+Directs matching traffic to a given DPDK port ID.
+
+See `Item: PORT_ID`_.
+
+.. _table_rte_flow_action_port_id:
+
+.. table:: PORT_ID
+
+   +--------------+---------------------------------------+
+   | Field        | Value                                 |
+   +==============+=======================================+
+   | ``original`` | use original DPDK port ID if possible |
+   +--------------+---------------------------------------+
+   | ``id``       | DPDK port ID                          |
+   +--------------+---------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 64d8dfddb..bfb5ad027 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3212,6 +3212,10 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: match traffic from/to a given DPDK port ID.
+
+  - ``id {unsigned}``: DPDK port ID.
+
 - ``raw``: match an arbitrary byte string.
 
   - ``relative {boolean}``: look for pattern after the previous item.
@@ -3428,6 +3432,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original port index if possible.
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: direct matching traffic to a given DPDK port ID.
+
+  - ``original {boolean}``: use original DPDK port ID if possible.
+  - ``id {unsigned}``: DPDK port ID.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index e0fd78dd5..3d8116ebd 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,6 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -77,6 +78,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index c3ae0c6a8..29a3b26e3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -180,6 +180,16 @@ enum rte_flow_item_type {
 	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
+	 * [META]
+	 *
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given DPDK port ID.
+	 *
+	 * See struct rte_flow_item_port_id.
+	 */
+	RTE_FLOW_ITEM_TYPE_PORT_ID,
+
+	/**
 	 * Matches a byte string of a given length at a given offset.
 	 *
 	 * See struct rte_flow_item_raw.
@@ -414,6 +424,32 @@ static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 #endif
 
 /**
+ * RTE_FLOW_ITEM_TYPE_PORT_ID
+ *
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * DPDK port ID.
+ *
+ * Normally only supported if the port ID in question is known by the
+ * underlying PMD and related to the device the flow rule is created
+ * against.
+ *
+ * This must not be confused with @p PHY_PORT which refers to the physical
+ * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev
+ * object on the application side (also known as "port representor"
+ * depending on the kind of underlying device).
+ */
+struct rte_flow_item_port_id {
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */
+#ifndef __cplusplus
+static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = {
+	.id = 0xffffffff,
+};
+#endif
+
+/**
  * RTE_FLOW_ITEM_TYPE_RAW
  *
  * Matches a byte string of a given length at a given offset.
@@ -997,6 +1033,13 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_PHY_PORT,
 
 	/**
+	 * Directs matching traffic to a given DPDK port ID.
+	 *
+	 * See struct rte_flow_action_port_id.
+	 */
+	RTE_FLOW_ACTION_TYPE_PORT_ID,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1134,6 +1177,19 @@ struct rte_flow_action_phy_port {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PORT_ID
+ *
+ * Directs matching traffic to a given DPDK port ID.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PORT_ID
+ */
+struct rte_flow_action_port_id {
+	uint32_t original:1; /**< Use original DPDK port ID if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (10 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-10 16:37  3%     ` Adrien Mazarguil
  2018-04-11 13:00  0%       ` Andrew Rybchenko
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

This patch adds the missing action counterpart to the PHY_PORT pattern
item, that is, the ability to directly inject matching traffic into a
physical port of the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
 6 files changed, 84 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index a0dbec119..cc78b4f2c 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -182,6 +182,9 @@ enum index {
 	ACTION_VF,
 	ACTION_VF_ORIGINAL,
 	ACTION_VF_ID,
+	ACTION_PHY_PORT,
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -623,6 +626,7 @@ static const enum index next_action[] = {
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
+	ACTION_PHY_PORT,
 	ACTION_METER,
 	ZERO,
 };
@@ -657,6 +661,13 @@ static const enum index action_vf[] = {
 	ZERO,
 };
 
+static const enum index action_phy_port[] = {
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "direct packets to physical port index",
+		.priv = PRIV_ACTION(PHY_PORT,
+				    sizeof(struct rte_flow_action_phy_port)),
+		.next = NEXT(action_phy_port),
+		.call = parse_vc,
+	},
+	[ACTION_PHY_PORT_ORIGINAL] = {
+		.name = "original",
+		.help = "use original port index if possible",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PHY_PORT_INDEX] = {
+		.name = "index",
+		.help = "physical port index",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
+					index)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9f968919e..effb4ff81 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1058,6 +1058,7 @@ static const struct {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 4e053c24b..a39c1e1b0 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1433,6 +1433,26 @@ See `Item: VF`_.
    | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
+Action: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^^^
+
+Directs matching traffic to a given physical port index of the underlying
+device.
+
+See `Item: PHY_PORT`_.
+
+.. _table_rte_flow_action_phy_port:
+
+.. table:: PHY_PORT
+
+   +--------------+-------------------------------------+
+   | Field        | Value                               |
+   +==============+=====================================+
+   | ``original`` | use original port index if possible |
+   +--------------+-------------------------------------+
+   | ``index``    | physical port index                 |
+   +--------------+-------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a2bbd1930..64d8dfddb 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3423,6 +3423,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original VF ID if possible.
   - ``id {unsigned}``: VF ID.
 
+- ``phy_port``: direct packets to physical port index.
+
+  - ``original {boolean}``: use original port index if possible.
+  - ``index {unsigned}``: physical port index.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 6d4d7f5ed..e0fd78dd5 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ed211a8eb..c3ae0c6a8 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -989,6 +989,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VF,
 
 	/**
+	 * Directs packets to a given physical port index of the underlying
+	 * device.
+	 *
+	 * See struct rte_flow_action_phy_port.
+	 */
+	RTE_FLOW_ACTION_TYPE_PHY_PORT,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1112,6 +1120,20 @@ struct rte_flow_action_vf {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PHY_PORT
+ *
+ * Directs packets to a given physical port index of the underlying
+ * device.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
+ */
+struct rte_flow_action_phy_port {
+	uint32_t original:1; /**< Use original port index if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t index; /**< Physical port index. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (9 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-11 12:57  0%       ` Andrew Rybchenko
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
                       ` (2 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
device using specific identifiers, these are often confused with DPDK port
IDs exposed to applications in the global name space.

Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
for better clarity.

No ABI impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
 app/test-pmd/config.c                       |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
 lib/librte_ether/rte_flow.c                 |  2 +-
 lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index af0631036..a0dbec119 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -87,8 +87,8 @@ enum index {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_VF_ID,
-	ITEM_PORT,
-	ITEM_PORT_INDEX,
+	ITEM_PHY_PORT,
+	ITEM_PHY_PORT_INDEX,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -441,7 +441,7 @@ static const enum index next_item[] = {
 	ITEM_ANY,
 	ITEM_PF,
 	ITEM_VF,
-	ITEM_PORT,
+	ITEM_PHY_PORT,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -482,8 +482,8 @@ static const enum index item_vf[] = {
 	ZERO,
 };
 
-static const enum index item_port[] = {
-	ITEM_PORT_INDEX,
+static const enum index item_phy_port[] = {
+	ITEM_PHY_PORT_INDEX,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1059,18 +1059,19 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
-	[ITEM_PORT] = {
-		.name = "port",
-		.help = "device-specific physical port index to use",
-		.priv = PRIV_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-		.next = NEXT(item_port),
+	[ITEM_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "match traffic from/to a specific physical port",
+		.priv = PRIV_ITEM(PHY_PORT,
+				  sizeof(struct rte_flow_item_phy_port)),
+		.next = NEXT(item_phy_port),
 		.call = parse_vc,
 	},
-	[ITEM_PORT_INDEX] = {
+	[ITEM_PHY_PORT_INDEX] = {
 		.name = "index",
 		.help = "physical port index",
-		.next = NEXT(item_port, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port, index)),
+		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
 	[ITEM_RAW] = {
 		.name = "raw",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 49ef87782..9f968919e 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -960,7 +960,7 @@ static const struct {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a0a124aa2..4e053c24b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -195,8 +195,8 @@ When supported, this effectively enables an application to reroute traffic
 not necessarily intended for it (e.g. coming from or addressed to different
 physical ports, VFs or applications) at the device level.
 
-It complements the behavior of some pattern items such as `Item: PORT`_ and
-is meaningless without them.
+It complements the behavior of some pattern items such as `Item: PHY_PORT`_
+and is meaningless without them.
 
 When transferring flow rules, **ingress** and **egress** attributes
 (`Attribute: Traffic direction`_) keep their original meaning, as if
@@ -583,15 +583,15 @@ separate entities, should be addressed through their own DPDK port IDs.
    | ``mask`` | ``id``   | zeroed to match any VF ID |
    +----------+----------+---------------------------+
 
-Item: ``PORT``
-^^^^^^^^^^^^^^
+Item: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^
 
-Matches packets coming from the specified physical port of the underlying
-device.
+Matches traffic originating from (ingress) or going to (egress) a physical
+port of the underlying device.
 
-The first PORT item overrides the physical port normally associated with the
-specified DPDK input port (port_id). This item can be provided several times
-to match additional physical ports.
+The first PHY_PORT item overrides the physical port normally associated with
+the specified DPDK input port (port_id). This item can be provided several
+times to match additional physical ports.
 
 Note that physical ports are not necessarily tied to DPDK input ports
 (port_id) when those are not under DPDK control. Possible values are
@@ -603,9 +603,9 @@ associated with a port_id should be retrieved by other means.
 
 - Default ``mask`` matches any port index.
 
-.. _table_rte_flow_item_port:
+.. _table_rte_flow_item_phy_port:
 
-.. table:: PORT
+.. table:: PHY_PORT
 
    +----------+-----------+--------------------------------+
    | Field    | Subfield  | Value                          |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index af37c3d82..a2bbd1930 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3208,7 +3208,7 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``id {unsigned}``: VF ID.
 
-- ``port``: device-specific physical port index to use.
+- ``phy_port``: match traffic from/to a specific physical port.
 
   - ``index {unsigned}``: physical port index.
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 1f247d656..6d4d7f5ed 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -38,7 +38,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index f3db2ec01..ed211a8eb 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -84,7 +84,7 @@ struct rte_flow_attr {
 	 * applications) at the device level.
 	 *
 	 * It complements the behavior of some pattern items such as
-	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
 	 *
 	 * When transferring flow rules, ingress and egress attributes keep
 	 * their original meaning, as if processing traffic emitted or
@@ -172,17 +172,12 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets coming from the specified physical port of the
-	 * underlying device.
-	 *
-	 * The first PORT item overrides the physical port normally
-	 * associated with the specified DPDK input port (port_id). This
-	 * item can be provided several times to match additional physical
-	 * ports.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * physical port of the underlying device.
 	 *
-	 * See struct rte_flow_item_port.
+	 * See struct rte_flow_item_phy_port.
 	 */
-	RTE_FLOW_ITEM_TYPE_PORT,
+	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
 	 * Matches a byte string of a given length at a given offset.
@@ -388,13 +383,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
 #endif
 
 /**
- * RTE_FLOW_ITEM_TYPE_PORT
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT
  *
- * Matches packets coming from the specified physical port of the underlying
- * device.
+ * Matches traffic originating from (ingress) or going to (egress) a
+ * physical port of the underlying device.
  *
- * The first PORT item overrides the physical port normally associated with
- * the specified DPDK input port (port_id). This item can be provided
+ * The first PHY_PORT item overrides the physical port normally associated
+ * with the specified DPDK input port (port_id). This item can be provided
  * several times to match additional physical ports.
  *
  * Note that physical ports are not necessarily tied to DPDK input ports
@@ -407,13 +402,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
  *
  * A zeroed mask can be used to match any port index.
  */
-struct rte_flow_item_port {
+struct rte_flow_item_phy_port {
 	uint32_t index; /**< Physical port index. */
 };
 
-/** Default mask for RTE_FLOW_ITEM_TYPE_PORT. */
+/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */
 #ifndef __cplusplus
-static const struct rte_flow_item_port rte_flow_item_port_mask = {
+static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 	.index = 0x00000000,
 };
 #endif
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (8 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
                       ` (3 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

Contrary to all other pattern items, these are inconsistently documented as
affecting traffic instead of simply matching its origin, without provision
for the latter.

This commit clarifies documentation and updates PMDs since the original
behavior now has to be explicitly requested using the new transfer
attribute.

It breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
supported when a transfer attribute is also present.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 12 +++---
 doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
 drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
 drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
 lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
 6 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index a06f3f82b..af0631036 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -1041,21 +1041,21 @@ static const struct token token_list[] = {
 	},
 	[ITEM_PF] = {
 		.name = "pf",
-		.help = "match packets addressed to the physical function",
+		.help = "match traffic from/to the physical function",
 		.priv = PRIV_ITEM(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ITEM_NEXT)),
 		.call = parse_vc,
 	},
 	[ITEM_VF] = {
 		.name = "vf",
-		.help = "match packets addressed to a virtual function ID",
+		.help = "match traffic from/to a virtual function ID",
 		.priv = PRIV_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 		.next = NEXT(item_vf),
 		.call = parse_vc,
 	},
 	[ITEM_VF_ID] = {
 		.name = "id",
-		.help = "destination VF ID",
+		.help = "VF ID",
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
@@ -1686,14 +1686,14 @@ static const struct token token_list[] = {
 	},
 	[ACTION_PF] = {
 		.name = "pf",
-		.help = "redirect packets to physical device function",
+		.help = "direct traffic to physical function",
 		.priv = PRIV_ACTION(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
 	[ACTION_VF] = {
 		.name = "vf",
-		.help = "redirect packets to virtual device function",
+		.help = "direct traffic to a virtual function ID",
 		.priv = PRIV_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 		.next = NEXT(action_vf),
 		.call = parse_vc,
@@ -1708,7 +1708,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_VF_ID] = {
 		.name = "id",
-		.help = "VF ID to redirect packets to",
+		.help = "VF ID",
 		.next = NEXT(action_vf, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 550a4c95b..a0a124aa2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -528,15 +528,12 @@ Usage example, matching non-TCPv4 packets only:
 Item: ``PF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to the physical function of the device.
+Matches traffic originating from (ingress) or going to (egress) the physical
+function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: PF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the physical function is not managed by
+the application and thus not associated with a DPDK port ID.
 
-- Likely to return an error or never match any traffic if applied to a VF
-  device.
 - Can be combined with any number of `Item: VF`_ to match both PF and VF
   traffic.
 - ``spec``, ``last`` and ``mask`` must not be set.
@@ -558,15 +555,15 @@ duplicated between device instances by default.
 Item: ``VF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to a virtual function ID of the device.
+Matches traffic originating from (ingress) or going to (egress) a given
+virtual function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: VF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the virtual function is not managed by the
+application and thus not associated with a DPDK port ID.
+
+Note this pattern item does not match VF representors traffic which, as
+separate entities, should be addressed through their own DPDK port IDs.
 
-- Likely to return an error or never match any traffic if this causes a VF
-  device to match traffic addressed to a different VF.
 - Can be specified multiple times to match traffic addressed to several VF
   IDs.
 - Can be combined with a PF item to match both PF and VF traffic.
@@ -1395,7 +1392,10 @@ only matching traffic goes through.
 Action: ``PF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to the physical function (PF) of the current device.
+Directs matching traffic to the physical function (PF) of the current
+device.
+
+See `Item: PF`_.
 
 - No configurable properties.
 
@@ -1412,13 +1412,15 @@ Redirects packets to the physical function (PF) of the current device.
 Action: ``VF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to a virtual function (VF) of the current device.
+Directs matching traffic to a given virtual function of the current device.
 
 Packets matched by a VF pattern item can be redirected to their original VF
 ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
+See `Item: VF`_.
+
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1428,7 +1430,7 @@ rule or if packets are not addressed to a VF in the first place.
    +==============+================================+
    | ``original`` | use original VF ID if possible |
    +--------------+--------------------------------+
-   | ``vf``       | VF ID to redirect packets to   |
+   | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
 Action: ``METER``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 0bf6c33c9..af37c3d82 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3202,11 +3202,11 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``num {unsigned}``: number of layers covered.
 
-- ``pf``: match packets addressed to the physical function.
+- ``pf``: match traffic from/to the physical function.
 
-- ``vf``: match packets addressed to a virtual function ID.
+- ``vf``: match traffic from/to a virtual function ID.
 
-  - ``id {unsigned}``: destination VF ID.
+  - ``id {unsigned}``: VF ID.
 
 - ``port``: device-specific physical port index to use.
 
@@ -3416,12 +3416,12 @@ This section lists supported actions and their attributes, if any.
 
   - ``queues [{unsigned} [...]] end``: queue indices to use.
 
-- ``pf``: redirect packets to physical device function.
+- ``pf``: direct traffic to physical function.
 
-- ``vf``: redirect packets to virtual device function.
+- ``vf``: direct traffic to a virtual function ID.
 
   - ``original {boolean}``: use original VF ID if possible.
-  - ``id {unsigned}``: VF ID to redirect packets to.
+  - ``id {unsigned}``: VF ID.
 
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index bd166370a..f964b5ea4 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -275,6 +275,7 @@ bnxt_filter_type_check(const struct rte_flow_item pattern[],
 
 static int
 bnxt_validate_and_parse_flow_type(struct bnxt *bp,
+				  const struct rte_flow_attr *attr,
 				  const struct rte_flow_item pattern[],
 				  struct rte_flow_error *error,
 				  struct bnxt_filter_info *filter)
@@ -699,6 +700,16 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 				return -rte_errno;
 			}
 
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   item,
+					   "Matching VF traffic without"
+					   " affecting it (transfer attribute)"
+					   " is unsupported");
+				return -rte_errno;
+			}
+
 			filter->mirror_vnic_id =
 			dflt_vnic = bnxt_hwrm_func_qcfg_vf_dflt_vnic_id(bp, vf);
 			if (dflt_vnic < 0) {
@@ -746,14 +757,6 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -833,7 +836,8 @@ bnxt_validate_and_parse_flow(struct rte_eth_dev *dev,
 		goto ret;
 	}
 
-	rc = bnxt_validate_and_parse_flow_type(bp, pattern, error, filter);
+	rc = bnxt_validate_and_parse_flow_type(bp, attr, pattern, error,
+					       filter);
 	if (rc != 0)
 		goto ret;
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index b004357f1..b0aee0ef7 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -54,6 +54,7 @@ static int i40e_flow_parse_ethertype_action(struct rte_eth_dev *dev,
 				    struct rte_flow_error *error,
 				    struct rte_eth_ethertype_filter *filter);
 static int i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+					const struct rte_flow_attr *attr,
 					const struct rte_flow_item *pattern,
 					struct rte_flow_error *error,
 					struct i40e_fdir_filter_conf *filter);
@@ -1918,14 +1919,6 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -2429,6 +2422,7 @@ i40e_flow_fdir_get_pctype_value(struct i40e_pf *pf,
  */
 static int
 i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
 			     const struct rte_flow_item *pattern,
 			     struct rte_flow_error *error,
 			     struct i40e_fdir_filter_conf *filter)
@@ -2966,6 +2960,16 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ITEM_TYPE_VF:
 			vf_spec = item->spec;
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "Matching VF traffic"
+						   " without affecting it"
+						   " (transfer attribute)"
+						   " is unsupported");
+				return -rte_errno;
+			}
 			filter->input.flow_ext.is_vf = 1;
 			filter->input.flow_ext.dst_id = vf_spec->id;
 			if (filter->input.flow_ext.is_vf &&
@@ -3128,7 +3132,8 @@ i40e_flow_parse_fdir_filter(struct rte_eth_dev *dev,
 		&filter->fdir_filter;
 	int ret;
 
-	ret = i40e_flow_parse_fdir_pattern(dev, pattern, error, fdir_filter);
+	ret = i40e_flow_parse_fdir_pattern(dev, attr, pattern, error,
+					   fdir_filter);
 	if (ret)
 		return ret;
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index fc7df68d3..f3db2ec01 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -152,13 +152,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to the physical function of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a PF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress)
+	 * the physical function of the current device.
 	 *
 	 * No associated specification structure.
 	 */
@@ -167,13 +162,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to a virtual function ID of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a VF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given virtual function of the current device.
 	 *
 	 * See struct rte_flow_item_vf.
 	 */
@@ -371,15 +361,15 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
 /**
  * RTE_FLOW_ITEM_TYPE_VF
  *
- * Matches packets addressed to a virtual function ID of the device.
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * virtual function of the current device.
  *
- * If the underlying device function differs from the one that would
- * normally receive the matched traffic, specifying this item prevents it
- * from reaching that device unless the flow rule contains a VF
- * action. Packets are not duplicated between device instances by default.
+ * If supported, should work even if the virtual function is not managed by
+ * the application and thus not associated with a DPDK port ID.
+ *
+ * Note this pattern item does not match VF representors traffic which, as
+ * separate entities, should be addressed through their own DPDK port IDs.
  *
- * - Likely to return an error or never match any traffic if this causes a
- *   VF device to match traffic addressed to a different VF.
  * - Can be specified multiple times to match traffic addressed to several
  *   VF IDs.
  * - Can be combined with a PF item to match both PF and VF traffic.
@@ -387,7 +377,7 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
  * A zeroed mask can be used to match any VF ID.
  */
 struct rte_flow_item_vf {
-	uint32_t id; /**< Destination VF ID. */
+	uint32_t id; /**< VF ID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VF. */
@@ -988,16 +978,16 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_RSS,
 
 	/**
-	 * Redirects packets to the physical function (PF) of the current
-	 * device.
+	 * Directs matching traffic to the physical function (PF) of the
+	 * current device.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PF,
 
 	/**
-	 * Redirects packets to the virtual function (VF) of the current
-	 * device with the specified ID.
+	 * Directs matching traffic to a given virtual function of the
+	 * current device.
 	 *
 	 * See struct rte_flow_action_vf.
 	 */
@@ -1111,7 +1101,8 @@ struct rte_flow_action_rss {
 /**
  * RTE_FLOW_ACTION_TYPE_VF
  *
- * Redirects packets to a virtual function (VF) of the current device.
+ * Directs matching traffic to a given virtual function of the current
+ * device.
  *
  * Packets matched by a VF pattern item can be redirected to their original
  * VF ID instead of the specified one. This parameter may not be available
@@ -1122,7 +1113,7 @@ struct rte_flow_action_rss {
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
 	uint32_t reserved:31; /**< Reserved, must be zero. */
-	uint32_t id; /**< VF ID to redirect packets to. */
+	uint32_t id; /**< VF ID. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (7 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
                       ` (4 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Andrew Rybchenko

This new attribute enables applications to create flow rules that do not
simply match traffic whose origin is specified in the pattern (e.g. some
non-default physical port or VF), but actively affect it by applying the
flow rule at the lowest possible level in the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>

---

v3 changes:

Clarified definition for ingress and egress following Andrew's comment on
subsequent patch.

[1] http://dpdk.org/ml/archives/dev/2018-April/095961.html
---
 app/test-pmd/cmdline_flow.c                 | 11 +++++
 app/test-pmd/config.c                       |  6 ++-
 doc/guides/prog_guide/rte_flow.rst          | 26 +++++++++++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 11 ++---
 drivers/net/bnxt/bnxt_filter.c              |  8 ++++
 drivers/net/e1000/igb_flow.c                | 44 ++++++++++++++++++++
 drivers/net/enic/enic_flow.c                |  6 +++
 drivers/net/i40e/i40e_flow.c                |  8 ++++
 drivers/net/ixgbe/ixgbe_flow.c              | 53 ++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_flow.c                |  4 ++
 drivers/net/mlx5/mlx5_flow.c                |  7 ++++
 drivers/net/mvpp2/mrvl_flow.c               |  6 +++
 drivers/net/sfc/sfc_flow.c                  |  6 +++
 drivers/net/tap/tap_flow.c                  |  6 +++
 lib/librte_ether/rte_flow.h                 | 22 +++++++++-
 15 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 49217d5bc..a06f3f82b 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -69,6 +69,7 @@ enum index {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 
 	/* Validate/create pattern. */
 	PATTERN,
@@ -407,6 +408,7 @@ static const enum index next_vc_attr[] = {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 	PATTERN,
 	ZERO,
 };
@@ -960,6 +962,12 @@ static const struct token token_list[] = {
 		.next = NEXT(next_vc_attr),
 		.call = parse_vc,
 	},
+	[TRANSFER] = {
+		.name = "transfer",
+		.help = "apply rule directly to endpoints found in pattern",
+		.next = NEXT(next_vc_attr),
+		.call = parse_vc,
+	},
 	/* Validate/create pattern. */
 	[PATTERN] = {
 		.name = "pattern",
@@ -1945,6 +1953,9 @@ parse_vc(struct context *ctx, const struct token *token,
 	case EGRESS:
 		out->args.vc.attr.egress = 1;
 		return len;
+	case TRANSFER:
+		out->args.vc.attr.transfer = 1;
+		return len;
 	case PATTERN:
 		out->args.vc.pattern =
 			(void *)RTE_ALIGN_CEIL((uintptr_t)(out + 1),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index c0fefe475..49ef87782 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1223,6 +1223,7 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
+		[RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
 		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
@@ -1488,12 +1489,13 @@ port_flow_list(portid_t port_id, uint32_t n, const uint32_t group[n])
 		const struct rte_flow_item *item = pf->pattern;
 		const struct rte_flow_action *action = pf->actions;
 
-		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c\t",
+		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c%c\t",
 		       pf->id,
 		       pf->attr.group,
 		       pf->attr.priority,
 		       pf->attr.ingress ? 'i' : '-',
-		       pf->attr.egress ? 'e' : '-');
+		       pf->attr.egress ? 'e' : '-',
+		       pf->attr.transfer ? 't' : '-');
 		while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 			if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
 				printf("%s ", flow_item[item->type].name);
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index c62a80566..550a4c95b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -170,7 +170,13 @@ Note that support for more than a single priority level is not guaranteed.
 Attribute: Traffic direction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Flow rules can apply to inbound and/or outbound traffic (ingress/egress).
+Flow rule patterns apply to inbound and/or outbound traffic.
+
+In the context of this API, **ingress** and **egress** respectively stand
+for **inbound** and **outbound** based on the standpoint of the application
+creating a flow rule.
+
+There are no exceptions to this definition.
 
 Several pattern items and actions are valid and can be used in both
 directions. At least one direction must be specified.
@@ -178,6 +184,24 @@ directions. At least one direction must be specified.
 Specifying both directions at once for a given rule is not recommended but
 may be valid in a few cases (e.g. shared counters).
 
+Attribute: Transfer
+^^^^^^^^^^^^^^^^^^^
+
+Instead of simply matching the properties of traffic as it would appear on a
+given DPDK port ID, enabling this attribute transfers a flow rule to the
+lowest possible level of any device endpoints found in the pattern.
+
+When supported, this effectively enables an application to reroute traffic
+not necessarily intended for it (e.g. coming from or addressed to different
+physical ports, VFs or applications) at the device level.
+
+It complements the behavior of some pattern items such as `Item: PORT`_ and
+is meaningless without them.
+
+When transferring flow rules, **ingress** and **egress** attributes
+(`Attribute: Traffic direction`_) keep their original meaning, as if
+processing traffic emitted or received by the application.
+
 Pattern item
 ~~~~~~~~~~~~
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 923664f7d..0bf6c33c9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -2970,14 +2970,14 @@ following sections.
 - Check whether a flow rule can be created::
 
    flow validate {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
 - Create a flow rule::
 
    flow create {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
@@ -3010,7 +3010,7 @@ underlying device in its current state but stops short of creating it. It is
 bound to ``rte_flow_validate()``::
 
    flow validate {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3047,7 +3047,7 @@ Creating flow rules
 to ``rte_flow_create()``::
 
    flow create {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3061,7 +3061,7 @@ Otherwise it will show an error message of the form::
 
 Parameters describe in the following order:
 
-- Attributes (*group*, *priority*, *ingress*, *egress* tokens).
+- Attributes (*group*, *priority*, *ingress*, *egress*, *transfer* tokens).
 - A matching pattern, starting with the *pattern* token and terminated by an
   *end* pattern item.
 - Actions, starting with the *actions* token and terminated by an *end*
@@ -3089,6 +3089,7 @@ specified before the ``pattern`` token.
 - ``priority {level}``: priority level within group.
 - ``ingress``: rule applies to ingress traffic.
 - ``egress``: rule applies to egress traffic.
+- ``transfer``: apply rule directly to endpoints found in pattern.
 
 Each instance of an attribute specified several times overrides the previous
 value as shown below (group 4 is used)::
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 9bb1575cb..bd166370a 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -746,6 +746,14 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index d1c0b4b8d..073852913 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -379,6 +379,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -624,6 +633,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -923,6 +940,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1211,6 +1237,15 @@ cons_parse_flex_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -1361,6 +1396,15 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index 20d6b9d59..3a0086399 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -1318,6 +1318,12 @@ enic_flow_parse(struct rte_eth_dev *dev,
 					   NULL,
 					   "egress is not supported");
 			return -rte_errno;
+		} else if (attrs->transfer) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					   NULL,
+					   "transfer is not supported");
+			return -rte_errno;
 		} else if (!attrs->ingress) {
 			rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index e3d83eac7..b004357f1 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -1918,6 +1918,14 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 438bfcdfb..eb0644c82 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -557,6 +557,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -787,6 +796,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -1078,6 +1095,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1250,6 +1276,15 @@ cons_parse_l2_tn_filter(struct rte_eth_dev *dev,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
 		rte_flow_error_set(error, EINVAL,
@@ -1354,6 +1389,15 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
 		rte_flow_error_set(error, EINVAL,
@@ -2829,6 +2873,15 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 779641e11..480442f87 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -652,6 +652,10 @@ mlx4_flow_prepare(struct priv *priv,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
 			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
 	if (!attr->ingress)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 292e579d1..de8ac9610 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -568,6 +568,13 @@ mlx5_flow_convert_attributes(const struct rte_flow_attr *attr,
 				   "egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   NULL,
+				   "transfer is not supported");
+		return -rte_errno;
+	}
 	if (!attr->ingress) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 6478eb2fe..a2e2129cc 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -2187,6 +2187,12 @@ mrvl_flow_parse_attr(struct mrvl_priv *priv __rte_unused,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, NULL,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index cd6a61b39..bcde2c2f7 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1116,6 +1116,12 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer != 0) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, attr,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->ingress == 0) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, attr,
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index e90e5165f..dc1491990 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1039,6 +1039,12 @@ priv_flow_process(struct pmd_internals *pmd,
 	};
 	int action = 0; /* Only one action authorized for now */
 
+	if (attr->transfer) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			NULL, "transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->group > MAX_GROUP) {
 		rte_flow_error_set(
 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 73d29ed32..fc7df68d3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -72,7 +72,26 @@ struct rte_flow_attr {
 	uint32_t priority; /**< Priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
-	uint32_t reserved:30; /**< Reserved, must be zero. */
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 *
+	 * When transferring flow rules, ingress and egress attributes keep
+	 * their original meaning, as if processing traffic emitted or
+	 * received by the application.
+	 */
+	uint32_t transfer:1;
+	uint32_t reserved:29; /**< Reserved, must be zero. */
 };
 
 /**
@@ -1181,6 +1200,7 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
+	RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
 	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (6 preceding siblings ...)
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-11 12:45  0%       ` Andrew Rybchenko
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
                       ` (5 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Andrew Rybchenko, Pascal Mazon

TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
consistent with the normal stacking order of pattern items, which is
confusing to applications.

Problem is that when followed by one of these layers, the EtherType field
of the preceding layer keeps its "inner" definition, and the "outer" TPID
is provided by the subsequent layer, the reverse of how a packet looks like
on the wire:

 Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
 rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]

Worse, when QinQ is involved, the stacking order of VLAN layers is
unspecified. It is unclear whether it should be reversed (innermost to
outermost) as well given TPID applies to the previous layer:

 Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
 rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
 rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]

While specifying EtherType/TPID is hopefully rarely necessary, the stacking
order in case of QinQ and the lack of documentation remain an issue.

This patch replaces TPID in the VLAN pattern item with an inner
EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
clarifies documentation and updates all relevant code.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
items:

- bnxt: EtherType matching is supported with and without VLAN, but TPID
  matching is not and triggers an error.

- e1000: EtherType matching is only supported with the ETHERTYPE filter,
  which does not support VLAN matching, therefore no impact.

- enic: same as bnxt.

- i40e: same as bnxt with existing FDIR limitations on allowed EtherType
  values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
  EtherType matching.

- ixgbe: same as e1000, with additional minor change to rely on the new
  E-Tag macro definition.

- mlx4: EtherType/TPID matching is not supported, no impact.

- mlx5: same as bnxt.

- mvpp2: same as bnxt.

- sfc: same as bnxt.

- tap: same as bnxt.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: John Daley <johndale@cisco.com>
Cc: Hyong Youb Kim <hyonkim@cisco.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Dmitri Epshtein <dima@marvell.com>
Cc: Natalie Samsonov <nsamsono@marvell.com>
Cc: Jianbo Liu <jianbo.liu@arm.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

Updated mrvl to mvpp2.

Moved unrelated default TCI mask update to separate patch.

Fixed sfc according to Andrew's comments [1], which made so much sense that
I standardized on the same behavior for all other PMDs: matching outer TPID
is never supported when a VLAN pattern item is present.

This is done because many devices accept several TPIDs but do not provide
means to match a given one explicitly, it's all or nothing, and that makes
the resulting flow rule inaccurate.

[1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
---
 app/test-pmd/cmdline_flow.c                 | 17 +++----
 doc/guides/nics/tap.rst                     |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
 drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
 drivers/net/enic/enic_flow.c                | 19 +++++---
 drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
 drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
 drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
 drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
 drivers/net/sfc/sfc_flow.c                  | 18 +++++++
 drivers/net/tap/tap_flow.c                  | 14 ++++--
 lib/librte_ether/rte_flow.h                 | 22 ++++++---
 lib/librte_net/rte_ether.h                  |  1 +
 14 files changed, 198 insertions(+), 55 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 9b6004176..49217d5bc 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -99,11 +99,11 @@ enum index {
 	ITEM_ETH_SRC,
 	ITEM_ETH_TYPE,
 	ITEM_VLAN,
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_IPV4,
 	ITEM_IPV4_TOS,
 	ITEM_IPV4_TTL,
@@ -505,11 +505,11 @@ static const enum index item_eth[] = {
 };
 
 static const enum index item_vlan[] = {
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1142,12 +1142,6 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vlan),
 		.call = parse_vc,
 	},
-	[ITEM_VLAN_TPID] = {
-		.name = "tpid",
-		.help = "tag protocol identifier",
-		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan, tpid)),
-	},
 	[ITEM_VLAN_TCI] = {
 		.name = "tci",
 		.help = "tag control information",
@@ -1175,6 +1169,13 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY_MASK_HTON(struct rte_flow_item_vlan,
 						  tci, "\x0f\xff")),
 	},
+	[ITEM_VLAN_INNER_TYPE] = {
+		.name = "inner_type",
+		.help = "inner EtherType",
+		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan,
+					     inner_type)),
+	},
 	[ITEM_IPV4] = {
 		.name = "ipv4",
 		.help = "match IPv4 header",
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c97786aca..3f7a15147 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -108,7 +108,7 @@ The kernel support can be checked with this command::
 Supported items:
 
 - eth: src and dst (with variable masks), and eth_type (0xffff mask).
-- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- vlan: vid, pcp, but not eid. (requires kernel 4.9)
 - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
 - udp/tcp: src and dst port (0xffff) mask.
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 1a09e8a0f..fd317b48c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -784,9 +784,15 @@ Item: ``ETH``
 
 Matches an Ethernet header.
 
+The ``type`` field either stands for "EtherType" or "TPID" when followed by
+so-called layer 2.5 pattern items such as ``RTE_FLOW_ITEM_TYPE_VLAN``. In
+the latter case, ``type`` refers to that of the outer header, with the inner
+EtherType/TPID provided by the subsequent pattern item. This is the same
+order as on the wire.
+
 - ``dst``: destination MAC.
 - ``src``: source MAC.
-- ``type``: EtherType.
+- ``type``: EtherType or TPID.
 - Default ``mask`` matches destination and source addresses only.
 
 Item: ``VLAN``
@@ -794,8 +800,12 @@ Item: ``VLAN``
 
 Matches an 802.1Q/ad VLAN tag.
 
-- ``tpid``: tag protocol identifier.
+The corresponding standard outer EtherType (TPID) values are
+``ETHER_TYPE_VLAN`` or ``ETHER_TYPE_QINQ``. It can be overridden by the
+preceding pattern item.
+
 - ``tci``: tag control information.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` matches TCI only.
 
 Item: ``IPV4``
@@ -866,12 +876,15 @@ Item: ``E_TAG``
 
 Matches an IEEE 802.1BR E-Tag header.
 
-- ``tpid``: tag protocol identifier (0x893F)
+The corresponding standard outer EtherType (TPID) value is
+``ETHER_TYPE_ETAG``. It can be overridden by the preceding pattern item.
+
 - ``epcp_edei_in_ecid_b``: E-Tag control information (E-TCI), E-PCP (3b),
   E-DEI (1b), ingress E-CID base (12b).
 - ``rsvd_grp_ecid_b``: reserved (2b), GRP (2b), E-CID base (12b).
 - ``in_ecid_e``: ingress E-CID ext.
 - ``ecid_e``: E-CID ext.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` simultaneously matches GRP and E-CID base.
 
 Item: ``NVGRE``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 3b1073bfc..923664f7d 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3223,15 +3223,15 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``dst {MAC-48}``: destination MAC.
   - ``src {MAC-48}``: source MAC.
-  - ``type {unsigned}``: EtherType.
+  - ``type {unsigned}``: EtherType or TPID.
 
 - ``vlan``: match 802.1Q/ad VLAN tag.
 
-  - ``tpid {unsigned}``: tag protocol identifier.
   - ``tci {unsigned}``: tag control information.
   - ``pcp {unsigned}``: priority code point.
   - ``dei {unsigned}``: drop eligible indicator.
   - ``vid {unsigned}``: VLAN identifier.
+  - ``inner_type {unsigned}``: inner EtherType or TPID.
 
 - ``ipv4``: match IPv4 header.
 
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 0f9c1c9ae..9bb1575cb 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -299,6 +299,7 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 	uint32_t vf = 0;
 	int use_ntuple;
 	uint32_t en = 0;
+	uint32_t en_ethertype;
 	int dflt_vnic;
 
 	use_ntuple = bnxt_filter_type_check(pattern, error);
@@ -308,6 +309,9 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 
 	filter->filter_type = use_ntuple ?
 		HWRM_CFA_NTUPLE_FILTER : HWRM_CFA_EM_FILTER;
+	en_ethertype = use_ntuple ?
+		NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
+		EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
 
 	while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 		if (item->last) {
@@ -377,30 +381,49 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 			if (eth_mask->type) {
 				filter->ethertype =
 					rte_be_to_cpu_16(eth_spec->type);
-				en |= use_ntuple ?
-					NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
-					EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
+				en |= en_ethertype;
 			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+			if (en & en_ethertype) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "VLAN TPID matching is not"
+						   " supported");
+				return -rte_errno;
+			}
 			if (vlan_mask->tci &&
-			    vlan_mask->tci == RTE_BE16(0x0fff) &&
-			    !vlan_mask->tpid) {
+			    vlan_mask->tci == RTE_BE16(0x0fff)) {
 				/* Only the VLAN ID can be matched. */
 				filter->l2_ovlan =
 					rte_be_to_cpu_16(vlan_spec->tci &
 							 RTE_BE16(0x0fff));
 				en |= EM_FLOW_ALLOC_INPUT_EN_OVLAN_VID;
-			} else if (vlan_mask->tci || vlan_mask->tpid) {
+			} else if (vlan_mask->tci) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
 						   "VLAN mask is invalid");
 				return -rte_errno;
 			}
+			if (vlan_mask->inner_type &&
+			    vlan_mask->inner_type != RTE_BE16(0xffff)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "inner ethertype mask not"
+						   " valid");
+				return -rte_errno;
+			}
+			if (vlan_mask->inner_type) {
+				filter->ethertype =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+				en |= en_ethertype;
+			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index a5c6a1670..20d6b9d59 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -557,16 +557,21 @@ enic_copy_item_vlan_v2(const struct rte_flow_item *item,
 	if (!spec)
 		return 0;
 
-	/* Don't support filtering in tpid */
-	if (mask) {
-		if (mask->tpid != 0)
-			return ENOTSUP;
-	} else {
+	if (!mask)
 		mask = &rte_flow_item_vlan_mask;
-		RTE_ASSERT(mask->tpid == 0);
-	}
 
 	if (*inner_ofst == 0) {
+		struct ether_hdr *eth_mask =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].mask;
+		struct ether_hdr *eth_val =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].val;
+
+		/* Outer TPID cannot be matched */
+		if (eth_mask->ether_type)
+			return ENOTSUP;
+		eth_mask->ether_type = mask->inner_type;
+		eth_val->ether_type = spec->inner_type;
+
 		/* Outer header. Use the vlan mask/val fields */
 		gp->mask_vlan = mask->tci;
 		gp->val_vlan = spec->tci;
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index fef812c6b..e3d83eac7 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <stdarg.h>
 
+#include <rte_debug.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_log.h>
@@ -2491,16 +2492,22 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						      "Invalid MAC_addr mask.");
 					return -rte_errno;
 				}
+			}
+			if (eth_spec && eth_mask && eth_mask->type) {
+				enum rte_flow_item_type next = (item + 1)->type;
 
-				if ((eth_mask->type & UINT16_MAX) ==
-				    UINT16_MAX) {
-					input_set |= I40E_INSET_LAST_ETHER_TYPE;
-					filter->input.flow.l2_flow.ether_type =
-						eth_spec->type;
+				if (eth_mask->type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid type mask.");
+					return -rte_errno;
 				}
 
 				ether_type = rte_be_to_cpu_16(eth_spec->type);
-				if (ether_type == ETHER_TYPE_IPv4 ||
+
+				if (next == RTE_FLOW_ITEM_TYPE_VLAN ||
+				    ether_type == ETHER_TYPE_IPv4 ||
 				    ether_type == ETHER_TYPE_IPv6 ||
 				    ether_type == ETHER_TYPE_ARP ||
 				    ether_type == outer_tpid) {
@@ -2510,6 +2517,9 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						     "Unsupported ether_type.");
 					return -rte_errno;
 				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					eth_spec->type;
 			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
@@ -2519,6 +2529,8 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+
+			RTE_ASSERT(!(input_set & I40E_INSET_LAST_ETHER_TYPE));
 			if (vlan_spec && vlan_mask) {
 				if (vlan_mask->tci ==
 				    rte_cpu_to_be_16(I40E_TCI_MASK)) {
@@ -2527,6 +2539,33 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						vlan_spec->tci;
 				}
 			}
+			if (vlan_spec && vlan_mask && vlan_mask->inner_type) {
+				if (vlan_mask->inner_type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid inner_type"
+						      " mask.");
+					return -rte_errno;
+				}
+
+				ether_type =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+
+				if (ether_type == ETHER_TYPE_IPv4 ||
+				    ether_type == ETHER_TYPE_IPv6 ||
+				    ether_type == ETHER_TYPE_ARP ||
+				    ether_type == outer_tpid) {
+					rte_flow_error_set(error, EINVAL,
+						     RTE_FLOW_ERROR_TYPE_ITEM,
+						     item,
+						     "Unsupported inner_type.");
+					return -rte_errno;
+				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					vlan_spec->inner_type;
+			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
 			layer_idx = I40E_FLXPLD_L2_IDX;
@@ -3285,7 +3324,8 @@ i40e_flow_parse_vxlan_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -3515,7 +3555,8 @@ i40e_flow_parse_nvgre_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -4023,7 +4064,8 @@ i40e_flow_parse_qinq_pattern(__rte_unused struct rte_eth_dev *dev,
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
 
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 					   RTE_FLOW_ERROR_TYPE_ITEM,
 					   item,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index ea3624ba4..94fba2908 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -115,7 +115,6 @@
 
 #define IXGBE_VT_CTL_POOLING_MODE_MASK         0x00030000
 #define IXGBE_VT_CTL_POOLING_MODE_ETAG         0x00010000
-#define DEFAULT_ETAG_ETYPE                     0x893f
 #define IXGBE_ETAG_ETYPE                       0x00005084
 #define IXGBE_ETAG_ETYPE_MASK                  0x0000ffff
 #define IXGBE_ETAG_ETYPE_VALID                 0x80000000
@@ -1481,7 +1480,7 @@ static int ixgbe_l2_tn_filter_init(struct rte_eth_dev *eth_dev)
 	}
 	l2_tn_info->e_tag_en = FALSE;
 	l2_tn_info->e_tag_fwd_en = FALSE;
-	l2_tn_info->e_tag_ether_type = DEFAULT_ETAG_ETYPE;
+	l2_tn_info->e_tag_ether_type = ETHER_TYPE_ETAG;
 
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bc1176819..292e579d1 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_ether.h>
 #include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
@@ -306,6 +307,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.actions = valid_actions,
 		.mask = &(const struct rte_flow_item_vlan){
 			.tci = -1,
+			.inner_type = -1,
 		},
 		.default_mask = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
@@ -1285,6 +1287,7 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 	struct mlx5_flow_parse *parser = data->parser;
 	struct ibv_flow_spec_eth *eth;
 	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg = "VLAN cannot be empty";
 
 	if (spec) {
 		unsigned int i;
@@ -1306,12 +1309,20 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 			 */
 			if (!eth->mask.vlan_tag)
 				goto error;
+			/* Outer TPID cannot be matched. */
+			if (eth->mask.ether_type) {
+				msg = "VLAN TPID matching is not supported";
+				goto error;
+			}
+			eth->val.ether_type = spec->inner_type;
+			eth->mask.ether_type = mask->inner_type;
+			eth->val.ether_type &= eth->mask.ether_type;
 		}
 		return 0;
 	}
 error:
 	return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				  item, "VLAN cannot be empty");
+				  item, msg);
 }
 
 /**
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 8fd4dbfb1..6478eb2fe 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -1091,12 +1091,6 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 	if (ret)
 		return ret;
 
-	if (mask->tpid) {
-		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				   NULL, "Not supported by classifier\n");
-		return -rte_errno;
-	}
-
 	m = rte_be_to_cpu_16(mask->tci);
 	if (m & MRVL_VLAN_ID_MASK) {
 		RTE_LOG(WARNING, PMD, "vlan id mask is ignored\n");
@@ -1112,6 +1106,26 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 			goto out;
 	}
 
+	if (flow->pattern & F_TYPE) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported\n");
+		return -rte_errno;
+	}
+	if (mask->inner_type) {
+		struct rte_flow_item_eth spec_eth = {
+			.type = spec->inner_type,
+		};
+		struct rte_flow_item_eth mask_eth = {
+			.type = mask->inner_type,
+		};
+
+		RTE_LOG(WARNING, PMD, "inner eth type mask is ignored\n");
+		ret = mrvl_parse_type(spec_eth, mask_eth, flow);
+		if (ret)
+			goto out;
+	}
+
 	return 0;
 out:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 3028efbf9..cd6a61b39 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -7,6 +7,7 @@
  * for Solarflare) and Solarflare Communications, Inc.
  */
 
+#include <rte_byteorder.h>
 #include <rte_tailq.h>
 #include <rte_common.h>
 #include <rte_ethdev_driver.h>
@@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 	const struct rte_flow_item_vlan *mask = NULL;
 	const struct rte_flow_item_vlan supp_mask = {
 		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
+		.inner_type = RTE_BE16(0xffff),
 	};
 
 	rc = sfc_flow_parse_init(item,
@@ -393,6 +395,22 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 		return -rte_errno;
 	}
 
+	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported");
+		return -rte_errno;
+	}
+	if (mask->inner_type == supp_mask.inner_type) {
+		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
+	} else if (mask->inner_type) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Bad mask for VLAN inner_type");
+		return -rte_errno;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 1caefff43..e90e5165f 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -270,13 +270,13 @@ static const struct tap_flow_items tap_flow_items[] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
 			       RTE_FLOW_ITEM_TYPE_IPV6),
 		.mask = &(const struct rte_flow_item_vlan){
-			.tpid = -1,
 			/* DEI matching is not supported */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 			.tci = 0xffef,
 #else
 			.tci = 0xefff,
 #endif
+			.inner_type = -1,
 		},
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
 		.default_mask = &rte_flow_item_vlan_mask,
@@ -578,13 +578,19 @@ tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
 	/* use default mask if none provided */
 	if (!mask)
 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
-	/* TC does not support tpid masking. Only accept if exact match. */
-	if (mask->tpid && mask->tpid != 0xffff)
+	/* Outer TPID cannot be matched. */
+	if (info->eth_type)
 		return -1;
 	/* Double-tagging not supported. */
-	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+	if (info->vlan)
 		return -1;
 	info->vlan = 1;
+	if (mask->inner_type) {
+		/* TC does not support partial eth_type masking */
+		if (mask->inner_type != RTE_BE16(0xffff))
+			return -1;
+		info->eth_type = spec->inner_type;
+	}
 	if (!flow)
 		return 0;
 	msg = &flow->msg;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cf4a3faf2..f6ee28929 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -454,11 +454,17 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
  * RTE_FLOW_ITEM_TYPE_ETH
  *
  * Matches an Ethernet header.
+ *
+ * The @p type field either stands for "EtherType" or "TPID" when followed
+ * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In
+ * the latter case, @p type refers to that of the outer header, with the
+ * inner EtherType/TPID provided by the subsequent pattern item. This is the
+ * same order as on the wire.
  */
 struct rte_flow_item_eth {
 	struct ether_addr dst; /**< Destination MAC. */
 	struct ether_addr src; /**< Source MAC. */
-	rte_be16_t type; /**< EtherType. */
+	rte_be16_t type; /**< EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */
@@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
  *
  * Matches an 802.1Q/ad VLAN tag.
  *
- * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
- * RTE_FLOW_ITEM_TYPE_VLAN.
+ * The corresponding standard outer EtherType (TPID) values are
+ * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
+ * pattern item.
  */
 struct rte_flow_item_vlan {
-	rte_be16_t tpid; /**< Tag protocol identifier. */
 	rte_be16_t tci; /**< Tag control information. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
 #ifndef __cplusplus
 static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
-	.tpid = RTE_BE16(0x0000),
 	.tci = RTE_BE16(0xffff),
+	.inner_type = RTE_BE16(0x0000),
 };
 #endif
 
@@ -636,9 +643,11 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
  * RTE_FLOW_ITEM_TYPE_E_TAG.
  *
  * Matches a E-tag header.
+ *
+ * The corresponding standard outer EtherType (TPID) value is
+ * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item.
  */
 struct rte_flow_item_e_tag {
-	rte_be16_t tpid; /**< Tag protocol identifier (0x893F). */
 	/**
 	 * E-Tag control information (E-TCI).
 	 * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b).
@@ -648,6 +657,7 @@ struct rte_flow_item_e_tag {
 	rte_be16_t rsvd_grp_ecid_b;
 	uint8_t in_ecid_e; /**< Ingress E-CID ext. */
 	uint8_t ecid_e; /**< E-CID ext. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index 45daa911a..a271d1c86 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -301,6 +301,7 @@ struct vxlan_hdr {
 #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */
 #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */
 #define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
+#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */
 #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
 #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
 #define ETHER_TYPE_TEB  0x6558 /**< Transparent Ethernet Bridging. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level to RSS flow API action
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (5 preceding siblings ...)
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-10 16:36  3%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
                       ` (6 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 34f33f671..9b6004176 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index b258c93e8..c0fefe475 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1085,6 +1085,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index e0c68495c..1a09e8a0f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1311,6 +1311,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1320,6 +1342,8 @@ field only, both can be requested simultaneously.
    +===============+=============================================+
    | ``func``      | RSS hash function to apply                  |
    +---------------+---------------------------------------------+
+   | ``level``     | encapsulation level for ``types``           |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 546ef3ab7..3b1073bfc 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 82307ec5d..d1c0b4b8d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index d5c1cd3d3..a3776a0d7 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2906,6 +2906,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2921,6 +2922,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 5e313950c..b104b551c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11975,6 +11975,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11990,6 +11991,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 33f77cc80..fef812c6b 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4356,6 +4356,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 00d975b93..438bfcdfb 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index e17f5a433..23af21712 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5683,6 +5683,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5698,6 +5699,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dcaf8df44..779641e11 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 0771ad339..bc1176819 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 779edad0c..3028efbf9 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1269,6 +1269,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 		return -EINVAL;
 	}
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 7abf49ab1..1caefff43 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 0a2c0ac00..1f247d656 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index acf6031ec..cf4a3faf2 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1046,6 +1046,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (4 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-10 16:36  2%     ` Adrien Mazarguil
  2018-04-11 12:40  0%       ` Andrew Rybchenko
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
                       ` (7 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

By definition, RSS involves some kind of hash algorithm, usually Toeplitz.

Until now it could not be modified on a flow rule basis and PMDs had to
always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
behavior when unspecified (0).

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

- Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
  more explicit where doing so would clarify the code.

- Updated sfc to include Toeplitz as the other allowed value.

Both according to Andrew's suggestions [1].

[1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
---
 app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          |  2 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
 drivers/net/e1000/igb_flow.c                |  4 ++
 drivers/net/e1000/igb_rxtx.c                |  4 +-
 drivers/net/i40e/i40e_ethdev.c              |  4 +-
 drivers/net/i40e/i40e_flow.c                |  4 ++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
 drivers/net/mlx4/mlx4_flow.c                |  7 +++
 drivers/net/mlx5/mlx5_flow.c                | 13 +++++
 drivers/net/sfc/sfc_flow.c                  |  8 +++
 drivers/net/tap/tap_flow.c                  |  6 ++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 |  2 +
 16 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f6b73ca6e..34f33f671 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -14,6 +14,7 @@
 #include <sys/socket.h>
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev.h>
 #include <rte_byteorder.h>
 #include <cmdline_parse.h>
@@ -165,6 +166,10 @@ enum index {
 	ACTION_DROP,
 	ACTION_COUNT,
 	ACTION_RSS,
+	ACTION_RSS_FUNC,
+	ACTION_RSS_FUNC_DEFAULT,
+	ACTION_RSS_FUNC_TOEPLITZ,
+	ACTION_RSS_FUNC_SIMPLE_XOR,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
 	ACTION_RSS_KEY,
@@ -632,6 +637,7 @@ static const enum index action_queue[] = {
 };
 
 static const enum index action_rss[] = {
+	ACTION_RSS_FUNC,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -666,6 +672,9 @@ static int parse_vc_conf(struct context *, const struct token *,
 static int parse_vc_action_rss(struct context *, const struct token *,
 			       const char *, unsigned int, void *,
 			       unsigned int);
+static int parse_vc_action_rss_func(struct context *, const struct token *,
+				    const char *, unsigned int, void *,
+				    unsigned int);
 static int parse_vc_action_rss_type(struct context *, const struct token *,
 				    const char *, unsigned int, void *,
 				    unsigned int);
@@ -1584,6 +1593,29 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
+	[ACTION_RSS_FUNC] = {
+		.name = "func",
+		.help = "RSS hash function to apply",
+		.next = NEXT(action_rss,
+			     NEXT_ENTRY(ACTION_RSS_FUNC_DEFAULT,
+					ACTION_RSS_FUNC_TOEPLITZ,
+					ACTION_RSS_FUNC_SIMPLE_XOR)),
+	},
+	[ACTION_RSS_FUNC_DEFAULT] = {
+		.name = "default",
+		.help = "default hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_TOEPLITZ] = {
+		.name = "toeplitz",
+		.help = "Toeplitz hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_SIMPLE_XOR] = {
+		.name = "simple_xor",
+		.help = "simple XOR hash function",
+		.call = parse_vc_action_rss_func,
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2074,6 +2106,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
+			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
@@ -2099,6 +2132,45 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 }
 
 /**
+ * Parse func field for RSS action.
+ *
+ * The RTE_ETH_HASH_FUNCTION_* value to assign is derived from the
+ * ACTION_RSS_FUNC_* index that called this function.
+ */
+static int
+parse_vc_action_rss_func(struct context *ctx, const struct token *token,
+			 const char *str, unsigned int len,
+			 void *buf, unsigned int size)
+{
+	struct action_rss_data *action_rss_data;
+	enum rte_eth_hash_function func;
+
+	(void)buf;
+	(void)size;
+	/* Token name must match. */
+	if (parse_default(ctx, token, str, len, NULL, 0) < 0)
+		return -1;
+	switch (ctx->curr) {
+	case ACTION_RSS_FUNC_DEFAULT:
+		func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+		break;
+	case ACTION_RSS_FUNC_TOEPLITZ:
+		func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+		break;
+	case ACTION_RSS_FUNC_SIMPLE_XOR:
+		func = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR;
+		break;
+	default:
+		return -1;
+	}
+	if (!ctx->object)
+		return len;
+	action_rss_data = ctx->object;
+	action_rss_data->conf.func = func;
+	return len;
+}
+
+/**
  * Parse type field for RSS action.
  *
  * Valid tokens are type field names and the "end" token.
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 717f31774..b258c93e8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,6 +1084,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index cf252eeba..e0c68495c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1318,6 +1318,8 @@ field only, both can be requested simultaneously.
    +---------------+---------------------------------------------+
    | Field         | Value                                       |
    +===============+=============================================+
+   | ``func``      | RSS hash function to apply                  |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 17336d163..546ef3ab7 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,6 +3398,9 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
+  - ``func {hash function}``: RSS hash function to apply, allowed tokens are
+    the same as `set_hash_global_config`_.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 8dc5f75f2..82307ec5d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1310,6 +1310,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 45bb3455c..d5c1cd3d3 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2905,6 +2905,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2919,7 +2920,8 @@ int
 igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 0242b5d59..5e313950c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11974,6 +11974,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11988,7 +11989,8 @@ int
 i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index db708fb5b..33f77cc80 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4352,6 +4352,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 4e31c7c56..00d975b93 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2779,6 +2779,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 94ea7444d..e17f5a433 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5682,6 +5682,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5696,7 +5697,8 @@ int
 ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4dbcaa39c..dcaf8df44 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -790,6 +790,12 @@ mlx4_flow_prepare(struct priv *priv,
 					" of the context size";
 				goto exit_action_not_supported;
 			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1283,6 +1289,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 7798052f9..0771ad339 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
@@ -634,6 +635,15 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "the only supported RSS hash"
+						   " function is Toeplitz");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -683,6 +693,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				}
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
+				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1915,6 +1926,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2429,6 +2441,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 1a2c0299c..779edad0c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1261,6 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
+	switch (rss->func) {
+	case RTE_ETH_HASH_FUNCTION_DEFAULT:
+	case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 78f20913f..7abf49ab1 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,6 +2055,12 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
+	/* Check supported hash functions */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "non-default RSS hash functions are not supported");
+
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
 	if (err < 0) {
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 2fabc9a29..0a2c0ac00 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,6 +330,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 4385e7eaa..acf6031ec 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -19,6 +19,7 @@
 
 #include <rte_arp.h>
 #include <rte_ether.h>
+#include <rte_eth_ctrl.h>
 #include <rte_icmp.h>
 #include <rte_ip.h>
 #include <rte_sctp.h>
@@ -1044,6 +1045,7 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
+	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (3 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-11 13:06  0%       ` Andrew Rybchenko
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
                       ` (8 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon, Radu Nicolau, Akhil Goyal

Since its inception, the rte_flow RSS action has been relying in part on
external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
This structure lacks parameters such as the hash algorithm to use, and more
recently, a method to tell which layer RSS should be performed on [1].

Given struct rte_eth_rss_conf will never be flexible enough to represent a
complete RSS configuration (e.g. RETA table), this patch supersedes it by
extending the rte_flow RSS action directly.

A subsequent patch will add a field to use a non-default RSS hash
algorithm. To that end, a field named "types" replaces the field formerly
known as "rss_hf" and standing for "RSS hash functions" as it was
confusing. Actual RSS hash function types are defined by enum
rte_eth_hash_function.

This patch updates all PMDs and example applications accordingly.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

[1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
Cc: Radu Nicolau <radu.nicolau@intel.com>
Cc: Akhil Goyal <akhil.goyal@nxp.com>

---

v3 changes:

Documentation update regarding the meaning of a 0 value for RSS types in
flow rules.

It used to implicitly mean "no RSS" but is redefined as requesting a kind
of "best-effort" mode from PMDs, i.e. anything ranging from empty to
all-inclusive RSS; what matters is it provides safe defaults that will work
regardless of PMD capabilities.
---
 app/test-pmd/cmdline_flow.c                 |  48 +++---
 app/test-pmd/config.c                       |  39 ++---
 doc/guides/prog_guide/rte_flow.rst          |  28 ++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  31 ++--
 drivers/net/e1000/igb_rxtx.c                |  51 +++++-
 drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                |  57 ++++---
 drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
 drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                |  61 +++----
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
 drivers/net/sfc/sfc_flow.c                  |  21 ++-
 drivers/net/tap/tap_flow.c                  |   8 +-
 examples/ipsec-secgw/ipsec.c                |  10 +-
 lib/librte_ether/rte_flow.c                 |  39 ++---
 lib/librte_ether/rte_flow.h                 |  12 +-
 28 files changed, 484 insertions(+), 359 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index acf19eb8a..f6b73ca6e 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -192,9 +192,8 @@ enum index {
 /** Storage for struct rte_flow_action_rss including external data. */
 struct action_rss_data {
 	struct rte_flow_action_rss conf;
+	uint8_t key[RSS_HASH_KEY_LENGTH];
 	uint16_t queue[ACTION_RSS_QUEUE_NUM];
-	struct rte_eth_rss_conf rss_conf;
-	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -1587,7 +1586,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
-		.help = "RSS hash types",
+		.help = "specific RSS hash types",
 		.next = NEXT(action_rss, NEXT_ENTRY(ACTION_RSS_TYPE)),
 	},
 	[ACTION_RSS_TYPE] = {
@@ -1602,21 +1601,21 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
 		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
 			     ARGS_ENTRY_ARB
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len)),
-			     ARGS_ENTRY(struct action_rss_data, rss_key)),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len)),
+			     ARGS_ENTRY(struct action_rss_data, key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len),
 			      0,
 			      RSS_HASH_KEY_LENGTH)),
 	},
@@ -2075,27 +2074,24 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->rss_conf,
-			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.types = rss_hf,
+			.key_len = sizeof(action_rss_data->key),
+			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.key = action_rss_data->key,
 			.queue = action_rss_data->queue,
 		},
+		.key = "testpmd's default RSS hash key",
 		.queue = { 0 },
-		.rss_conf = (struct rte_eth_rss_conf){
-			.rss_key = action_rss_data->rss_key,
-			.rss_key_len = sizeof(action_rss_data->rss_key),
-			.rss_hf = rss_hf,
-		},
-		.rss_key = "testpmd's default RSS hash key",
 	};
-	for (i = 0; i < action_rss_data->conf.num; ++i)
+	for (i = 0; i < action_rss_data->conf.queue_num; ++i)
 		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->rss_key),
+		action_rss_data->conf.key_len =
+			RTE_MIN(sizeof(action_rss_data->key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2123,7 +2119,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->rss_conf.rss_hf = 0;
+		action_rss_data->conf.types = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2142,7 +2138,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->conf.types |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2192,7 +2188,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue_num = i;
 	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 052163357..717f31774 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,40 +1084,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index acbeaacbd..cf252eeba 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1301,6 +1301,12 @@ Action: ``RSS``
 Similar to QUEUE, except RSS is additionally performed on packets to spread
 them among several queues according to the provided parameters.
 
+Unlike global RSS settings used by other DPDK APIs, unsetting the ``types``
+field does not disable RSS in a flow rule. Doing so instead requests safe
+unspecified "best-effort" settings from the underlying PMD, which depending
+on the flow rule, may result in anything ranging from empty (single queue)
+to all-inclusive RSS.
+
 Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
@@ -1309,15 +1315,19 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+--------------------------------+
-   | Field        | Value                          |
-   +==============+================================+
-   | ``rss_conf`` | RSS parameters                 |
-   +--------------+--------------------------------+
-   | ``num``      | number of entries in ``queue`` |
-   +--------------+--------------------------------+
-   | ``queue``    | queue indices to use           |
-   +--------------+--------------------------------+
+   +---------------+---------------------------------------------+
+   | Field         | Value                                       |
+   +===============+=============================================+
+   | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
+   +---------------+---------------------------------------------+
+   | ``key_len``   | hash key length in bytes                    |
+   +---------------+---------------------------------------------+
+   | ``queue_num`` | number of entries in ``queue``              |
+   +---------------+---------------------------------------------+
+   | ``key``       | hash key                                    |
+   +---------------+---------------------------------------------+
+   | ``queue``     | queue indices to use                        |
+   +---------------+---------------------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a015d02a4..17336d163 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,8 +3398,10 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
-  - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
-    are the same as `set_hash_input_set`_, an empty list means none (0).
+  - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
+    tokens are the same as `set_hash_input_set`_, except that an empty list
+    does not disable RSS but instead requests unspecified "best-effort"
+    settings.
 
   - ``key {string}``: RSS hash key, overrides ``key_len``.
 
diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b894a..902001f36 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -4,6 +4,10 @@
 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
+
+#include <stdint.h>
+
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_pci.h>
 
@@ -27,6 +31,7 @@
 #define E1000_CTRL_EXT_EXTEND_VLAN  (1<<26)    /* EXTENDED VLAN */
 #define IGB_VFTA_SIZE 128
 
+#define IGB_HKEY_MAX_INDEX             10
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
@@ -229,8 +234,8 @@ struct igb_ethertype_filter {
 };
 
 struct igb_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IGB_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IGB_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -501,6 +506,10 @@ int eth_igb_syn_filter_set(struct rte_eth_dev *dev,
 int eth_igb_add_del_flex_filter(struct rte_eth_dev *dev,
 			struct rte_eth_flex_filter *filter,
 			bool add);
+int igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		      const struct rte_flow_action_rss *in);
+int igb_action_rss_same(const struct rte_flow_action_rss *comp,
+			const struct rte_flow_action_rss *with);
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..7a431ac33 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -41,8 +41,6 @@
 #define IGB_DEFAULT_TX_HTHRESH      1
 #define IGB_DEFAULT_TX_WTHRESH      ((hw->mac.type == e1000_82576) ? 1 : 16)
 
-#define IGB_HKEY_MAX_INDEX 10
-
 /* Bit shift and mask */
 #define IGB_4_BIT_WIDTH  (CHAR_BIT / 2)
 #define IGB_4_BIT_MASK   RTE_LEN2MASK(IGB_4_BIT_WIDTH, uint8_t)
@@ -5576,7 +5574,7 @@ igb_rss_filter_restore(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter_info->rss_info, TRUE);
 }
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index c0f5b5190..8dc5f75f2 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1292,7 +1292,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -1300,7 +1300,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1310,14 +1310,18 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IGB_RSS_OFFLOAD_ALL;
-
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (igb_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	index++;
@@ -1518,9 +1522,8 @@ igb_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct igb_rte_flow_rss_conf));
+			igb_rss_conf_init(&rss_filter_ptr->filter_info,
+					  &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&igb_filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
@@ -1757,7 +1760,7 @@ igb_clear_rss_filter(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter->rss_info.num)
+	if (filter->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter->rss_info, FALSE);
 }
 
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 323913f0d..45bb3455c 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2898,12 +2898,47 @@ igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 }
 
 int
+igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		  const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+igb_action_rss_same(const struct rte_flow_action_rss *comp,
+		    const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 igb_config_rss_filter(struct rte_eth_dev *dev,
 		struct igb_rte_flow_rss_conf *conf, bool add)
 {
 	uint32_t shift;
 	uint16_t i, j;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
@@ -2911,8 +2946,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct igb_rte_flow_rss_conf)) == 0) {
+		if (igb_action_rss_same(&filter_info->rss_info.conf,
+					&conf->conf)) {
 			igb_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct igb_rte_flow_rss_conf));
@@ -2921,7 +2956,7 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 
 	/* Fill in redirection table. */
@@ -2933,9 +2968,9 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		} reta;
 		uint8_t q_idx;
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		q_idx = conf->queue[j];
+		q_idx = conf->conf.queue[j];
 		reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
 		if ((i & 3) == 3)
 			E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
@@ -2952,8 +2987,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	igb_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct igb_rte_flow_rss_conf));
+	if (igb_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..0242b5d59 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include <assert.h>
 
+#include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_string_fns.h>
 #include <rte_pci.h>
@@ -11467,7 +11468,7 @@ i40e_rss_filter_restore(struct i40e_pf *pf)
 {
 	struct i40e_rte_flow_rss_conf *conf =
 					&pf->rss_info;
-	if (conf->num)
+	if (conf->conf.queue_num)
 		i40e_config_rss_filter(pf, conf, TRUE);
 }
 
@@ -11966,18 +11967,52 @@ i40e_cloud_filter_qinq_create(struct i40e_pf *pf)
 }
 
 int
+i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		   const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+		     const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	uint32_t i, lut = 0;
 	uint16_t j, num;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct i40e_rte_flow_rss_conf *rss_info = &pf->rss_info;
 
 	if (!add) {
-		if (memcmp(conf, rss_info,
-			sizeof(struct i40e_rte_flow_rss_conf)) == 0) {
+		if (i40e_action_rss_same(&rss_info->conf, &conf->conf)) {
 			i40e_pf_disable_rss(pf);
 			memset(rss_info, 0,
 				sizeof(struct i40e_rte_flow_rss_conf));
@@ -11986,7 +12021,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 		return -EINVAL;
 	}
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		return -EINVAL;
 
 	/* If both VMDQ and RSS enabled, not all of PF queues are configured.
@@ -11997,7 +12032,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	else
 		num = pf->dev_data->nb_rx_queues;
 
-	num = RTE_MIN(num, conf->num);
+	num = RTE_MIN(num, conf->conf.queue_num);
 	PMD_DRV_LOG(INFO, "Max of contiguous %u PF queues are configured",
 			num);
 
@@ -12010,7 +12045,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
 			j = 0;
-		lut = (lut << 8) | (conf->queue[j] & ((0x1 <<
+		lut = (lut << 8) | (conf->conf.queue[j] & ((0x1 <<
 			hw->func_caps.rss_table_entry_width) - 1));
 		if ((i & 3) == 3)
 			I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i >> 2), lut);
@@ -12035,8 +12070,8 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 
 	i40e_hw_rss_hash_set(pf, &rss_conf);
 
-	rte_memcpy(rss_info,
-		conf, sizeof(struct i40e_rte_flow_rss_conf));
+	if (i40e_rss_conf_init(rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 151ed1a8c..5c02b37a0 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -5,13 +5,18 @@
 #ifndef _I40E_ETHDEV_H_
 #define _I40E_ETHDEV_H_
 
+#include <stdint.h>
+
 #include <rte_eth_ctrl.h>
 #include <rte_time.h>
 #include <rte_kvargs.h>
 #include <rte_hash.h>
+#include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_tm_driver.h>
 
+#include "base/i40e_register.h"
+
 #define I40E_VLAN_TAG_SIZE        4
 
 #define I40E_AQ_LEN               32
@@ -877,9 +882,11 @@ struct i40e_customized_pctype {
 };
 
 struct i40e_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
 	uint16_t queue_region_conf; /**< Queue region config flag */
-	uint16_t num; /**< Number of entries in queue[]. */
+	uint8_t key[(I40E_VFQF_HKEY_MAX_INDEX > I40E_PFQF_HKEY_MAX_INDEX ?
+		     I40E_VFQF_HKEY_MAX_INDEX : I40E_PFQF_HKEY_MAX_INDEX) + 1 *
+		    sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[I40E_MAX_Q_PER_TC]; /**< Queues indices to use. */
 };
 
@@ -1217,6 +1224,10 @@ void i40e_init_queue_region_conf(struct rte_eth_dev *dev);
 void i40e_flex_payload_reg_set_default(struct i40e_hw *hw);
 int i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, uint8_t key_len);
 int i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, uint16_t lut_size);
+int i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		       const struct rte_flow_action_rss *in);
+int i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+			 const struct rte_flow_action_rss *with);
 int i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index a32ad9b58..db708fb5b 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4207,7 +4207,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 
 	if (action_flag) {
 		for (n = 0; n < 64; n++) {
-			if (rss->rss_conf->rss_hf & (hf_bit << n)) {
+			if (rss->types & (hf_bit << n)) {
 				conf_info->region[0].hw_flowtype[0] = n;
 				conf_info->region[0].flowtype_num = 1;
 				conf_info->queue_region_number = 1;
@@ -4217,12 +4217,12 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	if (conf_info->queue_region_number) {
-		for (i = 0; i < rss->num; i++) {
-			for (j = 0; j < rss_info->num; j++) {
-				if (rss->queue[i] == rss_info->queue[j])
+		for (i = 0; i < rss->queue_num; i++) {
+			for (j = 0; j < rss_info->conf.queue_num; j++) {
+				if (rss->queue[i] == rss_info->conf.queue[j])
 					break;
 			}
-			if (j == rss_info->num) {
+			if (j == rss_info->conf.queue_num) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4231,7 +4231,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 		}
 
-		for (i = 0; i < rss->num - 1; i++) {
+		for (i = 0; i < rss->queue_num - 1; i++) {
 			if (rss->queue[i + 1] != rss->queue[i] + 1) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4245,8 +4245,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	for (n = 0; n < conf_info->queue_region_number; n++) {
 		if (conf_info->region[n].user_priority_num ||
 				conf_info->region[n].flowtype_num) {
-			if (!((rte_is_power_of_2(rss->num)) &&
-					rss->num <= 64)) {
+			if (!((rte_is_power_of_2(rss->queue_num)) &&
+					rss->queue_num <= 64)) {
 				PMD_DRV_LOG(ERR, "The region sizes should be any of the following values: 1, 2, 4, 8, 16, 32, 64 as long as the "
 				"total number of queues do not exceed the VSI allocation");
 				return -rte_errno;
@@ -4264,10 +4264,11 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				return -rte_errno;
 			}
 
-			if (rss_info->num < rss->num ||
-				rss->queue[0] < rss_info->queue[0] ||
-				(rss->queue[0] + rss->num >
-					rss_info->num + rss_info->queue[0])) {
+			if (rss_info->conf.queue_num < rss->queue_num ||
+				rss->queue[0] < rss_info->conf.queue[0] ||
+				(rss->queue[0] + rss->queue_num >
+					rss_info->conf.queue_num +
+					rss_info->conf.queue[0])) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4276,7 +4277,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 
 			for (i = 0; i < info->queue_region_number; i++) {
-				if (info->region[i].queue_num == rss->num &&
+				if (info->region[i].queue_num ==
+				    rss->queue_num &&
 					info->region[i].queue_start_index ==
 						rss->queue[0])
 					break;
@@ -4289,7 +4291,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				}
 
 				info->region[i].queue_num =
-					rss->num;
+					rss->queue_num;
 				info->region[i].queue_start_index =
 					rss->queue[0];
 				info->region[i].region_id =
@@ -4332,7 +4334,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	if (rss_config->queue_region_conf)
 		return 0;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -4340,7 +4342,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4349,15 +4351,20 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_config->rss_conf = *rss->rss_conf;
-	else
-		rss_config->rss_conf.rss_hf =
-			pf->adapter->flow_types_mask;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_config->queue[n] = rss->queue[n];
-	rss_config->num = rss->num;
+	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key too large");
+	if (rss->queue_num > RTE_DIM(rss_config->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (i40e_rss_conf_init(rss_config, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
+
 	index++;
 
 	/* check if the next not void action is END */
@@ -4877,7 +4884,7 @@ i40e_flow_flush_rss_filter(struct rte_eth_dev *dev)
 
 	ret = i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		ret = i40e_config_rss_filter(pf, rss_info, FALSE);
 	return ret;
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 752a17af0..ea3624ba4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -100,8 +100,6 @@
 
 #define IXGBE_QUEUE_STAT_COUNTERS (sizeof(hw_stats->qprc) / sizeof(hw_stats->qprc[0]))
 
-#define IXGBE_HKEY_MAX_INDEX 10
-
 /* Additional timesync values. */
 #define NSEC_PER_SEC             1000000000L
 #define IXGBE_INCVAL_10GB        0x66666666
@@ -8276,7 +8274,7 @@ ixgbe_rss_filter_restore(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev,
 			&filter_info->rss_info, TRUE);
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 655077700..9491b03f4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -4,6 +4,9 @@
 
 #ifndef _IXGBE_ETHDEV_H_
 #define _IXGBE_ETHDEV_H_
+
+#include <stdint.h>
+
 #include "base/ixgbe_type.h"
 #include "base/ixgbe_dcb.h"
 #include "base/ixgbe_dcb_82599.h"
@@ -12,6 +15,7 @@
 #ifdef RTE_LIBRTE_SECURITY
 #include "ixgbe_ipsec.h"
 #endif
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_hash.h>
 #include <rte_pci.h>
@@ -39,6 +43,7 @@
 #define IXGBE_EXTENDED_VLAN	  (uint32_t)(1 << 26) /* EXTENDED VLAN ENABLE */
 #define IXGBE_VFTA_SIZE 128
 #define IXGBE_VLAN_TAG_SIZE 4
+#define IXGBE_HKEY_MAX_INDEX 10
 #define IXGBE_MAX_RX_QUEUE_NUM	128
 #define IXGBE_MAX_INTR_QUEUE_NUM	15
 #define IXGBE_VMDQ_DCB_NB_QUEUES     IXGBE_MAX_RX_QUEUE_NUM
@@ -196,8 +201,8 @@ struct ixgbe_hw_fdir_info {
 };
 
 struct ixgbe_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IXGBE_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -696,6 +701,10 @@ void ixgbe_tm_conf_init(struct rte_eth_dev *dev);
 void ixgbe_tm_conf_uninit(struct rte_eth_dev *dev);
 int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev, uint16_t queue_idx,
 			       uint16_t tx_rate);
+int ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+			const struct rte_flow_action_rss *in);
+int ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+			  const struct rte_flow_action_rss *with);
 int ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index abdeac28b..4e31c7c56 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2761,7 +2761,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -2769,7 +2769,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -2778,14 +2778,19 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IXGBE_RSS_OFFLOAD_ALL;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (ixgbe_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	act = next_no_void_action(actions, act);
@@ -2834,7 +2839,7 @@ ixgbe_clear_rss_filter(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev, &filter_info->rss_info, FALSE);
 }
 
@@ -3153,9 +3158,8 @@ ixgbe_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct ixgbe_rte_flow_rss_conf));
+			ixgbe_rss_conf_init(&rss_filter_ptr->filter_info,
+					    &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 7511e183f..94ea7444d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5675,6 +5675,36 @@ ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
 }
 
 int
+ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+		    const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+		      const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add)
 {
@@ -5684,7 +5714,12 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	uint16_t j;
 	uint16_t sp_reta_size;
 	uint32_t reta_reg;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
@@ -5694,8 +5729,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	sp_reta_size = ixgbe_reta_size_get(hw->mac.type);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct ixgbe_rte_flow_rss_conf)) == 0) {
+		if (ixgbe_action_rss_same(&filter_info->rss_info.conf,
+					  &conf->conf)) {
 			ixgbe_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct ixgbe_rte_flow_rss_conf));
@@ -5704,7 +5739,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 	/* Fill in redirection table
 	 * The byte-swap is needed because NIC registers are in
@@ -5714,9 +5749,9 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	for (i = 0, j = 0; i < sp_reta_size; i++, j++) {
 		reta_reg = ixgbe_reta_reg_get(hw->mac.type, i);
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		reta = (reta << 8) | conf->queue[j];
+		reta = (reta << 8) | conf->conf.queue[j];
 		if ((i & 3) == 3)
 			IXGBE_WRITE_REG(hw, reta_reg,
 					rte_bswap32(reta));
@@ -5733,8 +5768,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	ixgbe_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct ixgbe_rte_flow_rss_conf));
+	if (ixgbe_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index fb8a8b848..c7854bead 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -569,7 +569,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			     " for UDP RSS and inner VXLAN RSS");
 			/* Fake support for all possible RSS hash fields. */
 			priv->hw_rss_sup = ~UINT64_C(0);
-			priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
+			priv->hw_rss_sup = mlx4_conv_rss_types(priv, -1);
 			/* Filter out known unsupported fields. */
 			priv->hw_rss_sup &=
 				~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 5a1b7dedd..4dbcaa39c 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -76,22 +76,22 @@ struct mlx4_drop {
 };
 
 /**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert DPDK RSS hash types to their Verbs equivalent.
  *
- * This function returns the supported (default) set when @p rss_hf has
+ * This function returns the supported (default) set when @p types has
  * special value (uint64_t)-1.
  *
  * @param priv
  *   Pointer to private structure.
- * @param rss_hf
- *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ *   Hash types in DPDK format (see struct rte_eth_rss_conf).
  *
  * @return
  *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
  *   otherwise and rte_errno is set.
  */
 uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types)
 {
 	enum { IPV4, IPV6, TCP, UDP, };
 	const uint64_t in[] = {
@@ -126,17 +126,17 @@ mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
 	unsigned int i;
 
 	for (i = 0; i != RTE_DIM(in); ++i)
-		if (rss_hf & in[i]) {
-			seen |= rss_hf & in[i];
+		if (types & in[i]) {
+			seen |= types & in[i];
 			conv |= out[i];
 		}
 	if ((conv & priv->hw_rss_sup) == conv) {
-		if (rss_hf == (uint64_t)-1) {
+		if (types == (uint64_t)-1) {
 			/* Include inner RSS by default if supported. */
 			conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
 			return conv;
 		}
-		if (!(rss_hf & ~seen))
+		if (!(types & ~seen))
 			return conv;
 	}
 	rte_errno = ENOTSUP;
@@ -717,7 +717,8 @@ mlx4_flow_prepare(struct priv *priv,
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
-			const struct rte_eth_rss_conf *rss_conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint64_t fields;
 			unsigned int i;
 
@@ -747,58 +748,56 @@ mlx4_flow_prepare(struct priv *priv,
 				break;
 			rss = action->conf;
 			/* Default RSS configuration if none is provided. */
-			rss_conf =
-				rss->rss_conf ?
-				rss->rss_conf :
-				&(struct rte_eth_rss_conf){
-					.rss_key = mlx4_rss_hash_key_default,
-					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
-					.rss_hf = -1,
-				};
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
 			/* Sanity checks. */
-			for (i = 0; i < rss->num; ++i)
+			for (i = 0; i < rss->queue_num; ++i)
 				if (rss->queue[i] >=
 				    priv->dev->data->nb_rx_queues)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "queue index target beyond number of"
 					" configured Rx queues";
 				goto exit_action_not_supported;
 			}
-			if (!rte_is_power_of_2(rss->num)) {
+			if (!rte_is_power_of_2(rss->queue_num)) {
 				msg = "for RSS, mlx4 requires the number of"
 					" queues to be a power of two";
 				goto exit_action_not_supported;
 			}
-			if (rss_conf->rss_key_len !=
-			    sizeof(flow->rss->key)) {
+			if (rss_key_len != sizeof(flow->rss->key)) {
 				msg = "mlx4 supports exactly one RSS hash key"
 					" length: "
 					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
 				goto exit_action_not_supported;
 			}
-			for (i = 1; i < rss->num; ++i)
+			for (i = 1; i < rss->queue_num; ++i)
 				if (rss->queue[i] - rss->queue[i - 1] != 1)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "mlx4 requires RSS contexts to use"
 					" consecutive queue indices only";
 				goto exit_action_not_supported;
 			}
-			if (rss->queue[0] % rss->num) {
+			if (rss->queue[0] % rss->queue_num) {
 				msg = "mlx4 requires the first queue of a RSS"
 					" context to be aligned on a multiple"
 					" of the context size";
 				goto exit_action_not_supported;
 			}
 			rte_errno = 0;
-			fields = mlx4_conv_rss_hf(priv, rss_conf->rss_hf);
+			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
 				msg = "unsupported RSS hash type requested";
 				goto exit_action_not_supported;
 			}
 			flow->rss = mlx4_rss_get
-				(priv, fields, rss_conf->rss_key, rss->num,
+				(priv, fields, rss_key, rss->queue_num,
 				 rss->queue);
 			if (!flow->rss) {
 				msg = "either invalid parameters or not enough"
@@ -1284,8 +1283,10 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
+		.types = -1,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 00188a65c..f71078ecc 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -47,7 +47,7 @@ struct rte_flow {
 
 /* mlx4_flow.c */
 
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t rss_hf);
 int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
 void mlx4_flow_clean(struct priv *priv);
 int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 7a036ed83..474614e4d 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
  */
 struct mlx4_rss *
 mlx4_rss_get(struct priv *priv, uint64_t fields,
-	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 	     uint16_t queues, const uint16_t queue_id[])
 {
 	struct mlx4_rss *rss;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index dd46ac006..521267724 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -126,7 +126,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
 int mlx4_rss_init(struct priv *priv);
 void mlx4_rss_deinit(struct priv *priv);
 struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
-			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 			      uint16_t queues, const uint16_t queue_id[]);
 void mlx4_rss_put(struct mlx4_rss *rss);
 int mlx4_rss_attach(struct mlx4_rss *rss);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a52dcf263..7798052f9 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -214,9 +214,8 @@ struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t (*queues)[]; /**< Queues indexes to use. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
 	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
@@ -406,9 +405,8 @@ struct mlx5_flow_parse {
 	uint32_t mark:1; /**< Mark is present in the flow. */
 	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
@@ -532,47 +530,6 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Copy the RSS configuration from the user ones, of the rss_conf is null,
- * uses the driver one.
- *
- * @param parser
- *   Internal parser structure.
- * @param rss_conf
- *   User RSS configuration to save.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_flow_convert_rss_conf(struct mlx5_flow_parse *parser,
-			   const struct rte_eth_rss_conf *rss_conf)
-{
-	/*
-	 * This function is also called at the beginning of
-	 * mlx5_flow_convert_actions() to initialize the parser with the
-	 * device default RSS configuration.
-	 */
-	if (rss_conf) {
-		if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len != 40) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len && rss_conf->rss_key) {
-			parser->rss_conf.rss_key_len = rss_conf->rss_key_len;
-			memcpy(parser->rss_key, rss_conf->rss_key,
-			       rss_conf->rss_key_len);
-			parser->rss_conf.rss_key = parser->rss_key;
-		}
-		parser->rss_conf.rss_hf = rss_conf->rss_hf;
-	}
-	return 0;
-}
-
-/**
  * Extract attribute to the parser.
  *
  * @param[in] attr
@@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	enum { FATE = 1, MARK = 2, COUNT = 4, };
 	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
 
-	/*
-	 * Add default RSS configuration necessary for Verbs to create QP even
-	 * if no RSS is necessary.
-	 */
-	ret = mlx5_flow_convert_rss_conf(parser,
-					 (const struct rte_eth_rss_conf *)
-					 &priv->rss_conf);
-	if (ret)
-		return ret;
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
@@ -671,25 +618,53 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			parser->queues_n = 1;
 			parser->queues[0] = queue->index;
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.queue_num = 1,
+				.queue = parser->queues,
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint16_t n;
 
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
-			if (!rss || !rss->num) {
+			if (rss->types & MLX5_RSS_HF_MASK) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "unsupported RSS type"
+						   " requested");
+				return -rte_errno;
+			}
+			if (rss->key_len) {
+				rss_key_len = rss->key_len;
+				rss_key = rss->key;
+			} else {
+				rss_key_len = rss_hash_default_key_len;
+				rss_key = rss_hash_default_key;
+			}
+			if (rss_key_len != RTE_DIM(parser->rss_key)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS hash key must be"
+						   " exactly 40 bytes long");
+				return -rte_errno;
+			}
+			if (!rss->queue_num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (rss->num > RTE_DIM(parser->queues)) {
+			if (rss->queue_num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -697,7 +672,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " context");
 				return -rte_errno;
 			}
-			for (n = 0; n < rss->num; ++n) {
+			for (n = 0; n < rss->queue_num; ++n) {
 				if (rss->queue[n] >= priv->rxqs_n) {
 					rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -707,16 +682,16 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 					return -rte_errno;
 				}
 			}
-			for (n = 0; n < rss->num; ++n)
-				parser->queues[n] = rss->queue[n];
-			parser->queues_n = rss->num;
-			if (mlx5_flow_convert_rss_conf(parser, rss->rss_conf)) {
-				rte_flow_error_set(error, EINVAL,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "wrong RSS configuration");
-				return -rte_errno;
-			}
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.types = rss->types,
+				.key_len = rss_key_len,
+				.queue_num = rss->queue_num,
+				.key = memcpy(parser->rss_key, rss_key,
+					      sizeof(*rss_key) * rss_key_len),
+				.queue = memcpy(parser->queues, rss->queue,
+						sizeof(*rss->queue) *
+						rss->queue_num),
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -761,7 +736,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
-	if (!parser->queues_n && !parser->drop) {
+	if (!parser->rss_conf.queue_num && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
@@ -941,7 +916,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	unsigned int i;
 
 	/* Remove any other flow not matching the pattern. */
-	if (parser->queues_n == 1 && !parser->rss_conf.rss_hf) {
+	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			if (i == HASH_RXQ_ETH)
 				continue;
@@ -969,7 +944,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	}
 	/* Remove impossible flow according to the RSS configuration. */
 	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.rss_hf) {
+	    parser->rss_conf.types) {
 		/* Remove any other flow. */
 		for (i = hmin; i != (hmax + 1); ++i) {
 			if ((i == parser->layer) ||
@@ -980,7 +955,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		}
 	} else  if (!parser->queue[ip].ibv_attr) {
 		/* no RSS possible with the current configuration. */
-		parser->queues_n = 1;
+		parser->rss_conf.queue_num = 1;
 		return;
 	}
 fill:
@@ -1109,7 +1084,7 @@ mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.rss_hf &
+			if (!(parser->rss_conf.types &
 			      hash_rxq_init[i].dpdk_rss_hf) &&
 			    (i != HASH_RXQ_ETH))
 				continue;
@@ -1777,20 +1752,20 @@ mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -1861,9 +1836,9 @@ mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 				   NULL, "internal error in flow creation");
 		goto error;
 	}
-	for (i = 0; i != parser->queues_n; ++i) {
+	for (i = 0; i != parser->rss_conf.queue_num; ++i) {
 		struct mlx5_rxq_data *q =
-			(*priv->rxqs)[parser->queues[i]];
+			(*priv->rxqs)[parser->rss_conf.queue[i]];
 
 		q->mark |= parser->mark;
 	}
@@ -1927,7 +1902,8 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	if (ret)
 		goto exit;
 	flow = rte_calloc(__func__, 1,
-			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  sizeof(*flow) +
+			  parser.rss_conf.queue_num * sizeof(uint16_t),
 			  0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
@@ -1936,15 +1912,20 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 				   "cannot allocate flow memory");
 		return NULL;
 	}
-	/* Copy queues configuration. */
+	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
-	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
-	flow->queues_n = parser.queues_n;
+	flow->rss_conf = (struct rte_flow_action_rss){
+		.types = parser.rss_conf.types,
+		.key_len = parser.rss_conf.key_len,
+		.queue_num = parser.rss_conf.queue_num,
+		.key = memcpy(flow->rss_key, parser.rss_conf.key,
+			      sizeof(*parser.rss_conf.key) *
+			      parser.rss_conf.key_len),
+		.queue = memcpy(flow->queues, parser.rss_conf.queue,
+				sizeof(*parser.rss_conf.queue) *
+				parser.rss_conf.queue_num),
+	};
 	flow->mark = parser.mark;
-	/* Copy RSS configuration. */
-	flow->rss_conf = parser.rss_conf;
-	flow->rss_conf.rss_key = flow->rss_key;
-	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
 	/* finalise the flow. */
 	if (parser.drop)
 		ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow,
@@ -2024,7 +2005,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 
 	if (flow->drop || !flow->mark)
 		goto free;
-	for (i = 0; i != flow->queues_n; ++i) {
+	for (i = 0; i != flow->rss_conf.queue_num; ++i) {
 		struct rte_flow *tmp;
 		int mark = 0;
 
@@ -2334,19 +2315,19 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			if (!flow->frxq[i].ibv_attr)
 				continue;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_get(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_get(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_new(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_new(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
@@ -2370,8 +2351,8 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 		}
 		if (!flow->mark)
 			continue;
-		for (i = 0; i != flow->queues_n; ++i)
-			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
+		for (i = 0; i != flow->rss_conf.queue_num; ++i)
+			(*priv->rxqs)[flow->rss_conf.queue[i]]->mark = 1;
 	}
 	return 0;
 }
@@ -2448,8 +2429,10 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = &priv->rss_conf,
-		.num = priv->reta_idx_n,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 1b4570586..1e4354ab3 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1218,8 +1218,8 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1286,8 +1286,8 @@ mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
  *   An indirection table if found.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1391,8 +1391,10 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1419,7 +1421,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = rss_key,
+				.rx_hash_key = (void *)(uintptr_t)rss_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
@@ -1469,8 +1471,10 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
  *   An hash Rx queue on success.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f5af43735..a702cb603 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -134,7 +134,7 @@ struct mlx5_ind_table_ibv {
 	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
-	uint16_t queues_n; /**< Number of queues in the list. */
+	uint32_t queues_n; /**< Number of queues in the list. */
 	uint16_t queues[]; /**< Queue list. */
 };
 
@@ -145,7 +145,7 @@ struct mlx5_hrxq {
 	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
-	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
 
@@ -237,20 +237,22 @@ int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_rxq_verify(struct rte_eth_dev *dev);
 int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
 			       struct mlx5_ind_table_ibv *ind_tbl);
 int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev);
-struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
-struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 056405515..1a2c0299c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	struct sfc_rxq *rxq;
 	unsigned int rxq_hw_index_min;
 	unsigned int rxq_hw_index_max;
-	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
-	uint64_t rss_hf;
-	uint8_t *rss_key = NULL;
+	const uint8_t *rss_key;
 	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
 	unsigned int i;
 
-	if (rss->num == 0)
+	if (rss->queue_num == 0)
 		return -EINVAL;
 
 	rxq_sw_index = sa->rxq_count - 1;
@@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	rxq_hw_index_min = rxq->hw_index;
 	rxq_hw_index_max = 0;
 
-	for (i = 0; i < rss->num; ++i) {
+	for (i = 0; i < rss->queue_num; ++i) {
 		rxq_sw_index = rss->queue[i];
 
 		if (rxq_sw_index >= sa->rxq_count)
@@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
-	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
-	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
+	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
-	if (rss_conf != NULL) {
-		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
+	if (rss->key_len) {
+		if (rss->key_len != sizeof(sa->rss_key))
 			return -EINVAL;
 
-		rss_key = rss_conf->rss_key;
+		rss_key = rss->key;
 	} else {
 		rss_key = sa->rss_key;
 	}
@@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 
 	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
 	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
-	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
+	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
 	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
 
 	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
-		unsigned int rxq_sw_index = rss->queue[i % rss->num];
+		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
 		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
 
 		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index aea3462a6..78f20913f 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1215,7 +1215,7 @@ priv_flow_process(struct pmd_internals *pmd,
 				if (err)
 					goto exit_action_not_supported;
 			}
-			if (flow && rss)
+			if (flow)
 				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
@@ -2050,7 +2050,7 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 			   struct rte_flow_error *error)
 {
 	/* 4096 is the maximum number of instructions for a BPF program */
-	int i;
+	unsigned int i;
 	int err;
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
@@ -2066,8 +2066,8 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	}
 
 	/* Update RSS map entry with queues */
-	rss_entry.nb_queues = rss->num;
-	for (i = 0; i < rss->num; i++)
+	rss_entry.nb_queues = rss->queue_num;
+	for (i = 0; i < rss->queue_num; i++)
 		rss_entry.queues[i] = rss->queue[i];
 	rss_entry.hash_fields =
 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 8b2047adb..3ce76c413 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -202,9 +202,13 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
 						queue[j++] = i;
-				action_rss.rss_conf = &rss_conf;
-				action_rss.num = j;
-				action_rss.queue = queue;
+				action_rss = (struct rte_flow_action_rss){
+					.types = rss_conf.rss_hf,
+					.key_len = rss_conf.rss_key_len,
+					.queue_num = j,
+					.key = rss_key,
+					.queue = queue,
+				};
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 550086411..2fabc9a29 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,40 +330,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 895feb1a3..4385e7eaa 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1033,13 +1033,21 @@ struct rte_flow_query_count {
  * Similar to QUEUE, except RSS is additionally performed on packets to
  * spread them among several queues according to the provided parameters.
  *
+ * Unlike global RSS settings used by other DPDK APIs, unsetting the
+ * @p types field does not disable RSS in a flow rule. Doing so instead
+ * requests safe unspecified "best-effort" settings from the underlying PMD,
+ * which depending on the flow rule, may result in anything ranging from
+ * empty (single queue) to all-inclusive RSS.
+ *
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
-	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+	uint32_t key_len; /**< Hash key length in bytes. */
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	const uint8_t *key; /**< Hash key. */
 	const uint16_t *queue; /**< Queue indices to use. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (2 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
                       ` (9 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 135 insertions(+), 117 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index be867b0ec..acf19eb8a 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,29 +2073,29 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->s.rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->s.rss_key),
+		action_rss_data->rss_conf.rss_key_len =
+			RTE_MIN(sizeof(action_rss_data->rss_key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2125,7 +2113,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2135,7 +2123,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2154,7 +2142,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2169,7 +2157,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2186,10 +2174,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2206,6 +2193,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2483,8 +2471,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2493,6 +2481,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2503,6 +2492,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2525,8 +2519,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 8d42ea9a9..052163357 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,7 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1010,14 +1010,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1049,7 +1055,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1080,11 +1086,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 582483076..5a1b7dedd 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 84d6f9b92..a52dcf263 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index db04c4f94..550086411 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index af9b14a4d..895feb1a3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
                       ` (10 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to do so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index b9f36587c..a5c6a1670 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -964,6 +965,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -1001,6 +1011,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4d26df326..582483076 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index f051fbef5..84d6f9b92 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap & FATE)
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 551b2d83d..aea3462a6 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index aab637a2c..af9b14a4d 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-10 16:36  2%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
                       ` (11 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index ac4b51a8a..be867b0ec 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 7ae0295f6..8d42ea9a9 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1049,7 +1049,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ba6feddee..db04c4f94 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 36fd38ffa..aab637a2c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-10 16:36  3%     ` Adrien Mazarguil
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
                       ` (12 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 2058e6ec8..7ae0295f6 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1228,8 +1228,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cdaaa3a5b..95799fd9c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads
                       ` (7 preceding siblings ...)
  @ 2018-04-10 16:36  4%   ` Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
                       ` (13 more replies)
  8 siblings, 14 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v4 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/096509.html

v3 changes:

- Rebased series, fixed latest conflicts.
- Addressed Andrew's comments, see affected patches for details:
  - Empty RSS types in flow rule means PMD-specific RSS instead of no RSS.
  - RSS hash function now explicitly compared against
    RTE_ETH_HASH_FUNCTION_DEFAULT instead of 0 in all PMDs.
  - sfc PMD updated to also accept Toeplitz.
  - Implicit VLAN TPID matching now removed from all PMDs.
  - Default mask upate for VLAN TCI now split as separate patch #11.
  - Ingress/egress definition clarified in patch #12.

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (16):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: limit default VLAN TCI mask in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 394 +++++++++++----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 618 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  60 ++-
 drivers/net/bnxt/bnxt_filter.c              |  49 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 ++-
 drivers/net/e1000/igb_rxtx.c                |  55 +-
 drivers/net/enic/enic_flow.c                |  50 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 140 +++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 +-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 316 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  32 +-
 drivers/net/sfc/sfc_flow.c                  |  78 ++-
 drivers/net/tap/tap_flow.c                  |  49 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 339 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1756 insertions(+), 1127 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 00/11] Bunch of flow API-related fixes
  @ 2018-04-10 16:34  3% ` Adrien Mazarguil
  2018-04-16 16:21  3%   ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:34 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v4 changes:

- Rebased again.
- The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
  see updated patch for details.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline.c                      |   2 +
 app/test-pmd/cmdline_flow.c                 | 252 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 10 files changed, 494 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] app/test: enhance power manager unit tests
  @ 2018-04-10 14:19  3% ` Hunt, David
  0 siblings, 0 replies; 200+ results
From: Hunt, David @ 2018-04-10 14:19 UTC (permalink / raw)
  To: Reshma Pattan, dev; +Cc: jananeex.m.parthasarathy

Hi Reshma,


On 6/4/2018 2:51 PM, Reshma Pattan wrote:
> Unit Testcases are added for power_acpi_cpu_freq,
> power_kvm_vm_test to improve coverage
>
> Signed-off-by: Jananee Parthasarathy <jananeex.m.parthasarathy@intel.com>
> ---
>   test/test/test_power_acpi_cpufreq.c |  2 +-
>   test/test/test_power_kvm_vm.c       | 62 +++++++++++++++++++++++++++++++++----
>   2 files changed, 57 insertions(+), 7 deletions(-)
>
> diff --git a/test/test/test_power_acpi_cpufreq.c b/test/test/test_power_acpi_cpufreq.c
> index 3bfd033..8da2dcc 100644
> --- a/test/test/test_power_acpi_cpufreq.c
> +++ b/test/test/test_power_acpi_cpufreq.c
> @@ -27,7 +27,7 @@
>   #define TEST_POWER_FREQS_NUM_MAX ((unsigned)RTE_MAX_LCORE_FREQS)
>   
>   #define TEST_POWER_SYSFILE_CUR_FREQ \
> -	"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
> +	"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_cur_freq"

This change is OK with me, from what I can see using cpuinfo_cur_freq 
instead of
scaling_cpu_freq gives us more compatibility on a wider selection of 
operating systems.

>   
>   static uint32_t total_freq_num;
>   static uint32_t freqs[TEST_POWER_FREQS_NUM_MAX];
> diff --git a/test/test/test_power_kvm_vm.c b/test/test/test_power_kvm_vm.c
> index 91b31c4..012ad82 100644
> --- a/test/test/test_power_kvm_vm.c
> +++ b/test/test/test_power_kvm_vm.c
> @@ -25,12 +25,19 @@
>   #define TEST_POWER_VM_LCORE_ID            0U
>   #define TEST_POWER_VM_LCORE_OUT_OF_BOUNDS (RTE_MAX_LCORE+1)
>   #define TEST_POWER_VM_LCORE_INVALID       1U
> +#define TEMP_POWER_MANAGER_FILE_PATH  "/tmp/testpm"
> +
> +int guest_channel_host_connect(const char *path, unsigned int lcore_id);
> +int power_kvm_vm_enable_turbo(unsigned int lcore_id);
> +int power_kvm_vm_disable_turbo(unsigned int lcore_id);

I see here you are calling guest_channel_host_connect to "emulate" a 
virtio-serial connection
to a host. While I am not a huge fan of faking functionality, I feel 
that having these unit tests
check ABI and API breakages is more beneficial, so I'm good with it for 
this reason.

However, there's no need to have the power_kvm_vm_enable/disable_turbo() 
prototypes, as you
can just use rte_power_freq_enable_turbo() and 
rte_power_freq_disable_turbo(), which in turn
call power_kvm_vm_enable_turbo() and power_kvm_vm_disable_turbo()

>   
>   static int
>   test_power_kvm_vm(void)
>   {
>   	int ret;
>   	enum power_management_env env;
> +	char fPath[PATH_MAX];
> +	FILE *fPtr = NULL;
>   
>   	ret = rte_power_set_env(PM_ENV_KVM_VM);
>   	if (ret != 0) {
> @@ -95,12 +102,31 @@
>   	/* Test initialisation of a valid lcore */
>   	ret = rte_power_init(TEST_POWER_VM_LCORE_ID);
>   	if (ret < 0) {
> -		printf("Cannot initialise power management for lcore %u, this "
> -				"may occur if environment is not configured "
> -				"correctly(KVM VM) or operating in another valid "
> -				"Power management environment\n", TEST_POWER_VM_LCORE_ID);
> -		rte_power_unset_env();
> -		return -1;
> +		printf("rte_power_init failed as expected in host\n");
> +		/* This test would be successful when run on VM,
> +		 * in order to run in Host itself, temporary file path
> +		 * is created and same is used for further communication
> +		 */
> +
> +		snprintf(fPath, PATH_MAX, "%s.%u",
> +			TEMP_POWER_MANAGER_FILE_PATH, TEST_POWER_VM_LCORE_ID);
> +		fPtr = fopen(fPath, "w");
> +		if (fPtr == NULL) {
> +			printf(" Unable to create file\n");
> +			rte_power_unset_env();
> +			return -1;
> +		}
> +		ret = guest_channel_host_connect(TEMP_POWER_MANAGER_FILE_PATH,
> +			TEST_POWER_VM_LCORE_ID);
> +		if (ret == 0)
> +			printf("guest_channel_host_connect successful\n");
> +		else {
> +			printf("guest_channel_host_connect failed\n");
> +			rte_power_unset_env();
> +			fclose(fPtr);
> +			remove(fPath);
> +			return -1;
> +		}
>   	}
>   
>   	/* Test initialisation of previously initialised lcore */
> @@ -175,6 +201,22 @@
>   		goto fail_all;
>   	}
>   
> +	/* Test KVM_VM Enable Turbo of valid core */
> +	ret = power_kvm_vm_enable_turbo(TEST_POWER_VM_LCORE_ID);

see comment above about using rte_power_freq_enable_turbo()

> +	if (ret == -1) {
> +		printf("power_kvm_vm_enable_turbo failed on valid lcore"
> +			"%u\n", TEST_POWER_VM_LCORE_ID);
> +		goto fail_all;
> +	}
> +
> +	/* Test KVM_VM Disable Turbo of valid core */
> +	ret = power_kvm_vm_disable_turbo(TEST_POWER_VM_LCORE_ID);

see comment above about using rte_power_freq_disable_turbo()

> +	if (ret == -1) {
> +		printf("power_kvm_vm_disable_turbo failed on valid lcore"
> +		"%u\n", TEST_POWER_VM_LCORE_ID);
> +		goto fail_all;
> +	}
> +
>   	/* Test frequency up of valid lcore */
>   	ret = rte_power_freq_up(TEST_POWER_VM_LCORE_ID);
>   	if (ret != 1) {
> @@ -274,10 +316,18 @@
>   		return -1;
>   	}
>   	rte_power_unset_env();
> +	if (fPtr != NULL) {
> +		fclose(fPtr);
> +		remove(fPath);
> +	}
>   	return 0;
>   fail_all:
>   	rte_power_exit(TEST_POWER_VM_LCORE_ID);
>   	rte_power_unset_env();
> +	if (fPtr != NULL) {
> +		fclose(fPtr);
> +		remove(fPath);
> +	}
>   	return -1;
>   }
>   #endif

With the changes described above:

Acked-by: David Hunt <david.hunt@intel.com>

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:26  0%           ` Dumitrescu, Cristian
@ 2018-04-10 12:32  0%             ` Van Haaren, Harry
  0 siblings, 0 replies; 200+ results
From: Van Haaren, Harry @ 2018-04-10 12:32 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Neil Horman
  Cc: dev, Ananyev, Konstantin, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce

+CC Neil from other reply

> From: Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 6:27 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Van Haaren, Harry
> <harry.van.haaren@intel.com>; Stephen Hemminger <stephen@networkplumber.org>;
> Singh, Jasvinder <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> > >
> > > If people think that this function conversion is not nice, it can be
> reworked
> > in multiple ways at the expense of API (but not ABI) change:
> > > 1. Define the hash function field in the table parameter structure as
> > opaque void * rather than 4-parameter version.
> > > 2. Create a separate parameter structure just for this hash table type.
> >
> > Why just not define your f_hash member as a union:
> >
> > struct rte_table_hash_params {
> > ...
> > union {
> >     rte_table_hash_op_hash  f_hash_4params;
> >     rte_hash_function f_hash_3_params;
> > };
> >
> > ?
> >
> 
> Yes, agreed, this is yet another way to handle this, thanks Konstantin.

Agree that this solution is a lot better than raw casting.

The issue I have with casting is that it doesn't explicitly show that the signature is different, and that the code must be aware of that fact. With a union, at least the code explicitly states that there is a difference in signature, and that this is being handled by the code, so this looks a better solution.

Neil proposed an alternative solution using a bit to indicate calling params in a separate reply - another possibility.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
@ 2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 0 replies; 200+ results
From: Neil Horman @ 2018-04-10 11:43 UTC (permalink / raw)
  To: Van Haaren, Harry
  Cc: Dumitrescu, Cristian, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce, dev

On Mon, Apr 09, 2018 at 04:38:11PM +0000, Van Haaren, Harry wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> > Sent: Monday, April 9, 2018 4:59 PM
> > To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > 
> > 
> > 
> > > -----Original Message-----
> > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > Sent: Monday, April 9, 2018 4:10 PM
> > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > >
> > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > >
> > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > ---
> > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > index dcb4fe9..f7eae27 100644
> > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
> > > >  		return NULL;
> > > >  	}
> > > >
> > > > +	void *hash_func = p->f_hash;
> > > > +
> > > >  	/* Create cuckoo hash table */
> > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > >  		.entries = p->n_keys,
> > > >  		.key_len = p->key_size,
> > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > +		.hash_func = (rte_hash_function) hash_func,
> > > >  		.hash_func_init_val = p->seed,
> > > >  		.socket_id = socket_id,
> > > >  		.name = p->name
> > >
> > > This is just tricking the compiler into not complaining.
> > > I would really rather see the two hash functions made the same.
> > 
> > (Adding Bruce as well to consolidate all conversations in a single thread.)
> > 
> > What we want to do here is be able to use the librte_hash under the same API
> > as the several hash table flavors implemented in librte_table.
> > 
> > Both of these libraries allow configuring the hash function per each hash
> > table instance. Problem is: hash function in librte_hash has only 3 parameters
> > (no key mask), while hash function in librte_table has 4 parameters (includes
> > key mask). The key mask helps a lot for practical protocol implementations by
> > avoiding key copy & pre-process on lookup.
> > 
> > So then: how to plug in librte_hash under the same API as the suite of hash
> > tables in librte_table? We don't want to re-implement cuckoo hash from
> > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > to how the LPM and ACL tables are plugged into librte_table.
> > 
> > Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
> > flavor under the librte_table. Maybe this should be documented better. This
> > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > this trivial patch.
> > 
> > Ideally, for every 3-parameter hash function, I would like to generate the
> > corresponding 4-parameter hash function on-the-fly, but unfortunately this is
> > not what C language can do.
> > 
> > Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.
> Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.
> 
> With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?
> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)
> 
Why not make the hash_func pointer in the rte_hash_parameters structure an
anonymous union, and reserve a bit in the extra_flag field to denote if the
function pointer has 3 arguments or 4?  Then rte_hash_hash can use the
appropriate calling convention on hash_func.

Neil

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-10 10:19  0%       ` Adrien Mazarguil
@ 2018-04-10 11:06  0%         ` Shahaf Shuler
  0 siblings, 0 replies; 200+ results
From: Shahaf Shuler @ 2018-04-10 11:06 UTC (permalink / raw)
  To: Adrien Mazarguil, Mohammad Abdul Awal; +Cc: Declan Doherty, dev, Alex Rosenbaum

Hi,

Adding small comment on top of Adrien's

Tuesday, April 10, 2018 1:20 PM, Adrien Mazarguil:
> On Mon, Apr 09, 2018 at 05:10:35PM +0100, Mohammad Abdul Awal wrote:
> > On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > > On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> > > > Add new flow action types and associated action data structures to
> > > > support the encapsulation and decapsulation of the virtual tunnel
> > > > endpoints.
> > > >
> > > > The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the
> > > > matching flow to be encapsulated in the virtual tunnel endpoint
> > > > overlay defined in the tunnel_encap action data.
> > > >
> > > > The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all
> > > > virtual tunnel endpoint overlays up to and including the first
> > > > instance of the flow item type defined in the tunnel_decap action
> > > > data for the matching flows.
> > > >
> > > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > > This generic approach looks flexible enough to cover the use cases
> > > that immediately come to mind (VLAN, VXLAN), its design is sound.
> > >
> > > However, while I'm aware it's not a concern at this point, it won't
> > > be able to deal with stateful tunnel or encapsulation types (e.g.
> > > IPsec or TCP) which will require additional meta data or some
> > > run-time assistance from the application.
> > >
> > > Eventually for more complex use cases, dedicated encap/decap actions
> > > will have to appear, so the issue I wanted to raise before going further is
> this:
> > >
> > > Going generic inevitably trades some of the usability; flat
> > > structures dedicated to VXLAN encap/decap with only the needed info
> > > to get the job done would likely be easier to implement in PMDs and
> > > use in applications. Any number of such actions can be added to rte_flow
> without ABI impact.
> > >
> > > If VXLAN is the only use case at this point, my suggestion would be
> > > to go with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP)
> actions,
> > > with fixed
> > > L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
> > We can go this way and this will increase the action for more and more
> > tunneling protocols being added. Current proposal is already a generic
> > approach which specifies as a tunnel for all the tunneling protocols.
> 
> Right, on the other hand there are not that many standard encapsulations
> offloaded by existing devices. rte_flow could easily handle dedicated actions
> for each of them without problem.
> 
> My point is that many of those (will eventually) have their own quirks to
> manage when doing encap/decap, it's not just a matter of prepending or
> removing a bunch of header definitions, otherwise we could as well let
> applications simply provide an arbitrary buffer to prepend.
> 
> Consider that the "generic" part is already built into rte_flow as the way
> patterns and action are handled. Adding another generic layer on top of that
> could make things more inconvenient than necessary to applications (my
> main concern).
> 
> You'd need another layer of validation/error reporting machinery to properly
> let applications know they cannot encap VXLAN on top of TCP on top of
> QinQinQinQinQ for instance. Either a single bounded encapsulation definition
> or a combination at the action list level is needed to avoid that.
> 
> > > Now we can start with the generic approach, see how it fares and add
> > > dedicated encap/decap later as needed.
> > >
> > > More comments below.
> <snip>
> > > > +Action: ``TUNNEL_ENCAP``
> > > > +^^^^^^^^^^^^^^^^^^^^^^

The ENCAP/DECAP doesn't have to be in the context of tunnel.
For example - let's take GRE - application may want to decap the GRE and encap it with L2. The L2 encapsulation is not related to any tunnel. 
Same for the other direction - VM sends Eth frame, and we want to decap the Eth and encap with GRE.

I think those action should be free from the tunnel association and just provide flow items we want to encap/decap or in a more generic way offset to the packet headers and buffer to encap (not sure how many devices supports that, may be overkill at this point). 

> > > > +
> > > > +Performs an encapsulation action by encapsulating the flows
> > > > +matched by the pattern items according to the network overlay
> > > > +defined in the ``rte_flow_action_tunnel_encap`` pattern items.
> > > > +
> > > > +This action modifies the payload of matched flows. The pattern
> > > > +items specified in the ``rte_flow_action_tunnel_encap`` action
> > > > +structure must defined a valid set of overlay headers, from the
> > > > +Ethernet header up to the overlay header. The pattern must be
> terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> > > Regarding the use of a pattern list, if you consider PMDs are
> > > already iterating on a list of actions when encountering
> > > RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner
> loop.
> > We understand that it is implementation specifics. If we do not go for
> > another inner loop, all the bundling need to be handled in the same
> > function, which seems more clumsy to me. This also breaks the tunnel
> > endpoint concept.
> > >
> > > How about making each encountered
> RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
> > > provide exactly one item instead (in encap, i.e. reverse order)?
> > Again, if we have tunnel action, security action, and other actions,
> > all the processing and tracking need to be done in one function. Now
> > we will need ETH_ENCAP/DECAP, UDP_ENCAP/DECAP,
> NVGRE_ENCAP/DECAP, etc.
> 
> Well, the number of DECAP actions doesn't need to perfectly reflect that of
> ENCAP since it implies all preceding layers. No problem with that.
> 
> Regarding multiple dedicated actions, my suggestion was for a single generic
> one as in this patch, but each instance on the ENCAP side would deal with a
> single protocol layer, instead of having a single ENCAP action with multiple
> inner layers (and thus an inner loop).
> 
> PMDs also gain the ability to precisely report which encap step fails by making
> rte_flow_error point to the problematic object to ease debugging of flow
> rules on the application side.
> 
> Why would that break the tunnel idea and more importantly, how would it
> prevent PMD developers from splitting their processing into multiple
> functions?
> 
> > >
> > > In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
> > >
> <snip>
> > > > +
> > > > +   +-------+--------------------------+------------+
> > > > +   | Index | Flow Item Type           | Flow Item  |
> > > > +   +=======+==========================+============+
> > > > +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> > > > +   +-------+--------------------------+------------+
> > > One possible issue is that it relies on objects normally found on
> > > the pattern side of flow rules. Those are supposed to match
> > > something, they are not intended for packet header generation. While
> their "spec" and "mask"
> > > fields might make sense in this context, the "last" field is odd.
> > >
> > > You must define them without leaving anything open for
> > > interpretation by PMDs and users alike. Defining things as
> > > "undefined" is fine as long as it's covered.
> > Please note that the "void *item" in the
> > "rte_flow_action_tunnel_encap.pattern" points to the data structure
> > defined for the corresponding rte_flow_item_type instead of a
> > rte_flow_item structure. As an example, for the rte_flow_item_eth type,
> the "void *item"
> > will point to a "struct rte_flow_item_eth" instance. Thats why we have
> > defined struct rte_flow_action_item inside struct
> > rte_flow_action_tunnel_encap. So, no question of spec, mask, last
> anymore.
> 
> Right, I noticed that after commenting its structure definition below.
> 
> I think I won't be the only one confused by this approach, also because a
> mask is needed in addition to a specification structure, otherwise how do you
> plan to tell what fields are relevant in application-provided protocol headers?
> 
> An application might set unusual IPv4/UDP/VXLAN fields and expect them to
> be part of the encapsulated traffic. Without a mask, a PMD must take
> headers verbatim, and I don't think many devices are ready for that yet.
> 
> Hence my other suggestion: defining inflexible $PROTOCOL_(ENCAP|DECAP)
> actions that do not allow more than what's defined by official RFCs for
> $PROTOCOL.
> 
> <snip>
> > > > + */
> > > > +struct rte_flow_action_tunnel_encap {
> > > > +	struct rte_flow_action_item {
> > > > +		enum rte_flow_item_type type;
> > > > +		/**< Flow item type. */
> > > > +		const void *item;
> > > > +		/**< Flow item definition which points to the data of
> > > > +		 * corresponding rte_flow_item_type.
> > > > +		 */
> > > I see it's a new action type, albeit a bit confusing (there is no
> > > RTE_FLOW_ACTION_TYPE_ITEM).
> > >
> > > I suggest the standard pattern item type since you're going with
> > > enum rte_flow_item_type anyway. Keep in mind you need some kind of
> > > mask to tell what fields are relevant. An application might
> > > otherwise want to encap with unsupported properties (e.g. specific IPv4
> ToS field and whatnot).
> > >
> > > How about a single "struct rte_flow_pattern_item item", neither
> > > const and neither a pointer. It's generic enough, enclosed
> > > spec/last/mask pointers take care of the specifics. You just need to
> > > define what's supposed to happen when "last" is set.
> > Please see the comment above regarding this field.
> 
> Point still stands, if you need to distinguish spec and mask, a more complete
> structure is needed. Instead of adding a new (confusing) type, you should
> use rte_flow_item and define what happens when "last" is set.
> 
> You should define it as reserved for now, any non-NULL value is an error. This
> field might also be useful later.
> 
> <snip>
> > > > +};
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> > > > + *
> > > > + * Virtual tunnel end-point decapsulation action data.
> > > > + *
> > > > + * Non-terminating action by default.
> > > > + */
> > > > +struct rte_flow_action_tunnel_decap {
> > > > +	enum rte_flow_item_type type;
> > > > +	/**<
> > > > +	 * Flow item type of virtual tunnel end-point to be decapsulated
> > > > +	 */
> > > > +};
> > > Note that contrary to ENCAP, DECAP wouldn't necessarily need
> > > repeated actions to peel each layer off. The current definition is fine.
> > To clarify, the the decap is upto the PMD to remove all the header for
> > a specified type. For example, for
> >
> > rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will
> peel off (ETH, IPV4, UDP, VXLAN) header all together.
> 
> Yep, that's fine, whether we use multiple actions or a single one doing
> multiple things, a single DECAP can peel them off all at once :)
> 
> > >
> > > > +
> > > > +/**
> > > >    * Definition of a single action.
> > > >    *
> > > >    * A list of actions is terminated by a END action.
> > > > --
> > > > 2.7.4
> > > >
> 
> If the reasons I gave did not manage to convince you about choosing
> between
> either fixed (VLAN|VXLAN)_(ENCAP|DECAP) actions or generic encap/decap
> actions that deal with a single protocol layer at once instead of the
> proposed approach, I'm ready to try it out as an experimental API (all new
> objects tagged as experimental) *if* you address the lack of mask, which
> remains an open issue.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-09 16:10  0%     ` Mohammad Abdul Awal
@ 2018-04-10 10:19  0%       ` Adrien Mazarguil
  2018-04-10 11:06  0%         ` Shahaf Shuler
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 10:19 UTC (permalink / raw)
  To: Mohammad Abdul Awal; +Cc: Declan Doherty, dev

On Mon, Apr 09, 2018 at 05:10:35PM +0100, Mohammad Abdul Awal wrote:
> On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> > > Add new flow action types and associated action data structures to
> > > support the encapsulation and decapsulation of the virtual tunnel
> > > endpoints.
> > > 
> > > The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
> > > flow to be encapsulated in the virtual tunnel endpoint overlay
> > > defined in the tunnel_encap action data.
> > > 
> > > The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
> > > tunnel endpoint overlays up to and including the first instance of
> > > the flow item type defined in the tunnel_decap action data for the
> > > matching flows.
> > > 
> > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > This generic approach looks flexible enough to cover the use cases that
> > immediately come to mind (VLAN, VXLAN), its design is sound.
> > 
> > However, while I'm aware it's not a concern at this point, it won't be able
> > to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
> > which will require additional meta data or some run-time assistance from the
> > application.
> > 
> > Eventually for more complex use cases, dedicated encap/decap actions will
> > have to appear, so the issue I wanted to raise before going further is this:
> > 
> > Going generic inevitably trades some of the usability; flat structures
> > dedicated to VXLAN encap/decap with only the needed info to get the job done
> > would likely be easier to implement in PMDs and use in applications. Any
> > number of such actions can be added to rte_flow without ABI impact.
> > 
> > If VXLAN is the only use case at this point, my suggestion would be to go
> > with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
> > L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
> We can go this way and this will increase the action for more and more
> tunneling protocols being added. Current proposal is already a generic
> approach which specifies as a tunnel for all the tunneling protocols.

Right, on the other hand there are not that many standard encapsulations
offloaded by existing devices. rte_flow could easily handle dedicated
actions for each of them without problem.

My point is that many of those (will eventually) have their own quirks to
manage when doing encap/decap, it's not just a matter of prepending or
removing a bunch of header definitions, otherwise we could as well let
applications simply provide an arbitrary buffer to prepend.

Consider that the "generic" part is already built into rte_flow as the way
patterns and action are handled. Adding another generic layer on top of that
could make things more inconvenient than necessary to applications (my main
concern).

You'd need another layer of validation/error reporting machinery to properly
let applications know they cannot encap VXLAN on top of TCP on top of
QinQinQinQinQ for instance. Either a single bounded encapsulation definition
or a combination at the action list level is needed to avoid that.

> > Now we can start with the generic approach, see how it fares and add
> > dedicated encap/decap later as needed.
> > 
> > More comments below.
<snip>
> > > +Action: ``TUNNEL_ENCAP``
> > > +^^^^^^^^^^^^^^^^^^^^^^
> > > +
> > > +Performs an encapsulation action by encapsulating the flows matched by the
> > > +pattern items according to the network overlay defined in the
> > > +``rte_flow_action_tunnel_encap`` pattern items.
> > > +
> > > +This action modifies the payload of matched flows. The pattern items specified
> > > +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
> > > +set of overlay headers, from the Ethernet header up to the overlay header. The
> > > +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> > Regarding the use of a pattern list, if you consider PMDs are already
> > iterating on a list of actions when encountering
> > RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.
> We understand that it is implementation specifics. If we do not go for
> another inner loop, all the bundling need to be handled in the same
> function, which seems more clumsy to me. This also breaks the tunnel
> endpoint concept.
> > 
> > How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
> > exactly one item instead (in encap, i.e. reverse order)?
> Again, if we have tunnel action, security action, and other actions, all the
> processing and tracking need to be done in one function. Now we will need
> ETH_ENCAP/DECAP, UDP_ENCAP/DECAP, NVGRE_ENCAP/DECAP, etc.

Well, the number of DECAP actions doesn't need to perfectly reflect that of
ENCAP since it implies all preceding layers. No problem with that.

Regarding multiple dedicated actions, my suggestion was for a single generic
one as in this patch, but each instance on the ENCAP side would deal with a
single protocol layer, instead of having a single ENCAP action with multiple
inner layers (and thus an inner loop).

PMDs also gain the ability to precisely report which encap step fails by
making rte_flow_error point to the problematic object to ease debugging of
flow rules on the application side.

Why would that break the tunnel idea and more importantly, how would it
prevent PMD developers from splitting their processing into multiple
functions?

> > 
> > In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
> > 
<snip>
> > > +
> > > +   +-------+--------------------------+------------+
> > > +   | Index | Flow Item Type           | Flow Item  |
> > > +   +=======+==========================+============+
> > > +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> > > +   +-------+--------------------------+------------+
> > > +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> > > +   +-------+--------------------------+------------+
> > > +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> > > +   +-------+--------------------------+------------+
> > > +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> > > +   +-------+--------------------------+------------+
> > > +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> > > +   +-------+--------------------------+------------+
> > One possible issue is that it relies on objects normally found on the
> > pattern side of flow rules. Those are supposed to match something, they are
> > not intended for packet header generation. While their "spec" and "mask"
> > fields might make sense in this context, the "last" field is odd.
> > 
> > You must define them without leaving anything open for interpretation by
> > PMDs and users alike. Defining things as "undefined" is fine as long as it's
> > covered.
> Please note that the "void *item" in the
> "rte_flow_action_tunnel_encap.pattern" points to the data structure defined
> for the corresponding rte_flow_item_type instead of a rte_flow_item
> structure. As an example, for the rte_flow_item_eth type, the "void *item"
> will point to a "struct rte_flow_item_eth" instance. Thats why we have
> defined struct rte_flow_action_item inside struct
> rte_flow_action_tunnel_encap. So, no question of spec, mask, last anymore.

Right, I noticed that after commenting its structure definition below.

I think I won't be the only one confused by this approach, also because a
mask is needed in addition to a specification structure, otherwise how do
you plan to tell what fields are relevant in application-provided protocol
headers?

An application might set unusual IPv4/UDP/VXLAN fields and expect them to be
part of the encapsulated traffic. Without a mask, a PMD must take headers
verbatim, and I don't think many devices are ready for that yet.

Hence my other suggestion: defining inflexible $PROTOCOL_(ENCAP|DECAP)
actions that do not allow more than what's defined by official RFCs for
$PROTOCOL.

<snip>
> > > + */
> > > +struct rte_flow_action_tunnel_encap {
> > > +	struct rte_flow_action_item {
> > > +		enum rte_flow_item_type type;
> > > +		/**< Flow item type. */
> > > +		const void *item;
> > > +		/**< Flow item definition which points to the data of
> > > +		 * corresponding rte_flow_item_type.
> > > +		 */
> > I see it's a new action type, albeit a bit confusing (there is no
> > RTE_FLOW_ACTION_TYPE_ITEM).
> > 
> > I suggest the standard pattern item type since you're going with enum
> > rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
> > what fields are relevant. An application might otherwise want to encap with
> > unsupported properties (e.g. specific IPv4 ToS field and whatnot).
> > 
> > How about a single "struct rte_flow_pattern_item item", neither const and
> > neither a pointer. It's generic enough, enclosed spec/last/mask pointers
> > take care of the specifics. You just need to define what's supposed to
> > happen when "last" is set.
> Please see the comment above regarding this field.

Point still stands, if you need to distinguish spec and mask, a more
complete structure is needed. Instead of adding a new (confusing) type, you
should use rte_flow_item and define what happens when "last" is set.

You should define it as reserved for now, any non-NULL value is an
error. This field might also be useful later.

<snip>
> > > +};
> > > +
> > > +/**
> > > + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> > > + *
> > > + * Virtual tunnel end-point decapsulation action data.
> > > + *
> > > + * Non-terminating action by default.
> > > + */
> > > +struct rte_flow_action_tunnel_decap {
> > > +	enum rte_flow_item_type type;
> > > +	/**<
> > > +	 * Flow item type of virtual tunnel end-point to be decapsulated
> > > +	 */
> > > +};
> > Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
> > actions to peel each layer off. The current definition is fine.
> To clarify, the the decap is upto the PMD to remove all the header for a
> specified type. For example, for
> 
> rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will peel off (ETH, IPV4, UDP, VXLAN) header all together.

Yep, that's fine, whether we use multiple actions or a single one doing
multiple things, a single DECAP can peel them off all at once :)

> > 
> > > +
> > > +/**
> > >    * Definition of a single action.
> > >    *
> > >    * A list of actions is terminated by a END action.
> > > -- 
> > > 2.7.4
> > > 

If the reasons I gave did not manage to convince you about choosing between
either fixed (VLAN|VXLAN)_(ENCAP|DECAP) actions or generic encap/decap
actions that deal with a single protocol layer at once instead of the
proposed approach, I'm ready to try it out as an experimental API (all new
objects tagged as experimental) *if* you address the lack of mask, which
remains an open issue.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  2018-04-10  9:43  4% ` [dpdk-dev] [PATCH v6 " Remy Horton
@ 2018-04-10  9:43  7%   ` Remy Horton
  2018-04-10 18:56  0%   ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
  1 sibling, 0 replies; 200+ results
From: Remy Horton @ 2018-04-10  9:43 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related
parameters, such as burst sizes, descriptor ring sizes, and number
of queues, varies between different network interface devices. This
patch allows individual PMDs to specify preferred parameter values.

Signed-off-by: Remy Horton <remy.horton@intel.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
---
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 4 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..d13077d 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -112,19 +112,6 @@ Deprecation Notices
   The new API add rss_level field to ``rte_eth_rss_conf`` to enable a choice
   of RSS hash calculation on outer or inner header of tunneled packet.
 
-* ethdev:  Currently, if the  rte_eth_rx_burst() function returns a value less
-  than *nb_pkts*, the application will assume that no more packets are present.
-  Some of the hw queue based hardware can only support smaller burst for RX
-  and TX and thus break the expectation of the rx_burst API. Similar is the
-  case for TX burst as well as ring sizes. ``rte_eth_dev_info`` will be added
-  with following new parameters so as to support semantics for drivers to
-  define a preferred size for Rx/Tx burst and rings.
-
-  - Member ``struct preferred_size`` would be added to enclose all preferred
-    size to be fetched from driver/implementation.
-  - Members ``uint16_t rx_burst``,  ``uint16_t tx_burst``, ``uint16_t rx_ring``,
-    and ``uint16_t tx_ring`` would be added to ``struct preferred_size``.
-
 * ethdev: A work is being planned for 18.05 to expose VF port representors
   as a mean to perform control and data path operation on the different VFs.
   As VF representor is an ethdev port, new fields are needed in order to map
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index abc1c17..0d57a63 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -58,6 +58,11 @@ New Features
   * Added support for NVGRE, VXLAN and GENEVE filters in flow API.
   * Added support for DROP action in flow API.
 
+* **Added PMD-recommended Tx and Rx parameters**
+
+  Applications can now query drivers for device-tuned values of
+  ring sizes, burst sizes, and number of queues.
+
 
 API Changes
 -----------
@@ -83,6 +88,29 @@ API Changes
   memory footprint which helps in better cache utilization when large number
   of meter objects are used.
 
+* **Changes to semantics of rte_eth_dev_configure() parameters.**
+
+   If both the ``nb_rx_q`` and ``nb_tx_q`` parameters are zero,
+   ``rte_eth_dev_configure`` will now use PMD-recommended queue sizes, or if
+   recommendations are not provided by the PMD the function will use ethdev
+   fall-back values. Previously setting both of the parameters to zero would
+   have resulted in ``-EINVAL`` being returned.
+
+* **Changes to semantics of rte_eth_rx_queue_setup() parameters.**
+
+   If the ``nb_rx_desc`` parameter is zero, ``rte_eth_rx_queue_setup`` will
+   now use the PMD-recommended Rx ring size, or in the case where the PMD
+   does not provide a recommendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_rx_desc`` to zero would have
+   resulted in an error.
+
+* **Changes to semantics of rte_eth_tx_queue_setup() parameters.**
+
+   If the ``nb_tx_desc`` parameter is zero, ``rte_eth_tx_queue_setup`` will
+   now use the PMD-recommended Tx ring size, or in the case where the PMD
+   does not provide a recoomendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
+   resulted in an error.
 
 ABI Changes
 -----------
@@ -97,6 +125,13 @@ ABI Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Additional fields in rte_eth_dev_info.**
+
+  The ``rte_eth_dev_info`` structure has had two extra entries appended to the
+  end of it: ``default_rxportconf`` and ``default_txportconf``. Each of these
+  in turn are ``rte_eth_dev_portconf`` structures containing three fields of
+  type ``uint16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
+  are parameter values recommended for use by the PMD.
 
 Removed Items
 -------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..209796d 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1061,6 +1061,26 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
+	dev = &rte_eth_devices[port_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
+
+	/* If number of queues specified by application for both Rx and Tx is
+	 * zero, use driver preferred values. This cannot be done individually
+	 * as it is valid for either Tx or Rx (but not both) to be zero.
+	 * If driver does not provide any preferred valued, fall back on
+	 * EAL defaults.
+	 */
+	if (nb_rx_q == 0 && nb_tx_q == 0) {
+		nb_rx_q = dev_info.default_rxportconf.nb_queues;
+		if (nb_rx_q == 0)
+			nb_rx_q = RTE_ETH_DEV_FALLBACK_RX_NBQUEUES;
+		nb_tx_q = dev_info.default_txportconf.nb_queues;
+		if (nb_tx_q == 0)
+			nb_tx_q = RTE_ETH_DEV_FALLBACK_TX_NBQUEUES;
+	}
+
 	if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) {
 		RTE_PMD_DEBUG_TRACE(
 			"Number of RX queues requested (%u) is greater than max supported(%d)\n",
@@ -1075,8 +1095,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 		return -EINVAL;
 	}
 
-	dev = &rte_eth_devices[port_id];
-
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
 
@@ -1106,13 +1124,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 	 * than the maximum number of RX and TX queues supported by the
 	 * configured device.
 	 */
-	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
-
-	if (nb_rx_q == 0 && nb_tx_q == 0) {
-		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d both rx and tx queue cannot be 0\n", port_id);
-		return -EINVAL;
-	}
-
 	if (nb_rx_q > dev_info.max_rx_queues) {
 		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
 				port_id, nb_rx_q, dev_info.max_rx_queues);
@@ -1477,6 +1488,14 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		return -EINVAL;
 	}
 
+	/* Use default specified by driver, if nb_rx_desc is zero */
+	if (nb_rx_desc == 0) {
+		nb_rx_desc = dev_info.default_rxportconf.ring_size;
+		/* If driver default is also zero, fall back on EAL default */
+		if (nb_rx_desc == 0)
+			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
+	}
+
 	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
 			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
 			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
@@ -1600,6 +1619,13 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 
 	rte_eth_dev_info_get(port_id, &dev_info);
 
+	/* Use default specified by driver, if nb_tx_desc is zero */
+	if (nb_tx_desc == 0) {
+		nb_tx_desc = dev_info.default_txportconf.ring_size;
+		/* If driver default is zero, fall back on EAL default */
+		if (nb_tx_desc == 0)
+			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
+	}
 	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
 	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
 	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca..685145f 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -988,6 +988,27 @@ struct rte_eth_conf {
 
 struct rte_pci_device;
 
+/*
+ * Fallback default preferred Rx/Tx port parameters.
+ * These are used if an application requests default parameters
+ * but the PMD does not provide preferred values.
+ */
+#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1
+#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1
+
+/**
+ * Preferred Rx/Tx port parameters.
+ * There are separate instances of this structure for transmission
+ * and reception respectively.
+ */
+struct rte_eth_dev_portconf {
+	uint16_t burst_size; /**< Device-preferred burst size */
+	uint16_t ring_size; /**< Device-preferred size of queue rings */
+	uint16_t nb_queues; /**< Device-preferred number of queues */
+};
+
 /**
  * Ethernet device information
  */
@@ -1029,6 +1050,10 @@ struct rte_eth_dev_info {
 	/** Configured number of rx/tx queues */
 	uint16_t nb_rx_queues; /**< Number of RX queues. */
 	uint16_t nb_tx_queues; /**< Number of TX queues. */
+	/** Rx parameter recommendations */
+	struct rte_eth_dev_portconf default_rxportconf;
+	/** Tx parameter recommendations */
+	struct rte_eth_dev_portconf default_txportconf;
 };
 
 /**
-- 
2.9.5

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters
    2018-04-06 17:01  0% ` Ferruh Yigit
@ 2018-04-10  9:43  4% ` Remy Horton
  2018-04-10  9:43  7%   ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
  2018-04-10 18:56  0%   ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
  1 sibling, 2 replies; 200+ results
From: Remy Horton @ 2018-04-10  9:43 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related parameters,
such as burst sizes, descriptor ring sizes, and number of queues, varies
between different network interface devices. This patchset allows individual
PMDs to specify their preferred parameter values, and if so indicated by an
application, for them to be used automatically by the ethdev layer.

rte_eth_dev_configure() has been changed so that specifying zero for both
nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
are not available, falls back to EAL defaults. Setting one (but not both)
to zero does not cause the use of defaults, as having one of them zeroed is
a valid setup.

This patchset includes per-PMD values for e1000 and i40e but it is expected
that subsequent patchsets will cover other PMDs. A deprecation notice
covering the API/ABI change is in place.

Changes in v6:
* Updated/corrected testpmd documentation
* Carried forward acks/review
* Rebased to d218a4d060de

Changes in v5:
* uint_16_t corrected to uint16_t

Changes in v4:
* Added API/ABI change documentation
* Rebased to 78f5a2e93d74

Changes in v3:
* Changed formatting around new rte_eth_dev_info fields
* Added Doxygen documentation to struct rte_eth_dev_portconf
* Testpmd "port config all burst 0" and --burst=0 uses PMD 
  Rx burst recommendations.
* Added to release notes
* Rebased to 8ea081f38161

Changes in v2:
* Rebased to master
* Removed fallback values from rte_eth_dev_info_get()
* Added fallback values to rte_rte_[rt]x_queue_setup()
* Added fallback values to rte_eth_dev_configure()
* Corrected comment
* Removed deprecation notice
* Split RX and Tx into seperate structures
* Changed parameter names


Remy Horton (4):
  ethdev: add support for PMD-tuned Tx/Rx parameters
  net/e1000: add TxRx tuning parameters
  net/i40e: add TxRx tuning parameters
  testpmd: make use of per-PMD TxRx parameters

 app/test-pmd/cmdline.c                 | 31 +++++++++++++++++++++---
 app/test-pmd/parameters.c              | 38 +++++++++++++++++++++++++----
 app/test-pmd/testpmd.c                 |  5 ++--
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 doc/guides/testpmd_app_ug/run_app.rst  |  4 +++-
 drivers/net/e1000/em_ethdev.c          |  6 +++++
 drivers/net/i40e/i40e_ethdev.c         | 33 ++++++++++++++++++++++---
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 10 files changed, 198 insertions(+), 36 deletions(-)

-- 
2.9.5

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  @ 2018-04-10  1:59  3%           ` Yongseok Koh
  2018-04-11  0:25  0%             ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-10  1:59 UTC (permalink / raw)
  To: Olivier Matz
  Cc: wenzhuo.lu, jingjing.wu, adrien.mazarguil, nelio.laranjeiro, dev

On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> Hi Yongseok,
> 
> On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > Hi,
> > > 
> > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > rte_pktmbuf_attach_at().
> > > > 
> > > > Possible use-cases could be:
> > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > >   buffers can reference different layers of the encapsulated packet.
> > > > - A large direct mbuf can even contain multiple packets in series and
> > > >   each packet can be referenced by multiple mbuf indirections.
> > > > 
> > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > 
> > > I think the current API is already able to do what you want.
> > > 
> > > 1/ Here is a mbuf m with its data
> > > 
> > >                off
> > >                <-->
> > >                       len
> > >           +----+   <---------->
> > >           |    |
> > >         +-|----v----------------------+
> > >         | |    -----------------------|
> > > m       | buf  |    XXXXXXXXXXX      ||
> > >         |      -----------------------|
> > >         +-----------------------------+
> > > 
> > > 
> > > 2/ clone m:
> > > 
> > >   c = rte_pktmbuf_alloc(pool);
> > >   rte_pktmbuf_attach(c, m);
> > > 
> > >   Note that c has its own offset and length fields.
> > > 
> > > 
> > >                off
> > >                <-->
> > >                       len
> > >           +----+   <---------->
> > >           |    |
> > >         +-|----v----------------------+
> > >         | |    -----------------------|
> > > m       | buf  |    XXXXXXXXXXX      ||
> > >         |      -----------------------|
> > >         +------^----------------------+
> > >                |
> > >           +----+
> > > indirect  |
> > >         +-|---------------------------+
> > >         | |    -----------------------|
> > > c       | buf  |                     ||
> > >         |      -----------------------|
> > >         +-----------------------------+
> > > 
> > >                 off    len
> > >                 <--><---------->
> > > 
> > > 
> > > 3/ remove some data from c without changing m
> > > 
> > >    rte_pktmbuf_adj(c, 10)   // at head
> > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > 
> > > 
> > > Please let me know if it fits your needs.
> > 
> > No, it doesn't.
> > 
> > Trimming head and tail with the current APIs removes data and make the space
> > available. Adjusting packet head means giving more headroom, not shifting the
> > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > difference offsets in m,
> > 
> > rte_pktmbuf_adj(c1, 10);
> > rte_pktmbuf_adj(c2, 20);
> > 
> > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > wants to attach outer header, it will overwrite the headroom even though the
> > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > header should be linked by h1->next = c2.
> 
> Yes, after these operations c1, c2 and m should become read-only. So, to
> prepend headers, another mbuf has to be inserted before as you suggest. It
> is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> length) that will:
>   - alloc and attach indirect mbuf for each segment of m that is
>     in the range [offset : length+offset].
>   - prepend an empty and writable mbuf for the headers
> 
> > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > which actually shrink the headroom, this case can be properly handled.
> 
> What do you mean by properly handled?
> 
> Yes, prepending data or adding data in the indirect mbuf won't override
> the direct mbuf. But prepending data or adding data in the direct mbuf m
> won't be protected.
> 
> From an application point of view, indirect mbufs, or direct mbufs that
> have refcnt != 1, should be both considered as read-only because they
> may share their data. How an application can know if the data is shared
> or not?
> 
> Maybe we need a flag to differentiate mbufs that are read-only
> (something like SHARED_DATA, or simply READONLY). In your case, if my
> understanding is correct, you want to have indirect mbufs with RW data.

Agree that indirect mbuf must be treated as read-only, Then the current code is
enough to handle that use-case.

> > And another use-case (this is my actual use-case) is to make a large mbuf have
> > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > because it transfers multiple packets to a single large buffer to reduce PCIe
> > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > indirect referencing.
> > 
> > Does this make sense?
> 
> I understand the need.
> 
> Another option would be to make the mbuf->buffer point to an external
> buffer (not inside the direct mbuf). This would require to add a
> mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> a quick overview.
> 
> [1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01-Session05-OlivierMatz-Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> 
> The advantage is that it does not require the large data to be inside a
> mbuf (requiring a mbuf structure before the buffer, and requiring to be
> allocated from a mempool). On the other hand, it is maybe more complex
> to implement compared to your solution.

I knew that you presented the slides and frankly, I had considered that option
at first. But even with that option, metadata to store refcnt should also be
allocated and managed anyway. Kernel also maintains the skb_shared_info at the
end of the data segment. Even though it could have smaller metadata structure,
I just wanted to make full use of the existing framework because it is less
complex as you mentioned. Given that you presented the idea of external data
buffer in 2016 and there hasn't been many follow-up discussions/activities so
far, I thought the demand isn't so big yet thus I wanted to make this patch
simpler.  I personally think that we can take the idea of external data seg when
more demands come from users in the future as it would be a huge change and may
break current ABI/API. When the day comes, I'll gladly participate in the
discussions and write codes for it if I can be helpful.

Do you think this patch is okay for now?


Thanks for your comments,
Yongseok

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:09  0%         ` Ananyev, Konstantin
@ 2018-04-09 17:26  0%           ` Dumitrescu, Cristian
  2018-04-10 12:32  0%             ` Van Haaren, Harry
  0 siblings, 1 reply; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:26 UTC (permalink / raw)
  To: Ananyev, Konstantin, Van Haaren, Harry, Stephen Hemminger, Singh,
	Jasvinder, Richardson, Bruce
  Cc: dev

> >
> > If people think that this function conversion is not nice, it can be reworked
> in multiple ways at the expense of API (but not ABI) change:
> > 1. Define the hash function field in the table parameter structure as
> opaque void * rather than 4-parameter version.
> > 2. Create a separate parameter structure just for this hash table type.
> 
> Why just not define your f_hash member as a union:
> 
> struct rte_table_hash_params {
> ...
> union {
>     rte_table_hash_op_hash  f_hash_4params;
>     rte_hash_function f_hash_3_params;
> };
> 
> ?
> 

Yes, agreed, this is yet another way to handle this, thanks Konstantin.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
@ 2018-04-09 17:09  0%         ` Ananyev, Konstantin
  2018-04-09 17:26  0%           ` Dumitrescu, Cristian
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-09 17:09 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Van Haaren, Harry, Stephen Hemminger,
	Singh, Jasvinder, Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 6:02 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> 
> 
> > -----Original Message-----
> > From: Van Haaren, Harry
> > Sent: Monday, April 9, 2018 5:38 PM
> > To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Stephen
> > Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> > Cristian
> > > Sent: Monday, April 9, 2018 4:59 PM
> > > To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> > Jasvinder
> > > <jasvinder.singh@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > > Sent: Monday, April 9, 2018 4:10 PM
> > > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > > >
> > > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > > >
> > > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > > >
> > > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > > ---
> > > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > index dcb4fe9..f7eae27 100644
> > > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> > *params,
> > > > >  		return NULL;
> > > > >  	}
> > > > >
> > > > > +	void *hash_func = p->f_hash;
> > > > > +
> > > > >  	/* Create cuckoo hash table */
> > > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > > >  		.entries = p->n_keys,
> > > > >  		.key_len = p->key_size,
> > > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > > +		.hash_func = (rte_hash_function) hash_func,
> > > > >  		.hash_func_init_val = p->seed,
> > > > >  		.socket_id = socket_id,
> > > > >  		.name = p->name
> > > >
> > > > This is just tricking the compiler into not complaining.
> > > > I would really rather see the two hash functions made the same.
> > >
> > > (Adding Bruce as well to consolidate all conversations in a single thread.)
> > >
> > > What we want to do here is be able to use the librte_hash under the same
> > API
> > > as the several hash table flavors implemented in librte_table.
> > >
> > > Both of these libraries allow configuring the hash function per each hash
> > > table instance. Problem is: hash function in librte_hash has only 3
> > parameters
> > > (no key mask), while hash function in librte_table has 4 parameters
> > (includes
> > > key mask). The key mask helps a lot for practical protocol implementations
> > by
> > > avoiding key copy & pre-process on lookup.
> > >
> > > So then: how to plug in librte_hash under the same API as the suite of
> > hash
> > > tables in librte_table? We don't want to re-implement cuckoo hash from
> > > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > > to how the LPM and ACL tables are plugged into librte_table.
> > >
> > > Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> > hash
> > > flavor under the librte_table. Maybe this should be documented better.
> > This
> > > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > > this trivial patch.
> > >
> > > Ideally, for every 3-parameter hash function, I would like to generate the
> > > corresponding 4-parameter hash function on-the-fly, but unfortunately this
> > is
> > > not what C language can do.
> > >
> > > Of course, IMO the best solution is to add key mask support to librte_hash.
> >
> >
> > Looking at the previous discussion I see the following as a possible solution;
> >
> > Given the current code looks broken it should be fixed in this release.
> 
> The code is not broken. This is not a bug, it is a limitation for that particular table type. The only gap that I see is adding a Doxygen
> comment in the API header file.
> 
> User explicitly picks the hash table type it wants; when using this particular hash table type, that pointer needs to point to a 3-parameter
> function instead of 4. Given the limitation is clearly documented in Doxygen (current gap that we can quickly address), I don't see any
> problem.
> 
> If people think that this function conversion is not nice, it can be reworked in multiple ways at the expense of API (but not ABI) change:
> 1. Define the hash function field in the table parameter structure as opaque void * rather than 4-parameter version.
> 2. Create a separate parameter structure just for this hash table type.

Why just not define your f_hash member as a union:

struct rte_table_hash_params {
...
union {
    rte_table_hash_op_hash  f_hash_4params;
    rte_hash_function f_hash_3_params;
}; 

?

> 
> > Given the actual code fix is an API / ABI break (depending on solution) it
> > cannot be merged official in this release.
> > We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> > compile time.
> 
> This is not new code introduced in this release cycle, this is just fixing the compiler warning, I fail to see how your ABI breakage mention is
> applicable.
> 
> Maybe we should talk more specifics over the code, where exactly in the code would you like to place your NEXT_ABI macro?
> 
> >
> > With the above 3 points, I think the best solution is to correctly fix the
> > problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> > macros.
> >
> > In this case, we can preserve backwards (buggy) behavior if required, and
> > provide correct (but API/ABI breaking) code as well. This is a tough decision -
> > particularly for distros - what do they package?
> >
> > Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:43  0%       ` Ferruh Yigit
@ 2018-04-09 17:05  0%         ` Dumitrescu, Cristian
  0 siblings, 0 replies; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:05 UTC (permalink / raw)
  To: Yigit, Ferruh, Van Haaren, Harry, Stephen Hemminger, Singh,
	Jasvinder, Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: Yigit, Ferruh
> Sent: Monday, April 9, 2018 5:43 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Dumitrescu, Cristian
> <cristian.dumitrescu@intel.com>; Stephen Hemminger
> <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> On 4/9/2018 5:38 PM, Van Haaren, Harry wrote:
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> Cristian
> >> Sent: Monday, April 9, 2018 4:59 PM
> >> To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> Jasvinder
> >> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> >> Cc: dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> >>> Sent: Monday, April 9, 2018 4:10 PM
> >>> To: Singh, Jasvinder <jasvinder.singh@intel.com>
> >>> Cc: dev@dpdk.org; Dumitrescu, Cristian
> <cristian.dumitrescu@intel.com>
> >>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >>>
> >>> On Mon,  9 Apr 2018 13:49:48 +0100
> >>> Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> >>>
> >>>> Fix build error with gcc 8.0 due to cast between function types.
> >>>> Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> >>>>
> >>>> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> >>>> ---
> >>>>  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> >>>>  1 file changed, 3 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> >>> b/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> index dcb4fe9..f7eae27 100644
> >>>> --- a/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> *params,
> >>>>  		return NULL;
> >>>>  	}
> >>>>
> >>>> +	void *hash_func = p->f_hash;
> >>>> +
> >>>>  	/* Create cuckoo hash table */
> >>>>  	struct rte_hash_parameters hash_cuckoo_params = {
> >>>>  		.entries = p->n_keys,
> >>>>  		.key_len = p->key_size,
> >>>> -		.hash_func = (rte_hash_function)(p->f_hash),
> >>>> +		.hash_func = (rte_hash_function) hash_func,
> >>>>  		.hash_func_init_val = p->seed,
> >>>>  		.socket_id = socket_id,
> >>>>  		.name = p->name
> >>>
> >>> This is just tricking the compiler into not complaining.
> >>> I would really rather see the two hash functions made the same.
> >>
> >> (Adding Bruce as well to consolidate all conversations in a single thread.)
> >>
> >> What we want to do here is be able to use the librte_hash under the
> same API
> >> as the several hash table flavors implemented in librte_table.
> >>
> >> Both of these libraries allow configuring the hash function per each hash
> >> table instance. Problem is: hash function in librte_hash has only 3
> parameters
> >> (no key mask), while hash function in librte_table has 4 parameters
> (includes
> >> key mask). The key mask helps a lot for practical protocol
> implementations by
> >> avoiding key copy & pre-process on lookup.
> >>
> >> So then: how to plug in librte_hash under the same API as the suite of
> hash
> >> tables in librte_table? We don't want to re-implement cuckoo hash from
> >> librte_hash, we simply want to invoke it as a low-level primitive, similarly
> >> to how the LPM and ACL tables are plugged into librte_table.
> >>
> >> Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> hash
> >> flavor under the librte_table. Maybe this should be documented better.
> This
> >> currently triggers a build warning with gcc 8, which is easy to fix, hence
> >> this trivial patch.
> >>
> >> Ideally, for every 3-parameter hash function, I would like to generate the
> >> corresponding 4-parameter hash function on-the-fly, but unfortunately
> this is
> >> not what C language can do.
> >>
> >> Of course, IMO the best solution is to add key mask support to
> librte_hash.
> >
> >
> > Looking at the previous discussion I see the following as a possible
> solution;
> >
> > Given the current code looks broken it should be fixed in this release.
> > Given the actual code fix is an API / ABI break (depending on solution) it
> cannot be merged official in this release.
> > We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> compile time.
> >
> > With the above 3 points, I think the best solution is to correctly fix the
> problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> macros.
> >
> > In this case, we can preserve backwards (buggy) behavior if required, and
> provide correct (but API/ABI breaking) code as well. This is a tough decision -
> particularly for distros - what do they package?
> 
> +1 to use RTE_NEXT_ABI and deliver fixed code, and agree this is kind of
> pushing
> decision to distros.
> 

Again, where is the bug, and where exactly in the code you want to put RTE_NEXT_ABI macro, and what is the problem fixed by using this macro?

As stated in the reply to Harry, this could be reworked in multiple ways if people think the function pointer conversion is misleading to the user.

> >
> > Given the current code, I don't see a better solution - but I hope I'm wrong
> :)
> >


^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
@ 2018-04-09 17:02  4%       ` Dumitrescu, Cristian
  2018-04-09 17:09  0%         ` Ananyev, Konstantin
  2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 1 reply; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:02 UTC (permalink / raw)
  To: Van Haaren, Harry, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: Van Haaren, Harry
> Sent: Monday, April 9, 2018 5:38 PM
> To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Stephen
> Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> Cristian
> > Sent: Monday, April 9, 2018 4:59 PM
> > To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> >
> >
> > > -----Original Message-----
> > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > Sent: Monday, April 9, 2018 4:10 PM
> > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > >
> > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > >
> > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > ---
> > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > index dcb4fe9..f7eae27 100644
> > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> *params,
> > > >  		return NULL;
> > > >  	}
> > > >
> > > > +	void *hash_func = p->f_hash;
> > > > +
> > > >  	/* Create cuckoo hash table */
> > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > >  		.entries = p->n_keys,
> > > >  		.key_len = p->key_size,
> > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > +		.hash_func = (rte_hash_function) hash_func,
> > > >  		.hash_func_init_val = p->seed,
> > > >  		.socket_id = socket_id,
> > > >  		.name = p->name
> > >
> > > This is just tricking the compiler into not complaining.
> > > I would really rather see the two hash functions made the same.
> >
> > (Adding Bruce as well to consolidate all conversations in a single thread.)
> >
> > What we want to do here is be able to use the librte_hash under the same
> API
> > as the several hash table flavors implemented in librte_table.
> >
> > Both of these libraries allow configuring the hash function per each hash
> > table instance. Problem is: hash function in librte_hash has only 3
> parameters
> > (no key mask), while hash function in librte_table has 4 parameters
> (includes
> > key mask). The key mask helps a lot for practical protocol implementations
> by
> > avoiding key copy & pre-process on lookup.
> >
> > So then: how to plug in librte_hash under the same API as the suite of
> hash
> > tables in librte_table? We don't want to re-implement cuckoo hash from
> > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > to how the LPM and ACL tables are plugged into librte_table.
> >
> > Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> hash
> > flavor under the librte_table. Maybe this should be documented better.
> This
> > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > this trivial patch.
> >
> > Ideally, for every 3-parameter hash function, I would like to generate the
> > corresponding 4-parameter hash function on-the-fly, but unfortunately this
> is
> > not what C language can do.
> >
> > Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.

The code is not broken. This is not a bug, it is a limitation for that particular table type. The only gap that I see is adding a Doxygen comment in the API header file.

User explicitly picks the hash table type it wants; when using this particular hash table type, that pointer needs to point to a 3-parameter function instead of 4. Given the limitation is clearly documented in Doxygen (current gap that we can quickly address), I don't see any problem.

If people think that this function conversion is not nice, it can be reworked in multiple ways at the expense of API (but not ABI) change:
1. Define the hash function field in the table parameter structure as opaque void * rather than 4-parameter version.
2. Create a separate parameter structure just for this hash table type.

> Given the actual code fix is an API / ABI break (depending on solution) it
> cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> compile time.

This is not new code introduced in this release cycle, this is just fixing the compiler warning, I fail to see how your ABI breakage mention is applicable.

Maybe we should talk more specifics over the code, where exactly in the code would you like to place your NEXT_ABI macro?

> 
> With the above 3 points, I think the best solution is to correctly fix the
> problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and
> provide correct (but API/ABI breaking) code as well. This is a tough decision -
> particularly for distros - what do they package?
> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
@ 2018-04-09 16:43  0%       ` Ferruh Yigit
  2018-04-09 17:05  0%         ` Dumitrescu, Cristian
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
  2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-09 16:43 UTC (permalink / raw)
  To: Van Haaren, Harry, Dumitrescu, Cristian, Stephen Hemminger,
	Singh, Jasvinder, Richardson, Bruce
  Cc: dev

On 4/9/2018 5:38 PM, Van Haaren, Harry wrote:
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
>> Sent: Monday, April 9, 2018 4:59 PM
>> To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
>> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
>> Cc: dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
>>
>>
>>
>>> -----Original Message-----
>>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>>> Sent: Monday, April 9, 2018 4:10 PM
>>> To: Singh, Jasvinder <jasvinder.singh@intel.com>
>>> Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
>>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
>>>
>>> On Mon,  9 Apr 2018 13:49:48 +0100
>>> Jasvinder Singh <jasvinder.singh@intel.com> wrote:
>>>
>>>> Fix build error with gcc 8.0 due to cast between function types.
>>>> Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
>>>>
>>>> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
>>>> ---
>>>>  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
>>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
>>> b/lib/librte_table/rte_table_hash_cuckoo.c
>>>> index dcb4fe9..f7eae27 100644
>>>> --- a/lib/librte_table/rte_table_hash_cuckoo.c
>>>> +++ b/lib/librte_table/rte_table_hash_cuckoo.c
>>>> @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> +	void *hash_func = p->f_hash;
>>>> +
>>>>  	/* Create cuckoo hash table */
>>>>  	struct rte_hash_parameters hash_cuckoo_params = {
>>>>  		.entries = p->n_keys,
>>>>  		.key_len = p->key_size,
>>>> -		.hash_func = (rte_hash_function)(p->f_hash),
>>>> +		.hash_func = (rte_hash_function) hash_func,
>>>>  		.hash_func_init_val = p->seed,
>>>>  		.socket_id = socket_id,
>>>>  		.name = p->name
>>>
>>> This is just tricking the compiler into not complaining.
>>> I would really rather see the two hash functions made the same.
>>
>> (Adding Bruce as well to consolidate all conversations in a single thread.)
>>
>> What we want to do here is be able to use the librte_hash under the same API
>> as the several hash table flavors implemented in librte_table.
>>
>> Both of these libraries allow configuring the hash function per each hash
>> table instance. Problem is: hash function in librte_hash has only 3 parameters
>> (no key mask), while hash function in librte_table has 4 parameters (includes
>> key mask). The key mask helps a lot for practical protocol implementations by
>> avoiding key copy & pre-process on lookup.
>>
>> So then: how to plug in librte_hash under the same API as the suite of hash
>> tables in librte_table? We don't want to re-implement cuckoo hash from
>> librte_hash, we simply want to invoke it as a low-level primitive, similarly
>> to how the LPM and ACL tables are plugged into librte_table.
>>
>> Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
>> flavor under the librte_table. Maybe this should be documented better. This
>> currently triggers a build warning with gcc 8, which is easy to fix, hence
>> this trivial patch.
>>
>> Ideally, for every 3-parameter hash function, I would like to generate the
>> corresponding 4-parameter hash function on-the-fly, but unfortunately this is
>> not what C language can do.
>>
>> Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.
> Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.
> 
> With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?

+1 to use RTE_NEXT_ABI and deliver fixed code, and agree this is kind of pushing
decision to distros.

> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  @ 2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
                         ` (2 more replies)
  0 siblings, 3 replies; 200+ results
From: Van Haaren, Harry @ 2018-04-09 16:38 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce
  Cc: dev

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 4:59 PM
> To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> 
> 
> > -----Original Message-----
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Monday, April 9, 2018 4:10 PM
> > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> > On Mon,  9 Apr 2018 13:49:48 +0100
> > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> >
> > > Fix build error with gcc 8.0 due to cast between function types.
> > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > >
> > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > ---
> > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > index dcb4fe9..f7eae27 100644
> > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
> > >  		return NULL;
> > >  	}
> > >
> > > +	void *hash_func = p->f_hash;
> > > +
> > >  	/* Create cuckoo hash table */
> > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > >  		.entries = p->n_keys,
> > >  		.key_len = p->key_size,
> > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > +		.hash_func = (rte_hash_function) hash_func,
> > >  		.hash_func_init_val = p->seed,
> > >  		.socket_id = socket_id,
> > >  		.name = p->name
> >
> > This is just tricking the compiler into not complaining.
> > I would really rather see the two hash functions made the same.
> 
> (Adding Bruce as well to consolidate all conversations in a single thread.)
> 
> What we want to do here is be able to use the librte_hash under the same API
> as the several hash table flavors implemented in librte_table.
> 
> Both of these libraries allow configuring the hash function per each hash
> table instance. Problem is: hash function in librte_hash has only 3 parameters
> (no key mask), while hash function in librte_table has 4 parameters (includes
> key mask). The key mask helps a lot for practical protocol implementations by
> avoiding key copy & pre-process on lookup.
> 
> So then: how to plug in librte_hash under the same API as the suite of hash
> tables in librte_table? We don't want to re-implement cuckoo hash from
> librte_hash, we simply want to invoke it as a low-level primitive, similarly
> to how the LPM and ACL tables are plugged into librte_table.
> 
> Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
> flavor under the librte_table. Maybe this should be documented better. This
> currently triggers a build warning with gcc 8, which is easy to fix, hence
> this trivial patch.
> 
> Ideally, for every 3-parameter hash function, I would like to generate the
> corresponding 4-parameter hash function on-the-fly, but unfortunately this is
> not what C language can do.
> 
> Of course, IMO the best solution is to add key mask support to librte_hash.


Looking at the previous discussion I see the following as a possible solution;

Given the current code looks broken it should be fixed in this release.
Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.

With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.

In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?

Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-06 20:26  2%   ` Adrien Mazarguil
@ 2018-04-09 16:10  0%     ` Mohammad Abdul Awal
  2018-04-10 10:19  0%       ` Adrien Mazarguil
  2018-04-17 14:58  3%     ` Doherty, Declan
  1 sibling, 1 reply; 200+ results
From: Mohammad Abdul Awal @ 2018-04-09 16:10 UTC (permalink / raw)
  To: Adrien Mazarguil, Declan Doherty; +Cc: dev



On 06/04/2018 21:26, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
>> Add new flow action types and associated action data structures to
>> support the encapsulation and decapsulation of the virtual tunnel
>> endpoints.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
>> flow to be encapsulated in the virtual tunnel endpoint overlay
>> defined in the tunnel_encap action data.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
>> tunnel endpoint overlays up to and including the first instance of
>> the flow item type defined in the tunnel_decap action data for the
>> matching flows.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> This generic approach looks flexible enough to cover the use cases that
> immediately come to mind (VLAN, VXLAN), its design is sound.
>
> However, while I'm aware it's not a concern at this point, it won't be able
> to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
> which will require additional meta data or some run-time assistance from the
> application.
>
> Eventually for more complex use cases, dedicated encap/decap actions will
> have to appear, so the issue I wanted to raise before going further is this:
>
> Going generic inevitably trades some of the usability; flat structures
> dedicated to VXLAN encap/decap with only the needed info to get the job done
> would likely be easier to implement in PMDs and use in applications. Any
> number of such actions can be added to rte_flow without ABI impact.
>
> If VXLAN is the only use case at this point, my suggestion would be to go
> with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
> L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
We can go this way and this will increase the action for more and more 
tunneling protocols being added. Current proposal is already a generic 
approach which specifies as a tunnel for all the tunneling protocols.

> Now we can start with the generic approach, see how it fares and add
> dedicated encap/decap later as needed.
>
> More comments below.
>
>> ---
>>   doc/guides/prog_guide/rte_flow.rst | 77 ++++++++++++++++++++++++++++++++--
>>   lib/librte_ether/rte_flow.h        | 84 ++++++++++++++++++++++++++++++++++++--
>>   2 files changed, 155 insertions(+), 6 deletions(-)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index fd33d19..106fb93 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -997,9 +997,11 @@ Actions
>>   
>>   Each possible action is represented by a type. Some have associated
>>   configuration structures. Several actions combined in a list can be assigned
>> -to a flow rule. That list is not ordered.
>> +to a flow rule. That list is not ordered, with the exception of  actions which
>> +modify the packet itself, these packet modification actions must be specified
>> +in the explicit order in which they are to be executed.
>>   
>> -They fall in three categories:
>> +They fall in four categories:
>>   
>>   - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>     processing matched packets by subsequent flow rules, unless overridden
>> @@ -1008,8 +1010,11 @@ They fall in three categories:
>>   - Non-terminating actions (PASSTHRU, DUP) that leave matched packets up for
>>     additional processing by subsequent flow rules.
>>   
>> +- Non-terminating meta actions that do not affect the fate of packets but result
>> +  in modification of the packet itself (SECURITY, TUNNEL_ENCAP, TUNNEL_DECAP).
>> +
>>   - Other non-terminating meta actions that do not affect the fate of packets
>> -  (END, VOID, MARK, FLAG, COUNT, SECURITY).
>> +  (END, VOID, MARK, FLAG, COUNT).
> The above changes are not necessary anymore [1][2].
>
> [1] "ethdev: clarify flow API pattern items and actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095776.html
> [2] "ethdev: alter behavior of flow API actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095779.html
OK, we can undo some changes here.
>
>>   When several actions are combined in a flow rule, they should all have
>>   different types (e.g. dropping a packet twice is not possible).
>> @@ -1486,6 +1491,72 @@ fields in the pattern items.
>>      | 1     | END      |
>>      +-------+----------+
>>   
>> +
> Nit: titles in this file are separated by a single empty line.
Fixed.
>
>> +Action: ``TUNNEL_ENCAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs an encapsulation action by encapsulating the flows matched by the
>> +pattern items according to the network overlay defined in the
>> +``rte_flow_action_tunnel_encap`` pattern items.
>> +
>> +This action modifies the payload of matched flows. The pattern items specified
>> +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
>> +set of overlay headers, from the Ethernet header up to the overlay header. The
>> +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> Regarding the use of a pattern list, if you consider PMDs are already
> iterating on a list of actions when encountering
> RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.
We understand that it is implementation specifics. If we do not go for 
another inner loop, all the bundling need to be handled in the same 
function, which seems more clumsy to me. This also breaks the tunnel 
endpoint concept.
>
> How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
> exactly one item instead (in encap, i.e. reverse order)?
Again, if we have tunnel action, security action, and other actions, all 
the processing and tracking need to be done in one function. Now we will 
need ETH_ENCAP/DECAP, UDP_ENCAP/DECAP, NVGRE_ENCAP/DECAP, etc.
>
> In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
>
>> +
>> +- Non-terminating by default.
> There's no such property anymore [2].
Removed.
>
>> +
>> +.. _table_rte_flow_action_tunnel_encap:
>> +
>> +.. table:: TUNNEL_ENCAP
>> +
>> +   +-------------+---------------------------------------------+
>> +   | Field       | Value                                       |
>> +   +=============+=============================================+
>> +   | ``pattern`` | Virtual tunnel end-point pattern definition |
>> +   +-------------+---------------------------------------------+
>> +
>> +
>> +.. _table_rte_flow_action_tunnel_encap_example:
>> +
>> +.. table:: IPv4 VxLAN flow pattern example.
> VxLAN => VXLAN
Fixed.
>
>> +
>> +   +-------+--------------------------+------------+
>> +   | Index | Flow Item Type           | Flow Item  |
>> +   +=======+==========================+============+
>> +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
>> +   +-------+--------------------------+------------+
>> +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
>> +   +-------+--------------------------+------------+
>> +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
>> +   +-------+--------------------------+------------+
>> +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
>> +   +-------+--------------------------+------------+
>> +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
>> +   +-------+--------------------------+------------+
> One possible issue is that it relies on objects normally found on the
> pattern side of flow rules. Those are supposed to match something, they are
> not intended for packet header generation. While their "spec" and "mask"
> fields might make sense in this context, the "last" field is odd.
>
> You must define them without leaving anything open for interpretation by
> PMDs and users alike. Defining things as "undefined" is fine as long as it's
> covered.
Please note that the "void *item" in the 
"rte_flow_action_tunnel_encap.pattern" points to the data structure 
defined for the corresponding rte_flow_item_type instead of a 
rte_flow_item structure. As an example, for the rte_flow_item_eth type, 
the "void *item" will point to a "struct rte_flow_item_eth" instance. 
Thats why we have defined struct rte_flow_action_item inside struct 
rte_flow_action_tunnel_encap. So, no question of spec, mask, last anymore.

>
>> +
>> +
> Nit: only one empty line necessary here.
Fixed.
>
>> +Action: ``TUNNEL_DECAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs a decapsulation action by stripping all headers of the virtual tunnel
>> +end-point overlay up to the header defined by the flow item type of flows
>> +matched by the pattern items.
> Not necessarily, for instance if one guarantees that flowing traffic only
> consists of decap'able packets. You must avoid mandatory dependencies
> between patterns and actions since they are normally unrelated.
>
> What you can document on the other hand is that the behavior is undefined
> when processing traffic on which the action can't be applied. This is
> how RSS level is documented [3].
>
> [3] https://dpdk.org/ml/archives/dev/2018-April/095783.html
Fixed per your suggestion.
>
>> +
>> +This action modifies the payload of matched flows. The flow item type specified
>> +in the ``rte_flow_action_tunnel_decap`` action structure must defined a valid
>> +set of overlay header type.
>> +
>> +- Non-terminating by default.
> See [2].
Removed.
>
>> +
>> +.. _table_rte_flow_action_tunnel_decap:
>> +
>> +   +---------------+----------------------------------------------+
>> +   | Field         | Value                                        |
>> +   +===============+==============================================+
>> +   | ``item type`` | Item type of tunnel end-point to decapsulate |
>> +   +---------------+----------------------------------------------+
> "item type" should be the exact name used in the structure.
Fixed.
>
>> +
>>   Negative types
>>   ~~~~~~~~~~~~~~
>>   
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 7d1f89d..6d94423 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -854,14 +854,17 @@ struct rte_flow_item {
>>   	const void *mask; /**< Bit-mask applied to spec and last. */
>>   };
>>   
>> +
> Unnecessary empty line.
Fixed.
>
>>   /**
>>    * Action types.
>>    *
>>    * Each possible action is represented by a type. Some have associated
>>    * configuration structures. Several actions combined in a list can be
>> - * affected to a flow rule. That list is not ordered.
>> + * affected to a flow rule. That list is not ordered, with the exception of
>> + * actions which modify the packet itself, these packet modification actions
>> + * must be specified in the explicit order in which they are to be executed.
>>    *
>> - * They fall in three categories:
>> + * They fall in four categories:
>>    *
>>    * - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>    *   processing matched packets by subsequent flow rules, unless overridden
>> @@ -870,6 +873,10 @@ struct rte_flow_item {
>>    * - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
>>    *   for additional processing by subsequent flow rules.
>>    *
>> + * - Non terminating meta actions that do not affect the fate of
>> + *   packets but result in modification of the packet itself (SECURITY,
>> + *   TUNNEL_ENCAP, TUNNEL_DECAP).
>> + *
> Same comment as above [1][2].
Will be revised and undo.
>
>>    * - Other non terminating meta actions that do not affect the fate of
>>    *   packets (END, VOID, MARK, FLAG, COUNT).
>>    *
>> @@ -1022,7 +1029,42 @@ enum rte_flow_action_type {
>>   	 *
>>   	 * See struct rte_flow_action_group_count.
>>   	 */
>> -	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
>> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT,
> An empty line would have been needed here (if we agree about no more
> GROUP_COUNT.)
Fixed.
>
>> +	/**
>> +	 * Encapsulate flow with tunnel defined in
>> +	 * rte_flow_action_tunnel_encap structure.
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_encap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP,
>> +
>> +	/**
>> +	 * Decapsulate all the headers of the tunnel
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_decap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP,
>> +
>> +	/**
>> +	 * Redirects packets to the logical group of the current device.
>> +	 *
>> +	 * In a logical hierarchy of groups, which can be used to represent a
>> +	 * physical of logical chaining of flow tables, this action allows the
>> +	 * terminating action to be a logical group of the same device.
>> +	 *
>> +	 * See struct rte_flow_action_group.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_GROUP,
>> +
>> +	/**
>> +	 * [META]
>> +	 *
>> +	 * Set specific metadata field associated with packet which is then
>> +	 * available to further pipeline stages.
>> +	 *
>> +	 * See struct rte_flow_action_metadata.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_METADATA
> These two actions should be part of the next patch, I won't comment them
> here.
It was my mistake. I will fix them.
>
>>   };
>>   
>>   /**
>> @@ -1173,6 +1215,42 @@ struct rte_flow_action_group_count {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
>> + *
>> + * Virtual tunnel end-point encapsulation action data.
>> + *
>> + * Non-terminating action by default.
> See [2].
Fixed.
>
>> + */
>> +struct rte_flow_action_tunnel_encap {
>> +	struct rte_flow_action_item {
>> +		enum rte_flow_item_type type;
>> +		/**< Flow item type. */
>> +		const void *item;
>> +		/**< Flow item definition which points to the data of
>> +		 * corresponding rte_flow_item_type.
>> +		 */
> I see it's a new action type, albeit a bit confusing (there is no
> RTE_FLOW_ACTION_TYPE_ITEM).
>
> I suggest the standard pattern item type since you're going with enum
> rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
> what fields are relevant. An application might otherwise want to encap with
> unsupported properties (e.g. specific IPv4 ToS field and whatnot).
>
> How about a single "struct rte_flow_pattern_item item", neither const and
> neither a pointer. It's generic enough, enclosed spec/last/mask pointers
> take care of the specifics. You just need to define what's supposed to
> happen when "last" is set.
Please see the comment above regarding this field.

>
>> +	} *pattern;
>> +	/**<
>> +	 * Tunnel pattern specification (list terminated by the END pattern
>> +	 * item).
>> +	 */
> As previously suggested, how about a single item per encap?
Please see the comment above regarding this field.
>
>> +};
>> +
>> +/**
>> + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
>> + *
>> + * Virtual tunnel end-point decapsulation action data.
>> + *
>> + * Non-terminating action by default.
>> + */
>> +struct rte_flow_action_tunnel_decap {
>> +	enum rte_flow_item_type type;
>> +	/**<
>> +	 * Flow item type of virtual tunnel end-point to be decapsulated
>> +	 */
>> +};
> Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
> actions to peel each layer off. The current definition is fine.
To clarify, the the decap is upto the PMD to remove all the header for a 
specified type. For example, for

rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will peel off (ETH, IPV4, UDP, VXLAN) header all together.

>
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> -- 
>> 2.7.4
>>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  2018-04-09 14:22  0%     ` Mohammad Abdul Awal
@ 2018-04-09 15:23  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 15:23 UTC (permalink / raw)
  To: Mohammad Abdul Awal; +Cc: Declan Doherty, dev

Hi Mohammad,

On Mon, Apr 09, 2018 at 03:22:45PM +0100, Mohammad Abdul Awal wrote:
> Hi Adrien,
> 
> 
> On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
> > > Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
> > > counters across multiple flows on a single port or across multiple
> > > flows on multiple ports within the same switch domain.
> > > 
> > > Introduce new API rte_flow_query_group_count to allow querying of group
> > > counters.
> > > 
> > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > Both features are definitely needed, however I suggest to enhance the
> > existing action type and query function instead, given the rte_flow ABI
> > won't be maintained for the 18.05 release [1].
> > 
> > Counters and query support were defined as a kind of PoC in preparation for
> > future requirements back in DPDK 17.02 and so far few PMDs have implemented
> > the query callback (mlx5 and failsafe, and the latter isn't really a PMD).
> > 
> > Due to the behavior change of action lists [2], providing an action type as
> > a query parameter is not specific enough anymore, for instance if a list
> > contains multiple COUNT, the application should be able to tell which needs
> > to be queried.
> > 
> > Therefore I suggest to redefine the query function as follows:
> > 
> >   int
> >   rte_flow_query(uint16_t port_id,
> >                  struct rte_flow *flow,
> >                  const struct rte_flow_action *action,
> >                  void *data,
> >                  struct rte_flow_error *error);
> > 
> > Third argument is an action definition with the same configuration (if any)
> > as previously defined in the action list originally used to create the flow
> > rule (not necessarily the same pointer, only the contents matter).
> > 
> > It means two perfectly identical actions can't be distinguished, and that's
> > how group counters will work.
> > Instead of adding a new action type to distinguish groups, a configuration
> > structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
> > non-shared counters as a default behavior:
> > 
> >   struct rte_flow_action_count {
> >           uint32_t shared:1; /**< Share counter ID with other flow rules. */
> >           uint32_t reserved:31; /**< Reserved, must be zero. */
> >           uint32_t id; /**< Counter ID. */
> >   };
> > 
> > Doing so will impact some existing code in mlx5 and librte_flow_classify,
> > but that shouldn't be much of an issue.
> > 
> > Keep in mind testpmd and its documentation must be updated as well.
> > 
> > Thoughts?
> Please correct me if I am wrong but I think we are talking two different
> things here.
> If I understood you correctly, you are proposing to pass a list of actions
> (instead if a action type) in the third parameter to perform multiple
> actions in the same query call. Lets take an example for 100 ingress flows.
> So, if we want to query the counter for all the flows, we can get them by a
> single query providing a list (of size 100) of action_count in the 3rd
> param.

Whoa no! I'm only suggesting a pointer to a single action as a replacement
for the basic action type, not a *list* of actions. I hope this addresses
all concerns :)

The fact the action in question would refer to a shared counter (see struct
above) with a given ID would be enough to make all counters with the same ID
refer to the same internal PMD object.

> On the other hand, we are saying that all the flows are belongs to same
> tunnel end-point (we are talking only 1 TEP here), then the PMD will be
> responsible to increment the counter of TEP for matching all the flows (100
> flows). So, using one group query by passing one action_count in 3rd param,
> we can get the count of the TEP.
> 
> This case is generic enough for sure for simple flows but may not be
> suitable for tunnel cases, as application needs to track the counters for
> all the flows, and needs to build the list of action each time the flows
> added/deleted.

I think we're on the same page. I'm only suggesting to define a
configuration structure for the COUNT action (which currently doesn't take
any) as a replacement for GROUP_COUNT, and tweak the query callback to be
able to tell a specific counter to query instead of adding a new query
callback and leaving the existing one broken by design.

> > A few nits below for the sake of commenting.
> > 
> > [1] "Flow API overhaul for switch offloads"
> >      http://dpdk.org/ml/archives/dev/2018-April/095774.html
> > [2] "ethdev: alter behavior of flow API actions"
> >      http://dpdk.org/ml/archives/dev/2018-April/095779.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to flow API
  2018-04-07  9:51  0%     ` Andrew Rybchenko
@ 2018-04-09 15:00  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 15:00 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: Thomas Monjalon, Ferruh Yigit, dev, Zhang, Qi Z

On Sat, Apr 07, 2018 at 12:51:40PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > This patch adds the missing action counterpart to the PHY_PORT pattern
> > item, that is, the ability to directly inject matching traffic into a
> > physical port of the underlying device.
> 
> Does it mean that if it is applied on ingress (incoming packet from network)
> it will simply send packets back to network (specified physical port)?

Precisely.

> And if it is applied on egress (outgoing from device to network) it will
> be directed to possibly different physical port and sent to network.

Right, note it gives applications the ability to express that wish, the fact
PMDs support it is another matter :)

In any case, this action is added for API completeness but should be rarely
necessary since we chose to go with port representors.

Port representors will expose valid DPDK port IDs, therefore applications
will simply have to create ingress/egress flow rules on the right DPDK port
targeting different port IDs through the PORT_ID action.

> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
> >   app/test-pmd/config.c                       |  1 +
> >   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
> >   lib/librte_ether/rte_flow.c                 |  1 +
> >   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
> >   6 files changed, 84 insertions(+)
> 
> <...>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in flow API
  2018-04-07  9:41  0%     ` Andrew Rybchenko
@ 2018-04-09 14:49  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:49 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Ajit Khaparde, Somnath Kotur,
	Beilei Xing, Qi Zhang

On Sat, Apr 07, 2018 at 12:41:17PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > Contrary to all other pattern items, these are inconsistently documented as
> > affecting traffic instead of simply matching its origin, without provision
> > for the latter.
> > 
> > This commit clarifies documentation and updates PMDs since the original
> > behavior now has to be explicitly requested using the new transfer
> > attribute.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_create()
> > - rte_flow_validate()
> > 
> > Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
> > supported when a transfer attribute is also present.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 12 +++---
> >   doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
> >   drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
> >   drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
> >   lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
> >   6 files changed, 77 insertions(+), 75 deletions(-)
> 
> <...>
> 
> > diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> > index 735ce6323..beedc713b 100644
> > --- a/doc/guides/prog_guide/rte_flow.rst
> > +++ b/doc/guides/prog_guide/rte_flow.rst
> > @@ -518,15 +518,12 @@ Usage example, matching non-TCPv4 packets only:
> >   Item: ``PF``
> >   ^^^^^^^^^^^^
> > -Matches packets addressed to the physical function of the device.
> > +Matches traffic originating from (ingress) or going to (egress) the physical
> > +function of the current device.
> 
> Not sure that I understand above. It looks like ingress and egress are
> misplaced.
> There many similar cases below.

In this API, "ingress" and "egress" are always defined as relative to the
application creating the flow rule. In that sense they are respectively
synonyms for "from" and "to".

I agree they are not properly defined in this document, in fact ingress and
egress were clarified (with diagrams and all) in the RFC submitted prior to
this patch [1].

I will update the "Attribute: Traffic direction" section in my next update.

[1] See "Traffic direction" in
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-07  9:05  0%     ` Andrew Rybchenko
@ 2018-04-09 14:42  0%       ` Adrien Mazarguil
  2018-04-11 13:21  0%         ` Andrew Rybchenko
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:42 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Xueming Li, Wenzhuo Lu,
	Jingjing Wu, Beilei Xing, Qi Zhang, Konstantin Ananyev,
	Nelio Laranjeiro, Yongseok Koh, Pascal Mazon, Radu Nicolau,
	Akhil Goyal, Ivan Malov

On Sat, Apr 07, 2018 at 12:05:51PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > Since its inception, the rte_flow RSS action has been relying in part on
> > external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> > This structure lacks parameters such as the hash algorithm to use, and more
> > recently, a method to tell which layer RSS should be performed on [1].
> > 
> > Given struct rte_eth_rss_conf will never be flexible enough to represent a
> > complete RSS configuration (e.g. RETA table), this patch supersedes it by
> > extending the rte_flow RSS action directly.
> > 
> > A subsequent patch will add a field to use a non-default RSS hash
> > algorithm. To that end, a field named "types" replaces the field formerly
> > known as "rss_hf" and standing for "RSS hash functions" as it was
> > confusing. Actual RSS hash function types are defined by enum
> > rte_eth_hash_function.
> > This patch updates all PMDs and example applications accordingly.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
> >      configuration")
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Xueming Li <xuemingl@mellanox.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > Cc: Radu Nicolau <radu.nicolau@intel.com>
> > Cc: Akhil Goyal <akhil.goyal@nxp.com>
> > ---
> >   app/test-pmd/cmdline_flow.c        |  59 +++++-----
> >   app/test-pmd/config.c              |  39 +++----
> >   doc/guides/prog_guide/rte_flow.rst |  22 ++--
> >   drivers/net/e1000/e1000_ethdev.h   |  13 ++-
> >   drivers/net/e1000/igb_ethdev.c     |   4 +-
> >   drivers/net/e1000/igb_flow.c       |  31 ++---
> >   drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
> >   drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
> >   drivers/net/i40e/i40e_ethdev.h     |  15 ++-
> >   drivers/net/i40e/i40e_flow.c       |  47 ++++----
> >   drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
> >   drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
> >   drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
> >   drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
> >   drivers/net/mlx4/mlx4.c            |   2 +-
> >   drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
> >   drivers/net/mlx4/mlx4_flow.h       |   2 +-
> >   drivers/net/mlx4/mlx4_rxq.c        |   2 +-
> >   drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
> >   drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
> >   drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
> >   drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
> >   drivers/net/sfc/sfc_flow.c         |  21 ++--
> >   drivers/net/tap/tap_flow.c         |   8 +-
> >   examples/ipsec-secgw/ipsec.c       |  10 +-
> >   lib/librte_ether/rte_flow.c        |  39 +++----
> >   lib/librte_ether/rte_flow.h        |   6 +-
> >   27 files changed, 473 insertions(+), 353 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index 056405515..1a2c0299c 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	struct sfc_rxq *rxq;
> >   	unsigned int rxq_hw_index_min;
> >   	unsigned int rxq_hw_index_max;
> > -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
> > -	uint64_t rss_hf;
> > -	uint8_t *rss_key = NULL;
> > +	const uint8_t *rss_key;
> >   	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
> >   	unsigned int i;
> > -	if (rss->num == 0)
> > +	if (rss->queue_num == 0)
> >   		return -EINVAL;
> >   	rxq_sw_index = sa->rxq_count - 1;
> > @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	rxq_hw_index_min = rxq->hw_index;
> >   	rxq_hw_index_max = 0;
> > -	for (i = 0; i < rss->num; ++i) {
> > +	for (i = 0; i < rss->queue_num; ++i) {
> >   		rxq_sw_index = rss->queue[i];
> >   		if (rxq_sw_index >= sa->rxq_count)
> > @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   			rxq_hw_index_max = rxq->hw_index;
> >   	}
> > -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
> 
> Here we had a fallback to default rss_hf (now types) if rss_conf is
> unspecified.

Thing is, rss_action->conf was never supposed to be NULL in the first
place. Crashing on a NULL configuration has always been fine, but until
recently prevented validation with testpmd's broken implementation. This
problem was addressed in a prior series [1][2][3].

Since a value is now always provided, no need for a fallback.

[1] "app/testpmd: fix lack of flow action configuration"
    http://dpdk.org/ml/archives/dev/2018-April/095280.html
[2] "app/testpmd: fix RSS flow action configuration"
    http://dpdk.org/ml/archives/dev/2018-April/095281.html
[3] "app/testpmd: fix missing RSS fields in flow action"
    http://dpdk.org/ml/archives/dev/2018-April/095282.html

> > -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
> > +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
> >   		return -EINVAL;
> > -	if (rss_conf != NULL) {
> > -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
> > +	if (rss->key_len) {
> > +		if (rss->key_len != sizeof(sa->rss_key))
> >   			return -EINVAL;
> > -		rss_key = rss_conf->rss_key;
> > +		rss_key = rss->key;
> >   	} else {
> >   		rss_key = sa->rss_key;
> >   	}
> > @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
> >   	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
> > -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
> > +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
> 
> Now types go directly to mapping function and unspecified types (0)
> will result in 0 rss_hash_types. Of course, it is a question how to treat
> types==0. It is possible to say that it no RSS, but it does not make sense.
> So, real options are device defaults (regardless configured on device level)
> or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
> Please, document the intended behaviour in rte_flow.rst.

Granted the existing documentation doesn't say much on that topic, but a 0
value for rss_hf does actually mean "no RSS" [4]:

 "The *rss_hf* field of the *rss_conf* structure indicates the different
  types of IPv4/IPv6 packets to which the RSS hashing must be applied.
  Supplying an *rss_hf* equal to zero disables the RSS feature."

Now since this action doesn't use struct rte_eth_rss_conf anymore, we could
define 0 as a PMD-specific behavior, which could be no RSS. It would make
the API easier to use for applications that don't care about the RSS
capabilities of each underlying adapter, 0 would just work everywhere as a
safe default.

[4] https://dpdk.org/doc/api/structrte__eth__rss__conf.html

> If the later is chosen, above we'll have a bug since fallback to fixed
> default.
> Just use sa->rss_hash_types as fallback. Something like:
> if (rss->types)
>     sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
> else
>     sfc_rss_conf->rss_hash_types =sa->rss_hash_types;

Looks like the previous code didn't provide a fallback when rss_hf was 0,
only when rss_conf itself was NULL. So this is not a new issue introduced by
this patch.

I will update documentation to define 0 as described above for the
convenience of application writers and leave the existing code in place.
PMD maintainers will be free to enhance it as they wish later.
Just remember testpmd now always provides a default value for it after
querying the device [2].

> >   	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
> >   	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
> > -		unsigned int rxq_sw_index = rss->queue[i % rss->num];
> > +		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
> >   		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
> >   		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
> 
> <...>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API
  2018-04-06 17:11  0%     ` Andrew Rybchenko
@ 2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:42 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Wenzhuo Lu, Jingjing Wu,
	Ajit Khaparde, Somnath Kotur, John Daley, Hyong Youb Kim,
	Beilei Xing, Qi Zhang, Konstantin Ananyev, Nelio Laranjeiro,
	Yongseok Koh, Tomasz Duszynski, Dmitri Epshtein,
	Natalie Samsonov, Jianbo Liu, Pascal Mazon

On Fri, Apr 06, 2018 at 08:11:38PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> > consistent with the normal stacking order of pattern items, which is
> > confusing to applications.
> > 
> > Problem is that when followed by one of these layers, the EtherType field
> > of the preceding layer keeps its "inner" definition, and the "outer" TPID
> > is provided by the subsequent layer, the reverse of how a packet looks like
> > on the wire:
> > 
> >   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
> >   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
> > 
> > Worse, when QinQ is involved, the stacking order of VLAN layers is
> > unspecified. It is unclear whether it should be reversed (innermost to
> > outermost) as well given TPID applies to the previous layer:
> > 
> >   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
> >   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
> >   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
> > 
> > While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> > order in case of QinQ and the lack of documentation remain an issue.
> > 
> > This patch replaces TPID in the VLAN pattern item with an inner
> > EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> > clarifies documentation and updates all relevant code.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> > items:
> > 
> > - bnxt: EtherType matching is supported, and vlan->inner_type overrides
> >    eth->type if the latter has standard TPID value 0x8100, otherwise an
> >    error is triggered.
> > 
> > - e1000: EtherType matching is only supported with the ETHERTYPE filter,
> >    which does not support VLAN matching, therefore no impact.
> > 
> > - enic: same as bnxt.
> > 
> > - i40e: same as bnxt with a configurable TPID value for the FDIR filter,
> >    with existing limitations on allowed EtherType values. The remaining
> >    filter types (VXLAN, NVGRE, QINQ) do not support EtherType matching.
> > 
> > - ixgbe: same as e1000, with additional minor change to rely on the new
> >    E-Tag macro definition.
> > 
> > - mlx4: EtherType/TPID matching is not supported, no impact.
> > 
> > - mlx5: same as bnxt.
> > 
> > - mrvl: EtherType matching is supported but eth->type cannot be specified
> >    when a VLAN item is present. However vlan->inner_type is used if
> >    specified.
> > 
> > - sfc: same as bnxt with QinQ TPID value 0x88a8 additionally supported.
> > 
> > - tap: same as bnxt.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> > Cc: John Daley <johndale@cisco.com>
> > Cc: Hyong Youb Kim <hyonkim@cisco.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Tomasz Duszynski <tdu@semihalf.com>
> > Cc: Dmitri Epshtein <dima@marvell.com>
> > Cc: Natalie Samsonov <nsamsono@marvell.com>
> > Cc: Jianbo Liu <jianbo.liu@arm.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > 
> > ---
> > 
> > Hi PMD maintainers, while I'm pretty confident in these changes, I could
> > not validate them with all devices.
> > 
> > It would be great if you could apply this patch, run testpmd, create VLAN
> > flow rules with/without inner EtherType as described and send matching
> > traffic while making sure nothing was broken in the process.
> > 
> > Thanks!
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 17 +++---
> >   doc/guides/nics/tap.rst                     |  2 +-
> >   doc/guides/prog_guide/rte_flow.rst          | 21 ++++++--
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
> >   drivers/net/bnxt/bnxt_filter.c              | 39 +++++++++++---
> >   drivers/net/enic/enic_flow.c                | 22 +++++---
> >   drivers/net/i40e/i40e_flow.c                | 69 +++++++++++++++++++-----
> >   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
> >   drivers/net/mlx5/mlx5_flow.c                | 16 +++++-
> >   drivers/net/mvpp2/mrvl_flow.c               | 27 +++++++---
> >   drivers/net/sfc/sfc_flow.c                  | 28 ++++++++++
> >   drivers/net/tap/tap_flow.c                  | 16 ++++--
> >   lib/librte_ether/rte_flow.h                 | 24 ++++++---
> >   lib/librte_net/rte_ether.h                  |  1 +
> >   14 files changed, 229 insertions(+), 60 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index bc4974edf..f61d4ec92 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -7,6 +7,7 @@
> >    * for Solarflare) and Solarflare Communications, Inc.
> >    */
> > +#include <rte_byteorder.h>
> >   #include <rte_tailq.h>
> >   #include <rte_common.h>
> >   #include <rte_ethdev_driver.h>
> > @@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
> >   	const struct rte_flow_item_vlan *mask = NULL;
> >   	const struct rte_flow_item_vlan supp_mask = {
> >   		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
> > +		.inner_type = RTE_BE16(0xffff),
> >   	};
> >   	rc = sfc_flow_parse_init(item,
> > @@ -393,6 +395,32 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
> >   		return -rte_errno;
> >   	}
> > +	/*
> > +	 * If an EtherType was already specified, make sure it is a valid
> > +	 * TPID for the current VLAN layer before overwriting it with the
> > +	 * specified inner type.
> > +	 */
> > +	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
> > +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_VLAN) &&
> > +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_QINQ)) {
> 
> 1. efs_ether_type is host-endian

Whoops, looks like I only half-fixed that endian issue in v2.

> 2. HW recognizes more TPIDs (0x9100, 0x9200, 0x9300) as VLAN
> 3. However, if some TPID is specified, user may expect that only VLAN
> packets
>     with specified TPID match. It is false expectation since the information
> is not
>     passed to HW to match (and there is no way to match).
>     So, it is safer to deny TPID specification (i.e. keep the first
> condition only).
>     From the flexibility point of view it is possible to allow any, but it
> should be
>     documented that exact match is not checked in fact.

Thanks for pointing this out. I've decided to update all PMDs to disallow
TPID matching because many devices support multiple concurrent TPIDs and
there's no way to match a given one explicitly. This should make the patch
simpler as well.

> > +		rte_flow_error_set(error, EINVAL,
> > +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> > +				   "Unsupported outer TPID");
> > +		return -rte_errno;
> > +	}
> > +	if (!mask->inner_type) {
> > +		efx_spec->efs_match_flags &= ~EFX_FILTER_MATCH_ETHER_TYPE;
> > +		efx_spec->efs_ether_type = RTE_BE16(0x0000);
> 
> Nothing should be done here if above is done.
> 
> > +	} else if (mask->inner_type == supp_mask.inner_type) {
> > +		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
> > +		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
> > +	} else {
> > +		rte_flow_error_set(error, EINVAL,
> > +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> > +				   "Bad mask for VLAN inner_type");
> > +		return -rte_errno;
> > +	}
> > +
> >   	return 0;
> >   }
> 
> <...>
> 
> > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > index fc7e6705d..b13b0e2e6 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
> >    *
> >    * Matches an 802.1Q/ad VLAN tag.
> >    *
> > - * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
> > - * RTE_FLOW_ITEM_TYPE_VLAN.
> > + * The corresponding standard outer EtherType (TPID) values are
> > + * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
> > + * pattern item.
> >    */
> >   struct rte_flow_item_vlan {
> > -	rte_be16_t tpid; /**< Tag protocol identifier. */
> >   	rte_be16_t tci; /**< Tag control information. */
> > +	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
> >   };
> >   /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
> >   #ifndef __cplusplus
> >   static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
> > -	.tpid = RTE_BE16(0x0000),
> > -	.tci = RTE_BE16(0xffff),
> > +	.tci = RTE_BE16(0x0fff),
> 
> It looks like unrelated change.

Yep, it should have been in a separate patch. I'll split it in my next
update. Thanks for reviewing.

> 
> > +	.inner_type = RTE_BE16(0x0000),
> >   };
> >   #endif
> 
> <...>
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action
  @ 2018-04-09 14:41  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:41 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Wenzhuo Lu, Jingjing Wu,
	Beilei Xing, Qi Zhang, Konstantin Ananyev, Nelio Laranjeiro,
	Yongseok Koh, Pascal Mazon

On Fri, Apr 06, 2018 at 06:41:35PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > By definition, RSS involves some kind of hash algorithm, usually Toeplitz.
> > 
> > Until now it could not be modified on a flow rule basis and PMDs had to
> > always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
> > behavior when unspecified (0).
> > 
> > This breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
> >   app/test-pmd/config.c                       |  1 +
> >   doc/guides/prog_guide/rte_flow.rst          |  2 +
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
> >   drivers/net/e1000/igb_flow.c                |  4 ++
> >   drivers/net/e1000/igb_rxtx.c                |  4 +-
> >   drivers/net/i40e/i40e_ethdev.c              |  4 +-
> >   drivers/net/i40e/i40e_flow.c                |  4 ++
> >   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
> >   drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
> >   drivers/net/mlx4/mlx4_flow.c                |  7 +++
> >   drivers/net/mlx5/mlx5_flow.c                | 13 +++++
> >   drivers/net/sfc/sfc_flow.c                  |  3 +
> >   drivers/net/tap/tap_flow.c                  |  6 ++
> >   lib/librte_ether/rte_flow.c                 |  1 +
> >   lib/librte_ether/rte_flow.h                 |  2 +
> >   16 files changed, 131 insertions(+), 3 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index 1a2c0299c..dbe4c2baa 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -1261,6 +1261,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   			rxq_hw_index_max = rxq->hw_index;
> >   	}
> > +	if (rss->func)
> 
> May be it is better to compare with RTE_ETH_HASH_FUNCTION_DEFAULT
> explicitly? I think it is more readable. If so, it is applicable to all
> similar checks
> in the patch.

Good suggestion, although RTE_ETH_HASH_FUNCTION_DEFAULT can't be anything
other than 0 for various reasons, I'll clarify (most of) the code in my next
update.

> In the case of sfc, please, allow RTE_ETH_HASH_FUNCTION_TOEPLITZ as well.
> I'd suggest:
> switch (rss->func) {
> case RTE_ETH_HASH_FUNCTION_DEFAULT:
> case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
>       break;
> default:
>       return -EINVAL;
> }

I'll add it, thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  2018-04-06 20:26  3%   ` Adrien Mazarguil
@ 2018-04-09 14:22  0%     ` Mohammad Abdul Awal
  2018-04-09 15:23  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Mohammad Abdul Awal @ 2018-04-09 14:22 UTC (permalink / raw)
  To: Adrien Mazarguil, Declan Doherty; +Cc: dev

Hi Adrien,


On 06/04/2018 21:26, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
>> Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
>> counters across multiple flows on a single port or across multiple
>> flows on multiple ports within the same switch domain.
>>
>> Introduce new API rte_flow_query_group_count to allow querying of group
>> counters.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> Both features are definitely needed, however I suggest to enhance the
> existing action type and query function instead, given the rte_flow ABI
> won't be maintained for the 18.05 release [1].
>
> Counters and query support were defined as a kind of PoC in preparation for
> future requirements back in DPDK 17.02 and so far few PMDs have implemented
> the query callback (mlx5 and failsafe, and the latter isn't really a PMD).
>
> Due to the behavior change of action lists [2], providing an action type as
> a query parameter is not specific enough anymore, for instance if a list
> contains multiple COUNT, the application should be able to tell which needs
> to be queried.
>
> Therefore I suggest to redefine the query function as follows:
>
>   int
>   rte_flow_query(uint16_t port_id,
>                  struct rte_flow *flow,
>                  const struct rte_flow_action *action,
>                  void *data,
>                  struct rte_flow_error *error);
>
> Third argument is an action definition with the same configuration (if any)
> as previously defined in the action list originally used to create the flow
> rule (not necessarily the same pointer, only the contents matter).
>
> It means two perfectly identical actions can't be distinguished, and that's
> how group counters will work.
> Instead of adding a new action type to distinguish groups, a configuration
> structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
> non-shared counters as a default behavior:
>
>   struct rte_flow_action_count {
>           uint32_t shared:1; /**< Share counter ID with other flow rules. */
>           uint32_t reserved:31; /**< Reserved, must be zero. */
>           uint32_t id; /**< Counter ID. */
>   };
>
> Doing so will impact some existing code in mlx5 and librte_flow_classify,
> but that shouldn't be much of an issue.
>
> Keep in mind testpmd and its documentation must be updated as well.
>
> Thoughts?
Please correct me if I am wrong but I think we are talking two different 
things here.
If I understood you correctly, you are proposing to pass a list of 
actions (instead if a action type) in the third parameter to perform 
multiple actions in the same query call. Lets take an example for 100 
ingress flows. So, if we want to query the counter for all the flows, we 
can get them by a single query providing a list (of size 100) of 
action_count in the 3rd param.

On the other hand, we are saying that all the flows are belongs to same 
tunnel end-point (we are talking only 1 TEP here), then the PMD will be 
responsible to increment the counter of TEP for matching all the flows 
(100 flows). So, using one group query by passing one action_count in 
3rd param, we can get the count of the TEP.

This case is generic enough for sure for simple flows but may not be 
suitable for tunnel cases, as application needs to track the counters 
for all the flows, and needs to build the list of action each time the 
flows added/deleted.

>
> A few nits below for the sake of commenting.
>
> [1] "Flow API overhaul for switch offloads"
>      http://dpdk.org/ml/archives/dev/2018-April/095774.html
> [2] "ethdev: alter behavior of flow API actions"
>      http://dpdk.org/ml/archives/dev/2018-April/095779.html
>
>> ---
>>   doc/guides/prog_guide/rte_flow.rst      | 35 +++++++++++++++++++++
>>   lib/librte_ether/rte_ethdev_version.map |  8 +++++
>>   lib/librte_ether/rte_flow.c             | 21 +++++++++++++
>>   lib/librte_ether/rte_flow.h             | 56 ++++++++++++++++++++++++++++++++-
>>   lib/librte_ether/rte_flow_driver.h      |  6 ++++
>>   5 files changed, 125 insertions(+), 1 deletion(-)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index 961943d..fd33d19 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -1698,6 +1698,41 @@ Return values:
>>   
>>   - 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>>   
>> +
> Unnecessary empty line.
I will take into account all the comments.

Regards,
Awal.

>
>> +Group Count Query
>> +~~~~~~~~~~~~~~~~~
>> +
>> +Query group counter which can be associated with multiple flows on a specified
>> +port.
>> +
>> +This function allows retrieving of group counters. A group counter is a
>> +counter which can be shared among multiple flows on a single port or among
>> +multiple flows on multiple ports within the same switch domain. Data is
>> +gathered by special actions which must be present in the flow rule
>> +definition.
>> +
>> +.. code-block:: c
>> +
>> +   int
>> +   rte_flow_query_group_count(uint16_t port_id,
>> +			   uint32_t group_counter_id,
>> +			   struct rte_flow_query_count *count,
>> +               struct rte_flow_error *error);
>> +
>> +Arguments:
>> +
>> +- ``port_id``: port identifier of Ethernet device.
>> +- ``group_counter_id``: group counter identifier.
>> +- ``count``: group counter parameters.
>> +- ``error``: perform verbose error reporting if not NULL. PMDs initialize
>> +  this structure in case of error only.
>> +
>> +Return values:
>> +
>> +- 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>> +
>> +
>> +
> More unnecessary empty lines.
>
>>   Isolated mode
>>   -------------
>>   
>> diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
>> index 34df6c8..cff6807 100644
>> --- a/lib/librte_ether/rte_ethdev_version.map
>> +++ b/lib/librte_ether/rte_ethdev_version.map
>> @@ -229,3 +229,11 @@ EXPERIMENTAL {
>>   	rte_mtr_stats_update;
>>   
>>   } DPDK_17.11;
>> +
>> +
> One more.
>
>> +EXPERIMENTAL {
>> +	global:
>> +
>> +	rte_flow_query_group_count
>> +
>> +} DPDK_18.05;
>> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
>> index 38f2d27..e10b1d0 100644
>> --- a/lib/librte_ether/rte_flow.c
>> +++ b/lib/librte_ether/rte_flow.c
>> @@ -418,3 +418,24 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len,
>>   	}
>>   	return 0;
>>   }
>> +
>> +int __rte_experimental
>> +rte_flow_query_group_count(uint16_t port_id,
>> +	uint32_t group_count_id,
>> +	struct rte_flow_query_count *count,
>> +	struct rte_flow_error *error)
> This function lacks a short documentation comment (see rte_flow_query()).
>
>> +{
>> +	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
>> +	const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error);
>> +
>> +	if (!ops)
>> +		return -rte_errno;
>> +	if (likely(!!ops->query_group_count))
>> +		return flow_err(port_id,
>> +				ops->query_group_count(dev, group_count_id,
>> +						       count, error),
>> +				error);
>> +	return rte_flow_error_set(error, ENOSYS,
>> +				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
>> +				  NULL, rte_strerror(ENOSYS));
>> +}
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 13e4202..7d1f89d 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -1010,7 +1010,19 @@ enum rte_flow_action_type {
>>   	 *
>>   	 * See struct rte_flow_action_security.
>>   	 */
>> -	RTE_FLOW_ACTION_TYPE_SECURITY
>> +	RTE_FLOW_ACTION_TYPE_SECURITY,
>> +
>> +	/**
>> +	 * Enable a shared flow group counter for flow. Group counters can be
>> +	 * associated with multiples flows on the same port or on port within
>> +	 * the same switch domain if supported by that device.
>> +	 *
>> +	 * Group counters can be retrieved and reset through
>> +	 * rte_flow_query_group_count()
>> +	 *
>> +	 * See struct rte_flow_action_group_count.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> Don't forget the trailing comma.
>
>>   };
>>   
>>   /**
>> @@ -1149,6 +1161,18 @@ struct rte_flow_action_security {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_GROUP_COUNT
>> + *
>> + * A packet/byte counter which can be shared across a group of flows programmed
>> + * on the same port/switch domain.
>> + *
>> + * Non-terminating by default.
>> + */
>> +struct rte_flow_action_group_count {
>> +	uint32_t id;
>> +};
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> @@ -1476,6 +1500,36 @@ rte_flow_copy(struct rte_flow_desc *fd, size_t len,
>>   	      const struct rte_flow_item *items,
>>   	      const struct rte_flow_action *actions);
>>   
>> +
> Caught another empty line.
>
>> +/**
>> + * Get hit/bytes count for group counter.
>> + *
>> + * A group counter is a counter which can be shared among multiple flows on a
>> + * single port or among multiple flows on multiple ports within the same
>> + * switch domain.
>> + *
>> + * In the case of ports within the same switch domain a global name space is
>> + * assumed for group_count_id value.
>> + *
>> + * @param[in]	port_id
>> + *   Port identifier of Ethernet device.
>> + * @param[in]	group_count_id
>> + *   Group counter identifier to query
>> + * @param[out]	count
>> + *   Group counter value
>> + * @param[out]	error
>> + *   Perform verbose error reporting if not NULL. PMDs initialize this
>> + *   structure in case of error only.
>> + *
>> + * @return
>> + *   Negative error code (errno value) and rte_errno is set.
>> + */
>> +int __rte_experimental
>> +rte_flow_query_group_count(uint16_t port_id,
>> +			   uint32_t group_count_id,
>> +			   struct rte_flow_query_count *count,
>> +			   struct rte_flow_error *error);
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_ether/rte_flow_driver.h b/lib/librte_ether/rte_flow_driver.h
>> index 7778c8e..ef09465 100644
>> --- a/lib/librte_ether/rte_flow_driver.h
>> +++ b/lib/librte_ether/rte_flow_driver.h
>> @@ -96,6 +96,12 @@ struct rte_flow_ops {
>>   		(struct rte_eth_dev *,
>>   		 int,
>>   		 struct rte_flow_error *);
>> +	/** See rte_flow_query_group_count(). */
>> +	int (*query_group_count)
>> +		(struct rte_eth_dev *,
>> +		 uint32_t,
>> +		 struct rte_flow_query_count *,
>> +		 struct rte_flow_error *);
>>   };
>>   
>>   /**
>> -- 
>> 2.7.4
>>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v1] doc:  add SPDX Licence to doc files
@ 2018-04-09 13:11  5% Marko Kovacevic
  0 siblings, 0 replies; 200+ results
From: Marko Kovacevic @ 2018-04-09 13:11 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, john.mcnamara, hemant.agrawal, Marko Kovacevic

Added SPDX headers to doc files to have them aligned with
the other doc files.

Signed-off-by: Marko Kovacevic <marko.kovacevic@intel.com>
---
 doc/guides/contributing/cheatsheet.rst           | 3 +++
 doc/guides/contributing/coding_style.rst         | 3 +++
 doc/guides/contributing/design.rst               | 3 +++
 doc/guides/contributing/documentation.rst        | 3 +++
 doc/guides/contributing/img/patch_cheatsheet.svg | 3 ++-
 doc/guides/contributing/index.rst                | 3 +++
 doc/guides/contributing/patches.rst              | 3 +++
 doc/guides/contributing/stable.rst               | 3 +++
 doc/guides/contributing/versioning.rst           | 3 +++
 doc/guides/linux_gsg/nic_perf_intel_platform.rst | 3 +++
 doc/guides/rel_notes/deprecation.rst             | 3 +++
 doc/guides/rel_notes/release_16_04.rst           | 3 +++
 doc/guides/rel_notes/release_16_07.rst           | 3 +++
 doc/guides/rel_notes/release_16_11.rst           | 3 +++
 doc/guides/rel_notes/release_17_02.rst           | 3 +++
 doc/guides/rel_notes/release_17_05.rst           | 3 +++
 doc/guides/rel_notes/release_17_08.rst           | 3 +++
 doc/guides/rel_notes/release_17_11.rst           | 3 +++
 doc/guides/rel_notes/release_18_02.rst           | 3 +++
 doc/guides/rel_notes/release_2_2.rst             | 3 +++
 20 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/doc/guides/contributing/cheatsheet.rst b/doc/guides/contributing/cheatsheet.rst
index 7bc0771..97512d7 100644
--- a/doc/guides/contributing/cheatsheet.rst
+++ b/doc/guides/contributing/cheatsheet.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Patch Cheatsheet
 ================
 
diff --git a/doc/guides/contributing/coding_style.rst b/doc/guides/contributing/coding_style.rst
index b0f0adb..e285fc8 100644
--- a/doc/guides/contributing/coding_style.rst
+++ b/doc/guides/contributing/coding_style.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. _coding_style:
 
 DPDK Coding Style
diff --git a/doc/guides/contributing/design.rst b/doc/guides/contributing/design.rst
index 88d3a43..bbe219e 100644
--- a/doc/guides/contributing/design.rst
+++ b/doc/guides/contributing/design.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Design
 ======
 
diff --git a/doc/guides/contributing/documentation.rst b/doc/guides/contributing/documentation.rst
index 82f2e1b..7ac1b41 100644
--- a/doc/guides/contributing/documentation.rst
+++ b/doc/guides/contributing/documentation.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. _doc_guidelines:
 
 DPDK Documentation Guidelines
diff --git a/doc/guides/contributing/img/patch_cheatsheet.svg b/doc/guides/contributing/img/patch_cheatsheet.svg
index 8522592..af2473d 100644
--- a/doc/guides/contributing/img/patch_cheatsheet.svg
+++ b/doc/guides/contributing/img/patch_cheatsheet.svg
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<!-- SPDX-License-Identifier: BSD-3-Clause -->
+<!-- Copyright(c) 2018 Intel Corporation -->
 
 <svg
    xmlns:dc="http://purl.org/dc/elements/1.1/"
diff --git a/doc/guides/contributing/index.rst b/doc/guides/contributing/index.rst
index 329b678..9fa0076 100644
--- a/doc/guides/contributing/index.rst
+++ b/doc/guides/contributing/index.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Contributor's Guidelines
 ========================
 
diff --git a/doc/guides/contributing/patches.rst b/doc/guides/contributing/patches.rst
index 2287835..494cdf4 100644
--- a/doc/guides/contributing/patches.rst
+++ b/doc/guides/contributing/patches.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. submitting_patches:
 
 Contributing Code to DPDK
diff --git a/doc/guides/contributing/stable.rst b/doc/guides/contributing/stable.rst
index 0f2f1f3..16518b0 100644
--- a/doc/guides/contributing/stable.rst
+++ b/doc/guides/contributing/stable.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. stable_lts_releases:
 
 DPDK Stable Releases and Long Term Support
diff --git a/doc/guides/contributing/versioning.rst b/doc/guides/contributing/versioning.rst
index c495294d..6e2f073 100644
--- a/doc/guides/contributing/versioning.rst
+++ b/doc/guides/contributing/versioning.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Managing ABI updates
 ====================
 
diff --git a/doc/guides/linux_gsg/nic_perf_intel_platform.rst b/doc/guides/linux_gsg/nic_perf_intel_platform.rst
index 987cd0a..7bd05b6 100644
--- a/doc/guides/linux_gsg/nic_perf_intel_platform.rst
+++ b/doc/guides/linux_gsg/nic_perf_intel_platform.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 How to get best performance with NICs on Intel platforms
 ========================================================
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..a8bd787 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 ABI and API Deprecation
 =======================
 
diff --git a/doc/guides/rel_notes/release_16_04.rst b/doc/guides/rel_notes/release_16_04.rst
index d0a09ef..072b294 100644
--- a/doc/guides/rel_notes/release_16_04.rst
+++ b/doc/guides/rel_notes/release_16_04.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.04
 ==================
 
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index a8a3fc1..e8aea7a 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.07
 ==================
 
diff --git a/doc/guides/rel_notes/release_16_11.rst b/doc/guides/rel_notes/release_16_11.rst
index 8c9ec65..551e1d5 100644
--- a/doc/guides/rel_notes/release_16_11.rst
+++ b/doc/guides/rel_notes/release_16_11.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.11
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_02.rst b/doc/guides/rel_notes/release_17_02.rst
index 357965a..f6c1b91 100644
--- a/doc/guides/rel_notes/release_17_02.rst
+++ b/doc/guides/rel_notes/release_17_02.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.02
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
index 6892284..9550cd5 100644
--- a/doc/guides/rel_notes/release_17_05.rst
+++ b/doc/guides/rel_notes/release_17_05.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.05
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_08.rst b/doc/guides/rel_notes/release_17_08.rst
index 0bcdfb7..f2cf434 100644
--- a/doc/guides/rel_notes/release_17_08.rst
+++ b/doc/guides/rel_notes/release_17_08.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.08
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst
index c2e3fc3..d0e2c6f 100644
--- a/doc/guides/rel_notes/release_17_11.rst
+++ b/doc/guides/rel_notes/release_17_11.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.11
 ==================
 
diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 44b7de5..b2b207a 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 18.02
 ==================
 
diff --git a/doc/guides/rel_notes/release_2_2.rst b/doc/guides/rel_notes/release_2_2.rst
index bb7d15a..8baccfb 100644
--- a/doc/guides/rel_notes/release_2_2.rst
+++ b/doc/guides/rel_notes/release_2_2.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 2.2
 ================
 
-- 
2.9.5

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v6] ethdev: replace bus specific struct with generic dev
  @ 2018-04-09 12:09  2% ` Ferruh Yigit
  0 siblings, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-09 12:09 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Ferruh Yigit, Shreyansh Jain, Allain Legacy,
	Tomasz Duszynski, Santosh Shukla, David Marchand

Public struct rte_eth_dev_info has a "struct rte_pci_device" field in it
although it is common for all ethdev in all buses.

Replacing pci specific struct with generic device struct and updating
places that are using pci device in a way to get this information from
generic device.

Signed-off-by: Ferruh Yigit <ferruh.yigit@intel.com>
Reviewed-by: David Marchand <david.marchand@6wind.com>
Acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
Cc: Shreyansh Jain <shreyansh.jain@nxp.com>
Cc: Allain Legacy <allain.legacy@windriver.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Cc: David Marchand <david.marchand@6wind.com>

v2:
- prevent possible crash while getting bus (Pablo)
- Remove unnecessary __rte_unused
- Some PMD info_dev->device was assigned to NULL, fixed them

v3:
- rebased on latest next-net

v4:
- Move dev_info->device assignment to ethdev layer

v5:
- Document API change in related section in release notes

v6:
- Rebase on latest next-net, ip_pipeline updated
- Update axgbe too
---
 app/test-pmd/config.c                   | 18 +++++++++++++++-
 app/test-pmd/testpmd.h                  | 38 +++++++++++++++++++++++++++------
 doc/guides/rel_notes/release_18_05.rst  |  3 +++
 drivers/net/ark/ark_ethdev.c            |  1 -
 drivers/net/avf/avf_ethdev.c            |  1 -
 drivers/net/avp/avp_ethdev.c            |  1 -
 drivers/net/axgbe/axgbe_ethdev.c        |  4 +---
 drivers/net/bnx2x/bnx2x_ethdev.c        |  1 -
 drivers/net/bnxt/bnxt_ethdev.c          |  2 --
 drivers/net/cxgbe/cxgbe_ethdev.c        |  2 --
 drivers/net/e1000/em_ethdev.c           |  1 -
 drivers/net/e1000/igb_ethdev.c          |  2 --
 drivers/net/ena/ena_ethdev.c            |  2 --
 drivers/net/enic/enic_ethdev.c          |  1 -
 drivers/net/fm10k/fm10k_ethdev.c        |  1 -
 drivers/net/i40e/i40e_ethdev.c          |  1 -
 drivers/net/i40e/i40e_ethdev_vf.c       |  1 -
 drivers/net/ixgbe/ixgbe_ethdev.c        |  2 --
 drivers/net/kni/rte_eth_kni.c           |  1 -
 drivers/net/liquidio/lio_ethdev.c       |  2 --
 drivers/net/mlx4/mlx4_ethdev.c          |  1 -
 drivers/net/mlx5/mlx5_ethdev.c          |  1 -
 drivers/net/nfp/nfp_net.c               |  1 -
 drivers/net/octeontx/octeontx_ethdev.c  |  1 -
 drivers/net/qede/qede_ethdev.c          |  1 -
 drivers/net/sfc/sfc_ethdev.c            |  1 -
 drivers/net/szedata2/rte_eth_szedata2.c |  1 -
 drivers/net/tap/rte_eth_tap.c           |  1 -
 drivers/net/thunderx/nicvf_ethdev.c     |  2 --
 drivers/net/virtio/virtio_ethdev.c      |  1 -
 drivers/net/vmxnet3/vmxnet3_ethdev.c    |  4 +---
 examples/ethtool/lib/rte_ethtool.c      | 16 ++++++++------
 examples/ip_pipeline/kni.c              | 11 ++++++++--
 examples/kni/main.c                     | 11 +++++++---
 lib/librte_ether/rte_ethdev.c           |  1 +
 lib/librte_ether/rte_ethdev.h           |  2 +-
 test/test/test_kni.c                    | 35 ++++++++++++++++++++++++------
 37 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 4bb255c62..dd051f5ca 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -754,6 +754,8 @@ vlan_id_is_invalid(uint16_t vlan_id)
 static int
 port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	uint64_t pci_len;
 
 	if (reg_off & 0x3) {
@@ -762,7 +764,21 @@ port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 		       (unsigned)reg_off);
 		return 1;
 	}
-	pci_len = ports[port_id].dev_info.pci_dev->mem_resource[0].len;
+
+	if (!ports[port_id].dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(ports[port_id].dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(ports[port_id].dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 1;
+	}
+
+	pci_len = pci_dev->mem_resource[0].len;
 	if (reg_off >= pci_len) {
 		printf("Port %d: register offset %u (0x%X) out of port PCI "
 		       "resource (length=%"PRIu64")\n",
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 153abea05..4d84e7b00 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -500,12 +500,25 @@ mbuf_pool_find(unsigned int sock_id)
 static inline uint32_t
 port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 	uint32_t reg_v;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 0;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	reg_v = *((volatile uint32_t *)reg_addr);
 	return rte_le_to_cpu_32(reg_v);
 }
@@ -516,11 +529,24 @@ port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 static inline void
 port_pci_reg_write(struct rte_port *port, uint32_t reg_off, uint32_t reg_v)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	*((volatile uint32_t *)reg_addr) = rte_cpu_to_le_32(reg_v);
 }
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 0f3d00972..4b6bc9bf5 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -88,6 +88,9 @@ API Changes
   memory footprint which helps in better cache utilization when large number
   of meter objects are used.
 
+* ethdev, in struct ``struct rte_eth_dev_info``, field ``rte_pci_device *pci_dev``
+  replaced with field ``struct rte_device *device``.
+
 
 ABI Changes
 -----------
diff --git a/drivers/net/ark/ark_ethdev.c b/drivers/net/ark/ark_ethdev.c
index ff87c20e2..c9d541921 100644
--- a/drivers/net/ark/ark_ethdev.c
+++ b/drivers/net/ark/ark_ethdev.c
@@ -771,7 +771,6 @@ eth_ark_dev_info_get(struct rte_eth_dev *dev,
 				ETH_LINK_SPEED_40G |
 				ETH_LINK_SPEED_50G |
 				ETH_LINK_SPEED_100G);
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 }
 
 static int
diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index b59e3cf79..8e2a1b066 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -507,7 +507,6 @@ avf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct avf_info *vf = AVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = AVF_BUF_SIZE_MIN;
diff --git a/drivers/net/avp/avp_ethdev.c b/drivers/net/avp/avp_ethdev.c
index a07a288ed..5b3c4cebf 100644
--- a/drivers/net/avp/avp_ethdev.c
+++ b/drivers/net/avp/avp_ethdev.c
@@ -2172,7 +2172,6 @@ avp_dev_info_get(struct rte_eth_dev *eth_dev,
 {
 	struct avp_dev *avp = AVP_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->max_rx_queues = avp->max_rx_queues;
 	dev_info->max_tx_queues = avp->max_tx_queues;
 	dev_info->min_rx_bufsize = AVP_MIN_RX_BUFSIZE;
diff --git a/drivers/net/axgbe/axgbe_ethdev.c b/drivers/net/axgbe/axgbe_ethdev.c
index 07c1337ac..2a68ccbf5 100644
--- a/drivers/net/axgbe/axgbe_ethdev.c
+++ b/drivers/net/axgbe/axgbe_ethdev.c
@@ -349,12 +349,10 @@ axgbe_dev_stats_reset(struct rte_eth_dev *dev)
 }
 
 static void
-axgbe_dev_info_get(struct rte_eth_dev *dev,
-		   struct rte_eth_dev_info *dev_info)
+axgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct axgbe_port *pdata = dev->data->dev_private;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = pdata->tx_ring_count;
 	dev_info->max_tx_queues = pdata->rx_ring_count;
 	dev_info->min_rx_bufsize = AXGBE_RX_MIN_BUF_SIZE;
diff --git a/drivers/net/bnx2x/bnx2x_ethdev.c b/drivers/net/bnx2x/bnx2x_ethdev.c
index 483d5a17c..8726b357a 100644
--- a/drivers/net/bnx2x/bnx2x_ethdev.c
+++ b/drivers/net/bnx2x/bnx2x_ethdev.c
@@ -447,7 +447,6 @@ static void
 bnx2x_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct bnx2x_softc *sc = dev->data->dev_private;
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues  = sc->max_rx_queues;
 	dev_info->max_tx_queues  = sc->max_tx_queues;
 	dev_info->min_rx_bufsize = BNX2X_MIN_RX_BUF_SIZE;
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 7c007c8f9..c447cd727 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -379,8 +379,6 @@ static void bnxt_dev_info_get_op(struct rte_eth_dev *eth_dev,
 	uint16_t max_vnics, i, j, vpool, vrxq;
 	unsigned int max_rx_rings;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	/* MAC Specifics */
 	dev_info->max_mac_addrs = bp->max_l2_ctx;
 	dev_info->max_hash_mac_addrs = 0;
diff --git a/drivers/net/cxgbe/cxgbe_ethdev.c b/drivers/net/cxgbe/cxgbe_ethdev.c
index 581a1f33a..24c9a9323 100644
--- a/drivers/net/cxgbe/cxgbe_ethdev.c
+++ b/drivers/net/cxgbe/cxgbe_ethdev.c
@@ -134,8 +134,6 @@ void cxgbe_dev_info_get(struct rte_eth_dev *eth_dev,
 		.nb_align = 1,
 	};
 
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	device_info->min_rx_bufsize = CXGBE_MIN_RX_BUFSIZE;
 	device_info->max_rx_pktlen = CXGBE_MAX_RX_PKTLEN;
 	device_info->max_rx_queues = max_queues;
diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c
index 087c192d5..c6062468c 100644
--- a/drivers/net/e1000/em_ethdev.c
+++ b/drivers/net/e1000/em_ethdev.c
@@ -1070,7 +1070,6 @@ eth_em_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen = em_get_max_pktlen(dev);
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..872357146 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -2144,7 +2144,6 @@ eth_igb_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
@@ -2269,7 +2268,6 @@ eth_igbvf_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d78..a15436c99 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -1527,8 +1527,6 @@ static void ena_infos_get(struct rte_eth_dev *dev,
 	ena_dev = &adapter->ena_dev;
 	ena_assert_msg(ena_dev != NULL, "Uninitialized device");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->speed_capa =
 			ETH_LINK_SPEED_1G   |
 			ETH_LINK_SPEED_2_5G |
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 03f0c2547..801f4704c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -471,7 +471,6 @@ static void enicpmd_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct enic *enic = pmd_priv(eth_dev);
 
 	ENICPMD_FUNC_TRACE();
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	/* Scattered Rx uses two receive queues per rx queue exposed to dpdk */
 	device_info->max_rx_queues = enic->conf_rq_count / 2;
 	device_info->max_tx_queues = enic->conf_wq_count;
diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c
index 61de4d772..34affd1cc 100644
--- a/drivers/net/fm10k/fm10k_ethdev.c
+++ b/drivers/net/fm10k/fm10k_ethdev.c
@@ -1404,7 +1404,6 @@ fm10k_dev_infos_get(struct rte_eth_dev *dev,
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev            = pdev;
 	dev_info->min_rx_bufsize     = FM10K_MIN_RX_BUF_SIZE;
 	dev_info->max_rx_pktlen      = FM10K_MAX_PKT_SIZE;
 	dev_info->max_rx_queues      = hw->mac.max_queues;
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..6a8a2cd2a 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -3212,7 +3212,6 @@ i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vsi *vsi = pf->main_vsi;
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = vsi->nb_qps;
 	dev_info->max_tx_queues = vsi->nb_qps;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c
index 2908c87e0..f6d7f40b1 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2183,7 +2183,6 @@ i40evf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index fbc048f7d..bd1773978 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -3593,7 +3593,6 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	if (RTE_ETH_DEV_SRIOV(dev).active == 0) {
@@ -3712,7 +3711,6 @@ ixgbevf_dev_info_get(struct rte_eth_dev *dev,
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	dev_info->min_rx_bufsize = 1024; /* cf BSIZEPACKET in SRRCTL reg */
diff --git a/drivers/net/kni/rte_eth_kni.c b/drivers/net/kni/rte_eth_kni.c
index dc4e65f5d..c10e970c2 100644
--- a/drivers/net/kni/rte_eth_kni.c
+++ b/drivers/net/kni/rte_eth_kni.c
@@ -201,7 +201,6 @@ eth_kni_dev_info(struct rte_eth_dev *dev __rte_unused,
 	dev_info->max_rx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->max_tx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 }
 
 static int
diff --git a/drivers/net/liquidio/lio_ethdev.c b/drivers/net/liquidio/lio_ethdev.c
index eeb8350e4..a13a566f9 100644
--- a/drivers/net/liquidio/lio_ethdev.c
+++ b/drivers/net/liquidio/lio_ethdev.c
@@ -373,8 +373,6 @@ lio_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct lio_device *lio_dev = LIO_DEV(eth_dev);
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 
-	devinfo->pci_dev = pci_dev;
-
 	switch (pci_dev->id.subsystem_device_id) {
 	/* CN23xx 10G cards */
 	case PCI_SUBSYS_DEV_ID_CN2350_210:
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 5f731e023..636100b23 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -556,7 +556,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index cc85f76c0..44cdbb622 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -405,7 +405,6 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 606cd3dc2..e030bbf9f 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1253,7 +1253,6 @@ nfp_net_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = (uint16_t)hw->max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->max_tx_queues;
 	dev_info->min_rx_bufsize = ETHER_MIN_MTU;
diff --git a/drivers/net/octeontx/octeontx_ethdev.c b/drivers/net/octeontx/octeontx_ethdev.c
index 1406e4e19..f829e0ca9 100644
--- a/drivers/net/octeontx/octeontx_ethdev.c
+++ b/drivers/net/octeontx/octeontx_ethdev.c
@@ -616,7 +616,6 @@ octeontx_dev_info(struct rte_eth_dev *dev,
 	dev_info->max_rx_queues = 1;
 	dev_info->max_tx_queues = PKO_MAX_NUM_DQ;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 
 	dev_info->default_rxconf = (struct rte_eth_rxconf) {
 		.rx_free_thresh = 0,
diff --git a/drivers/net/qede/qede_ethdev.c b/drivers/net/qede/qede_ethdev.c
index a4e9e753e..13c2a3b87 100644
--- a/drivers/net/qede/qede_ethdev.c
+++ b/drivers/net/qede/qede_ethdev.c
@@ -1549,7 +1549,6 @@ qede_dev_info_get(struct rte_eth_dev *eth_dev,
 
 	PMD_INIT_FUNC_TRACE(edev);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->min_rx_bufsize = (uint32_t)QEDE_MIN_RX_BUFF_SIZE;
 	dev_info->max_rx_pktlen = (uint32_t)ETH_TX_MAX_NON_LSO_PKT_LEN;
 	dev_info->rx_desc_lim = qede_rx_desc_lim;
diff --git a/drivers/net/sfc/sfc_ethdev.c b/drivers/net/sfc/sfc_ethdev.c
index 2af898e08..6631c5a7e 100644
--- a/drivers/net/sfc/sfc_ethdev.c
+++ b/drivers/net/sfc/sfc_ethdev.c
@@ -89,7 +89,6 @@ sfc_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	sfc_log_init(sa, "entry");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_pktlen = EFX_MAC_PDU_MAX;
 
 	/* Autonegotiation may be disabled */
diff --git a/drivers/net/szedata2/rte_eth_szedata2.c b/drivers/net/szedata2/rte_eth_szedata2.c
index fb9aac04b..41a6fb427 100644
--- a/drivers/net/szedata2/rte_eth_szedata2.c
+++ b/drivers/net/szedata2/rte_eth_szedata2.c
@@ -1015,7 +1015,6 @@ eth_dev_info(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *internals = dev->data->dev_private;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->if_index = 0;
 	dev_info->max_mac_addrs = 1;
 	dev_info->max_rx_pktlen = (uint32_t)-1;
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 61d646558..54c7c2b0f 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -760,7 +760,6 @@ tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 	dev_info->speed_capa = tap_dev_speed_capa();
 	dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
 	dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
diff --git a/drivers/net/thunderx/nicvf_ethdev.c b/drivers/net/thunderx/nicvf_ethdev.c
index 067f2243b..75e9d16c5 100644
--- a/drivers/net/thunderx/nicvf_ethdev.c
+++ b/drivers/net/thunderx/nicvf_ethdev.c
@@ -1400,8 +1400,6 @@ nicvf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	/* Autonegotiation may be disabled */
 	dev_info->speed_capa = ETH_LINK_SPEED_FIXED;
 	dev_info->speed_capa |= ETH_LINK_SPEED_10M | ETH_LINK_SPEED_100M |
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 11f758929..d7c81747e 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -2064,7 +2064,6 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
 
-	dev_info->pci_dev = dev->device ? RTE_ETH_DEV_TO_PCI(dev) : NULL;
 	dev_info->max_rx_queues =
 		RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_RX_QUEUES);
 	dev_info->max_tx_queues =
diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 426008722..01b4802e0 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -1022,11 +1022,9 @@ vmxnet3_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 }
 
 static void
-vmxnet3_dev_info_get(struct rte_eth_dev *dev,
+vmxnet3_dev_info_get(struct rte_eth_dev *dev __rte_unused,
 		     struct rte_eth_dev_info *dev_info)
 {
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->max_rx_queues = VMXNET3_MAX_RX_QUEUES;
 	dev_info->max_tx_queues = VMXNET3_MAX_TX_QUEUES;
 	dev_info->min_rx_bufsize = 1518 + RTE_PKTMBUF_HEADROOM;
diff --git a/examples/ethtool/lib/rte_ethtool.c b/examples/ethtool/lib/rte_ethtool.c
index 90dfbb739..d519a50db 100644
--- a/examples/ethtool/lib/rte_ethtool.c
+++ b/examples/ethtool/lib/rte_ethtool.c
@@ -22,6 +22,8 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 {
 	struct rte_eth_dev_info dev_info;
 	struct rte_dev_reg_info reg_info;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 	int n;
 	int ret;
 
@@ -46,15 +48,17 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 	snprintf(drvinfo->version, sizeof(drvinfo->version), "%s",
 		rte_version());
 	/* TODO: replace bus_info by rte_devargs.name */
-	if (dev_info.pci_dev)
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info),
 			"%04x:%02x:%02x.%x",
-			dev_info.pci_dev->addr.domain,
-			dev_info.pci_dev->addr.bus,
-			dev_info.pci_dev->addr.devid,
-			dev_info.pci_dev->addr.function);
-	else
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+	} else {
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info), "N/A");
+	}
 
 	memset(&reg_info, 0, sizeof(reg_info));
 	rte_eth_dev_get_reg_info(port_id, &reg_info);
diff --git a/examples/ip_pipeline/kni.c b/examples/ip_pipeline/kni.c
index ebc8c7904..712775338 100644
--- a/examples/ip_pipeline/kni.c
+++ b/examples/ip_pipeline/kni.c
@@ -106,6 +106,8 @@ kni_create(const char *name, struct kni_params *params)
 	struct mempool *mempool;
 	struct link *link;
 	struct rte_kni *k;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 
 	/* Check input params */
 	if ((name == NULL) ||
@@ -128,8 +130,13 @@ kni_create(const char *name, struct kni_params *params)
 	kni_conf.core_id = params->thread_id;
 	kni_conf.group_id = link->port_id;
 	kni_conf.mbuf_size = mempool->buffer_size;
-	kni_conf.addr = dev_info.pci_dev->addr;
-	kni_conf.id = dev_info.pci_dev->id;
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		kni_conf.addr = pci_dev->addr;
+		kni_conf.id = pci_dev->id;
+	}
 
 	memset(&kni_ops, 0, sizeof(kni_ops));
 	kni_ops.port_id = link->port_id;
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 0d9980ee1..aebfedd59 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -834,13 +834,18 @@ kni_alloc(uint16_t port_id)
 		if (i == 0) {
 			struct rte_kni_ops ops;
 			struct rte_eth_dev_info dev_info;
+			const struct rte_pci_device *pci_dev;
+			const struct rte_bus *bus = NULL;
 
 			memset(&dev_info, 0, sizeof(dev_info));
 			rte_eth_dev_info_get(port_id, &dev_info);
 
-			if (dev_info.pci_dev) {
-				conf.addr = dev_info.pci_dev->addr;
-				conf.id = dev_info.pci_dev->id;
+			if (dev_info.device)
+				bus = rte_bus_find_by_device(dev_info.device);
+			if (bus && !strcmp(bus->name, "pci")) {
+				pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+				conf.addr = pci_dev->addr;
+				conf.id = pci_dev->id;
 			}
 			/* Get the interface default mac address */
 			rte_eth_macaddr_get(port_id,
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e04..90c47ad12 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -2395,6 +2395,7 @@ rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info)
 	memset(dev_info, 0, sizeof(struct rte_eth_dev_info));
 	dev_info->rx_desc_lim = lim;
 	dev_info->tx_desc_lim = lim;
+	dev_info->device = dev->device;
 
 	RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get);
 	(*dev->dev_ops->dev_infos_get)(dev, dev_info);
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca6a..784c6faa4 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -992,7 +992,7 @@ struct rte_pci_device;
  * Ethernet device information
  */
 struct rte_eth_dev_info {
-	struct rte_pci_device *pci_dev; /**< Device PCI information. */
+	struct rte_device *device; /** Generic device information */
 	const char *driver_name; /**< Device Driver name. */
 	unsigned int if_index; /**< Index to bound host interface, or 0 if none.
 		Use if_indextoname() to translate into an interface name. */
diff --git a/test/test/test_kni.c b/test/test/test_kni.c
index e4839cdb7..3d1be56a9 100644
--- a/test/test/test_kni.c
+++ b/test/test/test_kni.c
@@ -357,6 +357,8 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 
 	if (!mp)
 		return -1;
@@ -366,8 +368,13 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	memset(&ops, 0, sizeof(ops));
 
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	snprintf(conf.name, sizeof(conf.name), TEST_KNI_PORT);
 
 	/* core id 1 configured for kernel thread */
@@ -465,6 +472,8 @@ test_kni(void)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 
 	/* Initialize KNI subsytem */
 	rte_kni_init(KNI_TEST_MAX_PORTS);
@@ -523,8 +532,15 @@ test_kni(void)
 	memset(&conf, 0, sizeof(conf));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
@@ -552,8 +568,15 @@ test_kni(void)
 	memset(&info, 0, sizeof(info));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
-- 
2.14.3

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to flow API
  @ 2018-04-07  9:51  0%     ` Andrew Rybchenko
  2018-04-09 15:00  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:51 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> This patch adds the missing action counterpart to the PHY_PORT pattern
> item, that is, the ability to directly inject matching traffic into a
> physical port of the underlying device.

Does it mean that if it is applied on ingress (incoming packet from network)
it will simply send packets back to network (specified physical port)?
And if it is applied on egress (outgoing from device to network) it will
be directed to possibly different physical port and sent to network.

> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
>   6 files changed, 84 insertions(+)

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in flow API
  @ 2018-04-07  9:41  0%     ` Andrew Rybchenko
  2018-04-09 14:49  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:41 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Contrary to all other pattern items, these are inconsistently documented as
> affecting traffic instead of simply matching its origin, without provision
> for the latter.
>
> This commit clarifies documentation and updates PMDs since the original
> behavior now has to be explicitly requested using the new transfer
> attribute.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_validate()
>
> Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
> supported when a transfer attribute is also present.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 12 +++---
>   doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
>   drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
>   drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
>   lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
>   6 files changed, 77 insertions(+), 75 deletions(-)

<...>

> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 735ce6323..beedc713b 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -518,15 +518,12 @@ Usage example, matching non-TCPv4 packets only:
>   Item: ``PF``
>   ^^^^^^^^^^^^
>   
> -Matches packets addressed to the physical function of the device.
> +Matches traffic originating from (ingress) or going to (egress) the physical
> +function of the current device.

Not sure that I understand above. It looks like ingress and egress are 
misplaced.
There many similar cases below.

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from flow API
  @ 2018-04-07  9:23  0%     ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:23 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Upcoming changes in relation to the handling of actions list will make the
> DUP action redundant as specifying several QUEUE actions will achieve the
> same behavior. Besides, no PMD implements this action.
>
> By removing an entry from enum rte_flow_action_type, this patch breaks ABI
> compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 23 -----------------------
>   app/test-pmd/config.c                       |  1 -
>   doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
>   lib/librte_ether/rte_ethdev_version.map     |  2 +-
>   lib/librte_ether/rte_flow.c                 |  1 -
>   lib/librte_ether/rte_flow.h                 | 24 ------------------------
>   7 files changed, 1 insertion(+), 81 deletions(-)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API
  2018-04-07  9:15  0%     ` Andrew Rybchenko
@ 2018-04-07  9:18  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:18 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/07/2018 12:15 PM, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
>> These enable more precise reporting of objects responsible for errors.
>>
>> This breaks ABI compatibility for the following public functions:
>>
>> - rte_flow_create()
>> - rte_flow_destroy()
>> - rte_flow_error_set()
>> - rte_flow_flush()
>> - rte_flow_isolate()
>> - rte_flow_query()
>> - rte_flow_validate()
>>
>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
>> ---
>>   app/test-pmd/config.c                   |  4 ++++
>>   lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
>>   lib/librte_ether/rte_flow.h             |  4 ++++
>>   3 files changed, 21 insertions(+), 7 deletions(-)
>
> I think PMD maintainers with flow API support should be additionally
> notified and encouraged to refine error reporting.

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API
  @ 2018-04-07  9:15  0%     ` Andrew Rybchenko
  2018-04-07  9:18  0%       ` Andrew Rybchenko
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:15 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> These enable more precise reporting of objects responsible for errors.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_destroy()
> - rte_flow_error_set()
> - rte_flow_flush()
> - rte_flow_isolate()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/config.c                   |  4 ++++
>   lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
>   lib/librte_ether/rte_flow.h             |  4 ++++
>   3 files changed, 21 insertions(+), 7 deletions(-)

I think PMD maintainers with flow API support should be additionally
notified and encouraged to refine error reporting.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  @ 2018-04-07  9:05  0%     ` Andrew Rybchenko
  2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:05 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon,
	Radu Nicolau, Akhil Goyal, Ivan Malov

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Since its inception, the rte_flow RSS action has been relying in part on
> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> This structure lacks parameters such as the hash algorithm to use, and more
> recently, a method to tell which layer RSS should be performed on [1].
>
> Given struct rte_eth_rss_conf will never be flexible enough to represent a
> complete RSS configuration (e.g. RETA table), this patch supersedes it by
> extending the rte_flow RSS action directly.
>
> A subsequent patch will add a field to use a non-default RSS hash
> algorithm. To that end, a field named "types" replaces the field formerly
> known as "rss_hf" and standing for "RSS hash functions" as it was
> confusing. Actual RSS hash function types are defined by enum
> rte_eth_hash_function.
> This patch updates all PMDs and example applications accordingly.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>      configuration")
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> Cc: Radu Nicolau <radu.nicolau@intel.com>
> Cc: Akhil Goyal <akhil.goyal@nxp.com>
> ---
>   app/test-pmd/cmdline_flow.c        |  59 +++++-----
>   app/test-pmd/config.c              |  39 +++----
>   doc/guides/prog_guide/rte_flow.rst |  22 ++--
>   drivers/net/e1000/e1000_ethdev.h   |  13 ++-
>   drivers/net/e1000/igb_ethdev.c     |   4 +-
>   drivers/net/e1000/igb_flow.c       |  31 ++---
>   drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
>   drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
>   drivers/net/i40e/i40e_ethdev.h     |  15 ++-
>   drivers/net/i40e/i40e_flow.c       |  47 ++++----
>   drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
>   drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
>   drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
>   drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
>   drivers/net/mlx4/mlx4.c            |   2 +-
>   drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
>   drivers/net/mlx4/mlx4_flow.h       |   2 +-
>   drivers/net/mlx4/mlx4_rxq.c        |   2 +-
>   drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
>   drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
>   drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
>   drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
>   drivers/net/sfc/sfc_flow.c         |  21 ++--
>   drivers/net/tap/tap_flow.c         |   8 +-
>   examples/ipsec-secgw/ipsec.c       |  10 +-
>   lib/librte_ether/rte_flow.c        |  39 +++----
>   lib/librte_ether/rte_flow.h        |   6 +-
>   27 files changed, 473 insertions(+), 353 deletions(-)

<...>

> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> index 056405515..1a2c0299c 100644
> --- a/drivers/net/sfc/sfc_flow.c
> +++ b/drivers/net/sfc/sfc_flow.c
> @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   	struct sfc_rxq *rxq;
>   	unsigned int rxq_hw_index_min;
>   	unsigned int rxq_hw_index_max;
> -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
> -	uint64_t rss_hf;
> -	uint8_t *rss_key = NULL;
> +	const uint8_t *rss_key;
>   	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
>   	unsigned int i;
>   
> -	if (rss->num == 0)
> +	if (rss->queue_num == 0)
>   		return -EINVAL;
>   
>   	rxq_sw_index = sa->rxq_count - 1;
> @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   	rxq_hw_index_min = rxq->hw_index;
>   	rxq_hw_index_max = 0;
>   
> -	for (i = 0; i < rss->num; ++i) {
> +	for (i = 0; i < rss->queue_num; ++i) {
>   		rxq_sw_index = rss->queue[i];
>   
>   		if (rxq_sw_index >= sa->rxq_count)
> @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   			rxq_hw_index_max = rxq->hw_index;
>   	}
>   
> -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;

Here we had a fallback to default rss_hf (now types) if rss_conf is 
unspecified.

> -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
> +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
>   		return -EINVAL;
>   
> -	if (rss_conf != NULL) {
> -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
> +	if (rss->key_len) {
> +		if (rss->key_len != sizeof(sa->rss_key))
>   			return -EINVAL;
>   
> -		rss_key = rss_conf->rss_key;
> +		rss_key = rss->key;
>   	} else {
>   		rss_key = sa->rss_key;
>   	}
> @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   
>   	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
>   	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
> -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
> +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);

Now types go directly to mapping function and unspecified types (0)
will result in 0 rss_hash_types. Of course, it is a question how to treat
types==0. It is possible to say that it no RSS, but it does not make sense.
So, real options are device defaults (regardless configured on device level)
or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
Please, document the intended behaviour in rte_flow.rst.

If the later is chosen, above we'll have a bug since fallback to fixed 
default.
Just use sa->rss_hash_types as fallback. Something like:
if (rss->types)
     sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
else
     sfc_rss_conf->rss_hash_types =sa->rss_hash_types;

>   	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
>   
>   	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
> -		unsigned int rxq_sw_index = rss->queue[i % rss->num];
> +		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
>   		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
>   
>   		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level to RSS flow API action
  @ 2018-04-07  8:27  0%     ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  8:27 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
> protocol header fields of a packet that must be taken into account while
> computing RSS.
>
> When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
> to whether these should apply to inner or outer packets. Applications need
> the ability to tell exactly "where" RSS must be performed.
>
> This is addressed by adding encapsulation level information to the RSS flow
> action. Its default value is 0 and stands for the usual unspecified
> behavior. Other values provide a specific encapsulation level.
>
> Contrary to the change announced by commit 676b605182a5 ("doc: announce
> ethdev API change for RSS configuration"), this patch does not affect
> struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
> used anymore by the RSS flow action. ABI impact is therefore limited to
> rte_flow.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
>   drivers/net/e1000/igb_flow.c                |  4 ++++
>   drivers/net/e1000/igb_rxtx.c                |  2 ++
>   drivers/net/i40e/i40e_ethdev.c              |  2 ++
>   drivers/net/i40e/i40e_flow.c                |  4 ++++
>   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
>   drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
>   drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
>   drivers/net/sfc/sfc_flow.c                  |  3 +++
>   drivers/net/tap/tap_flow.c                  |  6 +++++-
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
>   16 files changed, 110 insertions(+), 1 deletion(-)

Generic and sfc parts
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 3/4] ethdev: Add group action type to rte_flow
  @ 2018-04-06 20:26  3%   ` Adrien Mazarguil
  2018-04-17 14:40  0%     ` Doherty, Declan
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:02PM +0100, Declan Doherty wrote:
> Add group action type which defines a terminating action which
> allows a matched flow to be redirect to a group. This allows logical
> flow table hierarchies to be managed through rte_flow.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

OK, I'm wondering if perhaps with the addition of this action, we should
redefine groups as unlinked by default?

Currently traffic enters through the flow rule with the lowest priority of
the group with the lowest ID and iterates through subsequent flow rules and
groups until matched by a flow rule without PASSTHRU (according to latest
definition [1]).

This would make jumps between groups always explicit, not necessarily a bad
idea given no PMD implements groups as of yet. Thoughts?

Also as a rather fundamental API addition, I suggest to add it after
RTE_FLOW_ACTION_TYPE_PASSTHRU. It's OK because ABI is already broken. You
just need to mention it in the commit log [1].

Another suggestion would be to rename it "JUMP" (reasons below).

[1] "ethdev: alter behavior of flow API actions"
    http://dpdk.org/ml/archives/dev/2018-April/095779.html

> ---
>  doc/guides/prog_guide/rte_flow.rst | 23 +++++++++++++++++++++++
>  lib/librte_ether/rte_flow.h        | 15 +++++++++++++++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 106fb93..2f0a47a 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1557,6 +1557,29 @@ set of overlay header type.
>     | ``item type`` | Item type of tunnel end-point to decapsulate |
>     +---------------+----------------------------------------------+
>  
> +

Unnecessary empty line.

> +Action: ``GROUP``
> +^^^^^^^^^^^^^^^^^
> +
> +Redirects packets to a group on the current device.
> +
> +In a hierarchy of groups, which can be used to represent physical or logical
> +flow tables on the device, this action allows the terminating action to be a
> +group on that device.
> +
> +- Terminating by default.

Keep in mind there's no such thing as a terminating action anymore [1].

> +
> +.. _table_rte_flow_action_group:
> +
> +.. table:: GROUP
> +
> +   +--------------+---------------------------------+
> +   | Field        | Value                           |
> +   +==============+=================================+
> +   | ``id``       | Group ID to redirect packets to |
> +   +--------------+---------------------------------+

"Field" column can be shrunk somewhat.

> +
> +
>  Negative types
>  ~~~~~~~~~~~~~~
>  
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 6d94423..968a23b 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1251,6 +1251,21 @@ struct rte_flow_action_tunnel_decap {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_GROUP

Its addition to enum rte_flow_action_type should be part of this commit.

> + *
> + * Redirects packets to a group on the current device.
> + *
> + * In a hierarchy of groups, which can be used to represent physical or logical
> + * flow tables on the device, this action allows the terminating action to be a
> + * group on that device.
> + *
> + * Terminating by default.

See [1].

> + */
> +struct rte_flow_action_group {
> +	uint32_t id;

Assuming this structure is named rte_flow_action_jump, naming this field
"group" would match the attribute of the same name.

> +};
> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> -- 
> 2.7.4
> 

Don't forget testpmd code and documentation update.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  @ 2018-04-06 20:26  2%   ` Adrien Mazarguil
  2018-04-09 16:10  0%     ` Mohammad Abdul Awal
  2018-04-17 14:58  3%     ` Doherty, Declan
  0 siblings, 2 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> Add new flow action types and associated action data structures to
> support the encapsulation and decapsulation of the virtual tunnel
> endpoints.
> 
> The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
> flow to be encapsulated in the virtual tunnel endpoint overlay
> defined in the tunnel_encap action data.
> 
> The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
> tunnel endpoint overlays up to and including the first instance of
> the flow item type defined in the tunnel_decap action data for the
> matching flows.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

This generic approach looks flexible enough to cover the use cases that
immediately come to mind (VLAN, VXLAN), its design is sound.

However, while I'm aware it's not a concern at this point, it won't be able
to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
which will require additional meta data or some run-time assistance from the
application.

Eventually for more complex use cases, dedicated encap/decap actions will
have to appear, so the issue I wanted to raise before going further is this:

Going generic inevitably trades some of the usability; flat structures
dedicated to VXLAN encap/decap with only the needed info to get the job done
would likely be easier to implement in PMDs and use in applications. Any
number of such actions can be added to rte_flow without ABI impact.

If VXLAN is the only use case at this point, my suggestion would be to go
with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
L2/L3/L4/L5 header definitions to prepend according to RFC 7348.

Now we can start with the generic approach, see how it fares and add
dedicated encap/decap later as needed.

More comments below.

> ---
>  doc/guides/prog_guide/rte_flow.rst | 77 ++++++++++++++++++++++++++++++++--
>  lib/librte_ether/rte_flow.h        | 84 ++++++++++++++++++++++++++++++++++++--
>  2 files changed, 155 insertions(+), 6 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index fd33d19..106fb93 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -997,9 +997,11 @@ Actions
>  
>  Each possible action is represented by a type. Some have associated
>  configuration structures. Several actions combined in a list can be assigned
> -to a flow rule. That list is not ordered.
> +to a flow rule. That list is not ordered, with the exception of  actions which
> +modify the packet itself, these packet modification actions must be specified
> +in the explicit order in which they are to be executed.
>  
> -They fall in three categories:
> +They fall in four categories:
>  
>  - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>    processing matched packets by subsequent flow rules, unless overridden
> @@ -1008,8 +1010,11 @@ They fall in three categories:
>  - Non-terminating actions (PASSTHRU, DUP) that leave matched packets up for
>    additional processing by subsequent flow rules.
>  
> +- Non-terminating meta actions that do not affect the fate of packets but result
> +  in modification of the packet itself (SECURITY, TUNNEL_ENCAP, TUNNEL_DECAP).
> +
>  - Other non-terminating meta actions that do not affect the fate of packets
> -  (END, VOID, MARK, FLAG, COUNT, SECURITY).
> +  (END, VOID, MARK, FLAG, COUNT).

The above changes are not necessary anymore [1][2].

[1] "ethdev: clarify flow API pattern items and actions"
    https://dpdk.org/ml/archives/dev/2018-April/095776.html
[2] "ethdev: alter behavior of flow API actions"
    https://dpdk.org/ml/archives/dev/2018-April/095779.html

>  When several actions are combined in a flow rule, they should all have
>  different types (e.g. dropping a packet twice is not possible).
> @@ -1486,6 +1491,72 @@ fields in the pattern items.
>     | 1     | END      |
>     +-------+----------+
>  
> +

Nit: titles in this file are separated by a single empty line.

> +Action: ``TUNNEL_ENCAP``
> +^^^^^^^^^^^^^^^^^^^^^^
> +
> +Performs an encapsulation action by encapsulating the flows matched by the
> +pattern items according to the network overlay defined in the
> +``rte_flow_action_tunnel_encap`` pattern items.
> +
> +This action modifies the payload of matched flows. The pattern items specified
> +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
> +set of overlay headers, from the Ethernet header up to the overlay header. The
> +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.

Regarding the use of a pattern list, if you consider PMDs are already
iterating on a list of actions when encountering
RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.

How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
exactly one item instead (in encap, i.e. reverse order)?

In which case perhaps "GENERIC" would be a better fit than "TUNNEL".

> +
> +- Non-terminating by default.

There's no such property anymore [2].

> +
> +.. _table_rte_flow_action_tunnel_encap:
> +
> +.. table:: TUNNEL_ENCAP
> +
> +   +-------------+---------------------------------------------+
> +   | Field       | Value                                       |
> +   +=============+=============================================+
> +   | ``pattern`` | Virtual tunnel end-point pattern definition |
> +   +-------------+---------------------------------------------+
> +
> +
> +.. _table_rte_flow_action_tunnel_encap_example:
> +
> +.. table:: IPv4 VxLAN flow pattern example.

VxLAN => VXLAN

> +
> +   +-------+--------------------------+------------+
> +   | Index | Flow Item Type           | Flow Item  |
> +   +=======+==========================+============+
> +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> +   +-------+--------------------------+------------+
> +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> +   +-------+--------------------------+------------+
> +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> +   +-------+--------------------------+------------+
> +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> +   +-------+--------------------------+------------+
> +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> +   +-------+--------------------------+------------+

One possible issue is that it relies on objects normally found on the
pattern side of flow rules. Those are supposed to match something, they are
not intended for packet header generation. While their "spec" and "mask"
fields might make sense in this context, the "last" field is odd.

You must define them without leaving anything open for interpretation by
PMDs and users alike. Defining things as "undefined" is fine as long as it's
covered.

> +
> +

Nit: only one empty line necessary here.

> +Action: ``TUNNEL_DECAP``
> +^^^^^^^^^^^^^^^^^^^^^^
> +
> +Performs a decapsulation action by stripping all headers of the virtual tunnel
> +end-point overlay up to the header defined by the flow item type of flows
> +matched by the pattern items.

Not necessarily, for instance if one guarantees that flowing traffic only
consists of decap'able packets. You must avoid mandatory dependencies
between patterns and actions since they are normally unrelated.

What you can document on the other hand is that the behavior is undefined
when processing traffic on which the action can't be applied. This is
how RSS level is documented [3].

[3] https://dpdk.org/ml/archives/dev/2018-April/095783.html

> +
> +This action modifies the payload of matched flows. The flow item type specified
> +in the ``rte_flow_action_tunnel_decap`` action structure must defined a valid
> +set of overlay header type.
> +
> +- Non-terminating by default.

See [2].

> +
> +.. _table_rte_flow_action_tunnel_decap:
> +
> +   +---------------+----------------------------------------------+
> +   | Field         | Value                                        |
> +   +===============+==============================================+
> +   | ``item type`` | Item type of tunnel end-point to decapsulate |
> +   +---------------+----------------------------------------------+

"item type" should be the exact name used in the structure.

> +
>  Negative types
>  ~~~~~~~~~~~~~~
>  
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 7d1f89d..6d94423 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -854,14 +854,17 @@ struct rte_flow_item {
>  	const void *mask; /**< Bit-mask applied to spec and last. */
>  };
>  
> +

Unnecessary empty line.

>  /**
>   * Action types.
>   *
>   * Each possible action is represented by a type. Some have associated
>   * configuration structures. Several actions combined in a list can be
> - * affected to a flow rule. That list is not ordered.
> + * affected to a flow rule. That list is not ordered, with the exception of
> + * actions which modify the packet itself, these packet modification actions
> + * must be specified in the explicit order in which they are to be executed.
>   *
> - * They fall in three categories:
> + * They fall in four categories:
>   *
>   * - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>   *   processing matched packets by subsequent flow rules, unless overridden
> @@ -870,6 +873,10 @@ struct rte_flow_item {
>   * - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
>   *   for additional processing by subsequent flow rules.
>   *
> + * - Non terminating meta actions that do not affect the fate of
> + *   packets but result in modification of the packet itself (SECURITY,
> + *   TUNNEL_ENCAP, TUNNEL_DECAP).
> + *

Same comment as above [1][2].

>   * - Other non terminating meta actions that do not affect the fate of
>   *   packets (END, VOID, MARK, FLAG, COUNT).
>   *
> @@ -1022,7 +1029,42 @@ enum rte_flow_action_type {
>  	 *
>  	 * See struct rte_flow_action_group_count.
>  	 */
> -	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT,

An empty line would have been needed here (if we agree about no more
GROUP_COUNT.)

> +	/**
> +	 * Encapsulate flow with tunnel defined in
> +	 * rte_flow_action_tunnel_encap structure.
> +	 *
> +	 * See struct rte_flow_action_tunnel_encap.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP,
> +
> +	/**
> +	 * Decapsulate all the headers of the tunnel
> +	 *
> +	 * See struct rte_flow_action_tunnel_decap.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP,
> +
> +	/**
> +	 * Redirects packets to the logical group of the current device.
> +	 *
> +	 * In a logical hierarchy of groups, which can be used to represent a
> +	 * physical of logical chaining of flow tables, this action allows the
> +	 * terminating action to be a logical group of the same device.
> +	 *
> +	 * See struct rte_flow_action_group.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_GROUP,
> +
> +	/**
> +	 * [META]
> +	 *
> +	 * Set specific metadata field associated with packet which is then
> +	 * available to further pipeline stages.
> +	 *
> +	 * See struct rte_flow_action_metadata.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_METADATA

These two actions should be part of the next patch, I won't comment them
here.

>  };
>  
>  /**
> @@ -1173,6 +1215,42 @@ struct rte_flow_action_group_count {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
> + *
> + * Virtual tunnel end-point encapsulation action data.
> + *
> + * Non-terminating action by default.

See [2].

> + */
> +struct rte_flow_action_tunnel_encap {
> +	struct rte_flow_action_item {
> +		enum rte_flow_item_type type;
> +		/**< Flow item type. */
> +		const void *item;
> +		/**< Flow item definition which points to the data of
> +		 * corresponding rte_flow_item_type.
> +		 */

I see it's a new action type, albeit a bit confusing (there is no
RTE_FLOW_ACTION_TYPE_ITEM).

I suggest the standard pattern item type since you're going with enum
rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
what fields are relevant. An application might otherwise want to encap with
unsupported properties (e.g. specific IPv4 ToS field and whatnot).

How about a single "struct rte_flow_pattern_item item", neither const and
neither a pointer. It's generic enough, enclosed spec/last/mask pointers
take care of the specifics. You just need to define what's supposed to
happen when "last" is set.

> +	} *pattern;
> +	/**<
> +	 * Tunnel pattern specification (list terminated by the END pattern
> +	 * item).
> +	 */

As previously suggested, how about a single item per encap?

> +};
> +
> +/**
> + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> + *
> + * Virtual tunnel end-point decapsulation action data.
> + *
> + * Non-terminating action by default.
> + */
> +struct rte_flow_action_tunnel_decap {
> +	enum rte_flow_item_type type;
> +	/**<
> +	 * Flow item type of virtual tunnel end-point to be decapsulated
> +	 */
> +};

Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
actions to peel each layer off. The current definition is fine.

> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  @ 2018-04-06 20:26  3%   ` Adrien Mazarguil
  2018-04-09 14:22  0%     ` Mohammad Abdul Awal
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
> Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
> counters across multiple flows on a single port or across multiple
> flows on multiple ports within the same switch domain.
> 
> Introduce new API rte_flow_query_group_count to allow querying of group
> counters.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

Both features are definitely needed, however I suggest to enhance the
existing action type and query function instead, given the rte_flow ABI
won't be maintained for the 18.05 release [1].

Counters and query support were defined as a kind of PoC in preparation for
future requirements back in DPDK 17.02 and so far few PMDs have implemented
the query callback (mlx5 and failsafe, and the latter isn't really a PMD).

Due to the behavior change of action lists [2], providing an action type as
a query parameter is not specific enough anymore, for instance if a list
contains multiple COUNT, the application should be able to tell which needs
to be queried.

Therefore I suggest to redefine the query function as follows:

 int
 rte_flow_query(uint16_t port_id,
                struct rte_flow *flow,
                const struct rte_flow_action *action,
                void *data,
                struct rte_flow_error *error);

Third argument is an action definition with the same configuration (if any)
as previously defined in the action list originally used to create the flow
rule (not necessarily the same pointer, only the contents matter).

It means two perfectly identical actions can't be distinguished, and that's
how group counters will work.

Instead of adding a new action type to distinguish groups, a configuration
structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
non-shared counters as a default behavior:

 struct rte_flow_action_count {
         uint32_t shared:1; /**< Share counter ID with other flow rules. */
         uint32_t reserved:31; /**< Reserved, must be zero. */
         uint32_t id; /**< Counter ID. */
 };

Doing so will impact some existing code in mlx5 and librte_flow_classify,
but that shouldn't be much of an issue.

Keep in mind testpmd and its documentation must be updated as well.

Thoughts?

A few nits below for the sake of commenting.

[1] "Flow API overhaul for switch offloads"
    http://dpdk.org/ml/archives/dev/2018-April/095774.html
[2] "ethdev: alter behavior of flow API actions"
    http://dpdk.org/ml/archives/dev/2018-April/095779.html

> ---
>  doc/guides/prog_guide/rte_flow.rst      | 35 +++++++++++++++++++++
>  lib/librte_ether/rte_ethdev_version.map |  8 +++++
>  lib/librte_ether/rte_flow.c             | 21 +++++++++++++
>  lib/librte_ether/rte_flow.h             | 56 ++++++++++++++++++++++++++++++++-
>  lib/librte_ether/rte_flow_driver.h      |  6 ++++
>  5 files changed, 125 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 961943d..fd33d19 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1698,6 +1698,41 @@ Return values:
>  
>  - 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>  
> +

Unnecessary empty line.

> +Group Count Query
> +~~~~~~~~~~~~~~~~~
> +
> +Query group counter which can be associated with multiple flows on a specified
> +port.
> +
> +This function allows retrieving of group counters. A group counter is a
> +counter which can be shared among multiple flows on a single port or among
> +multiple flows on multiple ports within the same switch domain. Data is
> +gathered by special actions which must be present in the flow rule
> +definition.
> +
> +.. code-block:: c
> +
> +   int
> +   rte_flow_query_group_count(uint16_t port_id,
> +			   uint32_t group_counter_id,
> +			   struct rte_flow_query_count *count,
> +               struct rte_flow_error *error);
> +
> +Arguments:
> +
> +- ``port_id``: port identifier of Ethernet device.
> +- ``group_counter_id``: group counter identifier.
> +- ``count``: group counter parameters.
> +- ``error``: perform verbose error reporting if not NULL. PMDs initialize
> +  this structure in case of error only.
> +
> +Return values:
> +
> +- 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
> +
> +
> +

More unnecessary empty lines.

>  Isolated mode
>  -------------
>  
> diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
> index 34df6c8..cff6807 100644
> --- a/lib/librte_ether/rte_ethdev_version.map
> +++ b/lib/librte_ether/rte_ethdev_version.map
> @@ -229,3 +229,11 @@ EXPERIMENTAL {
>  	rte_mtr_stats_update;
>  
>  } DPDK_17.11;
> +
> +

One more.

> +EXPERIMENTAL {
> +	global:
> +
> +	rte_flow_query_group_count
> +
> +} DPDK_18.05;
> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> index 38f2d27..e10b1d0 100644
> --- a/lib/librte_ether/rte_flow.c
> +++ b/lib/librte_ether/rte_flow.c
> @@ -418,3 +418,24 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len,
>  	}
>  	return 0;
>  }
> +
> +int __rte_experimental
> +rte_flow_query_group_count(uint16_t port_id,
> +	uint32_t group_count_id,
> +	struct rte_flow_query_count *count,
> +	struct rte_flow_error *error)

This function lacks a short documentation comment (see rte_flow_query()).

> +{
> +	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +	const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error);
> +
> +	if (!ops)
> +		return -rte_errno;
> +	if (likely(!!ops->query_group_count))
> +		return flow_err(port_id,
> +				ops->query_group_count(dev, group_count_id,
> +						       count, error),
> +				error);
> +	return rte_flow_error_set(error, ENOSYS,
> +				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +				  NULL, rte_strerror(ENOSYS));
> +}
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 13e4202..7d1f89d 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1010,7 +1010,19 @@ enum rte_flow_action_type {
>  	 *
>  	 * See struct rte_flow_action_security.
>  	 */
> -	RTE_FLOW_ACTION_TYPE_SECURITY
> +	RTE_FLOW_ACTION_TYPE_SECURITY,
> +
> +	/**
> +	 * Enable a shared flow group counter for flow. Group counters can be
> +	 * associated with multiples flows on the same port or on port within
> +	 * the same switch domain if supported by that device.
> +	 *
> +	 * Group counters can be retrieved and reset through
> +	 * rte_flow_query_group_count()
> +	 *
> +	 * See struct rte_flow_action_group_count.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT

Don't forget the trailing comma.

>  };
>  
>  /**
> @@ -1149,6 +1161,18 @@ struct rte_flow_action_security {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> + *
> + * A packet/byte counter which can be shared across a group of flows programmed
> + * on the same port/switch domain.
> + *
> + * Non-terminating by default.
> + */
> +struct rte_flow_action_group_count {
> +	uint32_t id;
> +};
> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> @@ -1476,6 +1500,36 @@ rte_flow_copy(struct rte_flow_desc *fd, size_t len,
>  	      const struct rte_flow_item *items,
>  	      const struct rte_flow_action *actions);
>  
> +

Caught another empty line.

> +/**
> + * Get hit/bytes count for group counter.
> + *
> + * A group counter is a counter which can be shared among multiple flows on a
> + * single port or among multiple flows on multiple ports within the same
> + * switch domain.
> + *
> + * In the case of ports within the same switch domain a global name space is
> + * assumed for group_count_id value.
> + *
> + * @param[in]	port_id
> + *   Port identifier of Ethernet device.
> + * @param[in]	group_count_id
> + *   Group counter identifier to query
> + * @param[out]	count
> + *   Group counter value
> + * @param[out]	error
> + *   Perform verbose error reporting if not NULL. PMDs initialize this
> + *   structure in case of error only.
> + *
> + * @return
> + *   Negative error code (errno value) and rte_errno is set.
> + */
> +int __rte_experimental
> +rte_flow_query_group_count(uint16_t port_id,
> +			   uint32_t group_count_id,
> +			   struct rte_flow_query_count *count,
> +			   struct rte_flow_error *error);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_ether/rte_flow_driver.h b/lib/librte_ether/rte_flow_driver.h
> index 7778c8e..ef09465 100644
> --- a/lib/librte_ether/rte_flow_driver.h
> +++ b/lib/librte_ether/rte_flow_driver.h
> @@ -96,6 +96,12 @@ struct rte_flow_ops {
>  		(struct rte_eth_dev *,
>  		 int,
>  		 struct rte_flow_error *);
> +	/** See rte_flow_query_group_count(). */
> +	int (*query_group_count)
> +		(struct rte_eth_dev *,
> +		 uint32_t,
> +		 struct rte_flow_query_count *,
> +		 struct rte_flow_error *);
>  };
>  
>  /**
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 02/10] bpf: add BPF loading and execution framework
  @ 2018-04-06 18:49  2% ` Konstantin Ananyev
  0 siblings, 0 replies; 200+ results
From: Konstantin Ananyev @ 2018-04-06 18:49 UTC (permalink / raw)
  To: dev; +Cc: Konstantin Ananyev

librte_bpf provides a framework to load and execute eBPF bytecode
inside user-space dpdk based applications.
It supports basic set of features from eBPF spec
(https://www.kernel.org/doc/Documentation/networking/filter.txt).

Not currently supported features:
 - JIT
 - cBPF
 - tail-pointer call
 - eBPF MAP
 - skb

It also adds dependency on libelf.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 config/common_base                 |   5 +
 lib/Makefile                       |   2 +
 lib/librte_bpf/Makefile            |  30 +++
 lib/librte_bpf/bpf.c               |  59 +++++
 lib/librte_bpf/bpf_exec.c          | 452 +++++++++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_impl.h          |  41 ++++
 lib/librte_bpf/bpf_load.c          | 386 +++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_validate.c      |  55 +++++
 lib/librte_bpf/meson.build         |  18 ++
 lib/librte_bpf/rte_bpf.h           | 170 ++++++++++++++
 lib/librte_bpf/rte_bpf_version.map |  12 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   2 +
 13 files changed, 1233 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_bpf/Makefile
 create mode 100644 lib/librte_bpf/bpf.c
 create mode 100644 lib/librte_bpf/bpf_exec.c
 create mode 100644 lib/librte_bpf/bpf_impl.h
 create mode 100644 lib/librte_bpf/bpf_load.c
 create mode 100644 lib/librte_bpf/bpf_validate.c
 create mode 100644 lib/librte_bpf/meson.build
 create mode 100644 lib/librte_bpf/rte_bpf.h
 create mode 100644 lib/librte_bpf/rte_bpf_version.map

diff --git a/config/common_base b/config/common_base
index c09c7cf88..d68c2e211 100644
--- a/config/common_base
+++ b/config/common_base
@@ -821,3 +821,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile librte_bpf
+#
+CONFIG_RTE_LIBRTE_BPF=y
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..a4a2329f9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -97,6 +97,8 @@ DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
 DEPDIRS-librte_gso += librte_mempool
+DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
+DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ether
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile
new file mode 100644
index 000000000..e0f434e77
--- /dev/null
+++ b/lib/librte_bpf/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_bpf.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+LDLIBS += -lrte_net -lrte_eal
+LDLIBS += -lrte_mempool -lrte_ring
+LDLIBS += -lrte_mbuf -lrte_ethdev
+LDLIBS += -lelf
+
+EXPORT_MAP := rte_bpf_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c
+
+# install header files
+SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c
new file mode 100644
index 000000000..d7f68c017
--- /dev/null
+++ b/lib/librte_bpf/bpf.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+int rte_bpf_logtype;
+
+__rte_experimental void
+rte_bpf_destroy(struct rte_bpf *bpf)
+{
+	if (bpf != NULL) {
+		if (bpf->jit.func != NULL)
+			munmap(bpf->jit.func, bpf->jit.sz);
+		munmap(bpf, bpf->sz);
+	}
+}
+
+__rte_experimental int
+rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	jit[0] = bpf->jit;
+	return 0;
+}
+
+int
+bpf_jit(struct rte_bpf *bpf)
+{
+	int32_t rc;
+
+	rc = -ENOTSUP;
+	if (rc != 0)
+		RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
+
+RTE_INIT(rte_bpf_init_log);
+
+static void
+rte_bpf_init_log(void)
+{
+	rte_bpf_logtype = rte_log_register("lib.bpf");
+	if (rte_bpf_logtype >= 0)
+		rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO);
+}
diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c
new file mode 100644
index 000000000..0382ade98
--- /dev/null
+++ b/lib/librte_bpf/bpf_exec.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+
+#include "bpf_impl.h"
+
+#define BPF_JMP_UNC(ins)	((ins) += (ins)->off)
+
+#define BPF_JMP_CND_REG(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \
+		(ins)->off : 0)
+
+#define BPF_JMP_CND_IMM(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \
+		(ins)->off : 0)
+
+#define BPF_NEG_ALU(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg]))
+
+#define BPF_MOV_ALU_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg])
+
+#define BPF_OP_ALU_REG(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg])
+
+#define BPF_MOV_ALU_IMM(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(ins)->imm)
+
+#define BPF_OP_ALU_IMM(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(ins)->imm)
+
+#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \
+	if ((type)(reg)[(ins)->src_reg] == 0) { \
+		RTE_BPF_LOG(ERR, \
+			"%s(%p): division by 0 at pc: %#zx;\n", \
+			__func__, bpf, \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+		return 0; \
+	} \
+} while (0)
+
+#define BPF_LD_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = \
+		*(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off))
+
+#define BPF_ST_IMM(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(ins)->imm)
+
+#define BPF_ST_REG(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(reg)[(ins)->src_reg])
+
+#define BPF_ST_XADD_REG(reg, ins, tp)	\
+	(rte_atomic##tp##_add((rte_atomic##tp##_t *) \
+		(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \
+		reg[ins->src_reg]))
+
+static inline void
+bpf_alu_be(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_be_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_be_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_be_64(*v);
+		break;
+	}
+}
+
+static inline void
+bpf_alu_le(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_le_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_le_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_le_64(*v);
+		break;
+	}
+}
+
+static inline uint64_t
+bpf_exec(const struct rte_bpf *bpf, uint64_t reg[MAX_BPF_REG])
+{
+	const struct bpf_insn *ins;
+
+	for (ins = bpf->prm.ins; ; ins++) {
+		switch (ins->code) {
+		/* 32 bit ALU IMM operations */
+		case (BPF_ALU | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint32_t);
+			break;
+		/* 32 bit ALU REG operations */
+		case (BPF_ALU | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_BE):
+			bpf_alu_be(reg, ins);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_LE):
+			bpf_alu_le(reg, ins);
+			break;
+		/* 64 bit ALU IMM operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint64_t);
+			break;
+		/* 64 bit ALU REG operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint64_t);
+			break;
+		/* load instructions */
+		case (BPF_LDX | BPF_MEM | BPF_B):
+			BPF_LD_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_H):
+			BPF_LD_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_W):
+			BPF_LD_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_DW):
+			BPF_LD_REG(reg, ins, uint64_t);
+			break;
+		/* load 64 bit immediate value */
+		case (BPF_LD | BPF_IMM | BPF_DW):
+			reg[ins->dst_reg] = (uint32_t)ins[0].imm |
+				(uint64_t)(uint32_t)ins[1].imm << 32;
+			ins++;
+			break;
+		/* store instructions */
+		case (BPF_STX | BPF_MEM | BPF_B):
+			BPF_ST_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_H):
+			BPF_ST_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_W):
+			BPF_ST_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_DW):
+			BPF_ST_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_B):
+			BPF_ST_IMM(reg, ins, uint8_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_H):
+			BPF_ST_IMM(reg, ins, uint16_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_W):
+			BPF_ST_IMM(reg, ins, uint32_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_DW):
+			BPF_ST_IMM(reg, ins, uint64_t);
+			break;
+		/* atomic add instructions */
+		case (BPF_STX | BPF_XADD | BPF_W):
+			BPF_ST_XADD_REG(reg, ins, 32);
+			break;
+		case (BPF_STX | BPF_XADD | BPF_DW):
+			BPF_ST_XADD_REG(reg, ins, 64);
+			break;
+		/* jump instructions */
+		case (BPF_JMP | BPF_JA):
+			BPF_JMP_UNC(ins);
+			break;
+		/* jump IMM instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, &, uint64_t);
+			break;
+		/* jump REG instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, &, uint64_t);
+			break;
+		/* call instructions */
+		case (BPF_JMP | BPF_CALL):
+			reg[BPF_REG_0] = bpf->prm.xsym[ins->imm].func(
+				reg[BPF_REG_1], reg[BPF_REG_2], reg[BPF_REG_3],
+				reg[BPF_REG_4], reg[BPF_REG_5]);
+			break;
+		/* return instruction */
+		case (BPF_JMP | BPF_EXIT):
+			return reg[BPF_REG_0];
+		default:
+			RTE_BPF_LOG(ERR,
+				"%s(%p): invalid opcode %#x at pc: %#zx;\n",
+				__func__, bpf, ins->code,
+				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+			return 0;
+		}
+	}
+
+	/* should never be reached */
+	RTE_VERIFY(0);
+	return 0;
+}
+
+__rte_experimental uint32_t
+rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
+	uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[MAX_BPF_REG];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		reg[BPF_REG_1] = (uintptr_t)ctx[i];
+		reg[BPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+__rte_experimental uint64_t
+rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
+{
+	uint64_t rc;
+
+	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
+	return rc;
+}
diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h
new file mode 100644
index 000000000..5d7e65c31
--- /dev/null
+++ b/lib/librte_bpf/bpf_impl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _BPF_H_
+#define _BPF_H_
+
+#include <rte_bpf.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BPF_STACK_SIZE	0x200
+
+struct rte_bpf {
+	struct rte_bpf_prm prm;
+	struct rte_bpf_jit jit;
+	size_t sz;
+	uint32_t stack_sz;
+};
+
+extern int bpf_validate(struct rte_bpf *bpf);
+
+extern int bpf_jit(struct rte_bpf *bpf);
+
+#ifdef RTE_ARCH_X86_64
+extern int bpf_jit_x86(struct rte_bpf *);
+#endif
+
+extern int rte_bpf_logtype;
+
+#define	RTE_BPF_LOG(lvl, fmt, args...) \
+	rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BPF_H_ */
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
new file mode 100644
index 000000000..3c7279a6c
--- /dev/null
+++ b/lib/librte_bpf/bpf_load.c
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+
+#include "bpf_impl.h"
+
+/* To overcome compatibility issue */
+#ifndef EM_BPF
+#define	EM_BPF	247
+#endif
+
+static uint32_t
+bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
+	const struct rte_bpf_xsym fp[], uint32_t fn)
+{
+	uint32_t i;
+
+	if (sn == NULL || fp == NULL)
+		return UINT32_MAX;
+
+	for (i = 0; i != fn; i++) {
+		if (fp[i].type == type && strcmp(sn, fp[i].name) == 0)
+			break;
+	}
+
+	return (i != fn) ? i : UINT32_MAX;
+}
+
+/*
+ * update BPF code at offset *ofs* with a proper address(index) for external
+ * symbol *sn*
+ */
+static int
+resolve_xsym(const char *sn, size_t ofs, struct bpf_insn *ins, size_t ins_sz,
+	const struct rte_bpf_prm *prm)
+{
+	uint32_t idx, fidx;
+	enum rte_bpf_xtype type;
+
+	if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz)
+		return -EINVAL;
+
+	idx = ofs / sizeof(ins[0]);
+	if (ins[idx].code == (BPF_JMP | BPF_CALL))
+		type = RTE_BPF_XTYPE_FUNC;
+	else if (ins[idx].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+			ofs < ins_sz - sizeof(ins[idx]))
+		type = RTE_BPF_XTYPE_VAR;
+	else
+		return -EINVAL;
+
+	fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym);
+	if (fidx == UINT32_MAX)
+		return -ENOENT;
+
+	/* for function we just need an index in our xsym table */
+	if (type == RTE_BPF_XTYPE_FUNC)
+		ins[idx].imm = fidx;
+	/* for variable we need to store its absolute address */
+	else {
+		ins[idx].imm = (uintptr_t)prm->xsym[fidx].var;
+		ins[idx + 1].imm =
+			(uint64_t)(uintptr_t)prm->xsym[fidx].var >> 32;
+	}
+
+	return 0;
+}
+
+static int
+check_elf_header(const Elf64_Ehdr * eh)
+{
+	const char *err;
+
+	err = NULL;
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	if (eh->e_ident[EI_DATA] != ELFDATA2LSB)
+#else
+	if (eh->e_ident[EI_DATA] != ELFDATA2MSB)
+#endif
+		err = "not native byte order";
+	else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE)
+		err = "unexpected OS ABI";
+	else if (eh->e_type != ET_REL)
+		err = "unexpected ELF type";
+	else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF)
+		err = "unexpected machine type";
+
+	if (err != NULL) {
+		RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find executable section by name.
+ */
+static int
+find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
+{
+	Elf_Scn *sc;
+	const Elf64_Ehdr *eh;
+	const Elf64_Shdr *sh;
+	Elf_Data *sd;
+	const char *sn;
+	int32_t rc;
+
+	eh = elf64_getehdr(elf);
+	if (eh == NULL) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	if (check_elf_header(eh) != 0)
+		return -EINVAL;
+
+	/* find given section by name */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL;
+			sc = elf_nextscn(elf, sc)) {
+		sh = elf64_getshdr(sc);
+		sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name);
+		if (sn != NULL && strcmp(section, sn) == 0 &&
+				sh->sh_type == SHT_PROGBITS &&
+				sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR))
+			break;
+	}
+
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL || sd->d_size == 0 ||
+			sd->d_size % sizeof(struct bpf_insn) != 0) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	*psd = sd;
+	*pidx = elf_ndxscn(sc);
+	return 0;
+}
+
+/*
+ * helper function to process data from relocation table.
+ */
+static int
+process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
+	struct bpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+{
+	int32_t rc;
+	uint32_t i, n;
+	size_t ofs, sym;
+	const char *sn;
+	const Elf64_Ehdr *eh;
+	Elf_Scn *sc;
+	const Elf_Data *sd;
+	Elf64_Sym *sm;
+
+	eh = elf64_getehdr(elf);
+
+	/* get symtable by section index */
+	sc = elf_getscn(elf, sym_idx);
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL)
+		return -EINVAL;
+	sm = sd->d_buf;
+
+	n = re_sz / sizeof(re[0]);
+	for (i = 0; i != n; i++) {
+
+		ofs = re[i].r_offset;
+
+		/* retrieve index in the symtable */
+		sym = ELF64_R_SYM(re[i].r_info);
+		if (sym * sizeof(sm[0]) >= sd->d_size)
+			return -EINVAL;
+
+		sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name);
+
+		rc = resolve_xsym(sn, ofs, ins, ins_sz, prm);
+		if (rc != 0) {
+			RTE_BPF_LOG(ERR,
+				"resolve_xsym(%s, %zu) error code: %d\n",
+				sn, ofs, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find relocation information (if any)
+ * and update bpf code.
+ */
+static int
+elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
+	const struct rte_bpf_prm *prm)
+{
+	Elf64_Rel *re;
+	Elf_Scn *sc;
+	const Elf64_Shdr *sh;
+	const Elf_Data *sd;
+	int32_t rc;
+
+	rc = 0;
+
+	/* walk through all sections */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0;
+			sc = elf_nextscn(elf, sc)) {
+
+		sh = elf64_getshdr(sc);
+
+		/* relocation data for our code section */
+		if (sh->sh_type == SHT_REL && sh->sh_info == sidx) {
+			sd = elf_getdata(sc, NULL);
+			if (sd == NULL || sd->d_size == 0 ||
+					sd->d_size % sizeof(re[0]) != 0)
+				return -EINVAL;
+			rc = process_reloc(elf, sh->sh_link,
+				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				prm);
+		}
+	}
+
+	return rc;
+}
+
+static struct rte_bpf *
+bpf_load(const struct rte_bpf_prm *prm)
+{
+	uint8_t *buf;
+	struct rte_bpf *bpf;
+	size_t sz, bsz, insz, xsz;
+
+	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
+	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	bsz = sizeof(bpf[0]);
+	sz = insz + xsz + bsz;
+
+	buf = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf == MAP_FAILED)
+		return NULL;
+
+	bpf = (void *)buf;
+	bpf->sz = sz;
+
+	memcpy(&bpf->prm, prm, sizeof(bpf->prm));
+
+	memcpy(buf + bsz, prm->xsym, xsz);
+	memcpy(buf + bsz + xsz, prm->ins, insz);
+
+	bpf->prm.xsym = (void *)(buf + bsz);
+	bpf->prm.ins = (void *)(buf + bsz + xsz);
+
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	if (prm == NULL || prm->ins == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load(prm);
+	if (bpf == NULL) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	rc = bpf_validate(bpf);
+	if (rc == 0) {
+		bpf_jit(bpf);
+		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
+			rc = -ENOMEM;
+	}
+
+	if (rc != 0) {
+		rte_bpf_destroy(bpf);
+		rte_errno = -rc;
+		return NULL;
+	}
+
+	return bpf;
+}
+
+static struct rte_bpf *
+bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+{
+	Elf *elf;
+	Elf_Data *sd;
+	size_t sidx;
+	int32_t rc;
+	struct rte_bpf *bpf;
+	struct rte_bpf_prm np;
+
+	elf_version(EV_CURRENT);
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+
+	rc = find_elf_code(elf, section, &sd, &sidx);
+	if (rc == 0)
+		rc = elf_reloc_code(elf, sd, sidx, prm);
+
+	if (rc == 0) {
+		np = prm[0];
+		np.ins = sd->d_buf;
+		np.nb_ins = sd->d_size / sizeof(struct bpf_insn);
+		bpf = rte_bpf_load(&np);
+	} else {
+		bpf = NULL;
+		rte_errno = -rc;
+	}
+
+	elf_end(elf);
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	int32_t fd, rc;
+	struct rte_bpf *bpf;
+
+	if (prm == NULL || fname == NULL || sname == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		rc = errno;
+		RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n",
+			__func__, fname, rc, strerror(rc));
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load_elf(prm, fd, sname);
+	close(fd);
+
+	if (bpf == NULL) {
+		RTE_BPF_LOG(ERR,
+			"%s(fname=\"%s\", sname=\"%s\") failed, "
+			"error code: %d\n",
+			__func__, fname, sname, rte_errno);
+		return NULL;
+	}
+
+	RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") "
+		"successfully creates %p(jit={.func=%p,.sz=%zu});\n",
+		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
+	return bpf;
+}
diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c
new file mode 100644
index 000000000..1911e1381
--- /dev/null
+++ b/lib/librte_bpf/bpf_validate.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+/*
+ * dummy one for now, need more work.
+ */
+int
+bpf_validate(struct rte_bpf *bpf)
+{
+	int32_t rc, ofs, stack_sz;
+	uint32_t i, op, dr;
+	const struct bpf_insn *ins;
+
+	rc = 0;
+	stack_sz = 0;
+	for (i = 0; i != bpf->prm.nb_ins; i++) {
+
+		ins = bpf->prm.ins + i;
+		op = ins->code;
+		dr = ins->dst_reg;
+		ofs = ins->off;
+
+		if ((BPF_CLASS(op) == BPF_STX || BPF_CLASS(op) == BPF_ST) &&
+				dr == BPF_REG_10) {
+			ofs -= sizeof(uint64_t);
+			stack_sz = RTE_MIN(ofs, stack_sz);
+		}
+	}
+
+	if (stack_sz != 0) {
+		stack_sz = -stack_sz;
+		if (stack_sz > MAX_BPF_STACK_SIZE)
+			rc = -ERANGE;
+		else
+			bpf->stack_sz = stack_sz;
+	}
+
+	if (rc != 0)
+		RTE_BPF_LOG(ERR, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build
new file mode 100644
index 000000000..05c48c7ff
--- /dev/null
+++ b/lib/librte_bpf/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+allow_experimental_apis = true
+sources = files('bpf.c',
+		'bpf_exec.c',
+		'bpf_load.c',
+		'bpf_validate.c')
+
+install_headers = files('rte_bpf.h')
+
+deps += ['mbuf', 'net']
+
+dep = dependency('libelf', required: false)
+if dep.found() == false
+	build = false
+endif
+ext_deps += dep
diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h
new file mode 100644
index 000000000..825621404
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_BPF_H_
+#define _RTE_BPF_H_
+
+/**
+ * @file
+ *
+ * RTE BPF support.
+ * librte_bpf provides a framework to load and execute eBPF bytecode
+ * inside user-space dpdk based applications.
+ * It supports basic set of features from eBPF spec
+ * (https://www.kernel.org/doc/Documentation/networking/filter.txt).
+ */
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <bpf_def.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible types for external symbols.
+ */
+enum rte_bpf_xtype {
+	RTE_BPF_XTYPE_FUNC, /**< function */
+	RTE_BPF_XTYPE_VAR, /**< variable */
+	RTE_BPF_XTYPE_NUM
+};
+
+/**
+ * Definition for external symbols available in the BPF program.
+ */
+struct rte_bpf_xsym {
+	const char *name;        /**< name */
+	enum rte_bpf_xtype type; /**< type */
+	union {
+		uint64_t (*func)(uint64_t, uint64_t, uint64_t,
+				uint64_t, uint64_t);
+		void *var;
+	}; /**< value */
+};
+
+/**
+ * Possible BPF program types.
+ * Use negative values for DPDK specific prog-types, to make sure they will
+ * not interfere with Linux related ones.
+ */
+enum rte_bpf_prog_type {
+	RTE_BPF_PROG_TYPE_UNSPEC = BPF_PROG_TYPE_UNSPEC,
+	/**< input is a pointer to raw data */
+	RTE_BPF_PROG_TYPE_MBUF = INT32_MIN,
+	/**< input is a pointer to rte_mbuf */
+};
+
+/**
+ * Input parameters for loading eBPF code.
+ */
+struct rte_bpf_prm {
+	const struct bpf_insn *ins; /**< array of eBPF instructions */
+	uint32_t nb_ins;            /**< number of instructions in ins */
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym; /**< number of elements in xsym */
+	enum rte_bpf_prog_type prog_type; /**< eBPF program type */
+};
+
+/**
+ * Information about compiled into native ISA eBPF code.
+ */
+struct rte_bpf_jit {
+	uint64_t (*func)(void *); /**< JIT-ed native code */
+	size_t sz;                /**< size of JIT-ed code */
+};
+
+struct rte_bpf;
+
+/**
+ * De-allocate all memory used by this eBPF execution context.
+ *
+ * @param bpf
+ *   BPF handle to destroy.
+ */
+void rte_bpf_destroy(struct rte_bpf *bpf);
+
+/**
+ * Create a new eBPF execution context and load given BPF code into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_load(const struct rte_bpf_prm *prm);
+
+/**
+ * Create a new eBPF execution context and load BPF code from given ELF
+ * file into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @param fname
+ *  Pathname for a ELF file.
+ * @param sname
+ *  Name of the executable section within the file to load.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_elf_load(const struct rte_bpf_prm *prm,
+	const char *fname, const char *sname);
+
+/**
+ * Execute given BPF bytecode.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to input context.
+ * @return
+ *   BPF execution return value.
+ */
+uint64_t rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
+
+/**
+ * Execute given BPF bytecode over a set of input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   array of pointers to the input contexts.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number of elements in ctx[] (and rc[]).
+ * @return
+ *   number of successfully processed inputs.
+ */
+uint32_t rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[],
+	uint64_t rc[], uint32_t num);
+
+/**
+ * Provide information about natively compield code for given BPF handle.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the rte_bpf_jit structure to be filled with related data.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BPF_H_ */
diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map
new file mode 100644
index 000000000..ff65144df
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_bpf_destroy;
+	rte_bpf_elf_load;
+	rte_bpf_exec;
+	rte_bpf_exec_burst;
+	rte_bpf_get_jit;
+	rte_bpf_load;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ef6159170..7ff7aaaa5 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -23,7 +23,7 @@ libraries = [ 'compat', # just a header, used for versioning
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
-	'flow_classify']
+	'flow_classify', 'bpf']
 
 foreach l:libraries
 	build = true
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 258590819..405a13147 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -83,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER)          += -lrte_timer
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 
+_LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf -lelf
+
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
-- 
2.13.6

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API
  @ 2018-04-06 17:11  0%     ` Andrew Rybchenko
  2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-06 17:11 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> consistent with the normal stacking order of pattern items, which is
> confusing to applications.
>
> Problem is that when followed by one of these layers, the EtherType field
> of the preceding layer keeps its "inner" definition, and the "outer" TPID
> is provided by the subsequent layer, the reverse of how a packet looks like
> on the wire:
>
>   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
>   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
>
> Worse, when QinQ is involved, the stacking order of VLAN layers is
> unspecified. It is unclear whether it should be reversed (innermost to
> outermost) as well given TPID applies to the previous layer:
>
>   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
>   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
>   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
>
> While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> order in case of QinQ and the lack of documentation remain an issue.
>
> This patch replaces TPID in the VLAN pattern item with an inner
> EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> clarifies documentation and updates all relevant code.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> items:
>
> - bnxt: EtherType matching is supported, and vlan->inner_type overrides
>    eth->type if the latter has standard TPID value 0x8100, otherwise an
>    error is triggered.
>
> - e1000: EtherType matching is only supported with the ETHERTYPE filter,
>    which does not support VLAN matching, therefore no impact.
>
> - enic: same as bnxt.
>
> - i40e: same as bnxt with a configurable TPID value for the FDIR filter,
>    with existing limitations on allowed EtherType values. The remaining
>    filter types (VXLAN, NVGRE, QINQ) do not support EtherType matching.
>
> - ixgbe: same as e1000, with additional minor change to rely on the new
>    E-Tag macro definition.
>
> - mlx4: EtherType/TPID matching is not supported, no impact.
>
> - mlx5: same as bnxt.
>
> - mrvl: EtherType matching is supported but eth->type cannot be specified
>    when a VLAN item is present. However vlan->inner_type is used if
>    specified.
>
> - sfc: same as bnxt with QinQ TPID value 0x88a8 additionally supported.
>
> - tap: same as bnxt.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Hyong Youb Kim <hyonkim@cisco.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Tomasz Duszynski <tdu@semihalf.com>
> Cc: Dmitri Epshtein <dima@marvell.com>
> Cc: Natalie Samsonov <nsamsono@marvell.com>
> Cc: Jianbo Liu <jianbo.liu@arm.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> Hi PMD maintainers, while I'm pretty confident in these changes, I could
> not validate them with all devices.
>
> It would be great if you could apply this patch, run testpmd, create VLAN
> flow rules with/without inner EtherType as described and send matching
> traffic while making sure nothing was broken in the process.
>
> Thanks!
> ---
>   app/test-pmd/cmdline_flow.c                 | 17 +++---
>   doc/guides/nics/tap.rst                     |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 21 ++++++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
>   drivers/net/bnxt/bnxt_filter.c              | 39 +++++++++++---
>   drivers/net/enic/enic_flow.c                | 22 +++++---
>   drivers/net/i40e/i40e_flow.c                | 69 +++++++++++++++++++-----
>   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
>   drivers/net/mlx5/mlx5_flow.c                | 16 +++++-
>   drivers/net/mvpp2/mrvl_flow.c               | 27 +++++++---
>   drivers/net/sfc/sfc_flow.c                  | 28 ++++++++++
>   drivers/net/tap/tap_flow.c                  | 16 ++++--
>   lib/librte_ether/rte_flow.h                 | 24 ++++++---
>   lib/librte_net/rte_ether.h                  |  1 +
>   14 files changed, 229 insertions(+), 60 deletions(-)

<...>

> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> index bc4974edf..f61d4ec92 100644
> --- a/drivers/net/sfc/sfc_flow.c
> +++ b/drivers/net/sfc/sfc_flow.c
> @@ -7,6 +7,7 @@
>    * for Solarflare) and Solarflare Communications, Inc.
>    */
>   
> +#include <rte_byteorder.h>
>   #include <rte_tailq.h>
>   #include <rte_common.h>
>   #include <rte_ethdev_driver.h>
> @@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
>   	const struct rte_flow_item_vlan *mask = NULL;
>   	const struct rte_flow_item_vlan supp_mask = {
>   		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
> +		.inner_type = RTE_BE16(0xffff),
>   	};
>   
>   	rc = sfc_flow_parse_init(item,
> @@ -393,6 +395,32 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
>   		return -rte_errno;
>   	}
>   
> +	/*
> +	 * If an EtherType was already specified, make sure it is a valid
> +	 * TPID for the current VLAN layer before overwriting it with the
> +	 * specified inner type.
> +	 */
> +	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
> +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_VLAN) &&
> +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_QINQ)) {

1. efs_ether_type is host-endian
2. HW recognizes more TPIDs (0x9100, 0x9200, 0x9300) as VLAN
3. However, if some TPID is specified, user may expect that only VLAN 
packets
     with specified TPID match. It is false expectation since the 
information is not
     passed to HW to match (and there is no way to match).
     So, it is safer to deny TPID specification (i.e. keep the first 
condition only).
     From the flexibility point of view it is possible to allow any, but 
it should be
     documented that exact match is not checked in fact.

> +		rte_flow_error_set(error, EINVAL,
> +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> +				   "Unsupported outer TPID");
> +		return -rte_errno;
> +	}
> +	if (!mask->inner_type) {
> +		efx_spec->efs_match_flags &= ~EFX_FILTER_MATCH_ETHER_TYPE;
> +		efx_spec->efs_ether_type = RTE_BE16(0x0000);

Nothing should be done here if above is done.

> +	} else if (mask->inner_type == supp_mask.inner_type) {
> +		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
> +		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
> +	} else {
> +		rte_flow_error_set(error, EINVAL,
> +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> +				   "Bad mask for VLAN inner_type");
> +		return -rte_errno;
> +	}
> +
>   	return 0;
>   }

<...>

> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index fc7e6705d..b13b0e2e6 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
>    *
>    * Matches an 802.1Q/ad VLAN tag.
>    *
> - * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
> - * RTE_FLOW_ITEM_TYPE_VLAN.
> + * The corresponding standard outer EtherType (TPID) values are
> + * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
> + * pattern item.
>    */
>   struct rte_flow_item_vlan {
> -	rte_be16_t tpid; /**< Tag protocol identifier. */
>   	rte_be16_t tci; /**< Tag control information. */
> +	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
>   };
>   
>   /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
>   #ifndef __cplusplus
>   static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
> -	.tpid = RTE_BE16(0x0000),
> -	.tci = RTE_BE16(0xffff),
> +	.tci = RTE_BE16(0x0fff),

It looks like unrelated change.

> +	.inner_type = RTE_BE16(0x0000),
>   };
>   #endif

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  @ 2018-04-06 17:01  0% ` Ferruh Yigit
  2018-04-10  9:43  4% ` [dpdk-dev] [PATCH v6 " Remy Horton
  1 sibling, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-06 17:01 UTC (permalink / raw)
  To: Remy Horton, dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

On 4/6/2018 3:49 PM, Remy Horton wrote:
> The optimal values of several transmission & reception related parameters,
> such as burst sizes, descriptor ring sizes, and number of queues, varies
> between different network interface devices. This patchset allows individual
> PMDs to specify their preferred parameter values, and if so indicated by an
> application, for them to be used automatically by the ethdev layer.
> 
> rte_eth_dev_configure() has been changed so that specifying zero for both
> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
> are not available, falls back to EAL defaults. Setting one (but not both)
> to zero does not cause the use of defaults, as having one of them zeroed is
> a valid setup.
> 
> This patchset includes per-PMD values for e1000 and i40e but it is expected
> that subsequent patchsets will cover other PMDs. A deprecation notice
> covering the API/ABI change is in place.
> 
> Changes in v5:
> * uint_16_t corrected to uint16_t
> 
> Changes in v4:
> * Added API/ABI change documentation
> * Rebased to 78f5a2e93d74
> 
> Changes in v3:
> * Changed formatting around new rte_eth_dev_info fields
> * Added Doxygen documentation to struct rte_eth_dev_portconf
> * Testpmd "port config all burst 0" and --burst=0 uses PMD 
>   Rx burst recommendations.
> * Added to release notes
> * Rebased to 8ea081f38161
> 
> Changes in v2:
> * Rebased to master
> * Removed fallback values from rte_eth_dev_info_get()
> * Added fallback values to rte_rte_[rt]x_queue_setup()
> * Added fallback values to rte_eth_dev_configure()
> * Corrected comment
> * Removed deprecation notice
> * Split RX and Tx into seperate structures
> * Changed parameter names
> 
> 
> Remy Horton (4):
>   ethdev: add support for PMD-tuned Tx/Rx parameters
>   net/e1000: add TxRx tuning parameters
>   net/i40e: add TxRx tuning parameters
>   testpmd: make use of per-PMD TxRx parameters

For series,
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

^ permalink raw reply	[relevance 0%]

Results 9401-9600 of ~18000   |  | reverse | sort options + mbox downloads above
-- links below jump to the message on this page --
2017-11-09 13:43     [dpdk-dev] [RFC] cmdline: rework as a wrapper to libedit Adrien Mazarguil
2018-04-17 15:21  1% ` [dpdk-dev] [PATCH v1] " Adrien Mazarguil
2018-04-17 15:59  0%   ` Burakov, Anatoly
2018-04-19 15:13  1%   ` [dpdk-dev] [PATCH v2] " Adrien Mazarguil
2017-11-24 16:06     [dpdk-dev] [RFC PATCH 0/6] mempool: add bucket mempool driver Andrew Rybchenko
2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
2018-04-16 15:33  0%     ` Olivier Matz
2018-04-16 15:41  0%       ` Andrew Rybchenko
2018-04-17 10:23  0%     ` Burakov, Anatoly
2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
2018-04-16 13:33  4%   ` [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation Andrew Rybchenko
2018-01-23 13:15     [dpdk-dev] [RFC v2 00/17] mempool: add bucket mempool driver Andrew Rybchenko
2018-03-26 16:09     ` [dpdk-dev] [PATCH v3 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
2018-03-26 16:09       ` [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
2018-04-12 15:22  0%     ` Burakov, Anatoly
2018-03-26 16:12     ` [dpdk-dev] [PATCH v1 0/6] mempool: add bucket driver Andrew Rybchenko
2018-03-26 16:12       ` [dpdk-dev] [PATCH v1 2/6] mempool: implement abstract mempool info API Andrew Rybchenko
2018-04-19 16:42  5%     ` Olivier Matz
2018-04-19 16:41  0%   ` [dpdk-dev] [PATCH v1 0/6] mempool: add bucket driver Olivier Matz
2018-02-26 15:09     [dpdk-dev] [PATCH 01/18] ethdev: support tunnel RSS level Xueming Li
2018-04-10 13:00     ` [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE Xueming Li
2018-04-11  9:59  5%   ` Adrien Mazarguil
2018-04-11 12:04  0%     ` Xueming(Steven) Li
2018-04-10 13:00     ` [dpdk-dev] [PATCH v2 4/5] app/testpmd: " Xueming Li
2018-04-11  9:59  3%   ` Adrien Mazarguil
2018-04-12  7:33  3% ` [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types Xueming Li
2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
2018-04-17 15:04  3%   ` [dpdk-dev] [PATCH v5 0/4] " Xueming Li
2018-04-17 16:05  0%     ` Iremonger, Bernard
2018-04-18 11:55  0%       ` Xueming(Steven) Li
2018-04-18 15:11  0%         ` Iremonger, Bernard
2018-04-19 14:24  0%           ` Xueming(Steven) Li
2018-04-20 11:56  3%   ` [dpdk-dev] [PATCH v6 0/5] " Xueming Li
2018-04-23 12:16  3%     ` [dpdk-dev] [PATCH v7 " Xueming Li
2018-03-10  1:25     [dpdk-dev] [PATCH v1 0/6] net/mlx5: add Multi-Packet Rx support Yongseok Koh
2018-04-02 18:50     ` [dpdk-dev] [PATCH v2 " Yongseok Koh
2018-04-02 18:50       ` [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection Yongseok Koh
2018-04-03  8:26         ` Olivier Matz
2018-04-04  0:12           ` Yongseok Koh
2018-04-09 16:04             ` Olivier Matz
2018-04-10  1:59  3%           ` Yongseok Koh
2018-04-11  0:25  0%             ` Ananyev, Konstantin
2018-04-11  5:33  0%               ` Yongseok Koh
2018-04-11 11:39  0%                 ` Ananyev, Konstantin
2018-04-11 17:08  0%                   ` Yongseok Koh
2018-04-12 16:34  0%                     ` Ananyev, Konstantin
2018-04-12 18:58  0%                       ` Yongseok Koh
2018-04-19  1:11     ` [dpdk-dev] [PATCH v3 1/2] mbuf: support attaching external buffer to mbuf Yongseok Koh
2018-04-23 16:18  4%   ` Olivier Matz
2018-03-21 11:11     [dpdk-dev] handle seq no overflow in IPsec offload Anoob Joseph
2018-04-11  6:40     ` [dpdk-dev] [PATCH v4 0/5] " Anoob Joseph
2018-04-11  6:40       ` [dpdk-dev] [PATCH v4 1/5] lib/ethdev: support for inline IPsec events Anoob Joseph
2018-04-19  9:15         ` Anoob Joseph
2018-04-20 15:14  3%       ` Stephen Hemminger
2018-03-27 15:17     [dpdk-dev] [PATCH] vhost: add virtio configuration space messages Tomasz Kulasek
2018-03-27 15:35     ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
2018-03-28  9:11       ` Maxime Coquelin
2018-03-28  9:50         ` Liu, Changpeng
2018-03-28  9:57           ` Maxime Coquelin
2018-03-28 10:03             ` Liu, Changpeng
2018-03-28 10:11               ` Maxime Coquelin
2018-03-28 10:23                 ` Liu, Changpeng
2018-03-28 10:56                   ` Maxime Coquelin
2018-04-19 14:39  0%                 ` Maxime Coquelin
2018-04-20  0:32  0%                   ` Liu, Changpeng
2018-03-28 23:29     [dpdk-dev] [PATCH 0/4] rte_flow extension for vSwitch acceleration Qi Zhang
2018-04-01 21:19     ` [dpdk-dev] [PATCH v2 " Qi Zhang
2018-04-01 21:19       ` [dpdk-dev] [PATCH v2 2/4] ether: add flow last hit query support Qi Zhang
2018-04-11 16:31  3%     ` Adrien Mazarguil
2018-04-01 21:19       ` [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API Qi Zhang
2018-04-11 16:32  2%     ` Adrien Mazarguil
2018-04-12  5:12  0%       ` Zhang, Qi Z
2018-04-12  9:19  0%         ` Adrien Mazarguil
2018-04-12 10:00  0%           ` Zhang, Qi Z
2018-04-16  6:10     ` [dpdk-dev] [PATCH v3 0/4] rte_flow extension for vSwitch acceleration Qi Zhang
2018-04-16  6:10       ` [dpdk-dev] [PATCH v3 2/4] ethdev: add packet field set aciton in flow API Qi Zhang
2018-04-19 14:48         ` Adrien Mazarguil
2018-04-20  2:24           ` Zhang, Qi Z
2018-04-20  8:54  3%         ` Adrien Mazarguil
2018-03-30 17:32     [dpdk-dev] [PATCH v2 1/7] net: move BPF related definitions into librte_net Konstantin Ananyev
2018-04-06 18:49  2% ` [dpdk-dev] [PATCH v3 02/10] bpf: add BPF loading and execution framework Konstantin Ananyev
2018-04-03  9:43     [dpdk-dev] [PATCH v6 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-14  9:34     ` [dpdk-dev] [PATCH v8 00/11] " Jay Zhou
2018-04-14  9:34  1%   ` [dpdk-dev] [PATCH v8 02/11] crypto/virtio: support virtio device init Jay Zhou
2018-04-15  8:51     ` [dpdk-dev] [PATCH v9 00/11] crypto: add virtio poll mode driver Jay Zhou
2018-04-15  8:51  1%   ` [dpdk-dev] [PATCH v9 02/11] crypto/virtio: support virtio device init Jay Zhou
2018-04-16  2:21     ` [dpdk-dev] [PATCH v10 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-16  2:21  1%   ` [dpdk-dev] [PATCH v10 02/10] crypto/virtio: support virtio device init Jay Zhou
2018-04-17  9:23     ` [dpdk-dev] [PATCH v11 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-17  9:23  1%   ` [dpdk-dev] [PATCH v11 02/10] crypto/virtio: support virtio device init Jay Zhou
2018-04-03 15:07     [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure Jerin Jacob
2018-04-03 15:25     ` Olivier Matz
2018-04-03 15:37       ` Jerin Jacob
2018-04-03 15:56         ` Olivier Matz
2018-04-03 16:42           ` Jerin Jacob
2018-04-04 23:38             ` Ananyev, Konstantin
2018-04-05  8:01               ` Jerin Jacob
2018-04-05 13:49                 ` Ananyev, Konstantin
2018-04-06  1:26                   ` Jerin Jacob
2018-04-11  0:33                     ` Ananyev, Konstantin
2018-04-11  2:48  4%                   ` Jerin Jacob
2018-04-11  8:40  0%                     ` Ananyev, Konstantin
2018-04-04 15:56     [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-06 13:25     ` [dpdk-dev] [PATCH v2 00/15] " Adrien Mazarguil
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
2018-04-07  9:15  0%     ` Andrew Rybchenko
2018-04-07  9:18  0%       ` Andrew Rybchenko
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-07  9:23  0%     ` Andrew Rybchenko
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-07  9:05  0%     ` Andrew Rybchenko
2018-04-09 14:42  0%       ` Adrien Mazarguil
2018-04-11 13:21  0%         ` Andrew Rybchenko
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-06 15:41         ` Andrew Rybchenko
2018-04-09 14:41  0%       ` Adrien Mazarguil
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level " Adrien Mazarguil
2018-04-07  8:27  0%     ` Andrew Rybchenko
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-06 17:11  0%     ` Andrew Rybchenko
2018-04-09 14:42  0%       ` Adrien Mazarguil
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF " Adrien Mazarguil
2018-04-07  9:41  0%     ` Andrew Rybchenko
2018-04-09 14:49  0%       ` Adrien Mazarguil
2018-04-06 13:25       ` [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to " Adrien Mazarguil
2018-04-07  9:51  0%     ` Andrew Rybchenko
2018-04-09 15:00  0%       ` Adrien Mazarguil
2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-11 13:06  0%       ` Andrew Rybchenko
2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-11 12:40  0%       ` Andrew Rybchenko
2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-11 12:45  0%       ` Andrew Rybchenko
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
2018-04-11 12:57  0%       ` Andrew Rybchenko
2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
2018-04-11 13:00  0%       ` Andrew Rybchenko
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
2018-04-11 13:02  0%       ` Andrew Rybchenko
2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
2018-04-17 19:37  0%         ` Ferruh Yigit
2018-04-18  8:41  5%           ` Adrien Mazarguil
2018-04-18  9:24  3%             ` Ferruh Yigit
2018-04-19  9:48  5%               ` Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-18 12:26  0%         ` Andrew Rybchenko
2018-04-18 14:58  0%           ` Adrien Mazarguil
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-17 20:18  0%         ` Thomas Monjalon
2018-04-18  6:45  0%           ` Nélio Laranjeiro
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 09/16] ethdev: add encap level " Adrien Mazarguil
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 14/16] ethdev: rename physical port item " Adrien Mazarguil
2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 15/16] ethdev: add physical port action to " Adrien Mazarguil
2018-04-17  9:08  0%         ` Mohammad Abdul Awal
2018-04-16 16:23  2%       ` [dpdk-dev] [PATCH v4 16/16] ethdev: add port ID item and " Adrien Mazarguil
2018-04-19 10:16  4%       ` [dpdk-dev] [PATCH v5 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 01/16] ethdev: add error types to flow API Adrien Mazarguil
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 04/16] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-23 15:05  0%           ` Nélio Laranjeiro
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 09/16] ethdev: add encap level " Adrien Mazarguil
2018-04-19 10:16  1%         ` [dpdk-dev] [PATCH v5 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 14/16] ethdev: rename physical port item " Adrien Mazarguil
2018-04-19 10:16  3%         ` [dpdk-dev] [PATCH v5 15/16] ethdev: add physical port action to " Adrien Mazarguil
2018-04-19 10:16  2%         ` [dpdk-dev] [PATCH v5 16/16] ethdev: add port ID item and " Adrien Mazarguil
2018-04-04 22:01     [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-13  9:16  0% ` Burakov, Anatoly
2018-04-05 16:40     [dpdk-dev] [PATCH v5] ethdev: replace bus specific struct with generic dev Ferruh Yigit
2018-04-09 12:09  2% ` [dpdk-dev] [PATCH v6] " Ferruh Yigit
2018-04-05 18:07     [dpdk-dev] [PATCH v5 3/4] net/ifcvf: add ifcvf vdpa driver Xiao Wang
2018-04-12  7:19     ` [dpdk-dev] [PATCH v6 0/4] " Xiao Wang
2018-04-12  7:19  3%   ` [dpdk-dev] [PATCH v6 4/4] doc: add ifcvf driver document and release note Xiao Wang
2018-04-06 12:23     [dpdk-dev] [PATCH v3 0/4] ethdev: Additions to support tunnel encap/decap offload Declan Doherty
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow Declan Doherty
2018-04-06 20:26  3%   ` Adrien Mazarguil
2018-04-09 14:22  0%     ` Mohammad Abdul Awal
2018-04-09 15:23  0%       ` Adrien Mazarguil
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions Declan Doherty
2018-04-06 20:26  2%   ` Adrien Mazarguil
2018-04-09 16:10  0%     ` Mohammad Abdul Awal
2018-04-10 10:19  0%       ` Adrien Mazarguil
2018-04-10 11:06  0%         ` Shahaf Shuler
2018-04-17 14:58  3%     ` Doherty, Declan
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 3/4] ethdev: Add group action type to rte_flow Declan Doherty
2018-04-06 20:26  3%   ` Adrien Mazarguil
2018-04-17 14:40  0%     ` Doherty, Declan
2018-04-18 21:04     ` [dpdk-dev] [PATCH v4 0/6] additions to support tunnel encap/decap Declan Doherty
2018-04-18 21:04       ` [dpdk-dev] [PATCH v4 2/6] ethdev: Add jump action type to rte_flow Declan Doherty
2018-04-19 13:03  3%     ` Adrien Mazarguil
2018-04-23 15:56       ` [dpdk-dev] [PATCH v5 0/4] ethdev additions to support tunnel encap/decap Declan Doherty
2018-04-23 15:56  2%     ` [dpdk-dev] [PATCH v5 2/4] ethdev: Add group JUMP action Declan Doherty
2018-04-06 13:22     [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes Adrien Mazarguil
2018-04-10 16:34  3% ` [dpdk-dev] [PATCH v4 " Adrien Mazarguil
2018-04-16 16:21  3%   ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
2018-04-17  9:17  0%     ` Ferruh Yigit
2018-04-19 10:07  3%     ` [dpdk-dev] [PATCH v6 " Adrien Mazarguil
2018-04-19 14:03  0%       ` Ferruh Yigit
2018-04-19 14:07  0%         ` Ferruh Yigit
2018-04-06 13:51     [dpdk-dev] [PATCH] app/test: enhance power manager unit tests Reshma Pattan
2018-04-10 14:19  3% ` Hunt, David
2018-04-06 14:49     [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
2018-04-06 17:01  0% ` Ferruh Yigit
2018-04-10  9:43  4% ` [dpdk-dev] [PATCH v6 " Remy Horton
2018-04-10  9:43  7%   ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
2018-04-10 18:56  0%   ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
2018-04-06 18:49     [dpdk-dev] [PATCH v3 01/10] net: move BPF related definitions into librte_net Konstantin Ananyev
2018-04-13 14:43  2% ` [dpdk-dev] [PATCH v4 02/10] bpf: add BPF loading and execution framework Konstantin Ananyev
2018-04-09 12:10     [dpdk-dev] [PATCH v1 1/2] ethdev: add supported hash function check Xueming Li
2018-04-20 13:06     ` [dpdk-dev] [PATCH v5 1/2] ethdev: introduce generic IP/UDP tunnel checksum and TSO Xueming Li
2018-04-20 13:48  3%   ` Ferruh Yigit
2018-04-20 14:23  0%     ` Ferruh Yigit
2018-04-20 14:23  0%     ` Xueming(Steven) Li
2018-04-09 12:49     [dpdk-dev] [PATCH] table: fix build error with gcc 8 Jasvinder Singh
2018-04-09 15:09     ` Stephen Hemminger
2018-04-09 15:58       ` Dumitrescu, Cristian
2018-04-09 16:38  4%     ` Van Haaren, Harry
2018-04-09 16:43  0%       ` Ferruh Yigit
2018-04-09 17:05  0%         ` Dumitrescu, Cristian
2018-04-09 17:02  4%       ` Dumitrescu, Cristian
2018-04-09 17:09  0%         ` Ananyev, Konstantin
2018-04-09 17:26  0%           ` Dumitrescu, Cristian
2018-04-10 12:32  0%             ` Van Haaren, Harry
2018-04-10 11:43  0%       ` Neil Horman
2018-04-09 13:11  5% [dpdk-dev] [PATCH v1] doc: add SPDX Licence to doc files Marko Kovacevic
2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly
2018-04-17 15:14     [dpdk-dev] [PATCH v4 00/11] mlx5 Rx tunnel offloading Xueming Li
2018-04-20 12:23     ` [dpdk-dev] [PATCH v5 04/11] net/mlx5: support Rx tunnel type identification Xueming Li
2018-04-23  7:40  4%   ` Nélio Laranjeiro
2018-04-23  7:56  0%     ` Xueming(Steven) Li
2018-04-17 18:31     [dpdk-dev] Retire x86 32 bit? Stephen Hemminger
2018-04-17 18:39     ` David Harton (dharton)
2018-04-17 20:01       ` Jim Murphy
2018-04-17 20:46         ` Stephen Hemminger
2018-04-17 21:18  3%       ` Roger B Melton
2018-04-18 17:40  0%         ` Jim Murphy
2018-04-19  6:00  3% [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-19 17:25  0%   ` Kevin Traynor
2018-04-20 13:13  0%     ` Arnon Warshavsky
2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-19 17:25  0%   ` Kevin Traynor
2018-04-20 13:14  0%     ` Arnon Warshavsky
2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
2018-04-19 17:26  0%   ` Kevin Traynor
2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 06/11] kni: replace rte_panic instances in kni Arnon Warshavsky
2018-04-19  6:01  3% ` [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev Arnon Warshavsky
2018-04-19 17:27  0%   ` Kevin Traynor
2018-04-19  6:01  2% ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
2018-04-19 14:39  3%   ` Burakov, Anatoly
2018-04-19 14:48  0%     ` Arnon Warshavsky
2018-04-19 17:48  0%   ` Aaron Conole
2018-04-20 13:55  4%     ` Arnon Warshavsky
2018-04-20 14:53  4%       ` Aaron Conole
2018-04-23  8:07  4%         ` Arnon Warshavsky
2018-04-23  7:27  3% [dpdk-dev] [PATCH] lib/ethdev: moving IPsec event enum to the end Anoob Joseph
2018-04-23  8:02  0% ` De Lara Guarch, Pablo
2018-04-23 21:28     [dpdk-dev] [PATCH v5 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-23 21:28  3% ` [dpdk-dev] [PATCH v5 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).