DPDK patches and discussions
 help / color / mirror / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download: 
* [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                         ` (2 preceding siblings ...)
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 135 insertions(+), 117 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 2ddb08feb..798b7948d 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,29 +2073,29 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->s.rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->s.rss_key),
+		action_rss_data->rss_conf.rss_key_len =
+			RTE_MIN(sizeof(action_rss_data->rss_key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2125,7 +2113,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2135,7 +2123,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2154,7 +2142,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2169,7 +2157,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2186,10 +2174,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2206,6 +2193,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2483,8 +2471,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2493,6 +2481,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2503,6 +2492,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2525,8 +2519,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index d0d372797..95618e4eb 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -977,7 +977,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1026,14 +1026,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1065,7 +1071,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1096,11 +1102,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 15cdf07b7..8feb6ae31 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 9923bfa59..75ea0cbcb 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 80f9cb6cb..bb19e28c6 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 96184f030..ad2e55b8e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-16 16:22  1%       ` Adrien Mazarguil
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to do so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index b9f36587c..a5c6a1670 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -964,6 +965,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -1001,6 +1011,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 67fd568bc..15cdf07b7 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 1ca413e32..9923bfa59 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap & FATE)
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3b7a960b0..fe2f94010 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 6ace24ff4..96184f030 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-16 16:22  2%       ` Adrien Mazarguil
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
  2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f0b4b7bc4..2ddb08feb 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a7645adb8..d0d372797 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1065,7 +1065,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ada280810..80f9cb6cb 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index d28a2a473..6ace24ff4 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-16 16:22  3%       ` Adrien Mazarguil
  2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 5daa93bb3..a7645adb8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1244,8 +1244,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 44ae19d3b..26b95c772 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (12 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-16 16:22  4%     ` Adrien Mazarguil
  2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
                         ` (3 more replies)
  13 siblings, 4 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:22 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v4 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/096509.html

v5 changes:

- No change besides new acked-by lines, rebased series to address conflicts.

v3 changes:

- Rebased series, fixed latest conflicts.
- Addressed Andrew's comments, see affected patches for details:
  - Empty RSS types in flow rule means PMD-specific RSS instead of no RSS.
  - RSS hash function now explicitly compared against
    RTE_ETH_HASH_FUNCTION_DEFAULT instead of 0 in all PMDs.
  - sfc PMD updated to also accept Toeplitz.
  - Implicit VLAN TPID matching now removed from all PMDs.
  - Default mask upate for VLAN TCI now split as separate patch #11.
  - Ingress/egress definition clarified in patch #12.

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (16):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: limit default VLAN TCI mask in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 394 +++++++++++----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 618 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  60 ++-
 drivers/net/bnxt/bnxt_filter.c              |  49 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 ++-
 drivers/net/e1000/igb_rxtx.c                |  55 +-
 drivers/net/enic/enic_flow.c                |  50 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 130 +++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 +-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 316 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  32 +-
 drivers/net/sfc/sfc_flow.c                  |  78 ++-
 drivers/net/tap/tap_flow.c                  |  49 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 339 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1750 insertions(+), 1123 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v5 00/11] Bunch of flow API-related fixes
  2018-04-10 16:34  3%     ` [dpdk-dev] [PATCH v4 " Adrien Mazarguil
@ 2018-04-16 16:21  3%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-16 16:21 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v5 changes:

- No change, rebased series to address conflicts.

v4 changes:

- Rebased again.
- The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
  see updated patch for details.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline.c                      |   2 +
 app/test-pmd/cmdline_flow.c                 | 252 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 10 files changed, 494 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 15:33  0%     ` Olivier Matz
@ 2018-04-16 15:41  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 15:41 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev, Anatoly Burakov

On 04/16/2018 06:33 PM, Olivier Matz wrote:
> On Mon, Apr 16, 2018 at 02:24:33PM +0100, Andrew Rybchenko wrote:
>> Size of memory chunk required to populate mempool objects depends
>> on how objects are stored in the memory. Different mempool drivers
>> may have different requirements and a new operation allows to
>> calculate memory size in accordance with driver requirements and
>> advertise requirements on minimum memory chunk size and alignment
>> in a generic way.
>>
>> Bump ABI version since the patch breaks it.
>>
>> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
>> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> [...]
>
>> @@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>>   	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
>>   	 * memory, then we'll go and reserve space page-by-page.
>>   	 */
>> -	no_pageshift = no_contig || force_contig ||
>> -			rte_eal_iova_mode() == RTE_IOVA_VA;
>> +	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
>>   	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
> In case there is a v5 for another reason, I think the last line is
> equivalent to:
>
>    try_contig = !no_pageshift && rte_eal_has_hugepages();

Agree. As I understand it is true before my patch as well.

> Otherwise:
> Acked-by: Olivier Matz <olivier.matz@6wind.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 13:24  7%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
@ 2018-04-16 15:33  0%     ` Olivier Matz
  2018-04-16 15:41  0%       ` Andrew Rybchenko
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2018-04-16 15:33 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: dev, Anatoly Burakov

On Mon, Apr 16, 2018 at 02:24:33PM +0100, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>

[...]

> @@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
>  	 * memory, then we'll go and reserve space page-by-page.
>  	 */
> -	no_pageshift = no_contig || force_contig ||
> -			rte_eal_iova_mode() == RTE_IOVA_VA;
> +	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
>  	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();

In case there is a v5 for another reason, I think the last line is
equivalent to:

  try_contig = !no_pageshift && rte_eal_has_hugepages();


Otherwise:
Acked-by: Olivier Matz <olivier.matz@6wind.com>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation
  2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
@ 2018-04-16 13:33  4%   ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:33 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Artem V. Andreev

From: "Artem V. Andreev" <Artem.Andreev@oktetlabs.ru>

If mempool manager supports object blocks (physically and virtual
contiguous set of objects), it is sufficient to get the first
object only and the function allows to avoid filling in of
information about each block member.

Signed-off-by: Artem V. Andreev <Artem.Andreev@oktetlabs.ru>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 doc/guides/rel_notes/deprecation.rst       |   7 --
 lib/librte_mempool/Makefile                |   1 +
 lib/librte_mempool/meson.build             |   2 +
 lib/librte_mempool/rte_mempool.c           |  39 ++++++++
 lib/librte_mempool/rte_mempool.h           | 151 ++++++++++++++++++++++++++++-
 lib/librte_mempool/rte_mempool_ops.c       |   1 +
 lib/librte_mempool/rte_mempool_version.map |   1 +
 7 files changed, 194 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 6d9a0c8..f3284c5 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -47,13 +47,6 @@ Deprecation Notices
 
   - ``rte_eal_mbuf_default_mempool_ops``
 
-* mempool: several API and ABI changes are planned in v18.05.
-
-  The following changes are planned:
-
-  - addition of new op to allocate contiguous
-    block of objects if underlying driver supports it.
-
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
   definition in line with the ethdev TM and MTR APIs. Currently, this field
   is defined in librte_sched in a non-generic way. The new generic format
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 7f19f00..e3c32b1 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -10,6 +10,7 @@ CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
 # Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
 # from earlier deprecated rte_mempool_populate_phys_tab()
 CFLAGS += -Wno-deprecated-declarations
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index baf2d24..d507e55 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
+
 extra_flags = []
 
 # Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 84b3d64..cf5d124 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -1255,6 +1255,36 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #endif
 }
 
+void
+rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp,
+	void * const *first_obj_table_const, unsigned int n, int free)
+{
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	struct rte_mempool_info info;
+	const size_t total_elt_sz =
+		mp->header_size + mp->elt_size + mp->trailer_size;
+	unsigned int i, j;
+
+	rte_mempool_ops_get_info(mp, &info);
+
+	for (i = 0; i < n; ++i) {
+		void *first_obj = first_obj_table_const[i];
+
+		for (j = 0; j < info.contig_block_size; ++j) {
+			void *obj;
+
+			obj = (void *)((uintptr_t)first_obj + j * total_elt_sz);
+			rte_mempool_check_cookies(mp, &obj, 1, free);
+		}
+	}
+#else
+	RTE_SET_USED(mp);
+	RTE_SET_USED(first_obj_table_const);
+	RTE_SET_USED(n);
+	RTE_SET_USED(free);
+#endif
+}
+
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
 static void
 mempool_obj_audit(struct rte_mempool *mp, __rte_unused void *opaque,
@@ -1320,6 +1350,7 @@ void
 rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 {
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	struct rte_mempool_info info;
 	struct rte_mempool_debug_stats sum;
 	unsigned lcore_id;
 #endif
@@ -1361,6 +1392,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	/* sum and dump statistics */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
@@ -1369,6 +1401,8 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 		sum.get_success_objs += mp->stats[lcore_id].get_success_objs;
 		sum.get_fail_bulk += mp->stats[lcore_id].get_fail_bulk;
 		sum.get_fail_objs += mp->stats[lcore_id].get_fail_objs;
+		sum.get_success_blks += mp->stats[lcore_id].get_success_blks;
+		sum.get_fail_blks += mp->stats[lcore_id].get_fail_blks;
 	}
 	fprintf(f, "  stats:\n");
 	fprintf(f, "    put_bulk=%"PRIu64"\n", sum.put_bulk);
@@ -1377,6 +1411,11 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	fprintf(f, "    get_success_objs=%"PRIu64"\n", sum.get_success_objs);
 	fprintf(f, "    get_fail_bulk=%"PRIu64"\n", sum.get_fail_bulk);
 	fprintf(f, "    get_fail_objs=%"PRIu64"\n", sum.get_fail_objs);
+	if (info.contig_block_size > 0) {
+		fprintf(f, "    get_success_blks=%"PRIu64"\n",
+			sum.get_success_blks);
+		fprintf(f, "    get_fail_blks=%"PRIu64"\n", sum.get_fail_blks);
+	}
 #else
 	fprintf(f, "  no statistics available\n");
 #endif
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 1ac2f57..3cab3a0 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -70,6 +70,10 @@ struct rte_mempool_debug_stats {
 	uint64_t get_success_objs; /**< Objects successfully allocated. */
 	uint64_t get_fail_bulk;    /**< Failed allocation number. */
 	uint64_t get_fail_objs;    /**< Objects that failed to be allocated. */
+	/** Successful allocation number of contiguous blocks. */
+	uint64_t get_success_blks;
+	/** Failed allocation number of contiguous blocks. */
+	uint64_t get_fail_blks;
 } __rte_cache_aligned;
 #endif
 
@@ -195,7 +199,10 @@ struct rte_mempool_memhdr {
  *
  * Additional information about the mempool
  */
-struct rte_mempool_info;
+struct rte_mempool_info {
+	/** Number of objects in the contiguous block */
+	unsigned int contig_block_size;
+};
 
 /**
  * The RTE mempool structure.
@@ -273,8 +280,16 @@ struct rte_mempool {
 			mp->stats[__lcore_id].name##_bulk += 1;	\
 		}                                               \
 	} while(0)
+#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do {                    \
+		unsigned int __lcore_id = rte_lcore_id();       \
+		if (__lcore_id < RTE_MAX_LCORE) {               \
+			mp->stats[__lcore_id].name##_blks += n;	\
+			mp->stats[__lcore_id].name##_bulk += 1;	\
+		}                                               \
+	} while (0)
 #else
 #define __MEMPOOL_STAT_ADD(mp, name, n) do {} while(0)
+#define __MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, name, n) do {} while (0)
 #endif
 
 /**
@@ -342,6 +357,38 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Check contiguous object blocks and update cookies or panic.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param first_obj_table_const
+ *   Pointer to a table of void * pointers (first object of the contiguous
+ *   object blocks).
+ * @param n
+ *   Number of contiguous object blocks.
+ * @param free
+ *   - 0: object is supposed to be allocated, mark it as free
+ *   - 1: object is supposed to be free, mark it as allocated
+ *   - 2: just check that cookie is valid (free or allocated)
+ */
+void rte_mempool_contig_blocks_check_cookies(const struct rte_mempool *mp,
+	void * const *first_obj_table_const, unsigned int n, int free);
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+					      free) \
+	rte_mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+						free)
+#else
+#define __mempool_contig_blocks_check_cookies(mp, first_obj_table_const, n, \
+					      free) \
+	do {} while (0)
+#endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
+
 #define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
 
 /**
@@ -374,6 +421,15 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 		void **obj_table, unsigned int n);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dequeue a number of contiquous object blocks from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_contig_blocks_t)(struct rte_mempool *mp,
+		 void **first_obj_table, unsigned int n);
+
+/**
  * Return the number of available objects in the external pool.
  */
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
@@ -539,6 +595,10 @@ struct rte_mempool_ops {
 	 * Get mempool info
 	 */
 	rte_mempool_get_info_t get_info;
+	/**
+	 * Dequeue a number of contiguous object blocks.
+	 */
+	rte_mempool_dequeue_contig_blocks_t dequeue_contig_blocks;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -617,6 +677,30 @@ rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
 }
 
 /**
+ * @internal Wrapper for mempool_ops dequeue_contig_blocks callback.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[out] first_obj_table
+ *   Pointer to a table of void * pointers (first objects).
+ * @param[in] n
+ *   Number of blocks to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of dequeue function.
+ */
+static inline int
+rte_mempool_ops_dequeue_contig_blocks(struct rte_mempool *mp,
+		void **first_obj_table, unsigned int n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	RTE_ASSERT(ops->dequeue_contig_blocks != NULL);
+	return ops->dequeue_contig_blocks(mp, first_obj_table, n);
+}
+
+/**
  * @internal wrapper for mempool_ops enqueue callback.
  *
  * @param mp
@@ -1531,6 +1615,71 @@ rte_mempool_get(struct rte_mempool *mp, void **obj_p)
 }
 
 /**
+ * @internal Get contiguous blocks of objects from the pool. Used internally.
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param first_obj_table
+ *   A pointer to a pointer to the first object in each block.
+ * @param n
+ *   A number of blocks to get.
+ * @return
+ *   - >0: Success
+ *   - <0: Error
+ */
+static __rte_always_inline int
+__mempool_generic_get_contig_blocks(struct rte_mempool *mp,
+				    void **first_obj_table, unsigned int n)
+{
+	int ret;
+
+	ret = rte_mempool_ops_dequeue_contig_blocks(mp, first_obj_table, n);
+	if (ret < 0)
+		__MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_fail, n);
+	else
+		__MEMPOOL_CONTIG_BLOCKS_STAT_ADD(mp, get_success, n);
+
+	return ret;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get a contiguous blocks of objects from the mempool.
+ *
+ * If cache is enabled, consider to flush it first, to reuse objects
+ * as soon as possible.
+ *
+ * The application should check that the driver supports the operation
+ * by calling rte_mempool_ops_get_info() and checking that `contig_block_size`
+ * is not zero.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @param first_obj_table
+ *   A pointer to a pointer to the first object in each block.
+ * @param n
+ *   The number of blocks to get from mempool.
+ * @return
+ *   - 0: Success; blocks taken.
+ *   - -ENOBUFS: Not enough entries in the mempool; no object is retrieved.
+ *   - -EOPNOTSUPP: The mempool driver does not support block dequeue
+ */
+static __rte_always_inline int
+__rte_experimental
+rte_mempool_get_contig_blocks(struct rte_mempool *mp,
+			      void **first_obj_table, unsigned int n)
+{
+	int ret;
+
+	ret = __mempool_generic_get_contig_blocks(mp, first_obj_table, n);
+	if (ret == 0)
+		__mempool_contig_blocks_check_cookies(mp, first_obj_table, n,
+						      1);
+	return ret;
+}
+
+/**
  * Return the number of entries in the mempool.
  *
  * When cache is enabled, this function has to browse the length of
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index efc1c08..a27e1fa 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -60,6 +60,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
 	ops->get_info = h->get_info;
+	ops->dequeue_contig_blocks = h->dequeue_contig_blocks;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index c9d16ec..1c406b5 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -53,6 +53,7 @@ DPDK_17.11 {
 DPDK_18.05 {
 	global:
 
+	rte_mempool_contig_blocks_check_cookies;
 	rte_mempool_op_calc_mem_size_default;
 	rte_mempool_op_populate_default;
 
-- 
2.7.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver
    2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
@ 2018-04-16 13:33  3% ` Andrew Rybchenko
  2018-04-16 13:33  4%   ` [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation Andrew Rybchenko
  1 sibling, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:33 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The initial patch series [1] (RFCv1 is [2]) is split into two to simplify
processing.  It is the second part which relies on the first one [3].

It should be applied on top of [3].

The patch series adds bucket mempool driver which allows to allocate
(both physically and virtually) contiguous blocks of objects and adds
mempool API to do it. It is still capable to provide separate objects,
but it is definitely more heavy-weight than ring/stack drivers.
The driver will be used by the future Solarflare driver enhancements
which allow to utilize physical contiguous blocks in the NIC firmware.

The target usecase is dequeue in blocks and enqueue separate objects
back (which are collected in buckets to be dequeued). So, the memory
pool with bucket driver is created by an application and provided to
networking PMD receive queue. The choice of bucket driver is done using
rte_eth_dev_pool_ops_supported(). A PMD that relies upon contiguous
block allocation should report the bucket driver as the only supported
and preferred one.

Introduction of the contiguous block dequeue operation is proven by
performance measurements using autotest with minor enhancements:
 - in the original test bulks are powers of two, which is unacceptable
   for us, so they are changed to multiple of contig_block_size;
 - the test code is duplicated to support plain dequeue and
   dequeue_contig_blocks;
 - all the extra test variations (with/without cache etc) are eliminated;
 - a fake read from the dequeued buffer is added (in both cases) to
   simulate mbufs access.

start performance test for bucket (without cache)
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   111935488
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   115290931
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   353055539
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   353330790
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   224657407
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   230411468
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=   706700902
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=   703673139
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Srate_persec=   425236887
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Srate_persec=   437295512
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Srate_persec=  1343409356
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Srate_persec=  1336567397
start performance test for bucket (without cache + contiguous dequeue)
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   122945536
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   126458265
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   374262988
mempool_autotest cache=   0 cores= 1 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   377316966
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   244842496
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   251618917
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=   751226060
mempool_autotest cache=   0 cores= 2 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=   756233010
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  30 Crate_persec=   462068120
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=   1 n_keep=  60 Crate_persec=   476997221
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  30 Crate_persec=  1432171313
mempool_autotest cache=   0 cores= 4 n_get_bulk=  15 n_put_bulk=  15 n_keep=  60 Crate_persec=  1438829771

The number of objects in the contiguous block is a function of bucket
memory size (.config option) and total element size. In the future
additional API with possibility to pass parameters on mempool allocation
may be added.

It breaks ABI since changes rte_mempool_ops. The ABI version is already
bumped in [4].


[1] https://dpdk.org/ml/archives/dev/2018-January/088698.html
[2] https://dpdk.org/ml/archives/dev/2017-November/082335.html
[3] https://dpdk.org/ml/archives/dev/2018-April/097354.html
[4] https://dpdk.org/ml/archives/dev/2018-April/097352.html

v1 -> v2:
  - just rebase

RFCv2 -> v1:
  - rebased on top of [3]
  - cleanup deprecation notice when it is done
  - mark a new API experimental
  - move contig blocks dequeue debug checks/processing to the library function
  - add contig blocks get stats
  - add release notes

RFCv1 -> RFCv2:
  - change info API to get information from driver required to
    API user to know contiguous block size
  - use SPDX tags
  - avoid all objects affinity to single lcore
  - fix bucket get_count
  - fix NO_CACHE_ALIGN case in bucket mempool



Andrew Rybchenko (1):
  doc: advertise bucket mempool driver

Artem V. Andreev (5):
  mempool/bucket: implement bucket mempool manager
  mempool: implement abstract mempool info API
  mempool: support block dequeue operation
  mempool/bucket: implement block dequeue operation
  mempool/bucket: do not allow one lcore to grab all buckets

 MAINTAINERS                                        |   9 +
 config/common_base                                 |   2 +
 doc/guides/rel_notes/deprecation.rst               |   7 -
 doc/guides/rel_notes/release_18_05.rst             |   9 +
 drivers/mempool/Makefile                           |   1 +
 drivers/mempool/bucket/Makefile                    |  27 +
 drivers/mempool/bucket/meson.build                 |   9 +
 drivers/mempool/bucket/rte_mempool_bucket.c        | 627 +++++++++++++++++++++
 .../mempool/bucket/rte_mempool_bucket_version.map  |   4 +
 lib/librte_mempool/Makefile                        |   1 +
 lib/librte_mempool/meson.build                     |   2 +
 lib/librte_mempool/rte_mempool.c                   |  39 ++
 lib/librte_mempool/rte_mempool.h                   | 190 +++++++
 lib/librte_mempool/rte_mempool_ops.c               |  16 +
 lib/librte_mempool/rte_mempool_version.map         |   8 +
 mk/rte.app.mk                                      |   1 +
 16 files changed, 945 insertions(+), 7 deletions(-)
 create mode 100644 drivers/mempool/bucket/Makefile
 create mode 100644 drivers/mempool/bucket/meson.build
 create mode 100644 drivers/mempool/bucket/rte_mempool_bucket.c
 create mode 100644 drivers/mempool/bucket/rte_mempool_bucket_version.map

-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
                     ` (2 preceding siblings ...)
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
@ 2018-04-16 13:24  4%   ` Andrew Rybchenko
  2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Thomas Monjalon

Move rte_mempool_xmem_size() code to internal helper function
since it is required in two places: deprecated rte_mempool_xmem_size()
and non-deprecated rte_mempool_op_calc_mem_size_default().

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v2 -> v3:
 - none

v1 -> v2:
 - deprecate rte_mempool_populate_iova_tab()
 - add -Wno-deprecated-declarations to fix build errors because of
   rte_mempool_populate_iova_tab() deprecation
 - add @deprecated to deprecated functions description

RFCv2 -> v1:
 - advertise deprecation in release notes
 - factor out default memory size calculation into non-deprecated
   internal function to avoid usage of deprecated function internally
 - remove test for deprecated functions to address build issue because
   of usage of deprecated functions (it is easy to allow usage of
   deprecated function in Makefile, but very complicated in meson)

 doc/guides/rel_notes/deprecation.rst         |  7 -------
 doc/guides/rel_notes/release_18_05.rst       | 11 ++++++++++
 lib/librte_mempool/Makefile                  |  3 +++
 lib/librte_mempool/meson.build               | 12 +++++++++++
 lib/librte_mempool/rte_mempool.c             | 19 ++++++++++++++---
 lib/librte_mempool/rte_mempool.h             | 30 +++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool_ops_default.c |  4 ++--
 test/test/test_mempool.c                     | 31 ----------------------------
 8 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 99a0b01..8d1b362 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -48,13 +48,6 @@ Deprecation Notices
   - ``rte_eal_mbuf_default_mempool_ops``
 
 * mempool: several API and ABI changes are planned in v18.05.
-  The following functions, introduced for Xen, which is not supported
-  anymore since v17.11, are hard to use, not used anywhere else in DPDK.
-  Therefore they will be deprecated in v18.05 and removed in v18.08:
-
-  - ``rte_mempool_xmem_create``
-  - ``rte_mempool_xmem_size``
-  - ``rte_mempool_xmem_usage``
 
   The following changes are planned:
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index f481eea..3869d04 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -181,6 +181,17 @@ API Changes
   Now the new driver callbacks ``calc_mem_size`` and ``populate`` may be
   used to achieve it without specific knowledge in the generic code.
 
+* **Deprecated mempool xmem functions.**
+
+  The following functions, introduced for Xen, which is not supported
+  anymore since v17.11, are hard to use, not used anywhere else in DPDK.
+  Therefore they were deprecated in v18.05 and will be removed in v18.08:
+
+  - ``rte_mempool_xmem_create``
+  - ``rte_mempool_xmem_size``
+  - ``rte_mempool_xmem_usage``
+  - ``rte_mempool_populate_iova_tab``
+
 
 ABI Changes
 -----------
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 421e2a7..7f19f00 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -7,6 +7,9 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_mempool.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
+# from earlier deprecated rte_mempool_populate_phys_tab()
+CFLAGS += -Wno-deprecated-declarations
 LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 6181ad8..baf2d24 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,6 +1,18 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+extra_flags = []
+
+# Allow deprecated symbol to use deprecated rte_mempool_populate_iova_tab()
+# from earlier deprecated rte_mempool_populate_phys_tab()
+extra_flags += '-Wno-deprecated-declarations'
+
+foreach flag: extra_flags
+	if cc.has_argument(flag)
+		cflags += flag
+	endif
+endforeach
+
 version = 4
 sources = files('rte_mempool.c', 'rte_mempool_ops.c',
 		'rte_mempool_ops_default.c')
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 5c75c16..c63c363 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -227,11 +227,13 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 
 
 /*
- * Calculate maximum amount of memory required to store given number of objects.
+ * Internal function to calculate required memory chunk size shared
+ * by default implementation of the corresponding callback and
+ * deprecated external function.
  */
 size_t
-rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
-		      __rte_unused unsigned int flags)
+rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz,
+				 uint32_t pg_shift)
 {
 	size_t obj_per_page, pg_num, pg_sz;
 
@@ -251,6 +253,17 @@ rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
 }
 
 /*
+ * Calculate maximum amount of memory required to store given number of objects.
+ */
+size_t
+rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
+		      __rte_unused unsigned int flags)
+{
+	return rte_mempool_calc_mem_size_helper(elt_num, total_elt_sz,
+						pg_shift);
+}
+
+/*
  * Calculate how much memory would be actually required with the
  * given memory footprint to store required number of elements.
  */
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0b83d5e..9107f5a 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -427,6 +427,28 @@ ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		size_t *min_chunk_size, size_t *align);
 
 /**
+ * @internal Helper function to calculate memory size required to store
+ * specified number of objects in assumption that the memory buffer will
+ * be aligned at page boundary.
+ *
+ * Note that if object size is bigger than page size, then it assumes
+ * that pages are grouped in subsets of physically continuous pages big
+ * enough to store at least one object.
+ *
+ * @param elt_num
+ *   Number of elements.
+ * @param total_elt_sz
+ *   The size of each element, including header and trailer, as returned
+ *   by rte_mempool_calc_obj_size().
+ * @param pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+size_t rte_mempool_calc_mem_size_helper(uint32_t elt_num, size_t total_elt_sz,
+		uint32_t pg_shift);
+
+/**
  * Function to be called for each populated object.
  *
  * @param[in] mp
@@ -855,6 +877,7 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 		   int socket_id, unsigned flags);
 
 /**
+ * @deprecated
  * Create a new mempool named *name* in memory.
  *
  * The pool contains n elements of elt_size. Its size is set to n.
@@ -912,6 +935,7 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
  *   The pointer to the new allocated mempool, on success. NULL on error
  *   with rte_errno set appropriately. See rte_mempool_create() for details.
  */
+__rte_deprecated
 struct rte_mempool *
 rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
 		unsigned cache_size, unsigned private_data_size,
@@ -1008,6 +1032,7 @@ int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	void *opaque);
 
 /**
+ * @deprecated
  * Add physical memory for objects in the pool at init
  *
  * Add a virtually contiguous memory chunk in the pool where objects can
@@ -1033,6 +1058,7 @@ int rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
  *   On error, the chunks are not added in the memory list of the
  *   mempool and a negative errno is returned.
  */
+__rte_deprecated
 int rte_mempool_populate_iova_tab(struct rte_mempool *mp, char *vaddr,
 	const rte_iova_t iova[], uint32_t pg_num, uint32_t pg_shift,
 	rte_mempool_memchunk_free_cb_t *free_cb, void *opaque);
@@ -1652,6 +1678,7 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 	struct rte_mempool_objsz *sz);
 
 /**
+ * @deprecated
  * Get the size of memory required to store mempool elements.
  *
  * Calculate the maximum amount of memory required to store given number
@@ -1674,10 +1701,12 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  * @return
  *   Required memory size aligned at page boundary.
  */
+__rte_deprecated
 size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz,
 	uint32_t pg_shift, unsigned int flags);
 
 /**
+ * @deprecated
  * Get the size of memory required to store mempool elements.
  *
  * Calculate how much memory would be actually required with the given
@@ -1705,6 +1734,7 @@ size_t rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz,
  *   buffer is too small, return a negative value whose absolute value
  *   is the actual number of elements that can be stored in that buffer.
  */
+__rte_deprecated
 ssize_t rte_mempool_xmem_usage(void *vaddr, uint32_t elt_num,
 	size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num,
 	uint32_t pg_shift, unsigned int flags);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 3defc15..fd63ca1 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -16,8 +16,8 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
-					 mp->flags);
+	mem_size = rte_mempool_calc_mem_size_helper(obj_num, total_elt_sz,
+						    pg_shift);
 
 	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
 
diff --git a/test/test/test_mempool.c b/test/test/test_mempool.c
index 63f921e..8d29af2 100644
--- a/test/test/test_mempool.c
+++ b/test/test/test_mempool.c
@@ -444,34 +444,6 @@ test_mempool_same_name_twice_creation(void)
 	return 0;
 }
 
-/*
- * Basic test for mempool_xmem functions.
- */
-static int
-test_mempool_xmem_misc(void)
-{
-	uint32_t elt_num, total_size;
-	size_t sz;
-	ssize_t usz;
-
-	elt_num = MAX_KEEP;
-	total_size = rte_mempool_calc_obj_size(MEMPOOL_ELT_SIZE, 0, NULL);
-	sz = rte_mempool_xmem_size(elt_num, total_size, MEMPOOL_PG_SHIFT_MAX,
-					0);
-
-	usz = rte_mempool_xmem_usage(NULL, elt_num, total_size, 0, 1,
-		MEMPOOL_PG_SHIFT_MAX, 0);
-
-	if (sz != (size_t)usz)  {
-		printf("failure @ %s: rte_mempool_xmem_usage(%u, %u) "
-			"returns: %#zx, while expected: %#zx;\n",
-			__func__, elt_num, total_size, sz, (size_t)usz);
-		return -1;
-	}
-
-	return 0;
-}
-
 static void
 walk_cb(struct rte_mempool *mp, void *userdata __rte_unused)
 {
@@ -596,9 +568,6 @@ test_mempool(void)
 	if (test_mempool_same_name_twice_creation() < 0)
 		goto err;
 
-	if (test_mempool_xmem_misc() < 0)
-		goto err;
-
 	/* test the stack handler */
 	if (test_mempool_basic(mp_stack, 1) < 0)
 		goto err;
-- 
2.7.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
                     ` (3 preceding siblings ...)
  2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
@ 2018-04-16 13:24  8%   ` Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The callback is not required any more since there is a new callback
to populate objects using provided memory area which provides
the same information.

Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - none

v2 -> v3:
 - none

v1 -> v2:
 - none

RFCv2 -> v1:
 - advertise ABI changes in release notes

 doc/guides/rel_notes/deprecation.rst       |  1 -
 doc/guides/rel_notes/release_18_05.rst     |  2 ++
 lib/librte_mempool/rte_mempool.c           |  5 -----
 lib/librte_mempool/rte_mempool.h           | 31 ------------------------------
 lib/librte_mempool/rte_mempool_ops.c       | 14 --------------
 lib/librte_mempool/rte_mempool_version.map |  1 -
 6 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 8d1b362..02ffcd4 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -51,7 +51,6 @@ Deprecation Notices
 
   The following changes are planned:
 
-  - substitute ``register_memory_area`` with ``populate`` ops.
   - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 3869d04..3ed4aae 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -223,6 +223,8 @@ ABI Changes
   Callback ``get_capabilities`` has been removed from ``rte_mempool_ops``
   since its features are covered by ``calc_mem_size`` and ``populate``
   callbacks.
+  Callback ``register_memory_area`` has been removed from ``rte_mempool_ops``
+  since the new callback ``populate`` may be used instead of it.
 
 
 Removed Items
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index c63c363..84b3d64 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -378,11 +378,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (ret != 0)
 		return ret;
 
-	/* Notify memory area to mempool */
-	ret = rte_mempool_ops_register_memory_area(mp, vaddr, iova, len);
-	if (ret != -ENOTSUP && ret < 0)
-		return ret;
-
 	/* mempool is already populated */
 	if (mp->populated_size >= mp->size)
 		return -ENOSPC;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 9107f5a..314f909 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -371,12 +371,6 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
 
 /**
- * Notify new memory area to mempool.
- */
-typedef int (*rte_mempool_ops_register_memory_area_t)
-(const struct rte_mempool *mp, char *vaddr, rte_iova_t iova, size_t len);
-
-/**
  * Calculate memory size required to store given number of objects.
  *
  * If mempool objects are not required to be IOVA-contiguous
@@ -514,10 +508,6 @@ struct rte_mempool_ops {
 	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
 	rte_mempool_get_count get_count; /**< Get qty of available objs. */
 	/**
-	 * Notify new memory area to mempool
-	 */
-	rte_mempool_ops_register_memory_area_t register_memory_area;
-	/**
 	 * Optional callback to calculate memory size required to
 	 * store specified number of objects.
 	 */
@@ -639,27 +629,6 @@ unsigned
 rte_mempool_ops_get_count(const struct rte_mempool *mp);
 
 /**
- * @internal wrapper for mempool_ops register_memory_area callback.
- * API to notify the mempool handler when a new memory area is added to pool.
- *
- * @param mp
- *   Pointer to the memory pool.
- * @param vaddr
- *   Pointer to the buffer virtual address.
- * @param iova
- *   Pointer to the buffer IO address.
- * @param len
- *   Pool size.
- * @return
- *   - 0: Success;
- *   - -ENOTSUP - doesn't support register_memory_area ops (valid error case).
- *   - Otherwise, rte_mempool_populate_phys fails thus pool create fails.
- */
-int
-rte_mempool_ops_register_memory_area(const struct rte_mempool *mp,
-				char *vaddr, rte_iova_t iova, size_t len);
-
-/**
  * @internal wrapper for mempool_ops calc_mem_size callback.
  * API to calculate size of memory required to store specified number of
  * object.
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 6ac669a..ea9be1e 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -57,7 +57,6 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->enqueue = h->enqueue;
 	ops->dequeue = h->dequeue;
 	ops->get_count = h->get_count;
-	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
 
@@ -99,19 +98,6 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 }
 
 /* wrapper to notify new memory area to external mempool */
-int
-rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
-					rte_iova_t iova, size_t len)
-{
-	struct rte_mempool_ops *ops;
-
-	ops = rte_mempool_get_ops(mp->ops_index);
-
-	RTE_FUNC_PTR_OR_ERR_RET(ops->register_memory_area, -ENOTSUP);
-	return ops->register_memory_area(mp, vaddr, iova, len);
-}
-
-/* wrapper to notify new memory area to external mempool */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 637f73f..cf375db 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -45,7 +45,6 @@ DPDK_16.07 {
 DPDK_17.11 {
 	global:
 
-	rte_mempool_ops_register_memory_area;
 	rte_mempool_populate_iova;
 	rte_mempool_populate_iova_tab;
 
-- 
2.7.4

^ permalink raw reply	[relevance 8%]

* [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
  2018-04-16 13:24  7%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
@ 2018-04-16 13:24  6%   ` Andrew Rybchenko
  2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
  2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Santosh Shukla, Jerin Jacob

The callback was introduced to let generic code to know octeontx
mempool driver requirements to use single physically contiguous
memory chunk to store all objects and align object address to
total object size. Now these requirements are met using a new
callbacks to calculate required memory chunk size and to populate
objects using provided memory chunk.

These capability flags are not used anywhere else.

Restricting capabilities to flags is not generic and likely to
be insufficient to describe mempool driver features. If required
in the future, API which returns structured information may be
added.

Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - rebase

v2 -> v3:
 - none

v1 -> v2:
 - fix typo
 - rebase on top of patch which renames MEMPOOL_F_NO_PHYS_CONTIG

RFCv2 -> v1:
 - squash mempool/octeontx patches to add calc_mem_size and populate
   callbacks to this one in order to avoid breakages in the middle of
   patchset
 - advertise API changes in release notes

 doc/guides/rel_notes/deprecation.rst            |  1 -
 doc/guides/rel_notes/release_18_05.rst          | 11 +++++
 drivers/mempool/octeontx/rte_mempool_octeontx.c | 59 +++++++++++++++++++++----
 lib/librte_mempool/rte_mempool.c                | 34 ++------------
 lib/librte_mempool/rte_mempool.h                | 52 +---------------------
 lib/librte_mempool/rte_mempool_ops.c            | 14 ------
 lib/librte_mempool/rte_mempool_ops_default.c    | 15 +------
 lib/librte_mempool/rte_mempool_version.map      |  1 -
 8 files changed, 68 insertions(+), 119 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 575da18..99a0b01 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -58,7 +58,6 @@ Deprecation Notices
 
   The following changes are planned:
 
-  - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
   - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 5c6588e..f481eea 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -173,6 +173,14 @@ API Changes
    fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
    resulted in an error.
 
+* **Removed mempool capability flags and related functions.**
+
+  Flags ``MEMPOOL_F_CAPA_PHYS_CONTIG`` and
+  ``MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS`` were used by octeontx mempool
+  driver to customize generic mempool library behaviour.
+  Now the new driver callbacks ``calc_mem_size`` and ``populate`` may be
+  used to achieve it without specific knowledge in the generic code.
+
 
 ABI Changes
 -----------
@@ -201,6 +209,9 @@ ABI Changes
   to allow to customize required memory size calculation.
   A new callback ``populate`` has been added to ``rte_mempool_ops``
   to allow to customize objects population.
+  Callback ``get_capabilities`` has been removed from ``rte_mempool_ops``
+  since its features are covered by ``calc_mem_size`` and ``populate``
+  callbacks.
 
 
 Removed Items
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index d143d05..64ed528 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -126,14 +126,29 @@ octeontx_fpavf_get_count(const struct rte_mempool *mp)
 	return octeontx_fpa_bufpool_free_count(pool);
 }
 
-static int
-octeontx_fpavf_get_capabilities(const struct rte_mempool *mp,
-				unsigned int *flags)
+static ssize_t
+octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
+			     uint32_t obj_num, uint32_t pg_shift,
+			     size_t *min_chunk_size, size_t *align)
 {
-	RTE_SET_USED(mp);
-	*flags |= (MEMPOOL_F_CAPA_PHYS_CONTIG |
-			MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS);
-	return 0;
+	ssize_t mem_size;
+
+	/*
+	 * Simply need space for one more object to be able to
+	 * fulfil alignment requirements.
+	 */
+	mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
+							pg_shift,
+							min_chunk_size, align);
+	if (mem_size >= 0) {
+		/*
+		 * Memory area which contains objects must be physically
+		 * contiguous.
+		 */
+		*min_chunk_size = mem_size;
+	}
+
+	return mem_size;
 }
 
 static int
@@ -150,6 +165,33 @@ octeontx_fpavf_register_memory_area(const struct rte_mempool *mp,
 	return octeontx_fpavf_pool_set_range(pool_bar, len, vaddr, gpool);
 }
 
+static int
+octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
+			void *vaddr, rte_iova_t iova, size_t len,
+			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+{
+	size_t total_elt_sz;
+	size_t off;
+
+	if (iova == RTE_BAD_IOVA)
+		return -EINVAL;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	/* align object start address to a multiple of total_elt_sz */
+	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+
+	if (len < off)
+		return -EINVAL;
+
+	vaddr = (char *)vaddr + off;
+	iova += off;
+	len -= off;
+
+	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+					       obj_cb, obj_cb_arg);
+}
+
 static struct rte_mempool_ops octeontx_fpavf_ops = {
 	.name = "octeontx_fpavf",
 	.alloc = octeontx_fpavf_alloc,
@@ -157,8 +199,9 @@ static struct rte_mempool_ops octeontx_fpavf_ops = {
 	.enqueue = octeontx_fpavf_enqueue,
 	.dequeue = octeontx_fpavf_dequeue,
 	.get_count = octeontx_fpavf_get_count,
-	.get_capabilities = octeontx_fpavf_get_capabilities,
 	.register_memory_area = octeontx_fpavf_register_memory_area,
+	.calc_mem_size = octeontx_fpavf_calc_mem_size,
+	.populate = octeontx_fpavf_populate,
 };
 
 MEMPOOL_REGISTER_OPS(octeontx_fpavf_ops);
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 68ae12f..5c75c16 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -231,15 +231,9 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  */
 size_t
 rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
-		      unsigned int flags)
+		      __rte_unused unsigned int flags)
 {
 	size_t obj_per_page, pg_num, pg_sz;
-	unsigned int mask;
-
-	mask = MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS | MEMPOOL_F_CAPA_PHYS_CONTIG;
-	if ((flags & mask) == mask)
-		/* alignment need one additional object */
-		elt_num += 1;
 
 	if (total_elt_sz == 0)
 		return 0;
@@ -263,18 +257,12 @@ rte_mempool_xmem_size(uint32_t elt_num, size_t total_elt_sz, uint32_t pg_shift,
 ssize_t
 rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	size_t total_elt_sz, const rte_iova_t iova[], uint32_t pg_num,
-	uint32_t pg_shift, unsigned int flags)
+	uint32_t pg_shift, __rte_unused unsigned int flags)
 {
 	uint32_t elt_cnt = 0;
 	rte_iova_t start, end;
 	uint32_t iova_idx;
 	size_t pg_sz = (size_t)1 << pg_shift;
-	unsigned int mask;
-
-	mask = MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS | MEMPOOL_F_CAPA_PHYS_CONTIG;
-	if ((flags & mask) == mask)
-		/* alignment need one additional object */
-		elt_num += 1;
 
 	/* if iova is NULL, assume contiguous memory */
 	if (iova == NULL) {
@@ -368,8 +356,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb,
 	void *opaque)
 {
-	unsigned total_elt_sz;
-	unsigned int mp_capa_flags;
 	unsigned i = 0;
 	size_t off;
 	struct rte_mempool_memhdr *memhdr;
@@ -388,17 +374,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (mp->populated_size >= mp->size)
 		return -ENOSPC;
 
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
-
-	/* Get mempool capabilities */
-	mp_capa_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_capa_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
-	/* update mempool capabilities */
-	mp->flags |= mp_capa_flags;
-
 	memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0);
 	if (memhdr == NULL)
 		return -ENOMEM;
@@ -410,10 +385,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	memhdr->free_cb = free_cb;
 	memhdr->opaque = opaque;
 
-	if (mp_capa_flags & MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS)
-		/* align object start address to a multiple of total_elt_sz */
-		off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
-	else if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
+	if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
 		off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr;
 	else
 		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 754261e..0b83d5e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -246,24 +246,6 @@ struct rte_mempool {
 #define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
 #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
-/**
- * This capability flag is advertised by a mempool handler, if the whole
- * memory area containing the objects must be physically contiguous.
- * Note: This flag should not be passed by application.
- */
-#define MEMPOOL_F_CAPA_PHYS_CONTIG 0x0040
-/**
- * This capability flag is advertised by a mempool handler. Used for a case
- * where mempool driver wants object start address(vaddr) aligned to block
- * size(/ total element size).
- *
- * Note:
- * - This flag should not be passed by application.
- *   Flag used for mempool driver only.
- * - Mempool driver must also set MEMPOOL_F_CAPA_PHYS_CONTIG flag along with
- *   MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS.
- */
-#define MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS 0x0080
 
 /**
  * @internal When debug is enabled, store some statistics.
@@ -389,12 +371,6 @@ typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
 typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
 
 /**
- * Get the mempool capabilities.
- */
-typedef int (*rte_mempool_get_capabilities_t)(const struct rte_mempool *mp,
-		unsigned int *flags);
-
-/**
  * Notify new memory area to mempool.
  */
 typedef int (*rte_mempool_ops_register_memory_area_t)
@@ -440,13 +416,7 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
- * If mempool driver requires object addresses to be block size aligned
- * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
- * reserved to be able to meet the requirement.
- *
- * Minimum size of memory chunk is either all required space, if
- * capabilities say that whole memory area must be physically contiguous
- * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
+ * Minimum size of memory chunk is a maximum of the page size and total
  * element size.
  *
  * Required memory chunk alignment is a maximum of page size and cache
@@ -522,10 +492,6 @@ struct rte_mempool_ops {
 	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
 	rte_mempool_get_count get_count; /**< Get qty of available objs. */
 	/**
-	 * Get the mempool capabilities
-	 */
-	rte_mempool_get_capabilities_t get_capabilities;
-	/**
 	 * Notify new memory area to mempool
 	 */
 	rte_mempool_ops_register_memory_area_t register_memory_area;
@@ -651,22 +617,6 @@ unsigned
 rte_mempool_ops_get_count(const struct rte_mempool *mp);
 
 /**
- * @internal wrapper for mempool_ops get_capabilities callback.
- *
- * @param mp [in]
- *   Pointer to the memory pool.
- * @param flags [out]
- *   Pointer to the mempool flags.
- * @return
- *   - 0: Success; The mempool driver has advertised his pool capabilities in
- *   flags param.
- *   - -ENOTSUP - doesn't support get_capabilities ops (valid case).
- *   - Otherwise, pool create fails.
- */
-int
-rte_mempool_ops_get_capabilities(const struct rte_mempool *mp,
-					unsigned int *flags);
-/**
  * @internal wrapper for mempool_ops register_memory_area callback.
  * API to notify the mempool handler when a new memory area is added to pool.
  *
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 1a7f39f..6ac669a 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -57,7 +57,6 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->enqueue = h->enqueue;
 	ops->dequeue = h->dequeue;
 	ops->get_count = h->get_count;
-	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
 	ops->populate = h->populate;
@@ -99,19 +98,6 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to get external mempool capabilities. */
-int
-rte_mempool_ops_get_capabilities(const struct rte_mempool *mp,
-					unsigned int *flags)
-{
-	struct rte_mempool_ops *ops;
-
-	ops = rte_mempool_get_ops(mp->ops_index);
-
-	RTE_FUNC_PTR_OR_ERR_RET(ops->get_capabilities, -ENOTSUP);
-	return ops->get_capabilities(mp, flags);
-}
-
 /* wrapper to notify new memory area to external mempool */
 int
 rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 57295f7..3defc15 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -11,26 +11,15 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     uint32_t obj_num, uint32_t pg_shift,
 				     size_t *min_chunk_size, size_t *align)
 {
-	unsigned int mp_flags;
-	int ret;
 	size_t total_elt_sz;
 	size_t mem_size;
 
-	/* Get mempool capabilities */
-	mp_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
-					 mp->flags | mp_flags);
+					 mp->flags);
 
-	if (mp_flags & MEMPOOL_F_CAPA_PHYS_CONTIG)
-		*min_chunk_size = mem_size;
-	else
-		*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
+	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
 
 	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
 
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 41a0b09..637f73f 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -45,7 +45,6 @@ DPDK_16.07 {
 DPDK_17.11 {
 	global:
 
-	rte_mempool_ops_get_capabilities;
 	rte_mempool_ops_register_memory_area;
 	rte_mempool_populate_iova;
 	rte_mempool_populate_iova_tab;
-- 
2.7.4

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver
  @ 2018-04-16 13:24  2% ` Andrew Rybchenko
  2018-04-16 13:24  7%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
                     ` (4 more replies)
  2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
  1 sibling, 5 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev
  Cc: Olivier MATZ, Thomas Monjalon, Anatoly Burakov, Santosh Shukla,
	Jerin Jacob, Hemant Agrawal, Shreyansh Jain

The initial patch series [1] is split into two to simplify processing.
The second series relies on this one and will add bucket mempool driver
and related ops.

The patch series has generic enhancements suggested by Olivier.
Basically it adds driver callbacks to calculate required memory size and
to populate objects using provided memory area. It allows to remove
so-called capability flags used before to tell generic code how to
allocate and slice allocated memory into mempool objects.
Clean up which removes get_capabilities and register_memory_area is
not strictly required, but I think right thing to do.
Existing mempool drivers are updated.

rte_mempool_populate_iova_tab() is also deprecated in v2 as agreed in [2].
Unfortunately it requires addition of -Wno-deprecated-declarations flag
to librte_mempool since the function is used by deprecated earlier
rte_mempool_populate_phys_tab(). If the later may be removed in the
release, we can avoid addition of the flag to allow usage of deprecated
functions.

A new patch is added to the series in v3 to rename MEMPOOL_F_NO_PHYS_CONTIG
as MEMPOOL_F_NO_IOVA_CONTIG as agreed in [3].
MEMPOOL_F_CAPA_PHYS_CONTIG is not renamed since it removed in this
patchset.

It breaks ABI since changes rte_mempool_ops. Also it removes
rte_mempool_ops_register_memory_area() and
rte_mempool_ops_get_capabilities() since corresponding callbacks are
removed.

Internal global functions are not listed in map file since it is not
a part of external API.

[1] https://dpdk.org/ml/archives/dev/2018-January/088698.html
[2] https://dpdk.org/ml/archives/dev/2018-March/093186.html
[3] https://dpdk.org/ml/archives/dev/2018-March/093345.html

v3 -> v4:
  - rebase on memory rework

v2 -> v3:
  - fix build error in mempool/dpaa: prepare to remove register memory area op

v1 -> v2:
  - deprecate rte_mempool_populate_iova_tab()
  - add patch to fix memory leak if no objects are populated
  - add patch to rename MEMPOOL_F_NO_PHYS_CONTIG
  - minor fixes (typos, blank line at the end of file)
  - highlight meaning of min_chunk_size (when it is virtual or
    physical contiguous)
  - make sure that mempool is initialized in rte_mempool_populate_anon()
  - move patch to ensure that mempool is initialized earlier in the series

RFCv2 -> v1:
  - split the series in two
  - squash octeontx patches which implement calc_mem_size and populate
    callbacks into the patch which removes get_capabilities since it is
    the easiest way to untangle the tangle of tightly related library
    functions and flags advertised by the driver
  - consistently name default callbacks
  - move default callbacks to dedicated file
  - see detailed description in patches

RFCv1 -> RFCv2:
  - add driver ops to calculate required memory size and populate
    mempool objects, remove extra flags which were required before
    to control it
  - transition of octeontx and dpaa drivers to the new callbacks
  - change info API to get information from driver required to
    API user to know contiguous block size
  - remove get_capabilities (not required any more and may be
    substituted with more in info get API)
  - remove register_memory_area since it is substituted with
    populate callback which can do more
  - use SPDX tags
  - avoid all objects affinity to single lcore
  - fix bucket get_count
  - deprecate XMEM API
  - avoid introduction of a new function to flush cache
  - fix NO_CACHE_ALIGN case in bucket mempool


Andrew Rybchenko (9):
  mempool: fix memhdr leak when no objects are populated
  mempool: rename flag to control IOVA-contiguous objects
  mempool: add op to calculate memory size to be allocated
  mempool: add op to populate objects using provided memory
  mempool: remove callback to get capabilities
  mempool: deprecate xmem functions
  mempool/octeontx: prepare to remove register memory area op
  mempool/dpaa: prepare to remove register memory area op
  mempool: remove callback to register memory area

Artem V. Andreev (2):
  mempool: ensure the mempool is initialized before populating
  mempool: support flushing the default cache of the mempool

 doc/guides/rel_notes/deprecation.rst            |  12 +-
 doc/guides/rel_notes/release_18_05.rst          |  34 ++-
 drivers/mempool/dpaa/dpaa_mempool.c             |  13 +-
 drivers/mempool/octeontx/rte_mempool_octeontx.c |  64 ++++--
 drivers/net/thunderx/nicvf_ethdev.c             |   2 +-
 lib/librte_mempool/Makefile                     |   6 +-
 lib/librte_mempool/meson.build                  |  17 +-
 lib/librte_mempool/rte_mempool.c                | 240 ++++++++++----------
 lib/librte_mempool/rte_mempool.h                | 280 +++++++++++++++++-------
 lib/librte_mempool/rte_mempool_ops.c            |  37 ++--
 lib/librte_mempool/rte_mempool_ops_default.c    |  51 +++++
 lib/librte_mempool/rte_mempool_version.map      |  10 +-
 test/test/test_mempool.c                        |  31 ---
 13 files changed, 528 insertions(+), 269 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops_default.c

-- 
2.7.4

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
@ 2018-04-16 13:24  7%   ` Andrew Rybchenko
  2018-04-16 15:33  0%     ` Olivier Matz
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ, Anatoly Burakov

Size of memory chunk required to populate mempool objects depends
on how objects are stored in the memory. Different mempool drivers
may have different requirements and a new operation allows to
calculate memory size in accordance with driver requirements and
advertise requirements on minimum memory chunk size and alignment
in a generic way.

Bump ABI version since the patch breaks it.

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
v3 -> v4:
 - rebased on top of memory rework
 - dropped previous Ack's since rebase is not trivial
 - check size calculation failure in rte_mempool_populate_anon() and
   rte_mempool_memchunk_anon_free()

v2 -> v3:
 - none

v1 -> v2:
 - clarify min_chunk_size meaning
 - rebase on top of patch series which fixes library version in meson
   build

RFCv2 -> v1:
 - move default calc_mem_size callback to rte_mempool_ops_default.c
 - add ABI changes to release notes
 - name default callback consistently: rte_mempool_op_<callback>_default()
 - bump ABI version since it is the first patch which breaks ABI
 - describe default callback behaviour in details
 - avoid introduction of internal function to cope with deprecation
   (keep it to deprecation patch)
 - move cache-line or page boundary chunk alignment to default callback
 - highlight that min_chunk_size and align parameters are output only

 doc/guides/rel_notes/deprecation.rst         |   3 +-
 doc/guides/rel_notes/release_18_05.rst       |   8 +-
 lib/librte_mempool/Makefile                  |   3 +-
 lib/librte_mempool/meson.build               |   5 +-
 lib/librte_mempool/rte_mempool.c             | 114 +++++++++++++++------------
 lib/librte_mempool/rte_mempool.h             |  86 +++++++++++++++++++-
 lib/librte_mempool/rte_mempool_ops.c         |  18 +++++
 lib/librte_mempool/rte_mempool_ops_default.c |  38 +++++++++
 lib/librte_mempool/rte_mempool_version.map   |   7 ++
 9 files changed, 225 insertions(+), 57 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops_default.c

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index c929dcc..2aa5ef3 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -60,8 +60,7 @@ Deprecation Notices
 
   - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
-  - addition of new ops to customize required memory chunk calculation,
-    customize objects population and allocate contiguous
+  - addition of new ops to customize objects population and allocate contiguous
     block of objects if underlying driver supports it.
 
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 84295e4..7dbe7ac 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -195,6 +195,12 @@ ABI Changes
   type ``uint16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
   are parameter values recommended for use by the PMD.
 
+* **Changed rte_mempool_ops structure.**
+
+  A new callback ``calc_mem_size`` has been added to ``rte_mempool_ops``
+  to allow to customize required memory size calculation.
+
+
 Removed Items
 -------------
 
@@ -267,7 +273,7 @@ The libraries prepended with a plus sign were incremented in this version.
      librte_latencystats.so.1
      librte_lpm.so.2
    + librte_mbuf.so.4
-     librte_mempool.so.3
+   + librte_mempool.so.4
    + librte_meter.so.2
      librte_metrics.so.1
      librte_net.so.1
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 1f85d34..421e2a7 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -11,7 +11,7 @@ LDLIBS += -lrte_eal -lrte_ring
 
 EXPORT_MAP := rte_mempool_version.map
 
-LIBABIVER := 3
+LIBABIVER := 4
 
 # memseg walk is not yet part of stable API
 CFLAGS += -DALLOW_EXPERIMENTAL_API
@@ -19,6 +19,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops_default.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 89506c5..6181ad8 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
-version = 3
-sources = files('rte_mempool.c', 'rte_mempool_ops.c')
+version = 4
+sources = files('rte_mempool.c', 'rte_mempool_ops.c',
+		'rte_mempool_ops_default.c')
 headers = files('rte_mempool.h')
 deps += ['ring']
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index b15b79b..fdcee05 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -574,12 +574,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
-	size_t size, total_elt_sz, align, pg_sz, pg_shift;
+	ssize_t mem_size;
+	size_t align, pg_sz, pg_shift;
 	rte_iova_t iova;
 	unsigned mz_id, n;
-	unsigned int mp_flags;
 	int ret;
-	bool force_contig, no_contig, try_contig, no_pageshift;
+	bool no_contig, try_contig, no_pageshift;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -589,22 +589,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	if (mp->nb_mem_chunks != 0)
 		return -EEXIST;
 
-	/* Get mempool capabilities */
-	mp_flags = 0;
-	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
-	if ((ret < 0) && (ret != -ENOTSUP))
-		return ret;
-
-	/* update mempool capabilities */
-	mp->flags |= mp_flags;
-
 	no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG;
-	force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
 
 	/*
 	 * the following section calculates page shift and page size values.
 	 *
-	 * these values impact the result of rte_mempool_xmem_size(), which
+	 * these values impact the result of calc_mem_size operation, which
 	 * returns the amount of memory that should be allocated to store the
 	 * desired number of objects. when not zero, it allocates more memory
 	 * for the padding between objects, to ensure that an object does not
@@ -625,7 +615,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * if our IO addresses are virtual, not actual physical (IOVA as VA
 	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous physical memory as far as the hardware is concerned, so
+	 * contiguous IO memory as far as the hardware is concerned, so
 	 * act as if we're getting contiguous memory.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
@@ -643,39 +633,35 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
 	 * memory, then we'll go and reserve space page-by-page.
 	 */
-	no_pageshift = no_contig || force_contig ||
-			rte_eal_iova_mode() == RTE_IOVA_VA;
+	no_pageshift = no_contig || rte_eal_iova_mode() == RTE_IOVA_VA;
 	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
-	if (force_contig)
-		mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 	if (no_pageshift) {
 		pg_sz = 0;
 		pg_shift = 0;
-		align = RTE_CACHE_LINE_SIZE;
 	} else if (try_contig) {
 		pg_sz = get_min_page_size();
 		pg_shift = rte_bsf32(pg_sz);
-		/* we're trying to reserve contiguous memzone first, so try
-		 * align to cache line; if we fail to reserve a contiguous
-		 * memzone, we'll adjust alignment to equal pagesize later.
-		 */
-		align = RTE_CACHE_LINE_SIZE;
 	} else {
 		pg_sz = getpagesize();
 		pg_shift = rte_bsf32(pg_sz);
-		align = pg_sz;
 	}
 
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+		size_t min_chunk_size;
 		unsigned int flags;
+
 		if (try_contig || no_pageshift)
-			size = rte_mempool_xmem_size(n, total_elt_sz, 0,
-				mp->flags);
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					0, &min_chunk_size, &align);
 		else
-			size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
-				mp->flags);
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					pg_shift, &min_chunk_size, &align);
+
+		if (mem_size < 0) {
+			ret = mem_size;
+			goto fail;
+		}
 
 		ret = snprintf(mz_name, sizeof(mz_name),
 			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
@@ -692,27 +678,31 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		if (try_contig)
 			flags |= RTE_MEMZONE_IOVA_CONTIG;
 
-		mz = rte_memzone_reserve_aligned(mz_name, size, mp->socket_id,
-				flags, align);
+		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
+				mp->socket_id, flags, align);
 
-		/* if we were trying to allocate contiguous memory, adjust
-		 * memzone size and page size to fit smaller page sizes, and
-		 * try again.
+		/* if we were trying to allocate contiguous memory, failed and
+		 * minimum required contiguous chunk fits minimum page, adjust
+		 * memzone size to the page size, and try again.
 		 */
-		if (mz == NULL && try_contig) {
+		if (mz == NULL && try_contig && min_chunk_size <= pg_sz) {
 			try_contig = false;
 			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-			align = pg_sz;
-			size = rte_mempool_xmem_size(n, total_elt_sz,
-				pg_shift, mp->flags);
 
-			mz = rte_memzone_reserve_aligned(mz_name, size,
+			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
+					pg_shift, &min_chunk_size, &align);
+			if (mem_size < 0) {
+				ret = mem_size;
+				goto fail;
+			}
+
+			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
 				mp->socket_id, flags, align);
 		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
-		if (!force_contig && mz == NULL) {
+		if (min_chunk_size < (size_t)mem_size && mz == NULL) {
 			/* not enough memory, retry with the biggest zone we
 			 * have
 			 */
@@ -724,6 +714,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
+		if (mz->len < min_chunk_size) {
+			rte_memzone_free(mz);
+			ret = -ENOMEM;
+			goto fail;
+		}
+
 		if (no_contig)
 			iova = RTE_BAD_IOVA;
 		else
@@ -753,16 +749,18 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 }
 
 /* return the memory size required for mempool objects in anonymous mem */
-static size_t
+static ssize_t
 get_anon_size(const struct rte_mempool *mp)
 {
-	size_t size, total_elt_sz, pg_sz, pg_shift;
+	ssize_t size;
+	size_t pg_sz, pg_shift;
+	size_t min_chunk_size;
+	size_t align;
 
 	pg_sz = getpagesize();
 	pg_shift = rte_bsf32(pg_sz);
-	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
-	size = rte_mempool_xmem_size(mp->size, total_elt_sz, pg_shift,
-					mp->flags);
+	size = rte_mempool_ops_calc_mem_size(mp, mp->size, pg_shift,
+					     &min_chunk_size, &align);
 
 	return size;
 }
@@ -772,14 +770,25 @@ static void
 rte_mempool_memchunk_anon_free(struct rte_mempool_memhdr *memhdr,
 	void *opaque)
 {
-	munmap(opaque, get_anon_size(memhdr->mp));
+	ssize_t size;
+
+	/*
+	 * Calculate size since memhdr->len has contiguous chunk length
+	 * which may be smaller if anon map is split into many contiguous
+	 * chunks. Result must be the same as we calculated on populate.
+	 */
+	size = get_anon_size(memhdr->mp);
+	if (size < 0)
+		return;
+
+	munmap(opaque, size);
 }
 
 /* populate the mempool with an anonymous mapping */
 int
 rte_mempool_populate_anon(struct rte_mempool *mp)
 {
-	size_t size;
+	ssize_t size;
 	int ret;
 	char *addr;
 
@@ -793,8 +802,13 @@ rte_mempool_populate_anon(struct rte_mempool *mp)
 	if (ret != 0)
 		return ret;
 
-	/* get chunk of virtually continuous memory */
 	size = get_anon_size(mp);
+	if (size < 0) {
+		rte_errno = -size;
+		return 0;
+	}
+
+	/* get chunk of virtually continuous memory */
 	addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
 		MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 	if (addr == MAP_FAILED) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index e531a15..191255d 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -400,6 +400,62 @@ typedef int (*rte_mempool_get_capabilities_t)(const struct rte_mempool *mp,
 typedef int (*rte_mempool_ops_register_memory_area_t)
 (const struct rte_mempool *mp, char *vaddr, rte_iova_t iova, size_t len);
 
+/**
+ * Calculate memory size required to store given number of objects.
+ *
+ * If mempool objects are not required to be IOVA-contiguous
+ * (the flag MEMPOOL_F_NO_IOVA_CONTIG is set), min_chunk_size defines
+ * virtually contiguous chunk size. Otherwise, if mempool objects must
+ * be IOVA-contiguous (the flag MEMPOOL_F_NO_IOVA_CONTIG is clear),
+ * min_chunk_size defines IOVA-contiguous chunk size.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[in] obj_num
+ *   Number of objects.
+ * @param[in] pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[out] min_chunk_size
+ *   Location for minimum size of the memory chunk which may be used to
+ *   store memory pool objects.
+ * @param[out] align
+ *   Location for required memory chunk alignment.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
+		uint32_t obj_num,  uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
+/**
+ * Default way to calculate memory size required to store given number of
+ * objects.
+ *
+ * If page boundaries may be ignored, it is just a product of total
+ * object size including header and trailer and number of objects.
+ * Otherwise, it is a number of pages required to store given number of
+ * objects without crossing page boundary.
+ *
+ * Note that if object size is bigger than page size, then it assumes
+ * that pages are grouped in subsets of physically continuous pages big
+ * enough to store at least one object.
+ *
+ * If mempool driver requires object addresses to be block size aligned
+ * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
+ * reserved to be able to meet the requirement.
+ *
+ * Minimum size of memory chunk is either all required space, if
+ * capabilities say that whole memory area must be physically contiguous
+ * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
+ * element size.
+ *
+ * Required memory chunk alignment is a maximum of page size and cache
+ * line size.
+ */
+ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+		uint32_t obj_num, uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
 /** Structure defining mempool operations structure */
 struct rte_mempool_ops {
 	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
@@ -416,6 +472,11 @@ struct rte_mempool_ops {
 	 * Notify new memory area to mempool
 	 */
 	rte_mempool_ops_register_memory_area_t register_memory_area;
+	/**
+	 * Optional callback to calculate memory size required to
+	 * store specified number of objects.
+	 */
+	rte_mempool_calc_mem_size_t calc_mem_size;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -565,6 +626,29 @@ rte_mempool_ops_register_memory_area(const struct rte_mempool *mp,
 				char *vaddr, rte_iova_t iova, size_t len);
 
 /**
+ * @internal wrapper for mempool_ops calc_mem_size callback.
+ * API to calculate size of memory required to store specified number of
+ * object.
+ *
+ * @param[in] mp
+ *   Pointer to the memory pool.
+ * @param[in] obj_num
+ *   Number of objects.
+ * @param[in] pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[out] min_chunk_size
+ *   Location for minimum size of the memory chunk which may be used to
+ *   store memory pool objects.
+ * @param[out] align
+ *   Location for required memory chunk alignment.
+ * @return
+ *   Required memory size aligned at page boundary.
+ */
+ssize_t rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
+				      uint32_t obj_num, uint32_t pg_shift,
+				      size_t *min_chunk_size, size_t *align);
+
+/**
  * @internal wrapper for mempool_ops free callback.
  *
  * @param mp
@@ -1534,7 +1618,7 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
  * of objects. Assume that the memory buffer will be aligned at page
  * boundary.
  *
- * Note that if object size is bigger then page size, then it assumes
+ * Note that if object size is bigger than page size, then it assumes
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 0732255..26908cc 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -59,6 +59,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->get_count = h->get_count;
 	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
+	ops->calc_mem_size = h->calc_mem_size;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
@@ -123,6 +124,23 @@ rte_mempool_ops_register_memory_area(const struct rte_mempool *mp, char *vaddr,
 	return ops->register_memory_area(mp, vaddr, iova, len);
 }
 
+/* wrapper to notify new memory area to external mempool */
+ssize_t
+rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+
+	if (ops->calc_mem_size == NULL)
+		return rte_mempool_op_calc_mem_size_default(mp, obj_num,
+				pg_shift, min_chunk_size, align);
+
+	return ops->calc_mem_size(mp, obj_num, pg_shift, min_chunk_size, align);
+}
+
 /* sets mempool ops previously registered by rte_mempool_register_ops. */
 int
 rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
new file mode 100644
index 0000000..57fe79b
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 6WIND S.A.
+ * Copyright(c) 2018 Solarflare Communications Inc.
+ */
+
+#include <rte_mempool.h>
+
+ssize_t
+rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+				     uint32_t obj_num, uint32_t pg_shift,
+				     size_t *min_chunk_size, size_t *align)
+{
+	unsigned int mp_flags;
+	int ret;
+	size_t total_elt_sz;
+	size_t mem_size;
+
+	/* Get mempool capabilities */
+	mp_flags = 0;
+	ret = rte_mempool_ops_get_capabilities(mp, &mp_flags);
+	if ((ret < 0) && (ret != -ENOTSUP))
+		return ret;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	mem_size = rte_mempool_xmem_size(obj_num, total_elt_sz, pg_shift,
+					 mp->flags | mp_flags);
+
+	if (mp_flags & MEMPOOL_F_CAPA_PHYS_CONTIG)
+		*min_chunk_size = mem_size;
+	else
+		*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
+
+	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+
+	return mem_size;
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 62b76f9..cb38189 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -51,3 +51,10 @@ DPDK_17.11 {
 	rte_mempool_populate_iova_tab;
 
 } DPDK_16.07;
+
+DPDK_18.05 {
+	global:
+
+	rte_mempool_op_calc_mem_size_default;
+
+} DPDK_17.11;
-- 
2.7.4

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory
  2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
  2018-04-16 13:24  7%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
@ 2018-04-16 13:24  6%   ` Andrew Rybchenko
  2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-16 13:24 UTC (permalink / raw)
  To: dev; +Cc: Olivier MATZ

The callback allows to customize how objects are stored in the
memory chunk. Default implementation of the callback which simply
puts objects one by one is available.

Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
v3 -> v4:
 - none

v2 -> v3:
 - none

v1 -> v2:
 - fix memory leak if off is bigger than len

RFCv2 -> v1:
 - advertise ABI changes in release notes
 - use consistent name for default callback:
   rte_mempool_op_<callback>_default()
 - add opaque data pointer to populated object callback
 - move default callback to dedicated file

 doc/guides/rel_notes/deprecation.rst         |  2 +-
 doc/guides/rel_notes/release_18_05.rst       |  2 +
 lib/librte_mempool/rte_mempool.c             | 23 ++++---
 lib/librte_mempool/rte_mempool.h             | 90 ++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool_ops.c         | 21 +++++++
 lib/librte_mempool/rte_mempool_ops_default.c | 24 ++++++++
 lib/librte_mempool/rte_mempool_version.map   |  1 +
 7 files changed, 149 insertions(+), 14 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 2aa5ef3..575da18 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -60,7 +60,7 @@ Deprecation Notices
 
   - removal of ``get_capabilities`` mempool ops and related flags.
   - substitute ``register_memory_area`` with ``populate`` ops.
-  - addition of new ops to customize objects population and allocate contiguous
+  - addition of new op to allocate contiguous
     block of objects if underlying driver supports it.
 
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 7dbe7ac..5c6588e 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -199,6 +199,8 @@ ABI Changes
 
   A new callback ``calc_mem_size`` has been added to ``rte_mempool_ops``
   to allow to customize required memory size calculation.
+  A new callback ``populate`` has been added to ``rte_mempool_ops``
+  to allow to customize objects population.
 
 
 Removed Items
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index fdcee05..68ae12f 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -122,7 +122,8 @@ get_min_page_size(void)
 
 
 static void
-mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
+mempool_add_elem(struct rte_mempool *mp, __rte_unused void *opaque,
+		 void *obj, rte_iova_t iova)
 {
 	struct rte_mempool_objhdr *hdr;
 	struct rte_mempool_objtlr *tlr __rte_unused;
@@ -139,9 +140,6 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
 	tlr = __mempool_get_trailer(obj);
 	tlr->cookie = RTE_MEMPOOL_TRAILER_COOKIE;
 #endif
-
-	/* enqueue in ring */
-	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -420,17 +418,16 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	else
 		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
 
-	while (off + total_elt_sz <= len && mp->populated_size < mp->size) {
-		off += mp->header_size;
-		if (iova == RTE_BAD_IOVA)
-			mempool_add_elem(mp, (char *)vaddr + off,
-				RTE_BAD_IOVA);
-		else
-			mempool_add_elem(mp, (char *)vaddr + off, iova + off);
-		off += mp->elt_size + mp->trailer_size;
-		i++;
+	if (off > len) {
+		ret = -EINVAL;
+		goto fail;
 	}
 
+	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
+		(char *)vaddr + off,
+		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
+		len - off, mempool_add_elem, NULL);
+
 	/* not enough room to store one object */
 	if (i == 0) {
 		ret = -EINVAL;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 191255d..754261e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -456,6 +456,63 @@ ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
 		size_t *min_chunk_size, size_t *align);
 
+/**
+ * Function to be called for each populated object.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] opaque
+ *   An opaque pointer passed to iterator.
+ * @param[in] vaddr
+ *   Object virtual address.
+ * @param[in] iova
+ *   Input/output virtual address of the object or RTE_BAD_IOVA.
+ */
+typedef void (rte_mempool_populate_obj_cb_t)(struct rte_mempool *mp,
+		void *opaque, void *vaddr, rte_iova_t iova);
+
+/**
+ * Populate memory pool objects using provided memory chunk.
+ *
+ * Populated objects should be enqueued to the pool, e.g. using
+ * rte_mempool_ops_enqueue_bulk().
+ *
+ * If the given IO address is unknown (iova = RTE_BAD_IOVA),
+ * the chunk doesn't need to be physically contiguous (only virtually),
+ * and allocated objects may span two pages.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] max_objs
+ *   Maximum number of objects to be populated.
+ * @param[in] vaddr
+ *   The virtual address of memory that should be used to store objects.
+ * @param[in] iova
+ *   The IO address
+ * @param[in] len
+ *   The length of memory in bytes.
+ * @param[in] obj_cb
+ *   Callback function to be executed for each populated object.
+ * @param[in] obj_cb_arg
+ *   An opaque pointer passed to the callback function.
+ * @return
+ *   The number of objects added on success.
+ *   On error, no objects are populated and a negative errno is returned.
+ */
+typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
+/**
+ * Default way to populate memory pool object using provided memory
+ * chunk: just slice objects one by one.
+ */
+int rte_mempool_op_populate_default(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
 /** Structure defining mempool operations structure */
 struct rte_mempool_ops {
 	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
@@ -477,6 +534,11 @@ struct rte_mempool_ops {
 	 * store specified number of objects.
 	 */
 	rte_mempool_calc_mem_size_t calc_mem_size;
+	/**
+	 * Optional callback to populate mempool objects using
+	 * provided memory chunk.
+	 */
+	rte_mempool_populate_t populate;
 } __rte_cache_aligned;
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
@@ -649,6 +711,34 @@ ssize_t rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				      size_t *min_chunk_size, size_t *align);
 
 /**
+ * @internal wrapper for mempool_ops populate callback.
+ *
+ * Populate memory pool objects using provided memory chunk.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] max_objs
+ *   Maximum number of objects to be populated.
+ * @param[in] vaddr
+ *   The virtual address of memory that should be used to store objects.
+ * @param[in] iova
+ *   The IO address
+ * @param[in] len
+ *   The length of memory in bytes.
+ * @param[in] obj_cb
+ *   Callback function to be executed for each populated object.
+ * @param[in] obj_cb_arg
+ *   An opaque pointer passed to the callback function.
+ * @return
+ *   The number of objects added on success.
+ *   On error, no objects are populated and a negative errno is returned.
+ */
+int rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs,
+			     void *vaddr, rte_iova_t iova, size_t len,
+			     rte_mempool_populate_obj_cb_t *obj_cb,
+			     void *obj_cb_arg);
+
+/**
  * @internal wrapper for mempool_ops free callback.
  *
  * @param mp
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index 26908cc..1a7f39f 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -60,6 +60,7 @@ rte_mempool_register_ops(const struct rte_mempool_ops *h)
 	ops->get_capabilities = h->get_capabilities;
 	ops->register_memory_area = h->register_memory_area;
 	ops->calc_mem_size = h->calc_mem_size;
+	ops->populate = h->populate;
 
 	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
 
@@ -141,6 +142,26 @@ rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 	return ops->calc_mem_size(mp, obj_num, pg_shift, min_chunk_size, align);
 }
 
+/* wrapper to populate memory pool objects using provided memory chunk */
+int
+rte_mempool_ops_populate(struct rte_mempool *mp, unsigned int max_objs,
+				void *vaddr, rte_iova_t iova, size_t len,
+				rte_mempool_populate_obj_cb_t *obj_cb,
+				void *obj_cb_arg)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+
+	if (ops->populate == NULL)
+		return rte_mempool_op_populate_default(mp, max_objs, vaddr,
+						       iova, len, obj_cb,
+						       obj_cb_arg);
+
+	return ops->populate(mp, max_objs, vaddr, iova, len, obj_cb,
+			     obj_cb_arg);
+}
+
 /* sets mempool ops previously registered by rte_mempool_register_ops. */
 int
 rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 57fe79b..57295f7 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -36,3 +36,27 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 
 	return mem_size;
 }
+
+int
+rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+{
+	size_t total_elt_sz;
+	size_t off;
+	unsigned int i;
+	void *obj;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+		off += mp->header_size;
+		obj = (char *)vaddr + off;
+		obj_cb(mp, obj_cb_arg, obj,
+		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
+		off += mp->elt_size + mp->trailer_size;
+	}
+
+	return i;
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index cb38189..41a0b09 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,6 @@ DPDK_18.05 {
 	global:
 
 	rte_mempool_op_calc_mem_size_default;
+	rte_mempool_op_populate_default;
 
 } DPDK_17.11;
-- 
2.7.4

^ permalink raw reply	[relevance 6%]

* Re: [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (5 preceding siblings ...)
  2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
@ 2018-04-16 11:22  0% ` Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-16 11:22 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, wenzhuo.lu, declan.doherty,
	jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 13-Apr-18 7:30 PM, Arnon Warshavsky wrote:
> The purpose of this patch series is to cleanup the library code
> from paths that end up aborting the process,
> and move to checking error values, in order to allow the running process
> perform an orderly teardown or other mitigation of the event.
> 
> This patch modifies the majority of rte_panic calls
> under lib and drivers, and replaces them with a log message
> and an error return code according to context,
> that can be propagated up the call stack.
> 
> - Focus was given to the dpdk initialization path
> - Some of the panic calls within drivers were left in place where
>    the call is from within an interrupt or calls that are
>    on the data path,where there is no simple applicative
>    route to propagate the error to temination.
>    These should be handled by the driver maintainers.
> - In order to avoid breaking ABI where panic was called from public
>    void functions, a panic state variable was introduced so that
>    it can be queried after calling these void functions.
>    This tool place for a single function call.
> - local void functions with no api were changed to retrun a value
>    where needed
> - No change took place in example and test files
> - No change took place for debug assertions calling panic
> - A new function was added to devtools/checkpatches.sh
>    in order to prevent new additions of calls to rte_panic
>    under lib and drivers.
> 
> Keep calm and don't panic
> 
> ---
> 
> v2:
> - reformat error messages so that literal string are in the same line
> - fix typo in commit message
> - add new return code to doxigen of rte_memzone_free()
> 
> v3:
> - submit  all 13 patches changed and unchanged in the same patchset
> 

This patchset needs to be rebased. There were a few changes that make 
some of the patches unnecessary.

Changes in patch 7 and 9 were addressed in earlier memory hotplug 
patchset, and are no longer applicable. Some things may have changed for 
patch 12 as well.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v10 02/10] crypto/virtio: support virtio device init
  @ 2018-04-16  2:21  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-16  2:21 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/meson.build        |   3 +-
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 462 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 11 files changed, 1443 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index 2f04f0c..786afb8 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/meson.build b/drivers/crypto/virtio/meson.build
index 57d84c4..cee77cc 100644
--- a/drivers/crypto/virtio/meson.build
+++ b/drivers/crypto/virtio/meson.build
@@ -6,6 +6,7 @@ if not dep.found()
 	build = false
 endif
 deps += ['bus_pci']
-sources = files('virtio_cryptodev.c')
+sources = files('virtio_cryptodev.c', 'virtio_pci.c',
+		'virtio_rxtx.c', 'virtqueue.c')
 ext_deps += dep
 pkgconfig_extra_libs += '-lcrypto'
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..832c465
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v9 02/11] crypto/virtio: support virtio device init
  @ 2018-04-15  8:51  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-15  8:51 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 462 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 10 files changed, 1441 insertions(+), 2 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index 2f04f0c..786afb8 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..832c465
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v8 02/11] crypto/virtio: support virtio device init
  @ 2018-04-14  9:34  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-14  9:34 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 245 +++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 460 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 252 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 171 ++++++++++++
 10 files changed, 1439 insertions(+), 2 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index a3b44e9..c4727ea 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 3e54942..3fe2c80 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,27 +3,240 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int
 crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int
 crypto_virtio_pci_remove(
 	struct rte_pci_device *pci_dev __rte_unused)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -34,3 +247,31 @@
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..d4cefb2
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..9c905d5
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <virtio_crypto.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (4 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
@ 2018-04-13 18:30  2% ` Arnon Warshavsky
  2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local functions to this file,
changing from void to int are non-abi-breaking.
For handling the single function that cannot
change from void to int due to abi,
where this is the only place it is called in,
I added a state variable that is being checked
right after the call to this function.

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_eal/bsdapp/eal/eal.c           |  87 ++++++++++++++-------
 lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
 lib/librte_eal/common/eal_common_launch.c |  21 ++++++
 lib/librte_eal/common/include/rte_debug.h |  12 +++
 lib/librte_eal/linuxapp/eal/eal.c         | 121 ++++++++++++++++++++----------
 lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
 6 files changed, 272 insertions(+), 99 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..f6aa3b2 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -150,7 +150,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -159,60 +159,79 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
 	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	void *rte_mem_cfg_addr;
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 	close(mem_cfg_fd);
-	if (rte_mem_cfg_addr == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
+	}
 
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -236,23 +255,28 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create())
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach())
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+	return 0;
 }
 
 /* display usage */
@@ -583,7 +607,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_mp_channel_init() < 0) {
 		rte_eal_init_alert("failed to init mp channel\n");
@@ -630,7 +655,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_check_mem_on_local_socket();
 
-	eal_thread_init_master(rte_config.master_lcore);
+	if (eal_thread_init_master(rte_config.master_lcore) != 0)
+		return -1;
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -644,18 +670,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
index d602daf..5c3947c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_thread.c
+++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
index fe0ba3f..6f8bd46 100644
--- a/lib/librte_eal/common/eal_common_launch.c
+++ b/lib/librte_eal/common/eal_common_launch.c
@@ -14,6 +14,7 @@
 #include <rte_pause.h>
 #include <rte_per_lcore.h>
 #include <rte_lcore.h>
+#include <rte_debug.h>
 
 /*
  * Wait until a lcore finished its job.
@@ -88,3 +89,23 @@ enum rte_lcore_state_t
 		rte_eal_wait_lcore(lcore_id);
 	}
 }
+
+/* panic state */
+static int _panic_state;
+
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void)
+{
+	return _panic_state;
+}
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void)
+{
+	_panic_state = 1;
+}
diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
index 272df49..b421d33 100644
--- a/lib/librte_eal/common/include/rte_debug.h
+++ b/lib/librte_eal/common/include/rte_debug.h
@@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
 }
 #endif
 
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void);
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void);
+
+
 #endif /* _RTE_DEBUG_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..b7b950a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -160,7 +160,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -169,7 +169,7 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* map the config before hugepage address so that we don't waste a page */
 	if (internal_config.base_virtaddr != 0)
@@ -179,30 +179,39 @@ enum rte_iova_mode
 	else
 		rte_mem_cfg_addr = NULL;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for "
+					"rte_mem_config\n", __func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
-	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config\n", __func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
@@ -211,10 +220,11 @@ enum rte_iova_mode
 	 * processes could later map the config into this exact location */
 	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
 
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	struct rte_mem_config *mem_config;
@@ -222,33 +232,41 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+						__func__, pathname);
+			return -1;
+		}
 	}
 
 	/* map it as read-only first */
 	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
 			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
-	if (mem_config == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-			  errno, strerror(errno));
+	if (mem_config == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config! error %i (%s)\n",
+				__func__, errno, strerror(errno));
+		return -1;
+	}
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* reattach the shared config at exact memory location primary process has it */
-static void
+static int
 rte_eal_config_reattach(void)
 {
 	struct rte_mem_config *mem_config;
 	void *rte_mem_cfg_addr;
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* save the address primary process has mapped shared config to */
 	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
@@ -263,16 +281,21 @@ enum rte_iova_mode
 	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
 		if (mem_config != MAP_FAILED)
 			/* errno is stale, don't use */
-			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
-				  " - please use '--base-virtaddr' option\n",
-				  rte_mem_cfg_addr, mem_config);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config at [%p], got [%p] - please use "
+					"'--base-virtaddr' option\n",
+					__func__, rte_mem_cfg_addr, mem_config);
 		else
-			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-				  errno, strerror(errno));
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config! error %i (%s)\n",
+					__func__, errno, strerror(errno));
+		return -1;
 	}
 	close(mem_cfg_fd);
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -296,24 +319,31 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create() != 0)
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach() != 0)
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
-		rte_eal_config_reattach();
+		if (rte_eal_config_reattach() != 0)
+			return -1;
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+
+	return 0;
 }
 
 /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
@@ -827,7 +857,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
 		rte_eal_init_alert("Cannot init logging.");
@@ -890,6 +921,9 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_thread_init_master(rte_config.master_lcore);
 
+	if (rte_get_panic_state())
+		return -1;
+
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
@@ -907,18 +941,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b..3afcee5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (3 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_ether/rte_ethdev.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..57e1e6b 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -194,7 +194,7 @@ enum {
 	return port_id;
 }
 
-static void
+static int
 rte_eth_dev_shared_data_prepare(void)
 {
 	const unsigned flags = 0;
@@ -210,8 +210,12 @@ enum {
 					rte_socket_id(), flags);
 		} else
 			mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA);
-		if (mz == NULL)
-			rte_panic("Cannot allocate ethdev shared data\n");
+		if (mz == NULL) {
+			rte_spinlock_unlock(&rte_eth_shared_data_lock);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot allocate ethdev shared data\n",
+					__func__);
+			return -1;
+		}
 
 		rte_eth_dev_shared_data = mz->addr;
 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
@@ -224,6 +228,8 @@ enum {
 	}
 
 	rte_spinlock_unlock(&rte_eth_shared_data_lock);
+
+	return 0;
 }
 
 struct rte_eth_dev *
@@ -274,7 +280,8 @@ struct rte_eth_dev *
 	uint16_t port_id;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port creation between primary and secondary threads. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -317,7 +324,8 @@ struct rte_eth_dev *
 	uint16_t i;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port attachment to primary port creation and release. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -345,7 +353,8 @@ struct rte_eth_dev *
 	if (eth_dev == NULL)
 		return -EINVAL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -399,7 +408,8 @@ struct rte_eth_dev *
 int __rte_experimental
 rte_eth_dev_owner_new(uint64_t *owner_id)
 {
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -450,7 +460,8 @@ struct rte_eth_dev *
 {
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -467,7 +478,8 @@ struct rte_eth_dev *
 			{.id = RTE_ETH_DEV_NO_OWNER, .name = ""};
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -482,7 +494,8 @@ struct rte_eth_dev *
 {
 	uint16_t port_id;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -502,7 +515,8 @@ struct rte_eth_dev *
 {
 	int ret = 0;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (2 preceding siblings ...)
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_kni/rte_kni.c      | 18 ++++++++++++------
 lib/librte_kni/rte_kni_fifo.h | 11 ++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2867411..54050c8 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -353,37 +353,43 @@ struct rte_kni *
 	/* TX RING */
 	mz = slot->m_tx_q;
 	ctx->tx_q = mz->addr;
-	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.tx_phys = mz->phys_addr;
 
 	/* RX RING */
 	mz = slot->m_rx_q;
 	ctx->rx_q = mz->addr;
-	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.rx_phys = mz->phys_addr;
 
 	/* ALLOC RING */
 	mz = slot->m_alloc_q;
 	ctx->alloc_q = mz->addr;
-	kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.alloc_phys = mz->phys_addr;
 
 	/* FREE RING */
 	mz = slot->m_free_q;
 	ctx->free_q = mz->addr;
-	kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.free_phys = mz->phys_addr;
 
 	/* Request RING */
 	mz = slot->m_req_q;
 	ctx->req_q = mz->addr;
-	kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.req_phys = mz->phys_addr;
 
 	/* Response RING */
 	mz = slot->m_resp_q;
 	ctx->resp_q = mz->addr;
-	kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.resp_phys = mz->phys_addr;
 
 	/* Req/Resp sync mem area */
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c..5052015 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -7,17 +7,22 @@
 /**
  * Initializes the kni fifo structure
  */
-static void
+static int
 kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size)
 {
 	/* Ensure size is power of 2 */
-	if (size & (size - 1))
-		rte_panic("KNI fifo size must be power of 2\n");
+	if (size & (size - 1)) {
+		RTE_LOG(CRIT, EAL, "%s(): KNI fifo size must be power of 2\n",
+				__func__);
+		return -1;
+	}
 
 	fifo->write = 0;
 	fifo->read = 0;
 	fifo->len = size;
 	fifo->elem_size = sizeof(void *);
+
+	return 0;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/ixgbe/ixgbe_ethdev.c |  3 ++-
 drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
 drivers/net/ixgbe/ixgbe_pf.c     | 13 +++++++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 4df5c75..96188dc 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1224,7 +1224,8 @@ struct rte_ixgbe_xstats_name_off {
 	memset(hwstrip, 0, sizeof(*hwstrip));
 
 	/* initialize PF if max_vfs not zero */
-	ixgbe_pf_host_init(eth_dev);
+	if (ixgbe_pf_host_init(eth_dev) != 0)
+		return -1;
 
 	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
 	/* let hardware know driver is loaded */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index c56d652..82d7fd2 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -663,7 +663,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
 
 void ixgbe_vlan_hw_strip_disable_all(struct rte_eth_dev *dev);
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
index ea99737..5c25de0 100644
--- a/drivers/net/ixgbe/ixgbe_pf.c
+++ b/drivers/net/ixgbe/ixgbe_pf.c
@@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct ixgbe_vf_info **vfinfo =
 		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
+				__func__);
+		return -1;
+	}
 
 	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
 	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
@@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	/* set mb interrupt mask */
 	ixgbe_mb_intr_setup(eth_dev);
+
+	return 0;
 }
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/e1000/e1000_ethdev.h |  2 +-
 drivers/net/e1000/igb_ethdev.c   |  3 ++-
 drivers/net/e1000/igb_pf.c       | 15 +++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 23b089c..a66ff42 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -405,7 +405,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
 /*
  * misc function prototypes
  */
-void igb_pf_host_init(struct rte_eth_dev *eth_dev);
+int igb_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index d7eef9a..994bb5a 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
 	}
 
 	/* initialize PF if max_vfs not zero */
-	igb_pf_host_init(eth_dev);
+	if (igb_pf_host_init(eth_dev) != 0)
+		goto err_late;
 
 	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
 	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
index b9f2e53..dfa63c9 100644
--- a/drivers/net/e1000/igb_pf.c
+++ b/drivers/net/e1000/igb_pf.c
@@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void igb_pf_host_init(struct rte_eth_dev *eth_dev)
+int igb_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct e1000_vf_info **vfinfo =
 		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	if (0 == (vf_num = dev_num_vf(eth_dev)))
-		return;
+		return 0;
 
 	if (hw->mac.type == e1000_i350)
 		nb_queue = 1;
@@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 		/* per datasheet, it should be 2, but 1 seems correct */
 		nb_queue = 1;
 	else
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private "
+				"VF data\n", __func__);
+		return -1;
+	}
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
 	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
@@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 	/* set mb interrupt mask */
 	igb_mb_intr_setup(eth_dev);
 
-	return;
+	return 0;
 }
 
 void igb_pf_host_uninit(struct rte_eth_dev *dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver
  2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
@ 2018-04-13 18:30  3% ` Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local functions to this file,
changing from void to int are non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c         | 30 +++++++++++++++--------
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
 drivers/net/bonding/rte_eth_bond_api.c            | 20 ++++++++++-----
 drivers/net/bonding/rte_eth_bond_pmd.c            | 10 +++++---
 drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
 5 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index c452318..310118c 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -893,7 +893,7 @@
 			bond_mode_8023ad_periodic_cb, arg);
 }
 
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
 				uint16_t slave_id)
 {
@@ -939,7 +939,7 @@
 	timer_cancel(&port->warning_timer);
 
 	if (port->mbuf_pool != NULL)
-		return;
+		return 0;
 
 	RTE_ASSERT(port->rx_ring == NULL);
 	RTE_ASSERT(port->tx_ring == NULL);
@@ -968,8 +968,10 @@
 	/* Any memory allocation failure in initialization is critical because
 	 * resources can't be free, so reinitialization is impossible. */
 	if (port->mbuf_pool == NULL) {
-		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-			slave_id, mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory"
+				" pool '%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
@@ -977,8 +979,9 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
 
 	if (port->rx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	/* TX ring is at least one pkt longer to make room for marker packet. */
@@ -987,9 +990,13 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
 
 	if (port->tx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring "
+				"'%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
+
+	return 0;
 }
 
 int
@@ -1143,9 +1150,12 @@
 	struct bond_dev_private *internals = bond_dev->data->dev_private;
 	uint8_t i;
 
-	for (i = 0; i < internals->active_slave_count; i++)
-		bond_mode_8023ad_activate_slave(bond_dev,
+	for (i = 0; i < internals->active_slave_count; i++) {
+		int rc = bond_mode_8023ad_activate_slave(bond_dev,
 				internals->active_slaves[i]);
+		if (rc != 0)
+			return rc;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
index 0f490a5..96a42f2 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
@@ -263,7 +263,7 @@ struct mode8023ad_private {
  * @return
  *  0 on success, negative value otherwise.
  */
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
 
 /**
diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index f854b73..6bc5887 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -69,14 +69,15 @@
 	return 0;
 }
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
 {
 	struct bond_dev_private *internals = eth_dev->data->dev_private;
 	uint8_t active_count = internals->active_slave_count;
 
 	if (internals->mode == BONDING_MODE_8023AD)
-		bond_mode_8023ad_activate_slave(eth_dev, port_id);
+		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
+			return -1;
 
 	if (internals->mode == BONDING_MODE_TLB
 			|| internals->mode == BONDING_MODE_ALB) {
@@ -349,10 +350,17 @@
 				bond_ethdev_primary_set(internals,
 							slave_port_id);
 
-			if (find_slave_by_id(internals->active_slaves,
-					     internals->active_slave_count,
-					     slave_port_id) == internals->active_slave_count)
-				activate_slave(bonded_eth_dev, slave_port_id);
+			int rc =
+				find_slave_by_id(internals->active_slaves,
+					internals->active_slave_count,
+					slave_port_id);
+
+			if (rc == internals->active_slave_count) {
+				int rc = activate_slave(bonded_eth_dev,
+							slave_port_id);
+				if (rc != 0)
+					return -1;
+			}
 		}
 	}
 
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index b59ba9f..96f8b1a 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -1740,8 +1740,11 @@ struct bwg_slave {
 		/* Any memory allocation failure in initialization is critical because
 		 * resources can't be free, so reinitialization is impossible. */
 		if (port->slow_pool == NULL) {
-			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-				slave_id, mem_name, rte_strerror(rte_errno));
+			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create"
+					" memory pool '%s': %s\n",
+					__func__, slave_id, mem_name,
+					rte_strerror(rte_errno));
+			return -1;
 		}
 	}
 
@@ -2660,7 +2663,8 @@ struct bwg_slave {
 			mac_address_slaves_update(bonded_eth_dev);
 		}
 
-		activate_slave(bonded_eth_dev, port_id);
+		if (activate_slave(bonded_eth_dev, port_id) != 0)
+			return -1;
 
 		/* If user has defined the primary port then default to using it */
 		if (internals->user_defined_primary_port &&
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 92e15f8..65453aa 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -185,7 +185,7 @@ struct bond_dev_private {
 void
 deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
 void
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances
@ 2018-04-13 18:30  3% Arnon Warshavsky
  2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
                   ` (6 more replies)
  0 siblings, 7 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-13 18:30 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

The purpose of this patch series is to cleanup the library code
from paths that end up aborting the process,
and move to checking error values, in order to allow the running process
perform an orderly teardown or other mitigation of the event.

This patch modifies the majority of rte_panic calls
under lib and drivers, and replaces them with a log message
and an error return code according to context,
that can be propagated up the call stack.

- Focus was given to the dpdk initialization path
- Some of the panic calls within drivers were left in place where
  the call is from within an interrupt or calls that are
  on the data path,where there is no simple applicative
  route to propagate the error to temination.
  These should be handled by the driver maintainers.
- In order to avoid breaking ABI where panic was called from public
  void functions, a panic state variable was introduced so that
  it can be queried after calling these void functions.
  This tool place for a single function call.
- local void functions with no api were changed to retrun a value
  where needed
- No change took place in example and test files
- No change took place for debug assertions calling panic
- A new function was added to devtools/checkpatches.sh
  in order to prevent new additions of calls to rte_panic
  under lib and drivers.

Keep calm and don't panic

---

v2:
- reformat error messages so that literal string are in the same line
- fix typo in commit message
- add new return code to doxigen of rte_memzone_free()

v3:
- submit  all 13 patches changed and unchanged in the same patchset

Arnon Warshavsky (13):
  crypto: replace rte_panic instances in crypto driver
  bond: replace rte_panic instances in bonding driver
  e1000: replace rte_panic instances in e1000 driver
  ixgbe: replace rte_panic instances in ixgbe driver
  eal: replace rte_panic instances in eventdev
  kni: replace rte_panic instances in kni
  e1000: replace rte_panic instances in e1000 driver
  eal: replace rte_panic instances in hugepage_info
  eal: replace rte_panic instances in common_memzone
  eal: replace rte_panic instances in interrupts thread
  eal: replace rte_panic instances in ethdev
  eal: replace rte_panic instances in init sequence
  devtools: prevent new instances of rte_panic and rte_exit

 devtools/checkpatches.sh                          |  94 ++++++++++++++++-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c       |   8 +-
 drivers/crypto/dpaa_sec/dpaa_sec.c                |   8 +-
 drivers/net/bonding/rte_eth_bond_8023ad.c         |  30 ++++--
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |   2 +-
 drivers/net/bonding/rte_eth_bond_api.c            |  20 ++--
 drivers/net/bonding/rte_eth_bond_pmd.c            |  10 +-
 drivers/net/bonding/rte_eth_bond_private.h        |   2 +-
 drivers/net/e1000/e1000_ethdev.h                  |   2 +-
 drivers/net/e1000/igb_ethdev.c                    |   3 +-
 drivers/net/e1000/igb_pf.c                        |  15 +--
 drivers/net/ixgbe/ixgbe_ethdev.c                  |   3 +-
 drivers/net/ixgbe/ixgbe_ethdev.h                  |   2 +-
 drivers/net/ixgbe/ixgbe_pf.c                      |  13 ++-
 lib/librte_eal/bsdapp/eal/eal.c                   |  87 +++++++++++-----
 lib/librte_eal/bsdapp/eal/eal_thread.c            |  65 +++++++++---
 lib/librte_eal/common/eal_common_launch.c         |  21 ++++
 lib/librte_eal/common/eal_common_memzone.c        |   3 +-
 lib/librte_eal/common/include/rte_debug.h         |  12 +++
 lib/librte_eal/common/include/rte_memzone.h       |   1 +
 lib/librte_eal/common/rte_malloc.c                |   7 +-
 lib/librte_eal/linuxapp/eal/eal.c                 | 121 +++++++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  21 ++--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c      |  27 +++--
 lib/librte_eal/linuxapp/eal/eal_thread.c          |  65 +++++++++---
 lib/librte_ether/rte_ethdev.c                     |  36 +++++--
 lib/librte_eventdev/rte_eventdev_pmd_pci.h        |   8 +-
 lib/librte_eventdev/rte_eventdev_pmd_vdev.h       |   8 +-
 lib/librte_kni/rte_kni.c                          |  18 ++--
 lib/librte_kni/rte_kni_fifo.h                     |  11 +-
 30 files changed, 540 insertions(+), 183 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 02/10] bpf: add BPF loading and execution framework
  @ 2018-04-13 14:43  2% ` Konstantin Ananyev
  0 siblings, 0 replies; 200+ results
From: Konstantin Ananyev @ 2018-04-13 14:43 UTC (permalink / raw)
  To: dev; +Cc: Konstantin Ananyev

librte_bpf provides a framework to load and execute eBPF bytecode
inside user-space dpdk based applications.
It supports basic set of features from eBPF spec
(https://www.kernel.org/doc/Documentation/networking/filter.txt).

Not currently supported features:
 - JIT
 - cBPF
 - tail-pointer call
 - eBPF MAP
 - skb
 - function calls for 32-bit apps

It also adds dependency on libelf.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 config/common_base                 |   5 +
 lib/Makefile                       |   2 +
 lib/librte_bpf/Makefile            |  30 +++
 lib/librte_bpf/bpf.c               |  59 +++++
 lib/librte_bpf/bpf_exec.c          | 452 +++++++++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_impl.h          |  41 ++++
 lib/librte_bpf/bpf_load.c          | 386 +++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_validate.c      |  55 +++++
 lib/librte_bpf/meson.build         |  18 ++
 lib/librte_bpf/rte_bpf.h           | 170 ++++++++++++++
 lib/librte_bpf/rte_bpf_version.map |  12 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   2 +
 13 files changed, 1233 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_bpf/Makefile
 create mode 100644 lib/librte_bpf/bpf.c
 create mode 100644 lib/librte_bpf/bpf_exec.c
 create mode 100644 lib/librte_bpf/bpf_impl.h
 create mode 100644 lib/librte_bpf/bpf_load.c
 create mode 100644 lib/librte_bpf/bpf_validate.c
 create mode 100644 lib/librte_bpf/meson.build
 create mode 100644 lib/librte_bpf/rte_bpf.h
 create mode 100644 lib/librte_bpf/rte_bpf_version.map

diff --git a/config/common_base b/config/common_base
index c09c7cf88..d68c2e211 100644
--- a/config/common_base
+++ b/config/common_base
@@ -821,3 +821,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile librte_bpf
+#
+CONFIG_RTE_LIBRTE_BPF=y
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..a4a2329f9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -97,6 +97,8 @@ DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
 DEPDIRS-librte_gso += librte_mempool
+DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
+DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ether
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile
new file mode 100644
index 000000000..e0f434e77
--- /dev/null
+++ b/lib/librte_bpf/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_bpf.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+LDLIBS += -lrte_net -lrte_eal
+LDLIBS += -lrte_mempool -lrte_ring
+LDLIBS += -lrte_mbuf -lrte_ethdev
+LDLIBS += -lelf
+
+EXPORT_MAP := rte_bpf_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c
+
+# install header files
+SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c
new file mode 100644
index 000000000..d7f68c017
--- /dev/null
+++ b/lib/librte_bpf/bpf.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+int rte_bpf_logtype;
+
+__rte_experimental void
+rte_bpf_destroy(struct rte_bpf *bpf)
+{
+	if (bpf != NULL) {
+		if (bpf->jit.func != NULL)
+			munmap(bpf->jit.func, bpf->jit.sz);
+		munmap(bpf, bpf->sz);
+	}
+}
+
+__rte_experimental int
+rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	jit[0] = bpf->jit;
+	return 0;
+}
+
+int
+bpf_jit(struct rte_bpf *bpf)
+{
+	int32_t rc;
+
+	rc = -ENOTSUP;
+	if (rc != 0)
+		RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
+
+RTE_INIT(rte_bpf_init_log);
+
+static void
+rte_bpf_init_log(void)
+{
+	rte_bpf_logtype = rte_log_register("lib.bpf");
+	if (rte_bpf_logtype >= 0)
+		rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO);
+}
diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c
new file mode 100644
index 000000000..0382ade98
--- /dev/null
+++ b/lib/librte_bpf/bpf_exec.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+
+#include "bpf_impl.h"
+
+#define BPF_JMP_UNC(ins)	((ins) += (ins)->off)
+
+#define BPF_JMP_CND_REG(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \
+		(ins)->off : 0)
+
+#define BPF_JMP_CND_IMM(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \
+		(ins)->off : 0)
+
+#define BPF_NEG_ALU(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg]))
+
+#define BPF_MOV_ALU_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg])
+
+#define BPF_OP_ALU_REG(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg])
+
+#define BPF_MOV_ALU_IMM(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(ins)->imm)
+
+#define BPF_OP_ALU_IMM(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(ins)->imm)
+
+#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \
+	if ((type)(reg)[(ins)->src_reg] == 0) { \
+		RTE_BPF_LOG(ERR, \
+			"%s(%p): division by 0 at pc: %#zx;\n", \
+			__func__, bpf, \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+		return 0; \
+	} \
+} while (0)
+
+#define BPF_LD_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = \
+		*(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off))
+
+#define BPF_ST_IMM(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(ins)->imm)
+
+#define BPF_ST_REG(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(reg)[(ins)->src_reg])
+
+#define BPF_ST_XADD_REG(reg, ins, tp)	\
+	(rte_atomic##tp##_add((rte_atomic##tp##_t *) \
+		(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \
+		reg[ins->src_reg]))
+
+static inline void
+bpf_alu_be(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_be_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_be_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_be_64(*v);
+		break;
+	}
+}
+
+static inline void
+bpf_alu_le(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_le_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_le_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_le_64(*v);
+		break;
+	}
+}
+
+static inline uint64_t
+bpf_exec(const struct rte_bpf *bpf, uint64_t reg[MAX_BPF_REG])
+{
+	const struct bpf_insn *ins;
+
+	for (ins = bpf->prm.ins; ; ins++) {
+		switch (ins->code) {
+		/* 32 bit ALU IMM operations */
+		case (BPF_ALU | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint32_t);
+			break;
+		/* 32 bit ALU REG operations */
+		case (BPF_ALU | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_BE):
+			bpf_alu_be(reg, ins);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_LE):
+			bpf_alu_le(reg, ins);
+			break;
+		/* 64 bit ALU IMM operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint64_t);
+			break;
+		/* 64 bit ALU REG operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint64_t);
+			break;
+		/* load instructions */
+		case (BPF_LDX | BPF_MEM | BPF_B):
+			BPF_LD_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_H):
+			BPF_LD_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_W):
+			BPF_LD_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_DW):
+			BPF_LD_REG(reg, ins, uint64_t);
+			break;
+		/* load 64 bit immediate value */
+		case (BPF_LD | BPF_IMM | BPF_DW):
+			reg[ins->dst_reg] = (uint32_t)ins[0].imm |
+				(uint64_t)(uint32_t)ins[1].imm << 32;
+			ins++;
+			break;
+		/* store instructions */
+		case (BPF_STX | BPF_MEM | BPF_B):
+			BPF_ST_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_H):
+			BPF_ST_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_W):
+			BPF_ST_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_DW):
+			BPF_ST_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_B):
+			BPF_ST_IMM(reg, ins, uint8_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_H):
+			BPF_ST_IMM(reg, ins, uint16_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_W):
+			BPF_ST_IMM(reg, ins, uint32_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_DW):
+			BPF_ST_IMM(reg, ins, uint64_t);
+			break;
+		/* atomic add instructions */
+		case (BPF_STX | BPF_XADD | BPF_W):
+			BPF_ST_XADD_REG(reg, ins, 32);
+			break;
+		case (BPF_STX | BPF_XADD | BPF_DW):
+			BPF_ST_XADD_REG(reg, ins, 64);
+			break;
+		/* jump instructions */
+		case (BPF_JMP | BPF_JA):
+			BPF_JMP_UNC(ins);
+			break;
+		/* jump IMM instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, &, uint64_t);
+			break;
+		/* jump REG instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, &, uint64_t);
+			break;
+		/* call instructions */
+		case (BPF_JMP | BPF_CALL):
+			reg[BPF_REG_0] = bpf->prm.xsym[ins->imm].func(
+				reg[BPF_REG_1], reg[BPF_REG_2], reg[BPF_REG_3],
+				reg[BPF_REG_4], reg[BPF_REG_5]);
+			break;
+		/* return instruction */
+		case (BPF_JMP | BPF_EXIT):
+			return reg[BPF_REG_0];
+		default:
+			RTE_BPF_LOG(ERR,
+				"%s(%p): invalid opcode %#x at pc: %#zx;\n",
+				__func__, bpf, ins->code,
+				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+			return 0;
+		}
+	}
+
+	/* should never be reached */
+	RTE_VERIFY(0);
+	return 0;
+}
+
+__rte_experimental uint32_t
+rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
+	uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[MAX_BPF_REG];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		reg[BPF_REG_1] = (uintptr_t)ctx[i];
+		reg[BPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+__rte_experimental uint64_t
+rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
+{
+	uint64_t rc;
+
+	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
+	return rc;
+}
diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h
new file mode 100644
index 000000000..5d7e65c31
--- /dev/null
+++ b/lib/librte_bpf/bpf_impl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _BPF_H_
+#define _BPF_H_
+
+#include <rte_bpf.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BPF_STACK_SIZE	0x200
+
+struct rte_bpf {
+	struct rte_bpf_prm prm;
+	struct rte_bpf_jit jit;
+	size_t sz;
+	uint32_t stack_sz;
+};
+
+extern int bpf_validate(struct rte_bpf *bpf);
+
+extern int bpf_jit(struct rte_bpf *bpf);
+
+#ifdef RTE_ARCH_X86_64
+extern int bpf_jit_x86(struct rte_bpf *);
+#endif
+
+extern int rte_bpf_logtype;
+
+#define	RTE_BPF_LOG(lvl, fmt, args...) \
+	rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BPF_H_ */
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
new file mode 100644
index 000000000..3c7279a6c
--- /dev/null
+++ b/lib/librte_bpf/bpf_load.c
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+
+#include "bpf_impl.h"
+
+/* To overcome compatibility issue */
+#ifndef EM_BPF
+#define	EM_BPF	247
+#endif
+
+static uint32_t
+bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
+	const struct rte_bpf_xsym fp[], uint32_t fn)
+{
+	uint32_t i;
+
+	if (sn == NULL || fp == NULL)
+		return UINT32_MAX;
+
+	for (i = 0; i != fn; i++) {
+		if (fp[i].type == type && strcmp(sn, fp[i].name) == 0)
+			break;
+	}
+
+	return (i != fn) ? i : UINT32_MAX;
+}
+
+/*
+ * update BPF code at offset *ofs* with a proper address(index) for external
+ * symbol *sn*
+ */
+static int
+resolve_xsym(const char *sn, size_t ofs, struct bpf_insn *ins, size_t ins_sz,
+	const struct rte_bpf_prm *prm)
+{
+	uint32_t idx, fidx;
+	enum rte_bpf_xtype type;
+
+	if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz)
+		return -EINVAL;
+
+	idx = ofs / sizeof(ins[0]);
+	if (ins[idx].code == (BPF_JMP | BPF_CALL))
+		type = RTE_BPF_XTYPE_FUNC;
+	else if (ins[idx].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+			ofs < ins_sz - sizeof(ins[idx]))
+		type = RTE_BPF_XTYPE_VAR;
+	else
+		return -EINVAL;
+
+	fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym);
+	if (fidx == UINT32_MAX)
+		return -ENOENT;
+
+	/* for function we just need an index in our xsym table */
+	if (type == RTE_BPF_XTYPE_FUNC)
+		ins[idx].imm = fidx;
+	/* for variable we need to store its absolute address */
+	else {
+		ins[idx].imm = (uintptr_t)prm->xsym[fidx].var;
+		ins[idx + 1].imm =
+			(uint64_t)(uintptr_t)prm->xsym[fidx].var >> 32;
+	}
+
+	return 0;
+}
+
+static int
+check_elf_header(const Elf64_Ehdr * eh)
+{
+	const char *err;
+
+	err = NULL;
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	if (eh->e_ident[EI_DATA] != ELFDATA2LSB)
+#else
+	if (eh->e_ident[EI_DATA] != ELFDATA2MSB)
+#endif
+		err = "not native byte order";
+	else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE)
+		err = "unexpected OS ABI";
+	else if (eh->e_type != ET_REL)
+		err = "unexpected ELF type";
+	else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF)
+		err = "unexpected machine type";
+
+	if (err != NULL) {
+		RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find executable section by name.
+ */
+static int
+find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
+{
+	Elf_Scn *sc;
+	const Elf64_Ehdr *eh;
+	const Elf64_Shdr *sh;
+	Elf_Data *sd;
+	const char *sn;
+	int32_t rc;
+
+	eh = elf64_getehdr(elf);
+	if (eh == NULL) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	if (check_elf_header(eh) != 0)
+		return -EINVAL;
+
+	/* find given section by name */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL;
+			sc = elf_nextscn(elf, sc)) {
+		sh = elf64_getshdr(sc);
+		sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name);
+		if (sn != NULL && strcmp(section, sn) == 0 &&
+				sh->sh_type == SHT_PROGBITS &&
+				sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR))
+			break;
+	}
+
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL || sd->d_size == 0 ||
+			sd->d_size % sizeof(struct bpf_insn) != 0) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	*psd = sd;
+	*pidx = elf_ndxscn(sc);
+	return 0;
+}
+
+/*
+ * helper function to process data from relocation table.
+ */
+static int
+process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
+	struct bpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+{
+	int32_t rc;
+	uint32_t i, n;
+	size_t ofs, sym;
+	const char *sn;
+	const Elf64_Ehdr *eh;
+	Elf_Scn *sc;
+	const Elf_Data *sd;
+	Elf64_Sym *sm;
+
+	eh = elf64_getehdr(elf);
+
+	/* get symtable by section index */
+	sc = elf_getscn(elf, sym_idx);
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL)
+		return -EINVAL;
+	sm = sd->d_buf;
+
+	n = re_sz / sizeof(re[0]);
+	for (i = 0; i != n; i++) {
+
+		ofs = re[i].r_offset;
+
+		/* retrieve index in the symtable */
+		sym = ELF64_R_SYM(re[i].r_info);
+		if (sym * sizeof(sm[0]) >= sd->d_size)
+			return -EINVAL;
+
+		sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name);
+
+		rc = resolve_xsym(sn, ofs, ins, ins_sz, prm);
+		if (rc != 0) {
+			RTE_BPF_LOG(ERR,
+				"resolve_xsym(%s, %zu) error code: %d\n",
+				sn, ofs, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find relocation information (if any)
+ * and update bpf code.
+ */
+static int
+elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
+	const struct rte_bpf_prm *prm)
+{
+	Elf64_Rel *re;
+	Elf_Scn *sc;
+	const Elf64_Shdr *sh;
+	const Elf_Data *sd;
+	int32_t rc;
+
+	rc = 0;
+
+	/* walk through all sections */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0;
+			sc = elf_nextscn(elf, sc)) {
+
+		sh = elf64_getshdr(sc);
+
+		/* relocation data for our code section */
+		if (sh->sh_type == SHT_REL && sh->sh_info == sidx) {
+			sd = elf_getdata(sc, NULL);
+			if (sd == NULL || sd->d_size == 0 ||
+					sd->d_size % sizeof(re[0]) != 0)
+				return -EINVAL;
+			rc = process_reloc(elf, sh->sh_link,
+				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				prm);
+		}
+	}
+
+	return rc;
+}
+
+static struct rte_bpf *
+bpf_load(const struct rte_bpf_prm *prm)
+{
+	uint8_t *buf;
+	struct rte_bpf *bpf;
+	size_t sz, bsz, insz, xsz;
+
+	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
+	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	bsz = sizeof(bpf[0]);
+	sz = insz + xsz + bsz;
+
+	buf = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf == MAP_FAILED)
+		return NULL;
+
+	bpf = (void *)buf;
+	bpf->sz = sz;
+
+	memcpy(&bpf->prm, prm, sizeof(bpf->prm));
+
+	memcpy(buf + bsz, prm->xsym, xsz);
+	memcpy(buf + bsz + xsz, prm->ins, insz);
+
+	bpf->prm.xsym = (void *)(buf + bsz);
+	bpf->prm.ins = (void *)(buf + bsz + xsz);
+
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	if (prm == NULL || prm->ins == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load(prm);
+	if (bpf == NULL) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	rc = bpf_validate(bpf);
+	if (rc == 0) {
+		bpf_jit(bpf);
+		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
+			rc = -ENOMEM;
+	}
+
+	if (rc != 0) {
+		rte_bpf_destroy(bpf);
+		rte_errno = -rc;
+		return NULL;
+	}
+
+	return bpf;
+}
+
+static struct rte_bpf *
+bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+{
+	Elf *elf;
+	Elf_Data *sd;
+	size_t sidx;
+	int32_t rc;
+	struct rte_bpf *bpf;
+	struct rte_bpf_prm np;
+
+	elf_version(EV_CURRENT);
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+
+	rc = find_elf_code(elf, section, &sd, &sidx);
+	if (rc == 0)
+		rc = elf_reloc_code(elf, sd, sidx, prm);
+
+	if (rc == 0) {
+		np = prm[0];
+		np.ins = sd->d_buf;
+		np.nb_ins = sd->d_size / sizeof(struct bpf_insn);
+		bpf = rte_bpf_load(&np);
+	} else {
+		bpf = NULL;
+		rte_errno = -rc;
+	}
+
+	elf_end(elf);
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	int32_t fd, rc;
+	struct rte_bpf *bpf;
+
+	if (prm == NULL || fname == NULL || sname == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		rc = errno;
+		RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n",
+			__func__, fname, rc, strerror(rc));
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load_elf(prm, fd, sname);
+	close(fd);
+
+	if (bpf == NULL) {
+		RTE_BPF_LOG(ERR,
+			"%s(fname=\"%s\", sname=\"%s\") failed, "
+			"error code: %d\n",
+			__func__, fname, sname, rte_errno);
+		return NULL;
+	}
+
+	RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") "
+		"successfully creates %p(jit={.func=%p,.sz=%zu});\n",
+		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
+	return bpf;
+}
diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c
new file mode 100644
index 000000000..1911e1381
--- /dev/null
+++ b/lib/librte_bpf/bpf_validate.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+/*
+ * dummy one for now, need more work.
+ */
+int
+bpf_validate(struct rte_bpf *bpf)
+{
+	int32_t rc, ofs, stack_sz;
+	uint32_t i, op, dr;
+	const struct bpf_insn *ins;
+
+	rc = 0;
+	stack_sz = 0;
+	for (i = 0; i != bpf->prm.nb_ins; i++) {
+
+		ins = bpf->prm.ins + i;
+		op = ins->code;
+		dr = ins->dst_reg;
+		ofs = ins->off;
+
+		if ((BPF_CLASS(op) == BPF_STX || BPF_CLASS(op) == BPF_ST) &&
+				dr == BPF_REG_10) {
+			ofs -= sizeof(uint64_t);
+			stack_sz = RTE_MIN(ofs, stack_sz);
+		}
+	}
+
+	if (stack_sz != 0) {
+		stack_sz = -stack_sz;
+		if (stack_sz > MAX_BPF_STACK_SIZE)
+			rc = -ERANGE;
+		else
+			bpf->stack_sz = stack_sz;
+	}
+
+	if (rc != 0)
+		RTE_BPF_LOG(ERR, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build
new file mode 100644
index 000000000..05c48c7ff
--- /dev/null
+++ b/lib/librte_bpf/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+allow_experimental_apis = true
+sources = files('bpf.c',
+		'bpf_exec.c',
+		'bpf_load.c',
+		'bpf_validate.c')
+
+install_headers = files('rte_bpf.h')
+
+deps += ['mbuf', 'net']
+
+dep = dependency('libelf', required: false)
+if dep.found() == false
+	build = false
+endif
+ext_deps += dep
diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h
new file mode 100644
index 000000000..825621404
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_BPF_H_
+#define _RTE_BPF_H_
+
+/**
+ * @file
+ *
+ * RTE BPF support.
+ * librte_bpf provides a framework to load and execute eBPF bytecode
+ * inside user-space dpdk based applications.
+ * It supports basic set of features from eBPF spec
+ * (https://www.kernel.org/doc/Documentation/networking/filter.txt).
+ */
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <bpf_def.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible types for external symbols.
+ */
+enum rte_bpf_xtype {
+	RTE_BPF_XTYPE_FUNC, /**< function */
+	RTE_BPF_XTYPE_VAR, /**< variable */
+	RTE_BPF_XTYPE_NUM
+};
+
+/**
+ * Definition for external symbols available in the BPF program.
+ */
+struct rte_bpf_xsym {
+	const char *name;        /**< name */
+	enum rte_bpf_xtype type; /**< type */
+	union {
+		uint64_t (*func)(uint64_t, uint64_t, uint64_t,
+				uint64_t, uint64_t);
+		void *var;
+	}; /**< value */
+};
+
+/**
+ * Possible BPF program types.
+ * Use negative values for DPDK specific prog-types, to make sure they will
+ * not interfere with Linux related ones.
+ */
+enum rte_bpf_prog_type {
+	RTE_BPF_PROG_TYPE_UNSPEC = BPF_PROG_TYPE_UNSPEC,
+	/**< input is a pointer to raw data */
+	RTE_BPF_PROG_TYPE_MBUF = INT32_MIN,
+	/**< input is a pointer to rte_mbuf */
+};
+
+/**
+ * Input parameters for loading eBPF code.
+ */
+struct rte_bpf_prm {
+	const struct bpf_insn *ins; /**< array of eBPF instructions */
+	uint32_t nb_ins;            /**< number of instructions in ins */
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym; /**< number of elements in xsym */
+	enum rte_bpf_prog_type prog_type; /**< eBPF program type */
+};
+
+/**
+ * Information about compiled into native ISA eBPF code.
+ */
+struct rte_bpf_jit {
+	uint64_t (*func)(void *); /**< JIT-ed native code */
+	size_t sz;                /**< size of JIT-ed code */
+};
+
+struct rte_bpf;
+
+/**
+ * De-allocate all memory used by this eBPF execution context.
+ *
+ * @param bpf
+ *   BPF handle to destroy.
+ */
+void rte_bpf_destroy(struct rte_bpf *bpf);
+
+/**
+ * Create a new eBPF execution context and load given BPF code into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_load(const struct rte_bpf_prm *prm);
+
+/**
+ * Create a new eBPF execution context and load BPF code from given ELF
+ * file into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @param fname
+ *  Pathname for a ELF file.
+ * @param sname
+ *  Name of the executable section within the file to load.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_elf_load(const struct rte_bpf_prm *prm,
+	const char *fname, const char *sname);
+
+/**
+ * Execute given BPF bytecode.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to input context.
+ * @return
+ *   BPF execution return value.
+ */
+uint64_t rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
+
+/**
+ * Execute given BPF bytecode over a set of input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   array of pointers to the input contexts.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number of elements in ctx[] (and rc[]).
+ * @return
+ *   number of successfully processed inputs.
+ */
+uint32_t rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[],
+	uint64_t rc[], uint32_t num);
+
+/**
+ * Provide information about natively compield code for given BPF handle.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the rte_bpf_jit structure to be filled with related data.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BPF_H_ */
diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map
new file mode 100644
index 000000000..ff65144df
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_bpf_destroy;
+	rte_bpf_elf_load;
+	rte_bpf_exec;
+	rte_bpf_exec_burst;
+	rte_bpf_get_jit;
+	rte_bpf_load;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ef6159170..7ff7aaaa5 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -23,7 +23,7 @@ libraries = [ 'compat', # just a header, used for versioning
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
-	'flow_classify']
+	'flow_classify', 'bpf']
 
 foreach l:libraries
 	build = true
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 258590819..405a13147 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -83,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER)          += -lrte_timer
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 
+_LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf -lelf
+
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
-- 
2.13.6

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 0/5] introduce new tunnel types
                     ` (2 preceding siblings ...)
  2018-04-12  7:33  3% ` [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types Xueming Li
@ 2018-04-13 11:02  3% ` Xueming Li
  3 siblings, 0 replies; 200+ results
From: Xueming Li @ 2018-04-13 11:02 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v4:
- Update testpmd doc for flow VXLAN-GPE paramter.
v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine

Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c                 |  24 +++++++
 app/test-pmd/config.c                       |   2 +
 app/test-pmd/csumonly.c                     | 103 +++++++++++++++++++++++++---
 app/test-pmd/parameters.c                   |  12 +++-
 app/test-pmd/testpmd.h                      |   2 +
 doc/guides/prog_guide/rte_flow.rst          |  12 ++++
 doc/guides/rel_notes/deprecation.rst        |   4 --
 doc/guides/testpmd_app_ug/run_app.rst       |   5 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   4 ++
 lib/librte_ether/rte_eth_ctrl.h             |   3 +-
 lib/librte_ether/rte_flow.c                 |   1 +
 lib/librte_ether/rte_flow.h                 |  27 ++++++++
 lib/librte_mbuf/rte_mbuf.c                  |   3 +
 lib/librte_mbuf/rte_mbuf.h                  |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c            |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h            |  47 +++++++++++++
 lib/librte_net/rte_ether.h                  |  25 +++++++
 17 files changed, 261 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances
  2018-04-04 22:01  3% [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
@ 2018-04-13  9:16  0% ` Burakov, Anatoly
  0 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-13  9:16 UTC (permalink / raw)
  To: Arnon Warshavsky, thomas, wenzhuo.lu, declan.doherty,
	jerin.jacob, bruce.richardson, ferruh.yigit
  Cc: dev

On 04-Apr-18 11:01 PM, Arnon Warshavsky wrote:
> 
> The purpose of this patch series is to cleanup the library code
> from paths that end up aborting the process,
> and move to checking error values, in order to allow the running process
> perform an orderly teardown or other mitigation of the event.
> 
> This patch modifies the majority of rte_panic calls
> under lib and drivers, and replaces them with a log message
> and an error return code according to context,
> that can be propagated up the call stack.
> 
> - Focus was given to the dpdk initialization path
> - Some of the panic calls within drivers were left in place where
>    the call is from within an interrupt or calls that are
>    on the data path,where there is no simple applicative
>    route to propagate the error to temination.
>    These should be handled by the driver maintainers.
> - In order to avoid breaking ABI where panic was called from public
>    void functions, a panic state variable was introduced so that
>    it can be queried after calling these void functions.
>    This tool place for a single function call.
> - local void functions with no api were changed to retrun a value
>    where needed
> - No change took place in example and test files
> - No change took place for debug assertions calling panic
> - A new function was added to devtools/checkpatches.sh
>    in order to prevent new additions of calls to rte_panic
>    under lib and drivers.
> 
> Keep calm and don't panic
> 
> ---
> 
> v2:
> - reformat error messages so that literal string are in the same line
> - fix typo in commit message
> - add new return code to doxigen of rte_memzone_free()

Hi Arnon,

When sending new versions, the entire patchset must be sent. It makes it 
easier for maintainers to apply patches this way.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-12 16:34  0%                     ` Ananyev, Konstantin
@ 2018-04-12 18:58  0%                       ` Yongseok Koh
  0 siblings, 0 replies; 200+ results
From: Yongseok Koh @ 2018-04-12 18:58 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Thu, Apr 12, 2018 at 04:34:56PM +0000, Ananyev, Konstantin wrote:
> > >
> > > > > >
> > > > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > > > Hi Yongseok,
> > > > > > >
> > > > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > > > Hi,
> > > > > > > > >
> > > > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > > > >
> > > > > > > > > > Possible use-cases could be:
> > > > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > > > >
> > > > > > > > > I think the current API is already able to do what you want.
> > > > > > > > >
> > > > > > > > > 1/ Here is a mbuf m with its data
> > > > > > > > >
> > > > > > > > >                off
> > > > > > > > >                <-->
> > > > > > > > >                       len
> > > > > > > > >           +----+   <---------->
> > > > > > > > >           |    |
> > > > > > > > >         +-|----v----------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +-----------------------------+
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > 2/ clone m:
> > > > > > > > >
> > > > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > > > >
> > > > > > > > >   Note that c has its own offset and length fields.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >                off
> > > > > > > > >                <-->
> > > > > > > > >                       len
> > > > > > > > >           +----+   <---------->
> > > > > > > > >           |    |
> > > > > > > > >         +-|----v----------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +------^----------------------+
> > > > > > > > >                |
> > > > > > > > >           +----+
> > > > > > > > > indirect  |
> > > > > > > > >         +-|---------------------------+
> > > > > > > > >         | |    -----------------------|
> > > > > > > > > c       | buf  |                     ||
> > > > > > > > >         |      -----------------------|
> > > > > > > > >         +-----------------------------+
> > > > > > > > >
> > > > > > > > >                 off    len
> > > > > > > > >                 <--><---------->
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > 3/ remove some data from c without changing m
> > > > > > > > >
> > > > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Please let me know if it fits your needs.
> > > > > > > >
> > > > > > > > No, it doesn't.
> > > > > > > >
> > > > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > > > difference offsets in m,
> > > > > > > >
> > > > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > > > >
> > > > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > > > header should be linked by h1->next = c2.
> > > > > > >
> > > > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > > > length) that will:
> > > > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > > > >     in the range [offset : length+offset].
> > > > > > >   - prepend an empty and writable mbuf for the headers
> > > > > > >
> > > > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > > > >
> > > > > > > What do you mean by properly handled?
> > > > > > >
> > > > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > > > won't be protected.
> > > > > > >
> > > > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > > > may share their data. How an application can know if the data is shared
> > > > > > > or not?
> > > > > > >
> > > > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > > > >
> > > > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > > > enough to handle that use-case.
> > > > > >
> > > > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > > > indirect referencing.
> > > > >
> > > > > But just to make HW to RX multiple packets into one mbuf,
> > > > > data_off inside indirect mbuf should be enough, correct?
> > > > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > > > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > > > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > > > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > > > to saturate the network link.
> > >
> > > There were few complains that 64KB max is a limitation for some use-cases.
> > > I am not against increasing it, but I don't think we have free space on first cache-line for that
> > > without another big rework of mbuf layout.
> > > Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too.
> > >
> > > >
> > > > > As I understand, what you'd like to achieve with this new field -
> > > > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > > > indirect mbufs trying to modify same direct buffer.
> > > >
> > > > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > > > is read-only. What that means, all the entities which own such mbufs have to be
> > > > aware of that and keep the principle as DPDK can't enforce the rule and there
> > > > can't be such sanity check. In this sense, HW doesn't violate it because the
> > > > direct mbuf is injected to HW before indirection. When packets are written by
> > > > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > > > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > > > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> > >
> > > Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> > > which implies that it should be possible by some entity (upper layer?) to manipulate these
> > > indirect mbufs.
> > > And we don't know how exactly it will be done.
> > 
> > That's a valid concern. I can make it private by merging into the _attach_to()
> > func, or I just can add a comment in the API doc. However, if users are aware
> > that a mbuf is read-only and we expect them to keep it intact by their own
> > judgement, they would/should not use those APIs. We can't stop them modifying
> > content or the buffer itself anyway. Will add more comments of this discussion
> > regarding read-only mode.
> 
> Ok, so these functions are intended to be used only by PMD level?
> But in that case do you need them at all?
> Isn't it possible implement same thing with just data_off?
> I mean your PMD knows in advance what is the buf_len of mbuf and at startup
> time it can decide it going to slice it how to slice it into multiple packets.
> So each offset is known in advance and you don't need to worry that you'll overwrite
> neighbor packet's data. 

Since Olivier's last comment, I've been thinking about the approach all over
again. It looks like I'm trapped in self-contradiction. The reason why I didn't
want to use data_off was to provide valid headroom for each Rx packet and let
users freely write the headroom. But, given that indirect mbuf should be
considered read-only, this isn't a right approach. Instead of slicing a buffer
with mbuf indirection and manipulating boundaries, the idea of external data (as
Olivier suggested) would fit better. Even though it is more complex, it is
doable. I summarized ideas yesterday and will come up with a new patch soon.

Briefly, I think reserved bit 61 of ol_flags can be used to indicate externally
attached mbuf. The following is my initial thought.

#define EXT_ATTACHED_MBUF    (1ULL << 61)

struct rte_pktmbuf_ext_shared_info {
	refcnt;
	*free_cb();
	*opaque /* arg for free_cb() */
}

rte_pktmbuf_get_ext_shinfo() {
	/* Put shared info at the end of external buffer */
	return (struct rte_pktmbuf_ext_shared_info *)(m->buf_addr + m->buf_len);
}

rte_pktmbuf_attach_ext_buf(m, buf_addr, buf_len, free_cb, opaque) {
	struct rte_pktmbuf_ext_shared_info *shinfo;

	m->buf_addr = buf_addr;
	m->buf_iova = rte_mempool_virt2iova(buf_addr);
	/* Have to add some calculation for alignment */
	m->buf_len = buf_len - sizeof (*shinfo);
	shinfo = m->buf_addr + m->buf_len;
	...
	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
	m->ol_flags |= EXT_ATTACHED_MBUF;
	atomic set shinfo->refcnt = 1;

	shinfo->free_cb = free_cb;
	shinfo->opaque = opaque;

	...
}
rte_pktmbuf_detach_ext_buf(m)

#define RTE_MBUF_EXT(mb)   ((mb)->ol_flags & EXT_ATTACHED_MBUF)

In rte_pktmbuf_prefree_seg(),

		if (RTE_MBUF_INDIRECT(m))
			rte_pktmbuf_detach(m);
		else if (RTE_MBUF_EXT(m))
			rte_pktmbuf_detach_ext_buf(m);

And in rte_pktmbuf_attach(), if the mbuf attaching to is externally attached,
then just increase refcnt in shinfo so that multiple mbufs can refer to the same
external buffer.

Please feel free to share any concern/idea.

> > > > The direct buffer will be freed and get available for reuse when all the attached
> > > > indirect mbufs are freed.
> > > >
> > > > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > > > Fields for indirect mbufs, straight after attach()?
> > > >
> > > > Good point.
> > > > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > > > had to consider a case where priv_size is really given by user. Even though it
> > > > is less likely, but if original priv_size is quite big, it can't cover entire
> > > > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > > > 16bit field (buf_off) looked more plausible.
> > >
> > > As I remember, we can't have mbufs bigger then 64K,
> > > so priv_size + buf_len should be always less than 64K, correct?
> > 
> > Can you let me know where I can find the constraint? I checked
> > rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
> > but there's no such limitation.
> > 
> > 	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
> > 		(unsigned)data_room_size;
> 
> 
> Ok I scanned through librte_mbuf and didn't find any limitations.
> Seems like a false impression from my side.
> Anyway that seems like a corner case to have priv_szie + buf_len >64KB.
> Do you really need to support it?

If a user must have 64kB buffer (it's valid, no violation) and the priv_size is
just a few bytes. Then, does library have to force the user to sacrifice a few
bytes for priv_size? Do you think it's a corner case? Still using priv_size
doesn't seem to be a good idea.

Yongseok

> > The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
> > sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
> > priv_size couldn't be used for this purpose.
> > 
> > Yongseok
> > 
> > > > > > > >
> > > > > > > > Does this make sense?
> > > > > > >
> > > > > > > I understand the need.
> > > > > > >
> > > > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > > > a quick overview.
> > > > > > >
> > > > > > > [1]
> > > > > >
> > > >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > > > -Session05-OlivierMatz-
> > > > > >
> > > >
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > > > >
> > > > > > > The advantage is that it does not require the large data to be inside a
> > > > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > > > to implement compared to your solution.
> > > > > >
> > > > > > I knew that you presented the slides and frankly, I had considered that option
> > > > > > at first. But even with that option, metadata to store refcnt should also be
> > > > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > > > I just wanted to make full use of the existing framework because it is less
> > > > > > complex as you mentioned. Given that you presented the idea of external data
> > > > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > > > more demands come from users in the future as it would be a huge change and may
> > > > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > > > discussions and write codes for it if I can be helpful.
> > > > > >
> > > > > > Do you think this patch is okay for now?
> > > > > >
> > > > > >
> > > > > > Thanks for your comments,
> > > > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11 17:08  0%                   ` Yongseok Koh
@ 2018-04-12 16:34  0%                     ` Ananyev, Konstantin
  2018-04-12 18:58  0%                       ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-12 16:34 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

> >
> > > > >
> > > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > > Hi Yongseok,
> > > > > >
> > > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > > Hi,
> > > > > > > >
> > > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > > >
> > > > > > > > > Possible use-cases could be:
> > > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > > >
> > > > > > > > I think the current API is already able to do what you want.
> > > > > > > >
> > > > > > > > 1/ Here is a mbuf m with its data
> > > > > > > >
> > > > > > > >                off
> > > > > > > >                <-->
> > > > > > > >                       len
> > > > > > > >           +----+   <---------->
> > > > > > > >           |    |
> > > > > > > >         +-|----v----------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +-----------------------------+
> > > > > > > >
> > > > > > > >
> > > > > > > > 2/ clone m:
> > > > > > > >
> > > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > > >
> > > > > > > >   Note that c has its own offset and length fields.
> > > > > > > >
> > > > > > > >
> > > > > > > >                off
> > > > > > > >                <-->
> > > > > > > >                       len
> > > > > > > >           +----+   <---------->
> > > > > > > >           |    |
> > > > > > > >         +-|----v----------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +------^----------------------+
> > > > > > > >                |
> > > > > > > >           +----+
> > > > > > > > indirect  |
> > > > > > > >         +-|---------------------------+
> > > > > > > >         | |    -----------------------|
> > > > > > > > c       | buf  |                     ||
> > > > > > > >         |      -----------------------|
> > > > > > > >         +-----------------------------+
> > > > > > > >
> > > > > > > >                 off    len
> > > > > > > >                 <--><---------->
> > > > > > > >
> > > > > > > >
> > > > > > > > 3/ remove some data from c without changing m
> > > > > > > >
> > > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > > >
> > > > > > > >
> > > > > > > > Please let me know if it fits your needs.
> > > > > > >
> > > > > > > No, it doesn't.
> > > > > > >
> > > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > > difference offsets in m,
> > > > > > >
> > > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > > >
> > > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > > header should be linked by h1->next = c2.
> > > > > >
> > > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > > length) that will:
> > > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > > >     in the range [offset : length+offset].
> > > > > >   - prepend an empty and writable mbuf for the headers
> > > > > >
> > > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > > >
> > > > > > What do you mean by properly handled?
> > > > > >
> > > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > > won't be protected.
> > > > > >
> > > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > > may share their data. How an application can know if the data is shared
> > > > > > or not?
> > > > > >
> > > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > > >
> > > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > > enough to handle that use-case.
> > > > >
> > > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > > indirect referencing.
> > > >
> > > > But just to make HW to RX multiple packets into one mbuf,
> > > > data_off inside indirect mbuf should be enough, correct?
> > > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > > to saturate the network link.
> >
> > There were few complains that 64KB max is a limitation for some use-cases.
> > I am not against increasing it, but I don't think we have free space on first cache-line for that
> > without another big rework of mbuf layout.
> > Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too.
> >
> > >
> > > > As I understand, what you'd like to achieve with this new field -
> > > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > > indirect mbufs trying to modify same direct buffer.
> > >
> > > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > > is read-only. What that means, all the entities which own such mbufs have to be
> > > aware of that and keep the principle as DPDK can't enforce the rule and there
> > > can't be such sanity check. In this sense, HW doesn't violate it because the
> > > direct mbuf is injected to HW before indirection. When packets are written by
> > > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> >
> > Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> > which implies that it should be possible by some entity (upper layer?) to manipulate these
> > indirect mbufs.
> > And we don't know how exactly it will be done.
> 
> That's a valid concern. I can make it private by merging into the _attach_to()
> func, or I just can add a comment in the API doc. However, if users are aware
> that a mbuf is read-only and we expect them to keep it intact by their own
> judgement, they would/should not use those APIs. We can't stop them modifying
> content or the buffer itself anyway. Will add more comments of this discussion
> regarding read-only mode.

Ok, so these functions are intended to be used only by PMD level?
But in that case do you need them at all?
Isn't it possible implement same thing with just data_off?
I mean your PMD knows in advance what is the buf_len of mbuf and at startup
time it can decide it going to slice it how to slice it into multiple packets.
So each offset is known in advance and you don't need to worry that you'll overwrite
neighbor packet's data. 

> 
> > > The direct buffer will be freed and get available for reuse when all the attached
> > > indirect mbufs are freed.
> > >
> > > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > > Fields for indirect mbufs, straight after attach()?
> > >
> > > Good point.
> > > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > > had to consider a case where priv_size is really given by user. Even though it
> > > is less likely, but if original priv_size is quite big, it can't cover entire
> > > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > > 16bit field (buf_off) looked more plausible.
> >
> > As I remember, we can't have mbufs bigger then 64K,
> > so priv_size + buf_len should be always less than 64K, correct?
> 
> Can you let me know where I can find the constraint? I checked
> rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
> but there's no such limitation.
> 
> 	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
> 		(unsigned)data_room_size;


Ok I scanned through librte_mbuf and didn't find any limitations.
Seems like a false impression from my side.
Anyway that seems like a corner case to have priv_szie + buf_len >64KB.
Do you really need to support it?

Konstantin

> 
> The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
> sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
> priv_size couldn't be used for this purpose.
> 
> Yongseok
> 
> > > > > > >
> > > > > > > Does this make sense?
> > > > > >
> > > > > > I understand the need.
> > > > > >
> > > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > > a quick overview.
> > > > > >
> > > > > > [1]
> > > > >
> > >
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > > -Session05-OlivierMatz-
> > > > >
> > >
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > > >
> > > > > > The advantage is that it does not require the large data to be inside a
> > > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > > to implement compared to your solution.
> > > > >
> > > > > I knew that you presented the slides and frankly, I had considered that option
> > > > > at first. But even with that option, metadata to store refcnt should also be
> > > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > > I just wanted to make full use of the existing framework because it is less
> > > > > complex as you mentioned. Given that you presented the idea of external data
> > > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > > more demands come from users in the future as it would be a huge change and may
> > > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > > discussions and write codes for it if I can be helpful.
> > > > >
> > > > > Do you think this patch is okay for now?
> > > > >
> > > > >
> > > > > Thanks for your comments,
> > > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated
    2018-04-04 15:08  0%     ` santosh
  2018-04-06 15:51  0%     ` Olivier Matz
@ 2018-04-12 15:22  0%     ` Burakov, Anatoly
  2 siblings, 0 replies; 200+ results
From: Burakov, Anatoly @ 2018-04-12 15:22 UTC (permalink / raw)
  To: Andrew Rybchenko, dev; +Cc: Olivier MATZ

On 26-Mar-18 5:09 PM, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> ---

Hi Andrew,

<...>

> -	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> -		size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
> -						mp->flags);
> +		size_t min_chunk_size;
> +
> +		mem_size = rte_mempool_ops_calc_mem_size(mp, n, pg_shift,
> +				&min_chunk_size, &align);
> +		if (mem_size < 0) {
> +			ret = mem_size;
> +			goto fail;
> +		}
>   
>   		ret = snprintf(mz_name, sizeof(mz_name),
>   			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
> @@ -606,7 +600,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			goto fail;
>   		}
>   
> -		mz = rte_memzone_reserve_aligned(mz_name, size,
> +		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>   			mp->socket_id, mz_flags, align);
>   		/* not enough memory, retry with the biggest zone we have */
>   		if (mz == NULL)
> @@ -617,6 +611,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			goto fail;
>   		}
>   
> +		if (mz->len < min_chunk_size) {
> +			rte_memzone_free(mz);
> +			ret = -ENOMEM;
> +			goto fail;
> +		}
> +
>   		if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
>   			iova = RTE_BAD_IOVA;

OK by me, but needs to be rebased.

>   		else
> @@ -649,13 +649,14 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   static size_t
>   get_anon_size(const struct rte_mempool *mp)
>   {
> -	size_t size, total_elt_sz, pg_sz, pg_shift;
> +	size_t size, pg_sz, pg_shift;
> +	size_t min_chunk_size;
> +	size_t align;
>   
>   	pg_sz = getpagesize();

<...>

>   
> +/**
> + * Calculate memory size required to store given number of objects.
> + *
> + * If mempool objects are not required to be IOVA-contiguous
> + * (the flag MEMPOOL_F_NO_IOVA_CONTIG is set), min_chunk_size defines
> + * virtually contiguous chunk size. Otherwise, if mempool objects must
> + * be IOVA-contiguous (the flag MEMPOOL_F_NO_IOVA_CONTIG is clear),
> + * min_chunk_size defines IOVA-contiguous chunk size.
> + *
> + * @param[in] mp
> + *   Pointer to the memory pool.
> + * @param[in] obj_num
> + *   Number of objects.
> + * @param[in] pg_shift
> + *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
> + * @param[out] min_chunk_size
> + *   Location for minimum size of the memory chunk which may be used to
> + *   store memory pool objects.
> + * @param[out] align
> + *   Location for required memory chunk alignment.
> + * @return
> + *   Required memory size aligned at page boundary.
> + */
> +typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
> +		uint32_t obj_num,  uint32_t pg_shift,
> +		size_t *min_chunk_size, size_t *align);
> +
> +/**
> + * Default way to calculate memory size required to store given number of
> + * objects.
> + *
> + * If page boundaries may be ignored, it is just a product of total
> + * object size including header and trailer and number of objects.
> + * Otherwise, it is a number of pages required to store given number of
> + * objects without crossing page boundary.
> + *
> + * Note that if object size is bigger than page size, then it assumes
> + * that pages are grouped in subsets of physically continuous pages big
> + * enough to store at least one object.
> + *
> + * If mempool driver requires object addresses to be block size aligned
> + * (MEMPOOL_F_CAPA_BLK_ALIGNED_OBJECTS), space for one extra element is
> + * reserved to be able to meet the requirement.
> + *
> + * Minimum size of memory chunk is either all required space, if
> + * capabilities say that whole memory area must be physically contiguous
> + * (MEMPOOL_F_CAPA_PHYS_CONTIG), or a maximum of the page size and total
> + * element size.
> + *
> + * Required memory chunk alignment is a maximum of page size and cache
> + * line size.
> + */
> +ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> +		uint32_t obj_num, uint32_t pg_shift,
> +		size_t *min_chunk_size, size_t *align);

For API docs and wording,

Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>

Should be pretty straightforward to rebase, so you probably should keep 
my ack for v4.

-- 
Thanks,
Anatoly

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-12  9:19  0%         ` Adrien Mazarguil
@ 2018-04-12 10:00  0%           ` Zhang, Qi Z
  0 siblings, 0 replies; 200+ results
From: Zhang, Qi Z @ 2018-04-12 10:00 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce



> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Thursday, April 12, 2018 5:20 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> 
> On Thu, Apr 12, 2018 at 05:12:08AM +0000, Zhang, Qi Z wrote:
> > Hi Adrien:
> >
> > 	Thank you so much for your careful review and helpful suggestions!
> > 	I agree with most of your comments, except couple question about
> RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
> > 	Please see my comment inline.
> >
> > Thanks!
> > Qi
> 
> Thanks, replying inline also.
> 
> > > -----Original Message-----
> > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > Sent: Thursday, April 12, 2018 12:32 AM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>;
> > > Chandran, Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> > > <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>;
> > > Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > > Bruce <bruce.richardson@intel.com>
> > > Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow
> > > API
> > >
> > > On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > > > Add new protocol header match support as below
> > > >
> > > > RTE_FLOW_ITEM_TYPE_ARP
> > > > 	- match IPv4 ARP header
> > > > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > > > 	- match any IPv6 extension header
> > >
> > > While properly defined in the patch, "IPV6" is missing here.
> > >
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6
> > > > 	- match IPv6 ICMP header
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > > 	- match IPv6 ICMP Target address
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > > > 	- match IPv6 ICMP Source Link-layer address
> > > > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > > > 	- match IPv6 ICMP Target Link-layer address
> > > >
> > > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > >
> > > First, since they are added at the end of enum rte_flow_item_type,
> > > no ABI breakage notice is necessary.
> > >
> > > However testpmd implementation [1][2] and documentation update
> > > [3][4] are mandatory for all new pattern items and actions.
> >
> > OK, will add this into v2.
> >
> > >
> > > More comments below regarding these definitions.
> > >
> > > [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an
> > > example in app/test-pmd/cmdline_flow.c [3] "Pattern items" section
> > > in doc/guides/testpmd_app_ug/testpmd_funcs.rst
> > > [4] using "Item: ``ICMP``" section as an example in
> > >     doc/guides/prog_guide/rte_flow.rst
> > >
> > > > ---
> > > >  lib/librte_ether/rte_flow.h | 160
> > > > ++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 160 insertions(+)
> > > >
> > > > diff --git a/lib/librte_ether/rte_flow.h
> > > > b/lib/librte_ether/rte_flow.h index 8f75db0..a8ec780 100644
> > > > --- a/lib/librte_ether/rte_flow.h
> > > > +++ b/lib/librte_ether/rte_flow.h
> > > > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> > > >  	 * See struct rte_flow_item_geneve.
> > > >  	 */
> > > >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > > > +
> > > > +	/**
> > > > +	 * Matches ARP IPv4 header.
> > >
> > > => Matches an IPv4 ARP header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_arp.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ARP,
> > >
> > > While you're right to make "IPv4" clear since ARP is also used for
> > > other protocols DPDK doesn't support (and likely never will), the
> > > ARP header has both a fixed and a variably-sized part.
> > >
> > > Ideally an ARP pattern item should match the fixed part only and a
> > > separate
> > > ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP
> below.
> > >
> > > Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so
> > > one suggestion would be to rename this pattern item ARP_IPV4 directly:
> > >
> > > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > >
> > > > +
> > > > +	/**
> > > > +	 * Matches any IPv6 Extension header.
> > >
> > > => Matches an IPv6 extension header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_ipv6_ext_any.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> > >
> > > I'm not sure this definition is necessary, more below about that.
> > >
> > > Also I don't see a benefit in having "ANY" part of the name, if you
> > > want to keep it, I suggest the simpler:
> > >
> > > => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> > >
> > > > +
> > > > +	/**
> > > > +	 * Matches ICMPv6 header.
> > >
> > > => Matches an ICMPv6 header.
> > >
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6
> > >
> > > Missing "."
> > >
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > > > +
> > >
> > > Before entering NDP territory below, I understand those should be
> > > stacked on top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for
> > > clarity they should be named after the NDP types they represent, not inner
> data fields.
> > >
> > > Also I think we should consider NDP as a protocol sitting on top of
> > > ICMPv6. We could therefore drop "ICMP" from these definitions.
> > >
> > > Since "ND" is a common shorthand for this protocol and "6" another
> > > when doing something related to IPv6, I suggest to use "ND6" to name
> > > he related pattern items.
> >
> > I agree.
> >
> > >
> > > These are the reasons behind my next suggestions:
> > >
> > > > +	/**
> > > > +	 * Match ICMPv6 target address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> > >
> > > => Matches an IPv6 network discovery router solicitation.
> > > => See struct rte_flow_item_nd6_rs.
> > > => RTE_FLOW_ITEM_TYPE_ND6_RS,
> 
> By the way, I wrote "router solicitation" (RS) here but it should have been
> "neighbor solicitation" (NS) obviously.
> 
> > >
> > > You should add another item for neighbor advertisement messages
> > > using the same template:
> > >
> > > => Match an IPv6 network discovery neighbor advertisement.
> > > => See struct rte_flow_item_nd6_na.
> > > => RTE_FLOW_ITEM_TYPE_ND6_NA,
> >
> > The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a
> "target address"
> > according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22,
> > when type = 135/136
> >
> > so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor
> > Solicitation)  and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor
> Advertisement)
> > here, and with the same template (an IPV6 addr) for
> rte_flow_item_icmpv6_tgt_addr?
> 
> The rationale is that while they share a similar format, they are in fact different
> messages that applications could want to match more conveniently than
> providing ICMP type/code values. It would be done for consistency given the
> same RFC also defines router solicitation/advertisement messages.
> 
> However a problem remains since these messages are part of the ICMP format
> whose "reserved" field sometimes contains message flags, particularly with RA.
> These structures would lack that data.
> 
> Honestly your approach makes sense, but it shouldn't be possible to mix target
> addresses with RA/RS and it should be convenient to match these messages
> without specifically matching their contents.
> 
> So another suggestion would be to define new types at the ICMPv6 level to use
> directly on top of ETH for each possible message and define separate
> structures for options.
> 
> First let's drop one character here and in all other definitions in this
> patch:
> 
>  ICMPV6 => ICMP6
> 
> Then the new items would respectively be:
> 
>  RTE_FLOW_ITEM_TYPE_ICMP6
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA
>  RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA
> 
> All the related structure definitions would include the ICMPv6 header part
> defined according to the RFC and except for RTE_FLOW_ITEM_TYPE_ICMP6, a
> default mask that excludes type/code since they are implicit:
> 
>  struct rte_flow_item_icmp6_nd_na {
>       uint8_t type; /**< ICMPv6 type, normally 136. */
>       uint8_t code; /**< ICMPv6 code, normally 0. */
>       rte_be16_t checksum; /**< ICMPv6 checksum. */
>       /**
>        * Router flag (1b), solicited flag (1b), override flag (1b),
>        * reserved (29b).
>        */
>       rte_be32_t rso_reserved;
>       uint8_t target[16]; /**< Target address. */  };
> 
>  static const struct rte_flow_item_icmp6_nd_na
> rte_flow_item_icmp6_nd_na_mask = {
>      .target =
>           "\xff\xff\xff\xff\xff\xff\xff\xff"
>           "\xff\xff\xff\xff\xff\xff\xff\xff",
>  };
> 
> Also notice how uint(16|32)_t were modified as rte_be(16|32)_t while there.
> 
> What's your opinion?

OK, I will take this method, it looks good, thanks 

> 
> >
> > >
> > > The following are possible options for these headers, if specified
> > > they must be found afterward. Also since IPv6 may run on top of
> > > protocols other than Ethernet, you need to clarify these link-layer
> > > addresses use the Ethernet
> > > format:
> > >
> > > > +
> > > > +	/**
> > > > +	 * Match ICMPv6 Source Link-Layer Address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_sll.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> > >
> > > => Matches an IPv6 network discovery source Ethernet link-layer
> > > address option.
> > > => See struct rte_flow_item_nd6_opt_sla_eth.
> > > => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> > >
> > > > +
> > > > +	/**
> > > > +	 * Match ICMPv6 Target Link-Layer Address.
> > > > +	 *
> > > > +	 * See struct rte_flow_item_icmpv6_tll.
> > > > +	 */
> > > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> > >
> > > => Matches an IPv6 network discovery target Ethernet link-layer
> > > address option.
> > > => See struct rte_flow_item_nd6_opt_tla_eth.
> > > => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> > >
> >
> > Agree to rename.
> >
> > > > +
> > >
> > > Unnecessary empty line.
> > >
> > > >  };
> > > >
> > > >  /**
> > > > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > > > rte_flow_item_geneve_mask = {  #endif
> > > >
> > > >  /**
> > > > + * RTE_FLOW_ITEM_TYPE_ARP
> > > > + *
> > > > + * Matches IPv4 ARP packet header
> > >
> > > As above:
> > >
> > > => Matches an IPv4 ARP header.
> > > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > >
> > > > + */
> > > > +struct rte_flow_item_arp {
> > > > +	struct arp_hdr hdr;
> > > > +};
> > >
> > > Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> > > consistency, see ICMP and other definitions.
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef
> > > > +__cplusplus static const struct rte_flow_item_arp
> rte_flow_item_arp_mask = {
> > > > +	.hdr = {
> > > > +		.arp_data = {
> > > > +			.arp_sha = {
> > > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +			},
> > > > +			.arp_sip = RTE_BE32(0xffffffff),
> > > > +			.arp_tha = {
> > > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +			},
> > > > +			.arp_tip = RTE_BE32(0xffffffff),
> > > > +		},
> > > > +	},
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > > > + *
> > > > + * Matches any IPv6 extension header.
> > > > + */
> > > > +struct rte_flow_item_ipv6_ext_hdr_any {
> > > > +	uint8_t next_hdr;
> > > > +};
> > >
> > > So what's the point? next_hdr is already part of either struct
> > > ipv6_hdr
> > > ("proto") and individual extension headers. Moreover it's implicit
> > > if an extension header is provided in a pattern.
> > >
> > > How about removing it?
> >
> > We need this to match a packet that have extend header For example:
> > IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>)
> / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) /
> TCP ...
> >
> > I use "ANY" to match any extend header regardless their content.
> > There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX
> > in futures
> 
> I see, makes sense. How about doing like ICMPv6 above? Generic item uses the
> base name and can only match the generic part specifically (next_hdr), while
> specific items don't match the generic part but whatever additions their
> dedicated structures define, i.e.:
> 
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_HBH
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_DEST
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_RTHDR
>  RTE_FLOW_ITEM_TYPE_IPV6_EXT_FRAG
>  ...

Yes, agree.

> 
> No need to define them all if you only need EXT, this is just to describe the idea
> (it's also OK if you want to define them while you're at it).
> 
> >
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> > > #ifndef
> > > > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > > > +rte_flow_item_ipv6_ext_any_mask = {
> > > > +	.next_hdr = 0xff,
> > > > +};
> > > > +#endif
> > >
> > > Ditto.
> > >
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > > > + *
> > > > + * Matches ICMPv6 header.
> > >
> > > => Matches an ICMPv6 header.
> > >
> > > > + */
> > > > +struct rte_flow_item_icmpv6 {
> > > > +	uint8_t type;
> > > > +	uint8_t code;
> > > > +	uint16_t checksum;
> > >
> > > The last 32-bit "reserved" data field is missing.
> > >
> > > > +};
> > >
> > > Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could
> add it.
> > > In any case Doxygen comments are missing, please add them (see other
> > > structure definitions for examples).
> 
> No need to rely on an external definition due to the above suggestions by the
> way.
> 
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask =
> {
> > > > +	.type = 0xff,
> > > > +	.code = 0xff,
> > > > +	.checksum = RTE_BE16(0xffff),
> > > > +};
> > > > +#endif
> > >
> > > You must remove checksum matching from the default mask. That's the
> > > last thing an application might want to match exactly :)
> > >
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > > + *
> > > > + * Matches ICMPv6's Target Address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_tgt_addr {
> > > > +	uint8_t addr[16];
> > > > +};
> > >
> > > You need to expand this as two items, see prior comments regarding
> > > RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and
> their
> > > respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that these are only valid when used after
> > > RTE_FLOW_ITEM_TYPE_ICMPV6.
> > >
> > > > +
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const
> > > > +struct rte_flow_item_icmpv6_tgt_addr
> > > rte_flow_item_icmpv6_tgt_addr_mask = {
> > > > +	.addr =
> > > > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > > > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > > > + *
> > > > + * Matches ICMPv6 Source Link-Layer address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_sll {
> > > > +	struct ether_addr addr;
> > > > +};
> > >
> > > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH
> and
> > > struct rte_flow_item_type_nd6_opt_sla_eth.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that it is only valid when found after either
> > > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > >
> > > Also missing empty line here.
> > >
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6_sll
> > > rte_flow_item_icmpv6_sll_mask = {
> > > > +	.addr = {
> > > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +	}
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > > > + *
> > > > + * Matches ICMPv6 Target Link-Layer address.
> > > > + */
> > > > +struct rte_flow_item_icmpv6_tll {
> > > > +	struct ether_addr addr;
> > > > +};
> > >
> > > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> > > and struct rte_flow_item_type_nd6_opt_tla_eth.
> > >
> > > Also Doxygen documentation is missing for the addr field and you
> > > need to describe that it is only valid when found after either
> > > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > >
> > > Also missing empty line here.
> > >
> > > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> > >
> > > Missing "."
> > >
> > > > +#ifndef __cplusplus
> > > > +static const struct rte_flow_item_icmpv6_tll
> > > rte_flow_item_icmpv6_tll_mask = {
> > > > +	.addr = {
> > > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > > +	}
> > > > +};
> > > > +#endif
> > > > +
> > > > +/**
> > > >   * Matching pattern item definition.
> > > >   *
> > > >   * A pattern is formed by stacking items starting from the lowest
> > > > protocol
> > > > --
> > > > 2.7.4
> > > >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-12  5:12  0%       ` Zhang, Qi Z
@ 2018-04-12  9:19  0%         ` Adrien Mazarguil
  2018-04-12 10:00  0%           ` Zhang, Qi Z
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-12  9:19 UTC (permalink / raw)
  To: Zhang, Qi Z
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce

On Thu, Apr 12, 2018 at 05:12:08AM +0000, Zhang, Qi Z wrote:
> Hi Adrien:
> 
> 	Thank you so much for your careful review and helpful suggestions!
> 	I agree with most of your comments, except couple question about RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
> 	Please see my comment inline.
> 
> Thanks!
> Qi

Thanks, replying inline also.

> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Thursday, April 12, 2018 12:32 AM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> > Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> > <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> > Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> > 
> > On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > > Add new protocol header match support as below
> > >
> > > RTE_FLOW_ITEM_TYPE_ARP
> > > 	- match IPv4 ARP header
> > > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > > 	- match any IPv6 extension header
> > 
> > While properly defined in the patch, "IPV6" is missing here.
> > 
> > > RTE_FLOW_ITEM_TYPE_ICMPV6
> > > 	- match IPv6 ICMP header
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > 	- match IPv6 ICMP Target address
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > > 	- match IPv6 ICMP Source Link-layer address
> > > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > > 	- match IPv6 ICMP Target Link-layer address
> > >
> > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > 
> > First, since they are added at the end of enum rte_flow_item_type, no ABI
> > breakage notice is necessary.
> > 
> > However testpmd implementation [1][2] and documentation update [3][4] are
> > mandatory for all new pattern items and actions.
> 
> OK, will add this into v2.
> 
> > 
> > More comments below regarding these definitions.
> > 
> > [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an example
> > in app/test-pmd/cmdline_flow.c [3] "Pattern items" section in
> > doc/guides/testpmd_app_ug/testpmd_funcs.rst
> > [4] using "Item: ``ICMP``" section as an example in
> >     doc/guides/prog_guide/rte_flow.rst
> > 
> > > ---
> > >  lib/librte_ether/rte_flow.h | 160
> > > ++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 160 insertions(+)
> > >
> > > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > > index 8f75db0..a8ec780 100644
> > > --- a/lib/librte_ether/rte_flow.h
> > > +++ b/lib/librte_ether/rte_flow.h
> > > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> > >  	 * See struct rte_flow_item_geneve.
> > >  	 */
> > >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > > +
> > > +	/**
> > > +	 * Matches ARP IPv4 header.
> > 
> > => Matches an IPv4 ARP header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_arp.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ARP,
> > 
> > While you're right to make "IPv4" clear since ARP is also used for other
> > protocols DPDK doesn't support (and likely never will), the ARP header has
> > both a fixed and a variably-sized part.
> > 
> > Ideally an ARP pattern item should match the fixed part only and a separate
> > ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.
> > 
> > Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
> > suggestion would be to rename this pattern item ARP_IPV4 directly:
> > 
> > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > 
> > > +
> > > +	/**
> > > +	 * Matches any IPv6 Extension header.
> > 
> > => Matches an IPv6 extension header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_ipv6_ext_any.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> > 
> > I'm not sure this definition is necessary, more below about that.
> > 
> > Also I don't see a benefit in having "ANY" part of the name, if you want to keep
> > it, I suggest the simpler:
> > 
> > => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> > 
> > > +
> > > +	/**
> > > +	 * Matches ICMPv6 header.
> > 
> > => Matches an ICMPv6 header.
> > 
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6
> > 
> > Missing "."
> > 
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > > +
> > 
> > Before entering NDP territory below, I understand those should be stacked on
> > top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
> > named after the NDP types they represent, not inner data fields.
> > 
> > Also I think we should consider NDP as a protocol sitting on top of ICMPv6. We
> > could therefore drop "ICMP" from these definitions.
> > 
> > Since "ND" is a common shorthand for this protocol and "6" another when
> > doing something related to IPv6, I suggest to use "ND6" to name he related
> > pattern items.
> 
> I agree.
> 
> > 
> > These are the reasons behind my next suggestions:
> > 
> > > +	/**
> > > +	 * Match ICMPv6 target address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> > 
> > => Matches an IPv6 network discovery router solicitation.
> > => See struct rte_flow_item_nd6_rs.
> > => RTE_FLOW_ITEM_TYPE_ND6_RS,

By the way, I wrote "router solicitation" (RS) here but it should have been
"neighbor solicitation" (NS) obviously.

> > 
> > You should add another item for neighbor advertisement messages using the
> > same template:
> > 
> > => Match an IPv6 network discovery neighbor advertisement.
> > => See struct rte_flow_item_nd6_na.
> > => RTE_FLOW_ITEM_TYPE_ND6_NA,
> 
> The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a "target address"
> according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22, when type = 135/136
> 
> so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor Solicitation)
>  and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor Advertisement) here,
> and with the same template (an IPV6 addr) for rte_flow_item_icmpv6_tgt_addr?

The rationale is that while they share a similar format, they are in fact
different messages that applications could want to match more conveniently
than providing ICMP type/code values. It would be done for consistency given
the same RFC also defines router solicitation/advertisement messages.

However a problem remains since these messages are part of the ICMP format
whose "reserved" field sometimes contains message flags, particularly with
RA. These structures would lack that data.

Honestly your approach makes sense, but it shouldn't be possible to mix
target addresses with RA/RS and it should be convenient to match these
messages without specifically matching their contents.

So another suggestion would be to define new types at the ICMPv6 level to
use directly on top of ETH for each possible message and define separate
structures for options.

First let's drop one character here and in all other definitions in this
patch:

 ICMPV6 => ICMP6 

Then the new items would respectively be:

 RTE_FLOW_ITEM_TYPE_ICMP6
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_NA
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_NS
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_SLA
 RTE_FLOW_ITEM_TYPE_ICMP6_ND_OPT_TLA

All the related structure definitions would include the ICMPv6 header part
defined according to the RFC and except for RTE_FLOW_ITEM_TYPE_ICMP6, a
default mask that excludes type/code since they are implicit:

 struct rte_flow_item_icmp6_nd_na {
      uint8_t type; /**< ICMPv6 type, normally 136. */
      uint8_t code; /**< ICMPv6 code, normally 0. */
      rte_be16_t checksum; /**< ICMPv6 checksum. */
      /**
       * Router flag (1b), solicited flag (1b), override flag (1b),
       * reserved (29b).
       */
      rte_be32_t rso_reserved;
      uint8_t target[16]; /**< Target address. */
 };

 static const struct rte_flow_item_icmp6_nd_na rte_flow_item_icmp6_nd_na_mask = {
     .target =
          "\xff\xff\xff\xff\xff\xff\xff\xff"
          "\xff\xff\xff\xff\xff\xff\xff\xff",
 };

Also notice how uint(16|32)_t were modified as rte_be(16|32)_t while there.

What's your opinion?

> 
> > 
> > The following are possible options for these headers, if specified they must be
> > found afterward. Also since IPv6 may run on top of protocols other than
> > Ethernet, you need to clarify these link-layer addresses use the Ethernet
> > format:
> > 
> > > +
> > > +	/**
> > > +	 * Match ICMPv6 Source Link-Layer Address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_sll.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> > 
> > => Matches an IPv6 network discovery source Ethernet link-layer address
> > option.
> > => See struct rte_flow_item_nd6_opt_sla_eth.
> > => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> > 
> > > +
> > > +	/**
> > > +	 * Match ICMPv6 Target Link-Layer Address.
> > > +	 *
> > > +	 * See struct rte_flow_item_icmpv6_tll.
> > > +	 */
> > > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> > 
> > => Matches an IPv6 network discovery target Ethernet link-layer address
> > option.
> > => See struct rte_flow_item_nd6_opt_tla_eth.
> > => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> > 
> 
> Agree to rename.
> 
> > > +
> > 
> > Unnecessary empty line.
> > 
> > >  };
> > >
> > >  /**
> > > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > > rte_flow_item_geneve_mask = {  #endif
> > >
> > >  /**
> > > + * RTE_FLOW_ITEM_TYPE_ARP
> > > + *
> > > + * Matches IPv4 ARP packet header
> > 
> > As above:
> > 
> > => Matches an IPv4 ARP header.
> > => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> > 
> > > + */
> > > +struct rte_flow_item_arp {
> > > +	struct arp_hdr hdr;
> > > +};
> > 
> > Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> > consistency, see ICMP and other definitions.
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef __cplusplus
> > > +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> > > +	.hdr = {
> > > +		.arp_data = {
> > > +			.arp_sha = {
> > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +			},
> > > +			.arp_sip = RTE_BE32(0xffffffff),
> > > +			.arp_tha = {
> > > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +			},
> > > +			.arp_tip = RTE_BE32(0xffffffff),
> > > +		},
> > > +	},
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > > + *
> > > + * Matches any IPv6 extension header.
> > > + */
> > > +struct rte_flow_item_ipv6_ext_hdr_any {
> > > +	uint8_t next_hdr;
> > > +};
> > 
> > So what's the point? next_hdr is already part of either struct ipv6_hdr
> > ("proto") and individual extension headers. Moreover it's implicit if an
> > extension header is provided in a pattern.
> > 
> > How about removing it?
> 
> We need this to match a packet that have extend header
> For example:
> IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>) / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) / TCP ...
> 
> I use "ANY" to match any extend header regardless their content.
> There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX in futures

I see, makes sense. How about doing like ICMPv6 above? Generic item uses
the base name and can only match the generic part specifically (next_hdr),
while specific items don't match the generic part but whatever additions
their dedicated structures define, i.e.:

 RTE_FLOW_ITEM_TYPE_IPV6_EXT
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_HBH
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_DEST
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_RTHDR
 RTE_FLOW_ITEM_TYPE_IPV6_EXT_FRAG
 ...

No need to define them all if you only need EXT, this is just to describe
the idea (it's also OK if you want to define them while you're at it).

> 
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> > #ifndef
> > > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > > +rte_flow_item_ipv6_ext_any_mask = {
> > > +	.next_hdr = 0xff,
> > > +};
> > > +#endif
> > 
> > Ditto.
> > 
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > > + *
> > > + * Matches ICMPv6 header.
> > 
> > => Matches an ICMPv6 header.
> > 
> > > + */
> > > +struct rte_flow_item_icmpv6 {
> > > +	uint8_t type;
> > > +	uint8_t code;
> > > +	uint16_t checksum;
> > 
> > The last 32-bit "reserved" data field is missing.
> > 
> > > +};
> > 
> > Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add it.
> > In any case Doxygen comments are missing, please add them (see other
> > structure definitions for examples).

No need to rely on an external definition due to the above suggestions by
the way.

> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> > > +	.type = 0xff,
> > > +	.code = 0xff,
> > > +	.checksum = RTE_BE16(0xffff),
> > > +};
> > > +#endif
> > 
> > You must remove checksum matching from the default mask. That's the last
> > thing an application might want to match exactly :)
> > 
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > > + *
> > > + * Matches ICMPv6's Target Address.
> > > + */
> > > +struct rte_flow_item_icmpv6_tgt_addr {
> > > +	uint8_t addr[16];
> > > +};
> > 
> > You need to expand this as two items, see prior comments regarding
> > RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their
> > respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that these are only valid when used after
> > RTE_FLOW_ITEM_TYPE_ICMPV6.
> > 
> > > +
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const
> > > +struct rte_flow_item_icmpv6_tgt_addr
> > rte_flow_item_icmpv6_tgt_addr_mask = {
> > > +	.addr =
> > > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > > + *
> > > + * Matches ICMPv6 Source Link-Layer address.
> > > + */
> > > +struct rte_flow_item_icmpv6_sll {
> > > +	struct ether_addr addr;
> > > +};
> > 
> > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and
> > struct rte_flow_item_type_nd6_opt_sla_eth.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that it is only valid when found after either
> > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > 
> > Also missing empty line here.
> > 
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6_sll
> > rte_flow_item_icmpv6_sll_mask = {
> > > +	.addr = {
> > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +	}
> > > +};
> > > +#endif
> > > +
> > > +/**
> > > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > > + *
> > > + * Matches ICMPv6 Target Link-Layer address.
> > > + */
> > > +struct rte_flow_item_icmpv6_tll {
> > > +	struct ether_addr addr;
> > > +};
> > 
> > See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> > and struct rte_flow_item_type_nd6_opt_tla_eth.
> > 
> > Also Doxygen documentation is missing for the addr field and you need to
> > describe that it is only valid when found after either
> > RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> > 
> > Also missing empty line here.
> > 
> > > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> > 
> > Missing "."
> > 
> > > +#ifndef __cplusplus
> > > +static const struct rte_flow_item_icmpv6_tll
> > rte_flow_item_icmpv6_tll_mask = {
> > > +	.addr = {
> > > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > > +	}
> > > +};
> > > +#endif
> > > +
> > > +/**
> > >   * Matching pattern item definition.
> > >   *
> > >   * A pattern is formed by stacking items starting from the lowest
> > > protocol
> > > --
> > > 2.7.4
> > >

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types
      @ 2018-04-12  7:33  3% ` Xueming Li
  2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
  3 siblings, 0 replies; 200+ results
From: Xueming Li @ 2018-04-12  7:33 UTC (permalink / raw)
  To: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Adrien Mazarguil
  Cc: Xueming Li, Nelio Laranjeiro, Shahaf Shuler, dev, Olivier Matz

v3:
- Change VXLAN-GPE definition order to avoid ABI compatibility issue.
v2:
- Split patch set into public and mlx5 two series, this one is the first.
v1:
- Support new tunnel type MPLS-in-GRE and MPLS-in-UDP
- Remove deprecation notes of rss level

This patchset introduced new tunnel type and related testpmd code:
- New tunnel type VXLAN-GPE
  https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
- New tunnel type MPLS-in-GRE
  https://tools.ietf.org/html/rfc4023
- New tunnel type MPLS-in-UDP
  https://tools.ietf.org/html/rfc7510
- Support GRE extension in testpmd csum forwarding engine


Xueming Li (5):
  doc: remove RSS configuration change announcement
  ethdev: introduce new tunnel VXLAN-GPE
  ethdev: introduce tunnel type MPLS-in-GRE and MPLS-in-UDP
  app/testpmd: introduce new tunnel VXLAN-GPE
  app/testpmd: add more GRE extension support to csum engine

 app/test-pmd/cmdline_flow.c           |  24 ++++++++
 app/test-pmd/config.c                 |   2 +
 app/test-pmd/csumonly.c               | 103 ++++++++++++++++++++++++++++++----
 app/test-pmd/parameters.c             |  12 +++-
 app/test-pmd/testpmd.h                |   2 +
 doc/guides/prog_guide/rte_flow.rst    |  12 ++++
 doc/guides/rel_notes/deprecation.rst  |   4 --
 doc/guides/testpmd_app_ug/run_app.rst |   5 ++
 lib/librte_ether/rte_eth_ctrl.h       |   3 +-
 lib/librte_ether/rte_flow.c           |   1 +
 lib/librte_ether/rte_flow.h           |  27 +++++++++
 lib/librte_mbuf/rte_mbuf.c            |   3 +
 lib/librte_mbuf/rte_mbuf.h            |   1 +
 lib/librte_mbuf/rte_mbuf_ptype.c      |   3 +
 lib/librte_mbuf/rte_mbuf_ptype.h      |  47 ++++++++++++++++
 lib/librte_net/rte_ether.h            |  25 +++++++++
 16 files changed, 257 insertions(+), 17 deletions(-)

-- 
2.13.3

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v6 4/4] doc: add ifcvf driver document and release note
  @ 2018-04-12  7:19  3%   ` Xiao Wang
  0 siblings, 0 replies; 200+ results
From: Xiao Wang @ 2018-04-12  7:19 UTC (permalink / raw)
  To: ferruh.yigit
  Cc: dev, maxime.coquelin, zhihong.wang, tiwei.bie, jianfeng.tan,
	cunming.liang, dan.daly, thomas, gaetan.rivet, anatoly.burakov,
	hemant.agrawal, Xiao Wang

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/nics/features/ifcvf.ini     |  8 +++
 doc/guides/nics/ifcvf.rst              | 98 ++++++++++++++++++++++++++++++++++
 doc/guides/nics/index.rst              |  1 +
 doc/guides/rel_notes/release_18_05.rst |  9 ++++
 4 files changed, 116 insertions(+)
 create mode 100644 doc/guides/nics/features/ifcvf.ini
 create mode 100644 doc/guides/nics/ifcvf.rst

diff --git a/doc/guides/nics/features/ifcvf.ini b/doc/guides/nics/features/ifcvf.ini
new file mode 100644
index 000000000..ef1fc4711
--- /dev/null
+++ b/doc/guides/nics/features/ifcvf.ini
@@ -0,0 +1,8 @@
+;
+; Supported features of the 'ifcvf' vDPA driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+x86-32               = Y
+x86-64               = Y
diff --git a/doc/guides/nics/ifcvf.rst b/doc/guides/nics/ifcvf.rst
new file mode 100644
index 000000000..d7e76353c
--- /dev/null
+++ b/doc/guides/nics/ifcvf.rst
@@ -0,0 +1,98 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2018 Intel Corporation.
+
+IFCVF vDPA driver
+=================
+
+The IFCVF vDPA (vhost data path acceleration) driver provides support for the
+Intel FPGA 100G VF (IFCVF). IFCVF's datapath is virtio ring compatible, it
+works as a HW vhost backend which can send/receive packets to/from virtio
+directly by DMA. Besides, it supports dirty page logging and device state
+report/restore. This driver enables its vDPA functionality with live migration
+feature.
+
+
+Pre-Installation Configuration
+------------------------------
+
+Config File Options
+~~~~~~~~~~~~~~~~~~~
+
+The following option can be modified in the ``config`` file.
+
+- ``CONFIG_RTE_LIBRTE_IFCVF_VDPA_PMD`` (default ``y`` for linux)
+
+  Toggle compilation of the ``librte_ifcvf_vdpa`` driver.
+
+
+IFCVF vDPA Implementation
+-------------------------
+
+IFCVF's vendor ID and device ID are same as that of virtio net pci device,
+with its specific subsystem vendor ID and device ID. To let the device be
+probed by IFCVF driver, adding "vdpa=1" parameter helps to specify that this
+device is to be used in vDPA mode, rather than polling mode, virtio pmd will
+skip when it detects this message.
+
+Different VF devices serve different virtio frontends which are in different
+VMs, so each VF needs to have its own DMA address translation service. During
+the driver probe a new container is created for this device, with this
+container vDPA driver can program DMA remapping table with the VM's memory
+region information.
+
+Key IFCVF vDPA driver ops
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- ifcvf_dev_config:
+  Enable VF data path with virtio information provided by vhost lib, including
+  IOMMU programming to enable VF DMA to VM's memory, VFIO interrupt setup to
+  route HW interrupt to virtio driver, create notify relay thread to translate
+  virtio driver's kick to a MMIO write onto HW, HW queues configuration.
+
+  This function gets called to set up HW data path backend when virtio driver
+  in VM gets ready.
+
+- ifcvf_dev_close:
+  Revoke all the setup in ifcvf_dev_config.
+
+  This function gets called when virtio driver stops device in VM.
+
+To create a vhost port with IFC VF
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Create a vhost socket and assign a VF's device ID to this socket via
+  vhost API. When QEMU vhost connection gets ready, the assigned VF will
+  get configured automatically.
+
+
+Features
+--------
+
+Features of the IFCVF driver are:
+
+- Compatibility with virtio 0.95 and 1.0.
+- Live migration.
+
+
+Prerequisites
+-------------
+
+- Platform with IOMMU feature. IFC VF needs address translation service to
+  Rx/Tx directly with virtio driver in VM.
+
+
+Limitations
+-----------
+
+Dependency on vfio-pci
+~~~~~~~~~~~~~~~~~~~~~~
+
+vDPA driver needs to setup VF MSIX interrupts, each queue's interrupt vector
+is mapped to a callfd associated with a virtio ring. Currently only vfio-pci
+allows multiple interrupts, so the IFCVF driver is dependent on vfio-pci.
+
+Live Migration with VIRTIO_NET_F_GUEST_ANNOUNCE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+IFC VF doesn't support RARP packet generation, virtio frontend supporting
+VIRTIO_NET_F_GUEST_ANNOUNCE feature can help to do that.
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 51c453d9c..a294ab389 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -44,6 +44,7 @@ Network Interface Controller Drivers
     vmxnet3
     pcap_ring
     fail_safe
+    ifcvf
 
 **Figures**
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 3e1ae0cfd..1bf609f6b 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -84,6 +84,15 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Added IFCVF vDPA driver.**
+
+  Added IFCVF vDPA driver to support Intel FPGA 100G VF device. IFCVF works
+  as a HW vhost data path accelerator, it supports live migration and is
+  compatible with virtio 0.95 and 1.0. This driver registers ifcvf vDPA driver
+  to vhost lib, when virtio connected, with the help of the registered vDPA
+  driver the assigned VF gets configured to Rx/Tx directly to VM's virtio
+  vrings.
+
 
 ABI Changes
 -----------
-- 
2.15.1

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  2018-04-11 16:32  2%     ` Adrien Mazarguil
@ 2018-04-12  5:12  0%       ` Zhang, Qi Z
  2018-04-12  9:19  0%         ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Zhang, Qi Z @ 2018-04-12  5:12 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: dev, Doherty, Declan, Chandran, Sugesh, Glynn, Michael J, Liu,
	Yu Y, Ananyev, Konstantin, Richardson, Bruce

Hi Adrien:

	Thank you so much for your careful review and helpful suggestions!
	I agree with most of your comments, except couple question about RTE_FLOW_ITEM_TYPE_TGT_ADDR and RTE_FLOW_ITEM_IPV6_EXT_HDR
	Please see my comment inline.

Thanks!
Qi

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Thursday, April 12, 2018 12:32 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>; Chandran,
> Sugesh <sugesh.chandran@intel.com>; Glynn, Michael J
> <michael.j.glynn@intel.com>; Liu, Yu Y <yu.y.liu@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Subject: Re: [PATCH v2 3/4] ether: add more protocol support in flow API
> 
> On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> > Add new protocol header match support as below
> >
> > RTE_FLOW_ITEM_TYPE_ARP
> > 	- match IPv4 ARP header
> > RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> > 	- match any IPv6 extension header
> 
> While properly defined in the patch, "IPV6" is missing here.
> 
> > RTE_FLOW_ITEM_TYPE_ICMPV6
> > 	- match IPv6 ICMP header
> > RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > 	- match IPv6 ICMP Target address
> > RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> > 	- match IPv6 ICMP Source Link-layer address
> > RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> > 	- match IPv6 ICMP Target Link-layer address
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> 
> First, since they are added at the end of enum rte_flow_item_type, no ABI
> breakage notice is necessary.
> 
> However testpmd implementation [1][2] and documentation update [3][4] are
> mandatory for all new pattern items and actions.

OK, will add this into v2.

> 
> More comments below regarding these definitions.
> 
> [1] flow_item[] in app/test-pmd/config.c [2] using ITEM_ICMP as an example
> in app/test-pmd/cmdline_flow.c [3] "Pattern items" section in
> doc/guides/testpmd_app_ug/testpmd_funcs.rst
> [4] using "Item: ``ICMP``" section as an example in
>     doc/guides/prog_guide/rte_flow.rst
> 
> > ---
> >  lib/librte_ether/rte_flow.h | 160
> > ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 160 insertions(+)
> >
> > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > index 8f75db0..a8ec780 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -323,6 +323,49 @@ enum rte_flow_item_type {
> >  	 * See struct rte_flow_item_geneve.
> >  	 */
> >  	RTE_FLOW_ITEM_TYPE_GENEVE,
> > +
> > +	/**
> > +	 * Matches ARP IPv4 header.
> 
> => Matches an IPv4 ARP header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_arp.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ARP,
> 
> While you're right to make "IPv4" clear since ARP is also used for other
> protocols DPDK doesn't support (and likely never will), the ARP header has
> both a fixed and a variably-sized part.
> 
> Ideally an ARP pattern item should match the fixed part only and a separate
> ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.
> 
> Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
> suggestion would be to rename this pattern item ARP_IPV4 directly:
> 
> => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> 
> > +
> > +	/**
> > +	 * Matches any IPv6 Extension header.
> 
> => Matches an IPv6 extension header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_ipv6_ext_any.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,
> 
> I'm not sure this definition is necessary, more below about that.
> 
> Also I don't see a benefit in having "ANY" part of the name, if you want to keep
> it, I suggest the simpler:
> 
> => RTE_FLOW_ITEM_TYPE_IPV6_EXT
> 
> > +
> > +	/**
> > +	 * Matches ICMPv6 header.
> 
> => Matches an ICMPv6 header.
> 
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6
> 
> Missing "."
> 
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> > +
> 
> Before entering NDP territory below, I understand those should be stacked on
> top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
> named after the NDP types they represent, not inner data fields.
> 
> Also I think we should consider NDP as a protocol sitting on top of ICMPv6. We
> could therefore drop "ICMP" from these definitions.
> 
> Since "ND" is a common shorthand for this protocol and "6" another when
> doing something related to IPv6, I suggest to use "ND6" to name he related
> pattern items.

I agree.

> 
> These are the reasons behind my next suggestions:
> 
> > +	/**
> > +	 * Match ICMPv6 target address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,
> 
> => Matches an IPv6 network discovery router solicitation.
> => See struct rte_flow_item_nd6_rs.
> => RTE_FLOW_ITEM_TYPE_ND6_RS,
> 
> You should add another item for neighbor advertisement messages using the
> same template:
> 
> => Match an IPv6 network discovery neighbor advertisement.
> => See struct rte_flow_item_nd6_na.
> => RTE_FLOW_ITEM_TYPE_ND6_NA,

The purpose of RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR is to match a "target address"
according to IPv6 ND spec https://tools.ietf.org/html/rfc4861#page-22, when type = 135/136

so do you mean we should have RTE_FLOW_ITEM_TYPE_ND6_NS (Neighbor Solicitation)
 and RTE_FLOW_ITEM_TYPE_ND6_NA (Neighbor Advertisement) here,
and with the same template (an IPV6 addr) for rte_flow_item_icmpv6_tgt_addr?

> 
> The following are possible options for these headers, if specified they must be
> found afterward. Also since IPv6 may run on top of protocols other than
> Ethernet, you need to clarify these link-layer addresses use the Ethernet
> format:
> 
> > +
> > +	/**
> > +	 * Match ICMPv6 Source Link-Layer Address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_sll.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,
> 
> => Matches an IPv6 network discovery source Ethernet link-layer address
> option.
> => See struct rte_flow_item_nd6_opt_sla_eth.
> => RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,
> 
> > +
> > +	/**
> > +	 * Match ICMPv6 Target Link-Layer Address.
> > +	 *
> > +	 * See struct rte_flow_item_icmpv6_tll.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,
> 
> => Matches an IPv6 network discovery target Ethernet link-layer address
> option.
> => See struct rte_flow_item_nd6_opt_tla_eth.
> => RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,
> 

Agree to rename.

> > +
> 
> Unnecessary empty line.
> 
> >  };
> >
> >  /**
> > @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve
> > rte_flow_item_geneve_mask = {  #endif
> >
> >  /**
> > + * RTE_FLOW_ITEM_TYPE_ARP
> > + *
> > + * Matches IPv4 ARP packet header
> 
> As above:
> 
> => Matches an IPv4 ARP header.
> => RTE_FLOW_ITEM_TYPE_ARP_IPV4
> 
> > + */
> > +struct rte_flow_item_arp {
> > +	struct arp_hdr hdr;
> > +};
> 
> Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
> consistency, see ICMP and other definitions.
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */ #ifndef __cplusplus
> > +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> > +	.hdr = {
> > +		.arp_data = {
> > +			.arp_sha = {
> > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +			},
> > +			.arp_sip = RTE_BE32(0xffffffff),
> > +			.arp_tha = {
> > +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +			},
> > +			.arp_tip = RTE_BE32(0xffffffff),
> > +		},
> > +	},
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> > + *
> > + * Matches any IPv6 extension header.
> > + */
> > +struct rte_flow_item_ipv6_ext_hdr_any {
> > +	uint8_t next_hdr;
> > +};
> 
> So what's the point? next_hdr is already part of either struct ipv6_hdr
> ("proto") and individual extension headers. Moreover it's implicit if an
> extension header is provided in a pattern.
> 
> How about removing it?

We need this to match a packet that have extend header
For example:
IPV6(proto = 43, <Routing EH >) / EXT_HDR (next_head = 60 <Destination EH>) / EXT_HDR (next_head = 44, <Fragment EH)/ EXT_HDR (next_head = 6 <tcp>) / TCP ...

I use "ANY" to match any extend header regardless their content.
There is no conflict if we can add multiple RTE_FLOW_ITEM_EXT_HDR_XXX in futures

> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> #ifndef
> > +__cplusplus static const struct rte_flow_item_ipv6_ext_hdr_any
> > +rte_flow_item_ipv6_ext_any_mask = {
> > +	.next_hdr = 0xff,
> > +};
> > +#endif
> 
> Ditto.
> 
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6
> > + *
> > + * Matches ICMPv6 header.
> 
> => Matches an ICMPv6 header.
> 
> > + */
> > +struct rte_flow_item_icmpv6 {
> > +	uint8_t type;
> > +	uint8_t code;
> > +	uint16_t checksum;
> 
> The last 32-bit "reserved" data field is missing.
> 
> > +};
> 
> Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add it.
> In any case Doxygen comments are missing, please add them (see other
> structure definitions for examples).
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> > +	.type = 0xff,
> > +	.code = 0xff,
> > +	.checksum = RTE_BE16(0xffff),
> > +};
> > +#endif
> 
> You must remove checksum matching from the default mask. That's the last
> thing an application might want to match exactly :)
> 
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> > + *
> > + * Matches ICMPv6's Target Address.
> > + */
> > +struct rte_flow_item_icmpv6_tgt_addr {
> > +	uint8_t addr[16];
> > +};
> 
> You need to expand this as two items, see prior comments regarding
> RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their
> respective structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that these are only valid when used after
> RTE_FLOW_ITEM_TYPE_ICMPV6.
> 
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const
> > +struct rte_flow_item_icmpv6_tgt_addr
> rte_flow_item_icmpv6_tgt_addr_mask = {
> > +	.addr =
> > +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> > +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> > + *
> > + * Matches ICMPv6 Source Link-Layer address.
> > + */
> > +struct rte_flow_item_icmpv6_sll {
> > +	struct ether_addr addr;
> > +};
> 
> See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and
> struct rte_flow_item_type_nd6_opt_sla_eth.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that it is only valid when found after either
> RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> 
> Also missing empty line here.
> 
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6_sll
> rte_flow_item_icmpv6_sll_mask = {
> > +	.addr = {
> > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +	}
> > +};
> > +#endif
> > +
> > +/**
> > + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> > + *
> > + * Matches ICMPv6 Target Link-Layer address.
> > + */
> > +struct rte_flow_item_icmpv6_tll {
> > +	struct ether_addr addr;
> > +};
> 
> See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH
> and struct rte_flow_item_type_nd6_opt_tla_eth.
> 
> Also Doxygen documentation is missing for the addr field and you need to
> describe that it is only valid when found after either
> RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.
> 
> Also missing empty line here.
> 
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */
> 
> Missing "."
> 
> > +#ifndef __cplusplus
> > +static const struct rte_flow_item_icmpv6_tll
> rte_flow_item_icmpv6_tll_mask = {
> > +	.addr = {
> > +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +	}
> > +};
> > +#endif
> > +
> > +/**
> >   * Matching pattern item definition.
> >   *
> >   * A pattern is formed by stacking items starting from the lowest
> > protocol
> > --
> > 2.7.4
> >
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11 11:39  0%                 ` Ananyev, Konstantin
@ 2018-04-11 17:08  0%                   ` Yongseok Koh
  2018-04-12 16:34  0%                     ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-11 17:08 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Wed, Apr 11, 2018 at 11:39:47AM +0000, Ananyev, Konstantin wrote:
> 
> Hi Yongseok,
> 
> > > >
> > > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > > Hi Yongseok,
> > > > >
> > > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > > Hi,
> > > > > > >
> > > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > > rte_pktmbuf_attach_at().
> > > > > > > >
> > > > > > > > Possible use-cases could be:
> > > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > > >
> > > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > > >
> > > > > > > I think the current API is already able to do what you want.
> > > > > > >
> > > > > > > 1/ Here is a mbuf m with its data
> > > > > > >
> > > > > > >                off
> > > > > > >                <-->
> > > > > > >                       len
> > > > > > >           +----+   <---------->
> > > > > > >           |    |
> > > > > > >         +-|----v----------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > >         |      -----------------------|
> > > > > > >         +-----------------------------+
> > > > > > >
> > > > > > >
> > > > > > > 2/ clone m:
> > > > > > >
> > > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > > >   rte_pktmbuf_attach(c, m);
> > > > > > >
> > > > > > >   Note that c has its own offset and length fields.
> > > > > > >
> > > > > > >
> > > > > > >                off
> > > > > > >                <-->
> > > > > > >                       len
> > > > > > >           +----+   <---------->
> > > > > > >           |    |
> > > > > > >         +-|----v----------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > > >         |      -----------------------|
> > > > > > >         +------^----------------------+
> > > > > > >                |
> > > > > > >           +----+
> > > > > > > indirect  |
> > > > > > >         +-|---------------------------+
> > > > > > >         | |    -----------------------|
> > > > > > > c       | buf  |                     ||
> > > > > > >         |      -----------------------|
> > > > > > >         +-----------------------------+
> > > > > > >
> > > > > > >                 off    len
> > > > > > >                 <--><---------->
> > > > > > >
> > > > > > >
> > > > > > > 3/ remove some data from c without changing m
> > > > > > >
> > > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > > >
> > > > > > >
> > > > > > > Please let me know if it fits your needs.
> > > > > >
> > > > > > No, it doesn't.
> > > > > >
> > > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > > difference offsets in m,
> > > > > >
> > > > > > rte_pktmbuf_adj(c1, 10);
> > > > > > rte_pktmbuf_adj(c2, 20);
> > > > > >
> > > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > > header should be linked by h1->next = c2.
> > > > >
> > > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > > length) that will:
> > > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > > >     in the range [offset : length+offset].
> > > > >   - prepend an empty and writable mbuf for the headers
> > > > >
> > > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > > which actually shrink the headroom, this case can be properly handled.
> > > > >
> > > > > What do you mean by properly handled?
> > > > >
> > > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > > won't be protected.
> > > > >
> > > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > > have refcnt != 1, should be both considered as read-only because they
> > > > > may share their data. How an application can know if the data is shared
> > > > > or not?
> > > > >
> > > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > > understanding is correct, you want to have indirect mbufs with RW data.
> > > >
> > > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > > enough to handle that use-case.
> > > >
> > > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > > indirect referencing.
> > >
> > > But just to make HW to RX multiple packets into one mbuf,
> > > data_off inside indirect mbuf should be enough, correct?
> > Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> > to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> > a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> > better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> > to saturate the network link.
> 
> There were few complains that 64KB max is a limitation for some use-cases.
> I am not against increasing it, but I don't think we have free space on first cache-line for that
> without another big rework of mbuf layout. 
> Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too. 
> 
> > 
> > > As I understand, what you'd like to achieve with this new field -
> > > ability to manipulate packet boundaries after RX, probably at upper layer.
> > > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > > indirect mbufs trying to modify same direct buffer.
> > 
> > I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> > is read-only. What that means, all the entities which own such mbufs have to be
> > aware of that and keep the principle as DPDK can't enforce the rule and there
> > can't be such sanity check. In this sense, HW doesn't violate it because the
> > direct mbuf is injected to HW before indirection. When packets are written by
> > HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> > application layer with freeing the original direct mbuf (decrement refcnt by 1).
> > So, HW doesn't touch the direct buffer once it reaches to upper layer.
> 
> Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
> which implies that it should be possible by some entity (upper layer?) to manipulate these
> indirect mbufs.
> And we don't know how exactly it will be done.

That's a valid concern. I can make it private by merging into the _attach_to()
func, or I just can add a comment in the API doc. However, if users are aware
that a mbuf is read-only and we expect them to keep it intact by their own
judgement, they would/should not use those APIs. We can't stop them modifying
content or the buffer itself anyway. Will add more comments of this discussion
regarding read-only mode.

> > The direct buffer will be freed and get available for reuse when all the attached
> > indirect mbufs are freed.
> > 
> > > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > > Fields for indirect mbufs, straight after attach()?
> > 
> > Good point.
> > Actually that was my draft (Mellanox internal) version of this patch :-) But I
> > had to consider a case where priv_size is really given by user. Even though it
> > is less likely, but if original priv_size is quite big, it can't cover entire
> > buf_len. For this, I had to increase priv_size to 32-bit but adding another
> > 16bit field (buf_off) looked more plausible.
> 
> As I remember, we can't have mbufs bigger then 64K,
> so priv_size + buf_len should be always less than 64K, correct?

Can you let me know where I can find the constraint? I checked
rte_pktmbuf_pool_create() and rte_pktmbuf_init() again to not make any mistake
but there's no such limitation.

	elt_size = sizeof(struct rte_mbuf) + (unsigned)priv_size +
		(unsigned)data_room_size;

The max of data_room_size is 64kB, so is priv_size. m->buf_addr starts from 'm +
sizeof(*m) + priv_size' and m->buf_len can't be larger than UINT16_MAX. So,
priv_size couldn't be used for this purpose.

Yongseok

> > > > > >
> > > > > > Does this make sense?
> > > > >
> > > > > I understand the need.
> > > > >
> > > > > Another option would be to make the mbuf->buffer point to an external
> > > > > buffer (not inside the direct mbuf). This would require to add a
> > > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > > a quick overview.
> > > > >
> > > > > [1]
> > > >
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > > -Session05-OlivierMatz-
> > > >
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > > >
> > > > > The advantage is that it does not require the large data to be inside a
> > > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > > to implement compared to your solution.
> > > >
> > > > I knew that you presented the slides and frankly, I had considered that option
> > > > at first. But even with that option, metadata to store refcnt should also be
> > > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > > end of the data segment. Even though it could have smaller metadata structure,
> > > > I just wanted to make full use of the existing framework because it is less
> > > > complex as you mentioned. Given that you presented the idea of external data
> > > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > > simpler.  I personally think that we can take the idea of external data seg when
> > > > more demands come from users in the future as it would be a huge change and may
> > > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > > discussions and write codes for it if I can be helpful.
> > > >
> > > > Do you think this patch is okay for now?
> > > >
> > > >
> > > > Thanks for your comments,
> > > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API
  @ 2018-04-11 16:32  2%     ` Adrien Mazarguil
  2018-04-12  5:12  0%       ` Zhang, Qi Z
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-11 16:32 UTC (permalink / raw)
  To: Qi Zhang
  Cc: dev, declan.doherty, sugesh.chandran, michael.j.glynn, yu.y.liu,
	konstantin.ananyev, bruce.richardson

On Sun, Apr 01, 2018 at 05:19:21PM -0400, Qi Zhang wrote:
> Add new protocol header match support as below
> 
> RTE_FLOW_ITEM_TYPE_ARP
> 	- match IPv4 ARP header
> RTE_FLOW_ITEM_TYPE_EXT_HDR_ANY
> 	- match any IPv6 extension header

While properly defined in the patch, "IPV6" is missing here.

> RTE_FLOW_ITEM_TYPE_ICMPV6
> 	- match IPv6 ICMP header
> RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> 	- match IPv6 ICMP Target address
> RTE_FLOW_ITEM_TYPE_ICMPV6_SSL
> 	- match IPv6 ICMP Source Link-layer address
> RTE_FLOW_ITEM_TYPE_ICMPV6_TTL
> 	- match IPv6 ICMP Target Link-layer address
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

First, since they are added at the end of enum rte_flow_item_type, no ABI
breakage notice is necessary.

However testpmd implementation [1][2] and documentation update [3][4] are
mandatory for all new pattern items and actions.

More comments below regarding these definitions.

[1] flow_item[] in app/test-pmd/config.c
[2] using ITEM_ICMP as an example in app/test-pmd/cmdline_flow.c
[3] "Pattern items" section in doc/guides/testpmd_app_ug/testpmd_funcs.rst
[4] using "Item: ``ICMP``" section as an example in
    doc/guides/prog_guide/rte_flow.rst

> ---
>  lib/librte_ether/rte_flow.h | 160 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 160 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 8f75db0..a8ec780 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -323,6 +323,49 @@ enum rte_flow_item_type {
>  	 * See struct rte_flow_item_geneve.
>  	 */
>  	RTE_FLOW_ITEM_TYPE_GENEVE,
> +
> +	/**
> +	 * Matches ARP IPv4 header.

=> Matches an IPv4 ARP header.

> +	 *
> +	 * See struct rte_flow_item_arp.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ARP,

While you're right to make "IPv4" clear since ARP is also used for other
protocols DPDK doesn't support (and likely never will), the ARP header has
both a fixed and a variably-sized part.

Ideally an ARP pattern item should match the fixed part only and a separate
ARP_IPV4 match its payload, somewhat like you did for ICMPv6/NDP below.

Problem is that in DPDK, struct arp_hdr includes struct arp_ipv4, so one
suggestion would be to rename this pattern item ARP_IPV4 directly:

=> RTE_FLOW_ITEM_TYPE_ARP_IPV4

> +
> +	/**
> +	 * Matches any IPv6 Extension header.

=> Matches an IPv6 extension header.

> +	 *
> +	 * See struct rte_flow_item_ipv6_ext_any.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY,

I'm not sure this definition is necessary, more below about that.

Also I don't see a benefit in having "ANY" part of the name, if you want to
keep it, I suggest the simpler:

=> RTE_FLOW_ITEM_TYPE_IPV6_EXT

> +
> +	/**
> +	 * Matches ICMPv6 header.

=> Matches an ICMPv6 header.

> +	 *
> +	 * See struct rte_flow_item_icmpv6

Missing "."

> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6,
> +

Before entering NDP territory below, I understand those should be stacked on
top of RTE_FLOW_ITEM_TYPE_ICMPV6. It's fine but for clarity they should be
named after the NDP types they represent, not inner data fields.

Also I think we should consider NDP as a protocol sitting on top of
ICMPv6. We could therefore drop "ICMP" from these definitions.

Since "ND" is a common shorthand for this protocol and "6" another when
doing something related to IPv6, I suggest to use "ND6" to name he related
pattern items.

These are the reasons behind my next suggestions:

> +	/**
> +	 * Match ICMPv6 target address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_tgt_addr.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR,

=> Matches an IPv6 network discovery router solicitation.
=> See struct rte_flow_item_nd6_rs.
=> RTE_FLOW_ITEM_TYPE_ND6_RS,

You should add another item for neighbor advertisement messages using the
same template:

=> Match an IPv6 network discovery neighbor advertisement.
=> See struct rte_flow_item_nd6_na.
=> RTE_FLOW_ITEM_TYPE_ND6_NA,

The following are possible options for these headers, if specified they must
be found afterward. Also since IPv6 may run on top of protocols other than
Ethernet, you need to clarify these link-layer addresses use the Ethernet
format:

> +
> +	/**
> +	 * Match ICMPv6 Source Link-Layer Address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_sll.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_SLL,

=> Matches an IPv6 network discovery source Ethernet link-layer address option.
=> See struct rte_flow_item_nd6_opt_sla_eth.
=> RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH,

> +
> +	/**
> +	 * Match ICMPv6 Target Link-Layer Address.
> +	 *
> +	 * See struct rte_flow_item_icmpv6_tll.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_ICMPV6_TLL,

=> Matches an IPv6 network discovery target Ethernet link-layer address option.
=> See struct rte_flow_item_nd6_opt_tla_eth.
=> RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH,

> +

Unnecessary empty line.

>  };
>  
>  /**
> @@ -815,6 +858,123 @@ static const struct rte_flow_item_geneve rte_flow_item_geneve_mask = {
>  #endif
>  
>  /**
> + * RTE_FLOW_ITEM_TYPE_ARP
> + *
> + * Matches IPv4 ARP packet header

As above:

=> Matches an IPv4 ARP header.
=> RTE_FLOW_ITEM_TYPE_ARP_IPV4

> + */
> +struct rte_flow_item_arp {
> +	struct arp_hdr hdr;
> +};

Needs #include <rte_arp.h> and a Doxygen comment next to hdr for
consistency, see ICMP and other definitions.

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ARP. */
> +#ifndef __cplusplus
> +static const struct rte_flow_item_arp rte_flow_item_arp_mask = {
> +	.hdr = {
> +		.arp_data = {
> +			.arp_sha = {
> +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +			},
> +			.arp_sip = RTE_BE32(0xffffffff),
> +			.arp_tha = {
> +				.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +			},
> +			.arp_tip = RTE_BE32(0xffffffff),
> +		},
> +	},
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY
> + *
> + * Matches any IPv6 extension header.
> + */
> +struct rte_flow_item_ipv6_ext_hdr_any {
> +	uint8_t next_hdr;
> +};

So what's the point? next_hdr is already part of either struct ipv6_hdr
("proto") and individual extension headers. Moreover it's implicit if an
extension header is provided in a pattern.

How about removing it?

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_IPV6_EXT_HDR_ANY. */
> +#ifndef __cplusplus
> +static const
> +struct rte_flow_item_ipv6_ext_hdr_any rte_flow_item_ipv6_ext_any_mask = {
> +	.next_hdr = 0xff,
> +};
> +#endif

Ditto.

> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6
> + *
> + * Matches ICMPv6 header.

=> Matches an ICMPv6 header.

> + */
> +struct rte_flow_item_icmpv6 {
> +	uint8_t type;
> +	uint8_t code;
> +	uint16_t checksum;

The last 32-bit "reserved" data field is missing.

> +};

Too bad there is no struct icmp6_hdr definition in rte_icmp.h. You could add
it. In any case Doxygen comments are missing, please add them (see other
structure definitions for examples).

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6 */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6 rte_flow_item_icmpv6_mask = {
> +	.type = 0xff,
> +	.code = 0xff,
> +	.checksum = RTE_BE16(0xffff),
> +};
> +#endif

You must remove checksum matching from the default mask. That's the last
thing an application might want to match exactly :)

> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR
> + *
> + * Matches ICMPv6's Target Address.
> + */
> +struct rte_flow_item_icmpv6_tgt_addr {
> +	uint8_t addr[16];
> +};

You need to expand this as two items, see prior comments regarding
RTE_FLOW_ITEM_TYPE_ND6_RS, RTE_FLOW_ITEM_TYPE_ND6_NA and their respective
structs rte_flow_item_nd6_rs and rte_flow_item_nd6_na.

Also Doxygen documentation is missing for the addr field and you need to
describe that these are only valid when used after
RTE_FLOW_ITEM_TYPE_ICMPV6.

> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TGT_ADDR */

Missing "."

> +#ifndef __cplusplus
> +static const
> +struct rte_flow_item_icmpv6_tgt_addr rte_flow_item_icmpv6_tgt_addr_mask = {
> +	.addr =
> +		"\xff\xff\xff\xff\xff\xff\xff\xff"
> +		"\xff\xff\xff\xff\xff\xff\xff\xff",
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICPMV6_SLL.
> + *
> + * Matches ICMPv6 Source Link-Layer address.
> + */
> +struct rte_flow_item_icmpv6_sll {
> +	struct ether_addr addr;
> +};

See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_SLA_ETH and struct
rte_flow_item_type_nd6_opt_sla_eth.

Also Doxygen documentation is missing for the addr field and you need to
describe that it is only valid when found after either
RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.

Also missing empty line here.

> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_SLL */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6_sll rte_flow_item_icmpv6_sll_mask = {
> +	.addr = {
> +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +	}
> +};
> +#endif
> +
> +/**
> + * RTE_FLOW_ITEM_TYPE_ICMPV6_TLL.
> + *
> + * Matches ICMPv6 Target Link-Layer address.
> + */
> +struct rte_flow_item_icmpv6_tll {
> +	struct ether_addr addr;
> +};

See prior comments regarding RTE_FLOW_ITEM_TYPE_ND6_OPT_TLA_ETH and struct
rte_flow_item_type_nd6_opt_tla_eth.

Also Doxygen documentation is missing for the addr field and you need to
describe that it is only valid when found after either
RTE_FLOW_ITEM_TYPE_ND6_RS or RTE_FLOW_ITEM_TYPE_ND6_NA.

Also missing empty line here.

> +/** Default mask for RTE_FLOW_ITEM_TYPE_ICMPV6_TLL */

Missing "."

> +#ifndef __cplusplus
> +static const struct rte_flow_item_icmpv6_tll rte_flow_item_icmpv6_tll_mask = {
> +	.addr = {
> +		.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +	}
> +};
> +#endif
> +
> +/**
>   * Matching pattern item definition.
>   *
>   * A pattern is formed by stacking items starting from the lowest protocol
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 2/4] ether: add flow last hit query support
  @ 2018-04-11 16:31  3%     ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-11 16:31 UTC (permalink / raw)
  To: Qi Zhang
  Cc: dev, declan.doherty, sugesh.chandran, michael.j.glynn, yu.y.liu,
	konstantin.ananyev, bruce.richardson

On Sun, Apr 01, 2018 at 05:19:20PM -0400, Qi Zhang wrote:
> Enhanced the action RTE_FLOW_TYPE_ACTION_COUNT, number of
> milliseconds since last hit can be queried.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

Please confirm whether existing devices have the ability to report time
elapsed since last hit, or if PMDs are supposed to take care of that
entirely on their own in software?

If the latter, I suggest to drop this patch and let applications check
counters regularly on their own. Unlike applications, PMDs do not easily
have access to a reliable time source.

Otherwise, the patch looks acceptable but I can't tell if milliseconds are
the right unit for such information. Same issue as mbuf timestamps [1]
basically. As a 64-bit field, a precision down to the nanosecond is a
possibility so perhaps like mbufs, the reference and precision should be
undefined in the API in order to be processed by a PMD callback?

More comments below.

[1] commit 918ae9dc775e ("mbuf: add a timestamp field")

> ---
>  lib/librte_ether/rte_flow.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 1080086..8f75db0 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1054,9 +1054,11 @@ struct rte_flow_query_count {
>  	uint32_t reset:1; /**< Reset counters after query [in]. */
>  	uint32_t hits_set:1; /**< hits field is set [out]. */
>  	uint32_t bytes_set:1; /**< bytes field is set [out]. */
> +	uint32_t last_hit_set:1; /**< last_hit field is set [out]. */
>  	uint32_t reserved:29; /**< Reserved, must be zero [in, out]. */

You need to decrement reserved bits.

>  	uint64_t hits; /**< Number of hits for this rule [out]. */
>  	uint64_t bytes; /**< Number of bytes through this rule [out]. */
> +	uint64_t last_hit; /**< Number of milliseconds since last hit [out]. */
>  };

Doing so impacts ABI compatibility. While normally frowned upon for
rte_flow, it's OK for 18.05 because we already destroyed it. You still need
to mention what functions are impacted by this change as in "ethdev: add
encap level to RSS flow API action" [2] and update the .map files where
necessary.

In this case at least rte_flow_query() is impacted.

Please update doc/guides/prog_guide/rte_flow.rst as well (look for "COUNT
query").

[2] http://dpdk.org/ml/archives/dev/2018-April/096531.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-09 14:42  0%       ` Adrien Mazarguil
@ 2018-04-11 13:21  0%         ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:21 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Xueming Li, Wenzhuo Lu,
	Jingjing Wu, Beilei Xing, Qi Zhang, Konstantin Ananyev,
	Nelio Laranjeiro, Yongseok Koh, Pascal Mazon, Radu Nicolau,
	Akhil Goyal, Ivan Malov

On 04/09/2018 05:42 PM, Adrien Mazarguil wrote:
> On Sat, Apr 07, 2018 at 12:05:51PM +0300, Andrew Rybchenko wrote:
>> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
>>> Since its inception, the rte_flow RSS action has been relying in part on
>>> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
>>> This structure lacks parameters such as the hash algorithm to use, and more
>>> recently, a method to tell which layer RSS should be performed on [1].
>>>
>>> Given struct rte_eth_rss_conf will never be flexible enough to represent a
>>> complete RSS configuration (e.g. RETA table), this patch supersedes it by
>>> extending the rte_flow RSS action directly.
>>>
>>> A subsequent patch will add a field to use a non-default RSS hash
>>> algorithm. To that end, a field named "types" replaces the field formerly
>>> known as "rss_hf" and standing for "RSS hash functions" as it was
>>> confusing. Actual RSS hash function types are defined by enum
>>> rte_eth_hash_function.
>>> This patch updates all PMDs and example applications accordingly.
>>>
>>> It breaks ABI compatibility for the following public functions:
>>>
>>> - rte_flow_copy()
>>> - rte_flow_create()
>>> - rte_flow_query()
>>> - rte_flow_validate()
>>>
>>> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>>>       configuration")
>>>
>>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
>>> Cc: Xueming Li <xuemingl@mellanox.com>
>>> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
>>> Cc: Thomas Monjalon <thomas@monjalon.net>
>>> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
>>> Cc: Jingjing Wu <jingjing.wu@intel.com>
>>> Cc: Beilei Xing <beilei.xing@intel.com>
>>> Cc: Qi Zhang <qi.z.zhang@intel.com>
>>> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
>>> Cc: Yongseok Koh <yskoh@mellanox.com>
>>> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
>>> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>>> Cc: Radu Nicolau <radu.nicolau@intel.com>
>>> Cc: Akhil Goyal <akhil.goyal@nxp.com>
>>> ---
>>>    app/test-pmd/cmdline_flow.c        |  59 +++++-----
>>>    app/test-pmd/config.c              |  39 +++----
>>>    doc/guides/prog_guide/rte_flow.rst |  22 ++--
>>>    drivers/net/e1000/e1000_ethdev.h   |  13 ++-
>>>    drivers/net/e1000/igb_ethdev.c     |   4 +-
>>>    drivers/net/e1000/igb_flow.c       |  31 ++---
>>>    drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
>>>    drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
>>>    drivers/net/i40e/i40e_ethdev.h     |  15 ++-
>>>    drivers/net/i40e/i40e_flow.c       |  47 ++++----
>>>    drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
>>>    drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
>>>    drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
>>>    drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
>>>    drivers/net/mlx4/mlx4.c            |   2 +-
>>>    drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
>>>    drivers/net/mlx4/mlx4_flow.h       |   2 +-
>>>    drivers/net/mlx4/mlx4_rxq.c        |   2 +-
>>>    drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
>>>    drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
>>>    drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
>>>    drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
>>>    drivers/net/sfc/sfc_flow.c         |  21 ++--
>>>    drivers/net/tap/tap_flow.c         |   8 +-
>>>    examples/ipsec-secgw/ipsec.c       |  10 +-
>>>    lib/librte_ether/rte_flow.c        |  39 +++----
>>>    lib/librte_ether/rte_flow.h        |   6 +-
>>>    27 files changed, 473 insertions(+), 353 deletions(-)
>> <...>
>>
>>> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
>>> index 056405515..1a2c0299c 100644
>>> --- a/drivers/net/sfc/sfc_flow.c
>>> +++ b/drivers/net/sfc/sfc_flow.c
>>> @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	struct sfc_rxq *rxq;
>>>    	unsigned int rxq_hw_index_min;
>>>    	unsigned int rxq_hw_index_max;
>>> -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
>>> -	uint64_t rss_hf;
>>> -	uint8_t *rss_key = NULL;
>>> +	const uint8_t *rss_key;
>>>    	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
>>>    	unsigned int i;
>>> -	if (rss->num == 0)
>>> +	if (rss->queue_num == 0)
>>>    		return -EINVAL;
>>>    	rxq_sw_index = sa->rxq_count - 1;
>>> @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	rxq_hw_index_min = rxq->hw_index;
>>>    	rxq_hw_index_max = 0;
>>> -	for (i = 0; i < rss->num; ++i) {
>>> +	for (i = 0; i < rss->queue_num; ++i) {
>>>    		rxq_sw_index = rss->queue[i];
>>>    		if (rxq_sw_index >= sa->rxq_count)
>>> @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    			rxq_hw_index_max = rxq->hw_index;
>>>    	}
>>> -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
>> Here we had a fallback to default rss_hf (now types) if rss_conf is
>> unspecified.
> Thing is, rss_action->conf was never supposed to be NULL in the first
> place. Crashing on a NULL configuration has always been fine, but until
> recently prevented validation with testpmd's broken implementation. This
> problem was addressed in a prior series [1][2][3].
>
> Since a value is now always provided, no need for a fallback.

testpmd is not the only application. But in any case I agree that it was
possible have rss_hf==0 before. So, no big changes.

> [1] "app/testpmd: fix lack of flow action configuration"
>      http://dpdk.org/ml/archives/dev/2018-April/095280.html
> [2] "app/testpmd: fix RSS flow action configuration"
>      http://dpdk.org/ml/archives/dev/2018-April/095281.html
> [3] "app/testpmd: fix missing RSS fields in flow action"
>      http://dpdk.org/ml/archives/dev/2018-April/095282.html
>
>>> -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
>>> +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
>>>    		return -EINVAL;
>>> -	if (rss_conf != NULL) {
>>> -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
>>> +	if (rss->key_len) {
>>> +		if (rss->key_len != sizeof(sa->rss_key))
>>>    			return -EINVAL;
>>> -		rss_key = rss_conf->rss_key;
>>> +		rss_key = rss->key;
>>>    	} else {
>>>    		rss_key = sa->rss_key;
>>>    	}
>>> @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>>>    	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
>>>    	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
>>> -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
>>> +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
>> Now types go directly to mapping function and unspecified types (0)
>> will result in 0 rss_hash_types. Of course, it is a question how to treat
>> types==0. It is possible to say that it no RSS, but it does not make sense.
>> So, real options are device defaults (regardless configured on device level)
>> or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
>> Please, document the intended behaviour in rte_flow.rst.
> Granted the existing documentation doesn't say much on that topic, but a 0
> value for rss_hf does actually mean "no RSS" [4]:
>
>   "The *rss_hf* field of the *rss_conf* structure indicates the different
>    types of IPv4/IPv6 packets to which the RSS hashing must be applied.
>    Supplying an *rss_hf* equal to zero disables the RSS feature."
>
> Now since this action doesn't use struct rte_eth_rss_conf anymore, we could
> define 0 as a PMD-specific behavior, which could be no RSS. It would make
> the API easier to use for applications that don't care about the RSS
> capabilities of each underlying adapter, 0 would just work everywhere as a
> safe default.

PMD-specific is fine with some limits. It should be either device RSS 
config or
device defaults. I think it is bad idea to allow types=0 disable RSS as 
an option
of the PMD-specific behaviour.

> [4] https://dpdk.org/doc/api/structrte__eth__rss__conf.html
>
>> If the later is chosen, above we'll have a bug since fallback to fixed
>> default.
>> Just use sa->rss_hash_types as fallback. Something like:
>> if (rss->types)
>>      sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
>> else
>>      sfc_rss_conf->rss_hash_types =sa->rss_hash_types;
> Looks like the previous code didn't provide a fallback when rss_hf was 0,
> only when rss_conf itself was NULL. So this is not a new issue introduced by
> this patch.

Yes, I agree.

> I will update documentation to define 0 as described above for the
> convenience of application writers and leave the existing code in place.
> PMD maintainers will be free to enhance it as they wish later.
> Just remember testpmd now always provides a default value for it after
> querying the device [2].

Many thanks.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-11 13:06  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:06 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon,
	Radu Nicolau, Akhil Goyal

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> Since its inception, the rte_flow RSS action has been relying in part on
> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> This structure lacks parameters such as the hash algorithm to use, and more
> recently, a method to tell which layer RSS should be performed on [1].
>
> Given struct rte_eth_rss_conf will never be flexible enough to represent a
> complete RSS configuration (e.g. RETA table), this patch supersedes it by
> extending the rte_flow RSS action directly.
>
> A subsequent patch will add a field to use a non-default RSS hash
> algorithm. To that end, a field named "types" replaces the field formerly
> known as "rss_hf" and standing for "RSS hash functions" as it was
> confusing. Actual RSS hash function types are defined by enum
> rte_eth_hash_function.
>
> This patch updates all PMDs and example applications accordingly.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>      configuration")
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> Cc: Radu Nicolau <radu.nicolau@intel.com>
> Cc: Akhil Goyal <akhil.goyal@nxp.com>
>
> ---
>
> v3 changes:
>
> Documentation update regarding the meaning of a 0 value for RSS types in
> flow rules.
>
> It used to implicitly mean "no RSS" but is redefined as requesting a kind
> of "best-effort" mode from PMDs, i.e. anything ranging from empty to
> all-inclusive RSS; what matters is it provides safe defaults that will work
> regardless of PMD capabilities.
> ---
>   app/test-pmd/cmdline_flow.c                 |  48 +++---
>   app/test-pmd/config.c                       |  39 ++---
>   doc/guides/prog_guide/rte_flow.rst          |  28 ++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
>   drivers/net/e1000/e1000_ethdev.h            |  13 +-
>   drivers/net/e1000/igb_ethdev.c              |   4 +-
>   drivers/net/e1000/igb_flow.c                |  31 ++--
>   drivers/net/e1000/igb_rxtx.c                |  51 +++++-
>   drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
>   drivers/net/i40e/i40e_ethdev.h              |  15 +-
>   drivers/net/i40e/i40e_flow.c                |  57 ++++---
>   drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
>   drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
>   drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
>   drivers/net/mlx4/mlx4.c                     |   2 +-
>   drivers/net/mlx4/mlx4_flow.c                |  61 +++----
>   drivers/net/mlx4/mlx4_flow.h                |   2 +-
>   drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
>   drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
>   drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
>   drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
>   drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
>   drivers/net/sfc/sfc_flow.c                  |  21 ++-
>   drivers/net/tap/tap_flow.c                  |   8 +-
>   examples/ipsec-secgw/ipsec.c                |  10 +-
>   lib/librte_ether/rte_flow.c                 |  39 ++---
>   lib/librte_ether/rte_flow.h                 |  12 +-
>   28 files changed, 484 insertions(+), 359 deletions(-)

Generic and net/sfc;
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and action to flow API
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-11 13:02  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:02 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Zhang, Qi Z, Declan Doherty

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
> into a different device, as identified by its DPDK port ID.
>
> This is normally only supported when the target port ID has some kind of
> relationship with the port ID the flow rule is created against, such as
> being exposed by a common physical device (e.g. a different port of an
> Ethernet switch).
>
> The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
> flow rule match traffic whose origin is the specified port ID. Note that
> specifying a port ID that differs from the one the flow rule is created
> against is normally meaningless (if even accepted), but can make sense if
> combined with the transfer attribute.
>
> These must not be confused with their PHY_PORT counterparts, which refer to
> physical ports using device-specific indices, but unlike PORT_ID are not
> necessarily tied to DPDK port IDs.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> Cc: Declan Doherty <declan.doherty@intel.com>
>
> ---
>
> This patch provides the same functionality and supersedes Qi Zhang's
> "ether: add flow action to redirect packet to a port" [1].
>
> The main differences are:
>
> - Action is named PORT_ID instead of PORT.
> - Addition of a PORT_ID pattern item.
> - More extensive documentation.
> - Testpmd support.
> - rte_flow_copy() support.
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  2 +
>   doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
>   lib/librte_ether/rte_flow.c                 |  2 +
>   lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
>   6 files changed, 174 insertions(+)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to flow API
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-11 13:00  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 13:00 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> This patch adds the missing action counterpart to the PHY_PORT pattern
> item, that is, the ability to directly inject matching traffic into a
> physical port of the underlying device.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
>   6 files changed, 84 insertions(+)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item in flow API
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-11 12:57  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:57 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/10/2018 07:37 PM, Adrien Mazarguil wrote:
> While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
> device using specific identifiers, these are often confused with DPDK port
> IDs exposed to applications in the global name space.
>
> Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
> for better clarity.
>
> No ABI impact.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
>   app/test-pmd/config.c                       |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
>   lib/librte_ether/rte_flow.c                 |  2 +-
>   lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
>   6 files changed, 41 insertions(+), 45 deletions(-)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-11 12:45  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:45 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Pascal Mazon

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> consistent with the normal stacking order of pattern items, which is
> confusing to applications.
>
> Problem is that when followed by one of these layers, the EtherType field
> of the preceding layer keeps its "inner" definition, and the "outer" TPID
> is provided by the subsequent layer, the reverse of how a packet looks like
> on the wire:
>
>   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
>   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
>
> Worse, when QinQ is involved, the stacking order of VLAN layers is
> unspecified. It is unclear whether it should be reversed (innermost to
> outermost) as well given TPID applies to the previous layer:
>
>   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
>   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
>   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
>
> While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> order in case of QinQ and the lack of documentation remain an issue.
>
> This patch replaces TPID in the VLAN pattern item with an inner
> EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> clarifies documentation and updates all relevant code.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> items:
>
> - bnxt: EtherType matching is supported with and without VLAN, but TPID
>    matching is not and triggers an error.
>
> - e1000: EtherType matching is only supported with the ETHERTYPE filter,
>    which does not support VLAN matching, therefore no impact.
>
> - enic: same as bnxt.
>
> - i40e: same as bnxt with existing FDIR limitations on allowed EtherType
>    values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
>    EtherType matching.
>
> - ixgbe: same as e1000, with additional minor change to rely on the new
>    E-Tag macro definition.
>
> - mlx4: EtherType/TPID matching is not supported, no impact.
>
> - mlx5: same as bnxt.
>
> - mvpp2: same as bnxt.
>
> - sfc: same as bnxt.
>
> - tap: same as bnxt.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Hyong Youb Kim <hyonkim@cisco.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Tomasz Duszynski <tdu@semihalf.com>
> Cc: Dmitri Epshtein <dima@marvell.com>
> Cc: Natalie Samsonov <nsamsono@marvell.com>
> Cc: Jianbo Liu <jianbo.liu@arm.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> v3 changes:
>
> Updated mrvl to mvpp2.
>
> Moved unrelated default TCI mask update to separate patch.
>
> Fixed sfc according to Andrew's comments [1], which made so much sense that
> I standardized on the same behavior for all other PMDs: matching outer TPID
> is never supported when a VLAN pattern item is present.
>
> This is done because many devices accept several TPIDs but do not provide
> means to match a given one explicitly, it's all or nothing, and that makes
> the resulting flow rule inaccurate.
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 17 +++----
>   doc/guides/nics/tap.rst                     |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
>   drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
>   drivers/net/enic/enic_flow.c                | 19 +++++---
>   drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
>   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
>   drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
>   drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
>   drivers/net/sfc/sfc_flow.c                  | 18 +++++++
>   drivers/net/tap/tap_flow.c                  | 14 ++++--
>   lib/librte_ether/rte_flow.h                 | 22 ++++++---
>   lib/librte_net/rte_ether.h                  |  1 +
>   14 files changed, 198 insertions(+), 55 deletions(-)

Generic and net/sfc
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-11 12:40  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-11 12:40 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon

On 04/10/2018 07:36 PM, Adrien Mazarguil wrote:
> By definition, RSS involves some kind of hash algorithm, usually Toeplitz.
>
> Until now it could not be modified on a flow rule basis and PMDs had to
> always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
> behavior when unspecified (0).
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> v3 changes:
>
> - Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
>    more explicit where doing so would clarify the code.
>
> - Updated sfc to include Toeplitz as the other allowed value.
>
> Both according to Andrew's suggestions [1].
>
> [1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
> ---
>   app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          |  2 +
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
>   drivers/net/e1000/igb_flow.c                |  4 ++
>   drivers/net/e1000/igb_rxtx.c                |  4 +-
>   drivers/net/i40e/i40e_ethdev.c              |  4 +-
>   drivers/net/i40e/i40e_flow.c                |  4 ++
>   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
>   drivers/net/mlx4/mlx4_flow.c                |  7 +++
>   drivers/net/mlx5/mlx5_flow.c                | 13 +++++
>   drivers/net/sfc/sfc_flow.c                  |  8 +++
>   drivers/net/tap/tap_flow.c                  |  6 ++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 |  2 +
>   16 files changed, 136 insertions(+), 3 deletions(-)

Generic and net/sfc
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE
  2018-04-11  9:59  5%   ` Adrien Mazarguil
@ 2018-04-11 12:04  0%     ` Xueming(Steven) Li
  0 siblings, 0 replies; 200+ results
From: Xueming(Steven) Li @ 2018-04-11 12:04 UTC (permalink / raw)
  To: Adrien Mazarguil
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nélio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

Hi Adrien,

> -----Original Message-----
> From: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Sent: Wednesday, April 11, 2018 5:59 PM
> To: Xueming(Steven) Li <xuemingl@mellanox.com>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>; Jingjing Wu <jingjing.wu@intel.com>;
> Thomas Monjalon <thomas@monjalon.net>; Nélio Laranjeiro
> <nelio.laranjeiro@6wind.com>; Shahaf Shuler <shahafs@mellanox.com>;
> dev@dpdk.org; Olivier Matz <olivier.matz@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-
> GPE
> 
> On Tue, Apr 10, 2018 at 09:00:33PM +0800, Xueming Li wrote:
> > VXLAN-GPE enables VXLAN for all protocols. Protocol link:
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdat
> > atracker.ietf.org%2Fdoc%2Fdraft-ietf-nvo3-vxlan-gpe%2F&data=02%7C01%7C
> > xuemingl%40mellanox.com%7Ce69c95d98f5f457c966908d59f92e393%7Ca652971c7
> > d2e4d9ba6a4d149256f461b%7C0%7C0%7C636590375601220397&sdata=XZ6kpgEIrbB
> > wHrpODaZiByf6a2NQl4J6MadYYAsuNsc%3D&reserved=0
> >
> > Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> 
> Adding a new rte_flow pattern item in the middle of enum
> rte_flow_item_type breaks ABI compatibility. It's fine for 18.05 because
> prior series already destroyed it, however for this patch you need to
> choose between:
> 
> - Adding the new entry at the end of the enum and modifying the rest of
> the
>   code to follow the same order (preferred approach when not doing a full
>   API overhaul).
> 
> *or*
> 
> - Stating in the commit log what functions are impacted by ABI changes as
> in
>   "ethdev: remove DUP action from flow API" [1].
> 
> Also you must add a new "Item: ``VXLAN_GPE``" section to
> doc/guides/prog_guide/rte_flow.rst (look for "VXLAN" for clues).
> 
> Otherwise patch is mostly fine, just a few comments below.
> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fdpdk.org
> %2Fml%2Farchives%2Fdev%2F2018-
> April%2F096526.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Ce69c95d98f5f4
> 57c966908d59f92e393%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636590375
> 601220397&sdata=8Q%2FMigA8hSHmM25UWUvhTOtVuit%2FxQRFBA6iF6lYxv8%3D&reserve
> d=0
> 

Thanks, I've update code according to option 1.

> > ---
> >  lib/librte_ether/rte_eth_ctrl.h  |  3 ++-
> >  lib/librte_ether/rte_flow.c      |  1 +
> >  lib/librte_ether/rte_flow.h      | 27 +++++++++++++++++++++++++++
> >  lib/librte_mbuf/rte_mbuf.c       |  3 +++
> >  lib/librte_mbuf/rte_mbuf.h       |  1 +
> >  lib/librte_mbuf/rte_mbuf_ptype.c |  1 +
> > lib/librte_mbuf/rte_mbuf_ptype.h | 13 +++++++++++++
> >  lib/librte_net/rte_ether.h       | 25 +++++++++++++++++++++++++
> >  8 files changed, 73 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_ether/rte_eth_ctrl.h
> > b/lib/librte_ether/rte_eth_ctrl.h index 668f59acb..5ea8ae24c 100644
> > --- a/lib/librte_ether/rte_eth_ctrl.h
> > +++ b/lib/librte_ether/rte_eth_ctrl.h
> > @@ -54,7 +54,8 @@ extern "C" {
> >  #define RTE_ETH_FLOW_VXLAN              19 /**< VXLAN protocol based
> flow */
> >  #define RTE_ETH_FLOW_GENEVE             20 /**< GENEVE protocol based
> flow */
> >  #define RTE_ETH_FLOW_NVGRE              21 /**< NVGRE protocol based
> flow */
> > -#define RTE_ETH_FLOW_MAX                22
> > +#define RTE_ETH_FLOW_VXLAN_GPE          22 /**< VXLAN-GPE protocol
> based flow */
> > +#define RTE_ETH_FLOW_MAX                23
> >
> >  /**
> >   * Feature filter types
> > diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> > index 3d8116ebd..fb710fff7 100644
> > --- a/lib/librte_ether/rte_flow.c
> > +++ b/lib/librte_ether/rte_flow.c
> > @@ -50,6 +50,7 @@ static const struct rte_flow_desc_data
> rte_flow_desc_item[] = {
> >  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
> >  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
> >  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> > +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),
> 
> Should be at the end of this array if you choose to not impact ABI.
> 
> >  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
> >  	MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
> >  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)), diff --git
> > a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h index
> > bed727df8..c7cfc201a 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -256,6 +256,13 @@ enum rte_flow_item_type {
> >  	RTE_FLOW_ITEM_TYPE_VXLAN,
> >
> >  	/**
> > +	 * Matches a VXLAN-GPE header.
> > +	 *
> > +	 * See struct rte_flow_item_vxlan_gpe.
> > +	 */
> > +	RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
> > +
> > +	/**
> 
> Ditto for the enum definition.
> 
> >  	 * Matches a E_TAG header.
> >  	 *
> >  	 * See struct rte_flow_item_e_tag.
> > @@ -676,6 +683,26 @@ static const struct rte_flow_item_vxlan
> > rte_flow_item_vxlan_mask = {  #endif
> >
> >  /**
> > + * RTE_FLOW_ITEM_TYPE_VXLAN_GPE.
> > + *
> > + * Matches a VXLAN-GPE header.
> 
> You should name the current IETF draft pending a proper RFC:
> 
>  Matches a VXLAN-GPE header (draft-ietf-nvo3-vxlan-gpe-05).
> 
> > + */
> > +struct rte_flow_item_vxlan_gpe {
> > +	uint8_t flags; /**< Normally 0x0c (I and P flag). */
> > +	uint8_t rsvd0[2]; /**< Reserved, normally 0x0000. */
> > +	uint8_t protocol; /**< Protocol type. */
> > +	uint8_t vni[3]; /**< VXLAN identifier. */
> > +	uint8_t rsvd1; /**< Reserved, normally 0x00. */ };
> > +
> > +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN_GPE. */ #ifndef
> > +__cplusplus static const struct rte_flow_item_vxlan_gpe
> > +rte_flow_item_vxlan_gpe_mask = {
> > +	.vni = "\xff\xff\xff",
> > +};
> > +#endif
> 
> Again if you choose to not impact ABI, this should be moved further down,
> after the last item definition for consistency.
> 
> > +
> > +/**
> >   * RTE_FLOW_ITEM_TYPE_E_TAG.
> >   *
> >   * Matches a E-tag header.
> > diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> > index 091d388d3..dc90379e5 100644
> > --- a/lib/librte_mbuf/rte_mbuf.c
> > +++ b/lib/librte_mbuf/rte_mbuf.c
> > @@ -405,6 +405,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
> >  	case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP";
> >  	case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE";
> >  	case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP";
> > +	case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE";
> >  	case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
> >  	case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
> >  	default: return NULL;
> > @@ -439,6 +440,8 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf,
> size_t buflen)
> >  		  "PKT_TX_TUNNEL_NONE" },
> >  		{ PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
> >  		  "PKT_TX_TUNNEL_NONE" },
> > +		{ PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK,
> > +		  "PKT_TX_TUNNEL_NONE" },
> >  		{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
> >  		{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
> >  	};
> > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> > index 62740254d..1839cf2ed 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -210,6 +210,7 @@ extern "C" {
> >  #define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)  /**< TX packet with
> > MPLS-in-UDP RFC 7510 header. */  #define PKT_TX_TUNNEL_MPLSINUDP
> > (0x5ULL << 45)
> > +#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
> >  /* add new TX TUNNEL type here */
> >  #define PKT_TX_TUNNEL_MASK    (0xFULL << 45)
> >
> > diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c
> > b/lib/librte_mbuf/rte_mbuf_ptype.c
> > index 1feefacc6..49106c7df 100644
> > --- a/lib/librte_mbuf/rte_mbuf_ptype.c
> > +++ b/lib/librte_mbuf/rte_mbuf_ptype.c
> > @@ -65,6 +65,7 @@ const char *rte_get_ptype_tunnel_name(uint32_t ptype)
> >  	case RTE_PTYPE_TUNNEL_GTPU: return "TUNNEL_GTPU";
> >  	case RTE_PTYPE_TUNNEL_ESP: return "TUNNEL_ESP";
> >  	case RTE_PTYPE_TUNNEL_L2TP: return "TUNNEL_L2TP";
> > +	case RTE_PTYPE_TUNNEL_VXLAN_GPE: return "TUNNEL_VXLAN_GPE";
> >  	default: return "TUNNEL_UNKNOWN";
> >  	}
> >  }
> > diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> > b/lib/librte_mbuf/rte_mbuf_ptype.h
> > index b9a338110..7caf83312 100644
> > --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> > +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> > @@ -423,6 +423,19 @@ extern "C" {
> >   */
> >  #define RTE_PTYPE_TUNNEL_L2TP               0x0000a000
> >  /**
> > + * VXLAN-GPE (VXLAN Generic Protocol Extension) tunneling packet type.
> > + *
> > + * Packet format:
> > + * <'ether type'=0x0800
> > + * | 'version'=4, 'protocol'=17
> > + * | 'destination port'=4790>
> > + * or,
> > + * <'ether type'=0x86DD
> > + * | 'version'=6, 'next header'=17
> > + * | 'destination port'=4790>
> > + */
> > +#define RTE_PTYPE_TUNNEL_VXLAN_GPE          0x0000b000
> > +/**
> >   * Mask of tunneling packet types.
> >   */
> >  #define RTE_PTYPE_TUNNEL_MASK               0x0000f000
> > diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
> > index a271d1c86..a64814179 100644
> > --- a/lib/librte_net/rte_ether.h
> > +++ b/lib/librte_net/rte_ether.h
> > @@ -311,6 +311,31 @@ struct vxlan_hdr {  /**< VXLAN tunnel header
> > length. */
> >
> >  /**
> > + * VXLAN-GPE protocol header.
> > + * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network
> > + * Identifier and Reserved fields (16 bits and 8 bits).
> 
> Another reference to the current IETF draft here shouldn't hurt.
> 
> > + */
> > +struct vxlan_gpe_hdr {
> > +	uint8_t vx_flags; /**< flag (8). */
> > +	uint8_t reserved[2]; /**< Reserved (16). */
> > +	uint8_t proto; /**< next-protocol (8). */
> > +	uint32_t vx_vni;   /**< VNI (24) + Reserved (8). */
> > +} __attribute__((__packed__));
> > +
> > +/* VXLAN-GPE next protocol types */
> > +#define VXLAN_GPE_TYPE_IPv4 1 /**< IPv4 Protocol. */ #define
> > +VXLAN_GPE_TYPE_IPv6 2 /**< IPv6 Protocol. */ #define
> > +VXLAN_GPE_TYPE_ETH  3 /**< Ethernet Protocol. */ #define
> > +VXLAN_GPE_TYPE_NSH  4 /**< NSH Protocol. */ #define
> > +VXLAN_GPE_TYPE_MPLS 5 /**< MPLS Protocol. */ #define
> > +VXLAN_GPE_TYPE_GBP  6 /**< GBP Protocol. */ #define
> > +VXLAN_GPE_TYPE_VBNG 7 /**< vBNG Protocol. */
> > +
> > +#define ETHER_VXLAN_GPE_HLEN (sizeof(struct udp_hdr) + \
> > +			      sizeof(struct vxlan_gpe_hdr)) /**< VXLAN-GPE
> tunnel header
> > +length. */
> > +
> > +/**
> >   * Extract VLAN tag information into mbuf
> >   *
> >   * Software version of VLAN stripping
> > --
> > 2.13.3
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11  5:33  0%               ` Yongseok Koh
@ 2018-04-11 11:39  0%                 ` Ananyev, Konstantin
  2018-04-11 17:08  0%                   ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-11 11:39 UTC (permalink / raw)
  To: Yongseok Koh
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev


Hi Yongseok,

> > >
> > > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > > Hi Yongseok,
> > > >
> > > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > > Hi,
> > > > > >
> > > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > > rte_pktmbuf_attach_at().
> > > > > > >
> > > > > > > Possible use-cases could be:
> > > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > > >
> > > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > > >
> > > > > > I think the current API is already able to do what you want.
> > > > > >
> > > > > > 1/ Here is a mbuf m with its data
> > > > > >
> > > > > >                off
> > > > > >                <-->
> > > > > >                       len
> > > > > >           +----+   <---------->
> > > > > >           |    |
> > > > > >         +-|----v----------------------+
> > > > > >         | |    -----------------------|
> > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > >         |      -----------------------|
> > > > > >         +-----------------------------+
> > > > > >
> > > > > >
> > > > > > 2/ clone m:
> > > > > >
> > > > > >   c = rte_pktmbuf_alloc(pool);
> > > > > >   rte_pktmbuf_attach(c, m);
> > > > > >
> > > > > >   Note that c has its own offset and length fields.
> > > > > >
> > > > > >
> > > > > >                off
> > > > > >                <-->
> > > > > >                       len
> > > > > >           +----+   <---------->
> > > > > >           |    |
> > > > > >         +-|----v----------------------+
> > > > > >         | |    -----------------------|
> > > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > > >         |      -----------------------|
> > > > > >         +------^----------------------+
> > > > > >                |
> > > > > >           +----+
> > > > > > indirect  |
> > > > > >         +-|---------------------------+
> > > > > >         | |    -----------------------|
> > > > > > c       | buf  |                     ||
> > > > > >         |      -----------------------|
> > > > > >         +-----------------------------+
> > > > > >
> > > > > >                 off    len
> > > > > >                 <--><---------->
> > > > > >
> > > > > >
> > > > > > 3/ remove some data from c without changing m
> > > > > >
> > > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > > >
> > > > > >
> > > > > > Please let me know if it fits your needs.
> > > > >
> > > > > No, it doesn't.
> > > > >
> > > > > Trimming head and tail with the current APIs removes data and make the space
> > > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > > difference offsets in m,
> > > > >
> > > > > rte_pktmbuf_adj(c1, 10);
> > > > > rte_pktmbuf_adj(c2, 20);
> > > > >
> > > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > > header should be linked by h1->next = c2.
> > > >
> > > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > > length) that will:
> > > >   - alloc and attach indirect mbuf for each segment of m that is
> > > >     in the range [offset : length+offset].
> > > >   - prepend an empty and writable mbuf for the headers
> > > >
> > > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > > which actually shrink the headroom, this case can be properly handled.
> > > >
> > > > What do you mean by properly handled?
> > > >
> > > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > > won't be protected.
> > > >
> > > > From an application point of view, indirect mbufs, or direct mbufs that
> > > > have refcnt != 1, should be both considered as read-only because they
> > > > may share their data. How an application can know if the data is shared
> > > > or not?
> > > >
> > > > Maybe we need a flag to differentiate mbufs that are read-only
> > > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > > understanding is correct, you want to have indirect mbufs with RW data.
> > >
> > > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > > enough to handle that use-case.
> > >
> > > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > > indirect referencing.
> >
> > But just to make HW to RX multiple packets into one mbuf,
> > data_off inside indirect mbuf should be enough, correct?
> Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
> to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
> a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
> better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
> to saturate the network link.

There were few complains that 64KB max is a limitation for some use-cases.
I am not against increasing it, but I don't think we have free space on first cache-line for that
without another big rework of mbuf layout. 
Considering that we need to increase size for buf_len, data_off, data_len, and probably priv_size too. 

> 
> > As I understand, what you'd like to achieve with this new field -
> > ability to manipulate packet boundaries after RX, probably at upper layer.
> > As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> > indirect mbufs trying to modify same direct buffer.
> 
> I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
> is read-only. What that means, all the entities which own such mbufs have to be
> aware of that and keep the principle as DPDK can't enforce the rule and there
> can't be such sanity check. In this sense, HW doesn't violate it because the
> direct mbuf is injected to HW before indirection. When packets are written by
> HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
> application layer with freeing the original direct mbuf (decrement refcnt by 1).
> So, HW doesn't touch the direct buffer once it reaches to upper layer.

Yes, I understand that. But as I can see you introduced functions to adjust head and tail,
which implies that it should be possible by some entity (upper layer?) to manipulate these
indirect mbufs.
And we don't know how exactly it will be done.

> The direct buffer will be freed and get available for reuse when all the attached
> indirect mbufs are freed.
> 
> > Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> > Fields for indirect mbufs, straight after attach()?
> 
> Good point.
> Actually that was my draft (Mellanox internal) version of this patch :-) But I
> had to consider a case where priv_size is really given by user. Even though it
> is less likely, but if original priv_size is quite big, it can't cover entire
> buf_len. For this, I had to increase priv_size to 32-bit but adding another
> 16bit field (buf_off) looked more plausible.

As I remember, we can't have mbufs bigger then 64K,
so priv_size + buf_len should be always less than 64K, correct?
Konstantin  

> 
> Thanks for good comments,
> Yongseok
> 
> > > > >
> > > > > Does this make sense?
> > > >
> > > > I understand the need.
> > > >
> > > > Another option would be to make the mbuf->buffer point to an external
> > > > buffer (not inside the direct mbuf). This would require to add a
> > > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > > a quick overview.
> > > >
> > > > [1]
> > >
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > > -Session05-OlivierMatz-
> > >
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > > >
> > > > The advantage is that it does not require the large data to be inside a
> > > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > > allocated from a mempool). On the other hand, it is maybe more complex
> > > > to implement compared to your solution.
> > >
> > > I knew that you presented the slides and frankly, I had considered that option
> > > at first. But even with that option, metadata to store refcnt should also be
> > > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > > end of the data segment. Even though it could have smaller metadata structure,
> > > I just wanted to make full use of the existing framework because it is less
> > > complex as you mentioned. Given that you presented the idea of external data
> > > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > > simpler.  I personally think that we can take the idea of external data seg when
> > > more demands come from users in the future as it would be a huge change and may
> > > break current ABI/API. When the day comes, I'll gladly participate in the
> > > discussions and write codes for it if I can be helpful.
> > >
> > > Do you think this patch is okay for now?
> > >
> > >
> > > Thanks for your comments,
> > > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 4/5] app/testpmd: introduce new tunnel VXLAN-GPE
  @ 2018-04-11  9:59  3%   ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-11  9:59 UTC (permalink / raw)
  To: Xueming Li
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nelio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

On Tue, Apr 10, 2018 at 09:00:35PM +0800, Xueming Li wrote:
> Add VXLAN-GPE support to csum forwarding engine and rte flow.
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>

Depending on whether you chose to impact ABI compatibility in the second
patch of the series, you may need to reorder all VXLAN_GPE definitions in
this patch to match that of the rte_flow API.

A few more comments below.

> ---
>  app/test-pmd/cmdline_flow.c           | 24 ++++++++++
>  app/test-pmd/config.c                 |  2 +
>  app/test-pmd/csumonly.c               | 83 +++++++++++++++++++++++++++++++++--
>  app/test-pmd/parameters.c             | 12 ++++-
>  app/test-pmd/testpmd.h                |  2 +
>  doc/guides/testpmd_app_ug/run_app.rst |  5 +++
>  6 files changed, 124 insertions(+), 4 deletions(-)
> 
> diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
> index f85c1c57f..f5abd589d 100644
> --- a/app/test-pmd/cmdline_flow.c
> +++ b/app/test-pmd/cmdline_flow.c
> @@ -137,6 +137,8 @@ enum index {
>  	ITEM_SCTP_CKSUM,
>  	ITEM_VXLAN,
>  	ITEM_VXLAN_VNI,
> +	ITEM_VXLAN_GPE,
> +	ITEM_VXLAN_GPE_VNI,
>  	ITEM_E_TAG,
>  	ITEM_E_TAG_GRP_ECID_B,
>  	ITEM_NVGRE,
> @@ -461,6 +463,7 @@ static const enum index next_item[] = {
>  	ITEM_TCP,
>  	ITEM_SCTP,
>  	ITEM_VXLAN,
> +	ITEM_VXLAN_GPE,
>  	ITEM_E_TAG,
>  	ITEM_NVGRE,
>  	ITEM_MPLS,
> @@ -589,6 +592,12 @@ static const enum index item_vxlan[] = {
>  	ZERO,
>  };
>  
> +static const enum index item_vxlan_gpe[] = {
> +	ITEM_VXLAN_GPE_VNI,
> +	ITEM_NEXT,
> +	ZERO,
> +};
> +
>  static const enum index item_e_tag[] = {
>  	ITEM_E_TAG_GRP_ECID_B,
>  	ITEM_NEXT,
> @@ -1441,6 +1450,21 @@ static const struct token token_list[] = {
>  		.next = NEXT(item_vxlan, NEXT_ENTRY(UNSIGNED), item_param),
>  		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vxlan, vni)),
>  	},
> +	[ITEM_VXLAN_GPE] = {
> +		.name = "vxlan-gpe",
> +		.help = "match VXLAN-GPE header",
> +		.priv = PRIV_ITEM(VXLAN_GPE,
> +				  sizeof(struct rte_flow_item_vxlan_gpe)),
> +		.next = NEXT(item_vxlan_gpe),
> +		.call = parse_vc,
> +	},
> +	[ITEM_VXLAN_GPE_VNI] = {
> +		.name = "vni",
> +		.help = "VXLAN-GPE identifier",
> +		.next = NEXT(item_vxlan_gpe, NEXT_ENTRY(UNSIGNED), item_param),
> +		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vxlan_gpe,
> +					     vni)),
> +	},
>  	[ITEM_E_TAG] = {
>  		.name = "e_tag",
>  		.help = "match E-Tag header",
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 4a273eff7..1a9bc37ed 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -972,6 +972,7 @@ static const struct {
>  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
>  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
>  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),
>  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)),
>  	MK_FLOW_ITEM(NVGRE, sizeof(struct rte_flow_item_nvgre)),
>  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),

My first comment applies to all the above hunks.

> @@ -3080,6 +3081,7 @@ flowtype_to_str(uint16_t flow_type)
>  		{"l2_payload", RTE_ETH_FLOW_L2_PAYLOAD},
>  		{"port", RTE_ETH_FLOW_PORT},
>  		{"vxlan", RTE_ETH_FLOW_VXLAN},
> +		{"vxlan-gpe", RTE_ETH_FLOW_VXLAN_GPE},
>  		{"geneve", RTE_ETH_FLOW_GENEVE},
>  		{"nvgre", RTE_ETH_FLOW_NVGRE},
>  	};
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
> index 5f5ab64aa..213888374 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -60,6 +60,8 @@
>  #define _htons(x) (x)
>  #endif
>  
> +uint16_t vxlan_gpe_udp_port = 4790;
> +
>  /* structure that caches offload info for the current packet */
>  struct testpmd_offload_info {
>  	uint16_t ethertype;
> @@ -194,6 +196,70 @@ parse_vxlan(struct udp_hdr *udp_hdr,
>  	info->l2_len += ETHER_VXLAN_HLEN; /* add udp + vxlan */
>  }
>  
> +/* Parse a vxlan-gpe header */
> +static void
> +parse_vxlan_gpe(struct udp_hdr *udp_hdr,
> +	    struct testpmd_offload_info *info)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ipv6_hdr *ipv6_hdr;
> +	struct vxlan_gpe_hdr *vxlan_gpe_hdr;
> +	uint8_t vxlan_gpe_len = sizeof(*vxlan_gpe_hdr);
> +
> +	/* check udp destination port, 4790 is the default vxlan-gpe port */
> +	if (udp_hdr->dst_port != _htons(vxlan_gpe_udp_port))
> +		return;
> +
> +	vxlan_gpe_hdr = (struct vxlan_gpe_hdr *)((char *)udp_hdr +
> +				sizeof(struct udp_hdr));
> +
> +	if (!vxlan_gpe_hdr->proto || vxlan_gpe_hdr->proto ==
> +	    VXLAN_GPE_TYPE_IPv4) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		ipv4_hdr = (struct ipv4_hdr *)((char *)vxlan_gpe_hdr +
> +			   vxlan_gpe_len);
> +
> +		parse_ipv4(ipv4_hdr, info);
> +		info->ethertype = _htons(ETHER_TYPE_IPv4);
> +		info->l2_len = 0;
> +
> +	} else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_IPv6) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		ipv6_hdr = (struct ipv6_hdr *)((char *)vxlan_gpe_hdr +
> +			   vxlan_gpe_len);
> +
> +		info->ethertype = _htons(ETHER_TYPE_IPv6);
> +		parse_ipv6(ipv6_hdr, info);
> +		info->l2_len = 0;
> +
> +	} else if (vxlan_gpe_hdr->proto == VXLAN_GPE_TYPE_ETH) {
> +		info->is_tunnel = 1;
> +		info->outer_ethertype = info->ethertype;
> +		info->outer_l2_len = info->l2_len;
> +		info->outer_l3_len = info->l3_len;
> +		info->outer_l4_proto = info->l4_proto;
> +
> +		eth_hdr = (struct ether_hdr *)((char *)vxlan_gpe_hdr +
> +			  vxlan_gpe_len);
> +
> +		parse_ethernet(eth_hdr, info);
> +	} else
> +		return;
> +
> +	info->l2_len += ETHER_VXLAN_GPE_HLEN;
> +}
> +
>  /* Parse a gre header */
>  static void
>  parse_gre(struct simple_gre_hdr *gre_hdr, struct testpmd_offload_info *info)
> @@ -588,6 +654,10 @@ pkt_copy_split(const struct rte_mbuf *pkt)
>   *   Ether / (vlan) / IP|IP6 / UDP|TCP|SCTP .
>   *   Ether / (vlan) / outer IP|IP6 / outer UDP / VxLAN / Ether / IP|IP6 /
>   *           UDP|TCP|SCTP
> + *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / Ether / IP|IP6 /
> + *           UDP|TCP|SCTP
> + *   Ether / (vlan) / outer IP|IP6 / outer UDP / VXLAN-GPE / IP|IP6 /
> + *           UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / GRE / Ether / IP|IP6 / UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / GRE / IP|IP6 / UDP|TCP|SCTP
>   *   Ether / (vlan) / outer IP|IP6 / IP|IP6 / UDP|TCP|SCTP
> @@ -691,9 +761,16 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
>  
>  				udp_hdr = (struct udp_hdr *)((char *)l3_hdr +
>  					info.l3_len);
> -				parse_vxlan(udp_hdr, &info, m->packet_type);
> -				if (info.is_tunnel)
> -					tx_ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +				parse_vxlan_gpe(udp_hdr, &info);
> +				if (info.is_tunnel) {
> +					tx_ol_flags |= PKT_TX_TUNNEL_VXLAN_GPE;
> +				} else {
> +					parse_vxlan(udp_hdr, &info,
> +						    m->packet_type);
> +					if (info.is_tunnel)
> +						tx_ol_flags |=
> +							PKT_TX_TUNNEL_VXLAN;
> +				}
>  			} else if (info.l4_proto == IPPROTO_GRE) {
>  				struct simple_gre_hdr *gre_hdr;
>  
> diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
> index 2192bdcdf..68063b7a4 100644
> --- a/app/test-pmd/parameters.c
> +++ b/app/test-pmd/parameters.c
> @@ -70,7 +70,7 @@ usage(char* progname)
>  	       "--rss-ip | --rss-udp | "
>  	       "--rxpt= | --rxht= | --rxwt= | --rxfreet= | "
>  	       "--txpt= | --txht= | --txwt= | --txfreet= | "
> -	       "--txrst= | --tx-offloads ]\n",
> +	       "--txrst= | --tx-offloads= | --vxlan-gpe-port= ]\n",
>  	       progname);
>  #ifdef RTE_LIBRTE_CMDLINE
>  	printf("  --interactive: run in interactive mode.\n");
> @@ -186,6 +186,7 @@ usage(char* progname)
>  	printf("  --flow-isolate-all: "
>  	       "requests flow API isolated mode on all ports at initialization time.\n");
>  	printf("  --tx-offloads=0xXXXXXXXX: hexadecimal bitmask of TX queue offloads\n");
> +	printf("  --vxlan-gpe-port=N: UPD port of tunnel VXLAN-GPE\n");
>  }
>  
>  #ifdef RTE_LIBRTE_CMDLINE
> @@ -621,6 +622,7 @@ launch_args_parse(int argc, char** argv)
>  		{ "print-event",		1, 0, 0 },
>  		{ "mask-event",			1, 0, 0 },
>  		{ "tx-offloads",		1, 0, 0 },
> +		{ "vxlan-gpe-port",		1, 0, 0 },
>  		{ 0, 0, 0, 0 },
>  	};
>  
> @@ -1091,6 +1093,14 @@ launch_args_parse(int argc, char** argv)
>  					rte_exit(EXIT_FAILURE,
>  						 "tx-offloads must be >= 0\n");
>  			}
> +			if (!strcmp(lgopts[opt_idx].name, "vxlan-gpe-port")) {
> +				n = atoi(optarg);
> +				if (n >= 0)
> +					vxlan_gpe_udp_port = (uint16_t)n;
> +				else
> +					rte_exit(EXIT_FAILURE,
> +						 "vxlan-gpe-port must be >= 0\n");
> +			}
>  			if (!strcmp(lgopts[opt_idx].name, "print-event"))
>  				if (parse_event_printing_config(optarg, 1)) {
>  					rte_exit(EXIT_FAILURE,
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
> index 593ae2160..b270602d9 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -446,6 +446,8 @@ extern uint32_t retry_enabled;
>  extern struct fwd_lcore  **fwd_lcores;
>  extern struct fwd_stream **fwd_streams;
>  
> +extern uint16_t vxlan_gpe_udp_port; /**< UDP port of tunnel VXLAN-GPE. */
> +
>  extern portid_t nb_peer_eth_addrs; /**< Number of peer ethernet addresses. */
>  extern struct ether_addr peer_eth_addrs[RTE_MAX_ETHPORTS];
>  
> diff --git a/doc/guides/testpmd_app_ug/run_app.rst b/doc/guides/testpmd_app_ug/run_app.rst
> index 1fd53958a..2e8690f41 100644
> --- a/doc/guides/testpmd_app_ug/run_app.rst
> +++ b/doc/guides/testpmd_app_ug/run_app.rst
> @@ -479,3 +479,8 @@ The commandline options are:
>  
>      Set the hexadecimal bitmask of TX queue offloads.
>      The default value is 0.
> +
> +*   ``--vxlan-gpe-port=N``
> +
> +    Set the UDP port number of tunnel VXLAN-GPE to N.
> +    The default value is 4790.

You need to update the "Pattern items" section of the flow command
documentation as well.

> -- 
> 2.13.3
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE
  @ 2018-04-11  9:59  5%   ` Adrien Mazarguil
  2018-04-11 12:04  0%     ` Xueming(Steven) Li
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-11  9:59 UTC (permalink / raw)
  To: Xueming Li
  Cc: Wenzhuo Lu, Jingjing Wu, Thomas Monjalon, Nelio Laranjeiro,
	Shahaf Shuler, dev, Olivier Matz

On Tue, Apr 10, 2018 at 09:00:33PM +0800, Xueming Li wrote:
> VXLAN-GPE enables VXLAN for all protocols. Protocol link:
> https://datatracker.ietf.org/doc/draft-ietf-nvo3-vxlan-gpe/
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>

Adding a new rte_flow pattern item in the middle of enum rte_flow_item_type
breaks ABI compatibility. It's fine for 18.05 because prior series already
destroyed it, however for this patch you need to choose between:

- Adding the new entry at the end of the enum and modifying the rest of the
  code to follow the same order (preferred approach when not doing a full
  API overhaul).

*or*

- Stating in the commit log what functions are impacted by ABI changes as in
  "ethdev: remove DUP action from flow API" [1].

Also you must add a new "Item: ``VXLAN_GPE``" section to
doc/guides/prog_guide/rte_flow.rst (look for "VXLAN" for clues).

Otherwise patch is mostly fine, just a few comments below.

[1] http://dpdk.org/ml/archives/dev/2018-April/096526.html

> ---
>  lib/librte_ether/rte_eth_ctrl.h  |  3 ++-
>  lib/librte_ether/rte_flow.c      |  1 +
>  lib/librte_ether/rte_flow.h      | 27 +++++++++++++++++++++++++++
>  lib/librte_mbuf/rte_mbuf.c       |  3 +++
>  lib/librte_mbuf/rte_mbuf.h       |  1 +
>  lib/librte_mbuf/rte_mbuf_ptype.c |  1 +
>  lib/librte_mbuf/rte_mbuf_ptype.h | 13 +++++++++++++
>  lib/librte_net/rte_ether.h       | 25 +++++++++++++++++++++++++
>  8 files changed, 73 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_ether/rte_eth_ctrl.h b/lib/librte_ether/rte_eth_ctrl.h
> index 668f59acb..5ea8ae24c 100644
> --- a/lib/librte_ether/rte_eth_ctrl.h
> +++ b/lib/librte_ether/rte_eth_ctrl.h
> @@ -54,7 +54,8 @@ extern "C" {
>  #define RTE_ETH_FLOW_VXLAN              19 /**< VXLAN protocol based flow */
>  #define RTE_ETH_FLOW_GENEVE             20 /**< GENEVE protocol based flow */
>  #define RTE_ETH_FLOW_NVGRE              21 /**< NVGRE protocol based flow */
> -#define RTE_ETH_FLOW_MAX                22
> +#define RTE_ETH_FLOW_VXLAN_GPE          22 /**< VXLAN-GPE protocol based flow */
> +#define RTE_ETH_FLOW_MAX                23
>  
>  /**
>   * Feature filter types
> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> index 3d8116ebd..fb710fff7 100644
> --- a/lib/librte_ether/rte_flow.c
> +++ b/lib/librte_ether/rte_flow.c
> @@ -50,6 +50,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
>  	MK_FLOW_ITEM(TCP, sizeof(struct rte_flow_item_tcp)),
>  	MK_FLOW_ITEM(SCTP, sizeof(struct rte_flow_item_sctp)),
>  	MK_FLOW_ITEM(VXLAN, sizeof(struct rte_flow_item_vxlan)),
> +	MK_FLOW_ITEM(VXLAN_GPE, sizeof(struct rte_flow_item_vxlan_gpe)),

Should be at the end of this array if you choose to not impact ABI.

>  	MK_FLOW_ITEM(MPLS, sizeof(struct rte_flow_item_mpls)),
>  	MK_FLOW_ITEM(GRE, sizeof(struct rte_flow_item_gre)),
>  	MK_FLOW_ITEM(E_TAG, sizeof(struct rte_flow_item_e_tag)),
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index bed727df8..c7cfc201a 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -256,6 +256,13 @@ enum rte_flow_item_type {
>  	RTE_FLOW_ITEM_TYPE_VXLAN,
>  
>  	/**
> +	 * Matches a VXLAN-GPE header.
> +	 *
> +	 * See struct rte_flow_item_vxlan_gpe.
> +	 */
> +	RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
> +
> +	/**

Ditto for the enum definition.

>  	 * Matches a E_TAG header.
>  	 *
>  	 * See struct rte_flow_item_e_tag.
> @@ -676,6 +683,26 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
>  #endif
>  
>  /**
> + * RTE_FLOW_ITEM_TYPE_VXLAN_GPE.
> + *
> + * Matches a VXLAN-GPE header.

You should name the current IETF draft pending a proper RFC:

 Matches a VXLAN-GPE header (draft-ietf-nvo3-vxlan-gpe-05).

> + */
> +struct rte_flow_item_vxlan_gpe {
> +	uint8_t flags; /**< Normally 0x0c (I and P flag). */
> +	uint8_t rsvd0[2]; /**< Reserved, normally 0x0000. */
> +	uint8_t protocol; /**< Protocol type. */
> +	uint8_t vni[3]; /**< VXLAN identifier. */
> +	uint8_t rsvd1; /**< Reserved, normally 0x00. */
> +};
> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_VXLAN_GPE. */
> +#ifndef __cplusplus
> +static const struct rte_flow_item_vxlan_gpe rte_flow_item_vxlan_gpe_mask = {
> +	.vni = "\xff\xff\xff",
> +};
> +#endif

Again if you choose to not impact ABI, this should be moved further down,
after the last item definition for consistency.

> +
> +/**
>   * RTE_FLOW_ITEM_TYPE_E_TAG.
>   *
>   * Matches a E-tag header.
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 091d388d3..dc90379e5 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -405,6 +405,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
>  	case PKT_TX_TUNNEL_IPIP: return "PKT_TX_TUNNEL_IPIP";
>  	case PKT_TX_TUNNEL_GENEVE: return "PKT_TX_TUNNEL_GENEVE";
>  	case PKT_TX_TUNNEL_MPLSINUDP: return "PKT_TX_TUNNEL_MPLSINUDP";
> +	case PKT_TX_TUNNEL_VXLAN_GPE: return "PKT_TX_TUNNEL_VXLAN_GPE";
>  	case PKT_TX_MACSEC: return "PKT_TX_MACSEC";
>  	case PKT_TX_SEC_OFFLOAD: return "PKT_TX_SEC_OFFLOAD";
>  	default: return NULL;
> @@ -439,6 +440,8 @@ rte_get_tx_ol_flag_list(uint64_t mask, char *buf, size_t buflen)
>  		  "PKT_TX_TUNNEL_NONE" },
>  		{ PKT_TX_TUNNEL_MPLSINUDP, PKT_TX_TUNNEL_MASK,
>  		  "PKT_TX_TUNNEL_NONE" },
> +		{ PKT_TX_TUNNEL_VXLAN_GPE, PKT_TX_TUNNEL_MASK,
> +		  "PKT_TX_TUNNEL_NONE" },
>  		{ PKT_TX_MACSEC, PKT_TX_MACSEC, NULL },
>  		{ PKT_TX_SEC_OFFLOAD, PKT_TX_SEC_OFFLOAD, NULL },
>  	};
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 62740254d..1839cf2ed 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -210,6 +210,7 @@ extern "C" {
>  #define PKT_TX_TUNNEL_GENEVE  (0x4ULL << 45)
>  /**< TX packet with MPLS-in-UDP RFC 7510 header. */
>  #define PKT_TX_TUNNEL_MPLSINUDP (0x5ULL << 45)
> +#define PKT_TX_TUNNEL_VXLAN_GPE (0x6ULL << 45)
>  /* add new TX TUNNEL type here */
>  #define PKT_TX_TUNNEL_MASK    (0xFULL << 45)
>  
> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c
> index 1feefacc6..49106c7df 100644
> --- a/lib/librte_mbuf/rte_mbuf_ptype.c
> +++ b/lib/librte_mbuf/rte_mbuf_ptype.c
> @@ -65,6 +65,7 @@ const char *rte_get_ptype_tunnel_name(uint32_t ptype)
>  	case RTE_PTYPE_TUNNEL_GTPU: return "TUNNEL_GTPU";
>  	case RTE_PTYPE_TUNNEL_ESP: return "TUNNEL_ESP";
>  	case RTE_PTYPE_TUNNEL_L2TP: return "TUNNEL_L2TP";
> +	case RTE_PTYPE_TUNNEL_VXLAN_GPE: return "TUNNEL_VXLAN_GPE";
>  	default: return "TUNNEL_UNKNOWN";
>  	}
>  }
> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
> index b9a338110..7caf83312 100644
> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> @@ -423,6 +423,19 @@ extern "C" {
>   */
>  #define RTE_PTYPE_TUNNEL_L2TP               0x0000a000
>  /**
> + * VXLAN-GPE (VXLAN Generic Protocol Extension) tunneling packet type.
> + *
> + * Packet format:
> + * <'ether type'=0x0800
> + * | 'version'=4, 'protocol'=17
> + * | 'destination port'=4790>
> + * or,
> + * <'ether type'=0x86DD
> + * | 'version'=6, 'next header'=17
> + * | 'destination port'=4790>
> + */
> +#define RTE_PTYPE_TUNNEL_VXLAN_GPE          0x0000b000
> +/**
>   * Mask of tunneling packet types.
>   */
>  #define RTE_PTYPE_TUNNEL_MASK               0x0000f000
> diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
> index a271d1c86..a64814179 100644
> --- a/lib/librte_net/rte_ether.h
> +++ b/lib/librte_net/rte_ether.h
> @@ -311,6 +311,31 @@ struct vxlan_hdr {
>  /**< VXLAN tunnel header length. */
>  
>  /**
> + * VXLAN-GPE protocol header.
> + * Contains the 8-bit flag, 8-bit next-protocol, 24-bit VXLAN Network
> + * Identifier and Reserved fields (16 bits and 8 bits).

Another reference to the current IETF draft here shouldn't hurt.

> + */
> +struct vxlan_gpe_hdr {
> +	uint8_t vx_flags; /**< flag (8). */
> +	uint8_t reserved[2]; /**< Reserved (16). */
> +	uint8_t proto; /**< next-protocol (8). */
> +	uint32_t vx_vni;   /**< VNI (24) + Reserved (8). */
> +} __attribute__((__packed__));
> +
> +/* VXLAN-GPE next protocol types */
> +#define VXLAN_GPE_TYPE_IPv4 1 /**< IPv4 Protocol. */
> +#define VXLAN_GPE_TYPE_IPv6 2 /**< IPv6 Protocol. */
> +#define VXLAN_GPE_TYPE_ETH  3 /**< Ethernet Protocol. */
> +#define VXLAN_GPE_TYPE_NSH  4 /**< NSH Protocol. */
> +#define VXLAN_GPE_TYPE_MPLS 5 /**< MPLS Protocol. */
> +#define VXLAN_GPE_TYPE_GBP  6 /**< GBP Protocol. */
> +#define VXLAN_GPE_TYPE_VBNG 7 /**< vBNG Protocol. */
> +
> +#define ETHER_VXLAN_GPE_HLEN (sizeof(struct udp_hdr) + \
> +			      sizeof(struct vxlan_gpe_hdr))
> +/**< VXLAN-GPE tunnel header length. */
> +
> +/**
>   * Extract VLAN tag information into mbuf
>   *
>   * Software version of VLAN stripping
> -- 
> 2.13.3

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  2018-04-11  2:48  4%                       ` Jerin Jacob
@ 2018-04-11  8:40  0%                         ` Ananyev, Konstantin
  0 siblings, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2018-04-11  8:40 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Olivier Matz, dev, Richardson, Bruce

Hi Jerin,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Wednesday, April 11, 2018 3:49 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> 
> -----Original Message-----
> > Date: Wed, 11 Apr 2018 00:33:14 +0000
> > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > CC: Olivier Matz <olivier.matz@6wind.com>, "dev@dpdk.org" <dev@dpdk.org>,
> >  "Richardson, Bruce" <bruce.richardson@intel.com>
> > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> >  structure
> >
> 
> Hi Konstantin,
> 
> >
> > > -----Original Message-----
> > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > Sent: Friday, April 6, 2018 2:26 AM
> > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > >
> > > -----Original Message-----
> > >
> > > Hi Konstantin,
> > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > Sent: Thursday, April 5, 2018 9:02 AM
> > > > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > >
> > > > > -----Original Message-----
> > > > > > Date: Wed, 4 Apr 2018 23:38:41 +0000
> > > > > > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, Olivier Matz
> > > > > >  <olivier.matz@6wind.com>
> > > > > > CC: "dev@dpdk.org" <dev@dpdk.org>, "Richardson, Bruce"
> > > > > >  <bruce.richardson@intel.com>
> > > > > > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > >  structure
> > > > > >
> > > > > > Hi lads,
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > > > Sent: Tuesday, April 3, 2018 5:43 PM
> > > > > > > To: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > > > >
> > > > > > > -----Original Message-----
> > > > > > > > Date: Tue, 3 Apr 2018 17:56:01 +0200
> > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > >  structure
> > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > >
> > > > > > > > On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > > > > > > > > -----Original Message-----
> > > > > > > > > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > >  structure
> > > > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > > > >
> > > > > > > > > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > > > > > > > > -----Original Message-----
> > > > > > > > > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > > > To: dev@dpdk.org
> > > > > > > > > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > > > >  structure
> > > > > > > > > > > > X-Mailer: git-send-email 2.11.0
> > > > > > > > > > > >
> > > > > > > > > > > > The initial objective of
> > > > > > > > > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > > > > > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > > > > > > > > data (on platform with cache line size = 64B), preventing from
> > > > > > > > > > > > having them on adjacent cache lines.
> > > > > > > > > > > >
> > > > > > > > > > > > Following discussion on the mailing list, it appears that this
> > > > > > > > > > > > also imposes an alignment constraint that is not required.
> > > > > > > > > > > >
> > > > > > > > > > > > This patch removes the extra alignment constraint and adds the
> > > > > > > > > > > > empty cache lines using padding fields in the structure. The
> > > > > > > > > > > > size of rte_ring structure and the offset of the fields remain
> > > > > > > > > > > > the same on platforms with cache line size = 64B:
> > > > > > > > > > > >
> > > > > > > > > > > >   rte_ring = 384
> > > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > > >   rte_ring.prod = 128
> > > > > > > > > > > >   rte_ring.cons = 256
> > > > > > > > > > > >
> > > > > > > > > > > > But it has an impact on platform where cache line size is 128B:
> > > > > > > > > > > >
> > > > > > > > > > > >   rte_ring = 384        -> 768
> > > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > > >   rte_ring.prod = 128   -> 256
> > > > > > > > > > > >   rte_ring.cons = 256   -> 512
> > > > > > > > > > >
> > > > > > > > > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > > > > > > > > the adjust cacheline(consumer)?
> > > > > > > > > > >
> > > > > > > > > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > > > > > > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > > > > > > > >
> > > > > > > > > > The impact on machines that have a 128B cache line is that an unused
> > > > > > > > > > cache line will be added between the producer and consumer data. I
> > > > > > > > > > expect that the impact is positive in case there is a hw prefetcher, and
> > > > > > > > > > null in case there is no such prefetcher.
> > > > > > > > >
> > > > > > > > > It is not NULL, Right? You are loosing 256B for each ring.
> > > > > > > >
> > > > > > > > Is it really that important?
> > > > > > >
> > > > > > > Pipeline or eventdev SW cases there could more rings in the system.
> > > > > > > I don't see any downside of having config option which is enabled
> > > > > > > default.
> > > > > > >
> > > > > > > In my view, such config options are good, as in embedded usecases, customers
> > > > > > > can really fine tune the target for the need. In server usecases, let the default
> > > > > > > of option be enabled, no harm.
> > > > > >
> > > > > > But that would mean we have to maintain two layouts for the rte_ring structure.
> > > > >
> > > > > Is there any downside of having two configurable layout? meaning, we are not
> > > > > transferring rte_ring structure over network etc(ie no interoperability
> > > > > issue). Does it really matter? May I am missing something here.
> > > >
> > > > My concern about potential compatibility problems we are introducing -
> > > > library build with 'y', while app wit 'n', or visa-versa.
> > >
> > > Got it.
> > >
> > > > I wonder are there really a lot of users who would be interested in such savings?
> > > > Could it happen that this new option would sit here unused and untested?
> > >
> > > OK. Fair enough. I have no objections for Olivier patch.
> > >
> > > As a suggestion, may be we can move "char name[RTE_MEMZONE_NAMESIZE]" in the
> > > struct rte_ring in place of " empty cacheline" to save 32B. No strong option
> > > though.
> >
> > That sounds like a good idea to me...
> > But I suppose in that case we need to move to that empty cacheline all fields that precede prod?
> 
> Even though those fields are read only in fastpath,I suppose moving all
> the fields(used in fast path) after prod, prefetch _cons_ cache line in cross
> CPU case.

Ah yes, you right, missed that.
Konstantin

> 
> I think, following comment can be addressed in code as it is an ABI change.
>         /*
>          * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to
>          * ABI
>          * compatibility requirements, it could be changed to
>          * RTE_RING_NAMESIZE
>          * next time the ABI changes
>          */
>         char name[RTE_MEMZONE_NAMESIZE] __rte_cache_aligned; /**< Name of the ring. */
> 
> 
> > Otherwise there will be not much advantage in such move.
> >
> >

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-11  0:25  0%             ` Ananyev, Konstantin
@ 2018-04-11  5:33  0%               ` Yongseok Koh
  2018-04-11 11:39  0%                 ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-11  5:33 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Olivier Matz, Lu, Wenzhuo, Wu, Jingjing, Adrien Mazarguil,
	Nélio Laranjeiro, dev

On Tue, Apr 10, 2018 at 05:25:31PM -0700, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Yongseok Koh
> > Sent: Tuesday, April 10, 2018 2:59 AM
> > To: Olivier Matz <olivier.matz@6wind.com>
> > Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; adrien.mazarguil@6wind.com;
> > nelio.laranjeiro@6wind.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
> > 
> > On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > > Hi Yongseok,
> > >
> > > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > > Hi,
> > > > >
> > > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > > rte_pktmbuf_attach_at().
> > > > > >
> > > > > > Possible use-cases could be:
> > > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > > >
> > > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > > >
> > > > > I think the current API is already able to do what you want.
> > > > >
> > > > > 1/ Here is a mbuf m with its data
> > > > >
> > > > >                off
> > > > >                <-->
> > > > >                       len
> > > > >           +----+   <---------->
> > > > >           |    |
> > > > >         +-|----v----------------------+
> > > > >         | |    -----------------------|
> > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > >         |      -----------------------|
> > > > >         +-----------------------------+
> > > > >
> > > > >
> > > > > 2/ clone m:
> > > > >
> > > > >   c = rte_pktmbuf_alloc(pool);
> > > > >   rte_pktmbuf_attach(c, m);
> > > > >
> > > > >   Note that c has its own offset and length fields.
> > > > >
> > > > >
> > > > >                off
> > > > >                <-->
> > > > >                       len
> > > > >           +----+   <---------->
> > > > >           |    |
> > > > >         +-|----v----------------------+
> > > > >         | |    -----------------------|
> > > > > m       | buf  |    XXXXXXXXXXX      ||
> > > > >         |      -----------------------|
> > > > >         +------^----------------------+
> > > > >                |
> > > > >           +----+
> > > > > indirect  |
> > > > >         +-|---------------------------+
> > > > >         | |    -----------------------|
> > > > > c       | buf  |                     ||
> > > > >         |      -----------------------|
> > > > >         +-----------------------------+
> > > > >
> > > > >                 off    len
> > > > >                 <--><---------->
> > > > >
> > > > >
> > > > > 3/ remove some data from c without changing m
> > > > >
> > > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > > >
> > > > >
> > > > > Please let me know if it fits your needs.
> > > >
> > > > No, it doesn't.
> > > >
> > > > Trimming head and tail with the current APIs removes data and make the space
> > > > available. Adjusting packet head means giving more headroom, not shifting the
> > > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > > difference offsets in m,
> > > >
> > > > rte_pktmbuf_adj(c1, 10);
> > > > rte_pktmbuf_adj(c2, 20);
> > > >
> > > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > > wants to attach outer header, it will overwrite the headroom even though the
> > > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > > header should be linked by h1->next = c2.
> > >
> > > Yes, after these operations c1, c2 and m should become read-only. So, to
> > > prepend headers, another mbuf has to be inserted before as you suggest. It
> > > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > > length) that will:
> > >   - alloc and attach indirect mbuf for each segment of m that is
> > >     in the range [offset : length+offset].
> > >   - prepend an empty and writable mbuf for the headers
> > >
> > > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > > which actually shrink the headroom, this case can be properly handled.
> > >
> > > What do you mean by properly handled?
> > >
> > > Yes, prepending data or adding data in the indirect mbuf won't override
> > > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > > won't be protected.
> > >
> > > From an application point of view, indirect mbufs, or direct mbufs that
> > > have refcnt != 1, should be both considered as read-only because they
> > > may share their data. How an application can know if the data is shared
> > > or not?
> > >
> > > Maybe we need a flag to differentiate mbufs that are read-only
> > > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > > understanding is correct, you want to have indirect mbufs with RW data.
> > 
> > Agree that indirect mbuf must be treated as read-only, Then the current code is
> > enough to handle that use-case.
> > 
> > > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > > indirect referencing.
> 
> But just to make HW to RX multiple packets into one mbuf,
> data_off inside indirect mbuf should be enough, correct?
Right. Current max buffer len of mbuf is 64kB (16bits) but it is enough for mlx5
to reach to 100Gbps with 64B traffic (149Mpps). I made mlx5 HW put 16 packets in
a buffer. So, it needs ~32kB buffer. Having more bits in length fields would be
better but 16-bit is good enough to overcome the PCIe Gen3 bottleneck in order
to saturate the network link.

> As I understand, what you'd like to achieve with this new field -
> ability to manipulate packet boundaries after RX, probably at upper layer.
> As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
> indirect mbufs trying to modify same direct buffer.

I agree that there's an implication that indirect mbuf or mbuf having refcnt > 1
is read-only. What that means, all the entities which own such mbufs have to be
aware of that and keep the principle as DPDK can't enforce the rule and there
can't be such sanity check. In this sense, HW doesn't violate it because the
direct mbuf is injected to HW before indirection. When packets are written by
HW, PMD attaches indirect mbufs to the direct mbuf and deliver those to
application layer with freeing the original direct mbuf (decrement refcnt by 1).
So, HW doesn't touch the direct buffer once it reaches to upper layer. The
direct buffer will be freed and get available for reuse when all the attached
indirect mbufs are freed.

> Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
> Fields for indirect mbufs, straight after attach()?

Good point.
Actually that was my draft (Mellanox internal) version of this patch :-) But I
had to consider a case where priv_size is really given by user. Even though it
is less likely, but if original priv_size is quite big, it can't cover entire
buf_len. For this, I had to increase priv_size to 32-bit but adding another
16bit field (buf_off) looked more plausible.

Thanks for good comments,
Yongseok

> > > >
> > > > Does this make sense?
> > >
> > > I understand the need.
> > >
> > > Another option would be to make the mbuf->buffer point to an external
> > > buffer (not inside the direct mbuf). This would require to add a
> > > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > > a quick overview.
> > >
> > > [1]
> > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> > -Session05-OlivierMatz-
> > Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> > 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> > >
> > > The advantage is that it does not require the large data to be inside a
> > > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > > allocated from a mempool). On the other hand, it is maybe more complex
> > > to implement compared to your solution.
> > 
> > I knew that you presented the slides and frankly, I had considered that option
> > at first. But even with that option, metadata to store refcnt should also be
> > allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> > end of the data segment. Even though it could have smaller metadata structure,
> > I just wanted to make full use of the existing framework because it is less
> > complex as you mentioned. Given that you presented the idea of external data
> > buffer in 2016 and there hasn't been many follow-up discussions/activities so
> > far, I thought the demand isn't so big yet thus I wanted to make this patch
> > simpler.  I personally think that we can take the idea of external data seg when
> > more demands come from users in the future as it would be a huge change and may
> > break current ABI/API. When the day comes, I'll gladly participate in the
> > discussions and write codes for it if I can be helpful.
> > 
> > Do you think this patch is okay for now?
> > 
> > 
> > Thanks for your comments,
> > Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  @ 2018-04-11  2:48  4%                       ` Jerin Jacob
  2018-04-11  8:40  0%                         ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Jerin Jacob @ 2018-04-11  2:48 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: Olivier Matz, dev, Richardson, Bruce

-----Original Message-----
> Date: Wed, 11 Apr 2018 00:33:14 +0000
> From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: Olivier Matz <olivier.matz@6wind.com>, "dev@dpdk.org" <dev@dpdk.org>,
>  "Richardson, Bruce" <bruce.richardson@intel.com>
> Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
>  structure
> 

Hi Konstantin,

> 
> > -----Original Message-----
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Friday, April 6, 2018 2:26 AM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > 
> > -----Original Message-----
> > 
> > Hi Konstantin,
> > 
> > >
> > > > -----Original Message-----
> > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > Sent: Thursday, April 5, 2018 9:02 AM
> > > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > Cc: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > >
> > > > -----Original Message-----
> > > > > Date: Wed, 4 Apr 2018 23:38:41 +0000
> > > > > From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, Olivier Matz
> > > > >  <olivier.matz@6wind.com>
> > > > > CC: "dev@dpdk.org" <dev@dpdk.org>, "Richardson, Bruce"
> > > > >  <bruce.richardson@intel.com>
> > > > > Subject: RE: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > >  structure
> > > > >
> > > > > Hi lads,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > > > Sent: Tuesday, April 3, 2018 5:43 PM
> > > > > > To: Olivier Matz <olivier.matz@6wind.com>
> > > > > > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> > > > > >
> > > > > > -----Original Message-----
> > > > > > > Date: Tue, 3 Apr 2018 17:56:01 +0200
> > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > >  structure
> > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > >
> > > > > > > On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > > > > > > > -----Original Message-----
> > > > > > > > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > > > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > > > > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > >  structure
> > > > > > > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > > > > > > >
> > > > > > > > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > > > > > > > -----Original Message-----
> > > > > > > > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > > > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > > > > > > To: dev@dpdk.org
> > > > > > > > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > > > > > > >  structure
> > > > > > > > > > > X-Mailer: git-send-email 2.11.0
> > > > > > > > > > >
> > > > > > > > > > > The initial objective of
> > > > > > > > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > > > > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > > > > > > > data (on platform with cache line size = 64B), preventing from
> > > > > > > > > > > having them on adjacent cache lines.
> > > > > > > > > > >
> > > > > > > > > > > Following discussion on the mailing list, it appears that this
> > > > > > > > > > > also imposes an alignment constraint that is not required.
> > > > > > > > > > >
> > > > > > > > > > > This patch removes the extra alignment constraint and adds the
> > > > > > > > > > > empty cache lines using padding fields in the structure. The
> > > > > > > > > > > size of rte_ring structure and the offset of the fields remain
> > > > > > > > > > > the same on platforms with cache line size = 64B:
> > > > > > > > > > >
> > > > > > > > > > >   rte_ring = 384
> > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > >   rte_ring.prod = 128
> > > > > > > > > > >   rte_ring.cons = 256
> > > > > > > > > > >
> > > > > > > > > > > But it has an impact on platform where cache line size is 128B:
> > > > > > > > > > >
> > > > > > > > > > >   rte_ring = 384        -> 768
> > > > > > > > > > >   rte_ring.name = 0
> > > > > > > > > > >   rte_ring.flags = 32
> > > > > > > > > > >   rte_ring.memzone = 40
> > > > > > > > > > >   rte_ring.size = 48
> > > > > > > > > > >   rte_ring.mask = 52
> > > > > > > > > > >   rte_ring.prod = 128   -> 256
> > > > > > > > > > >   rte_ring.cons = 256   -> 512
> > > > > > > > > >
> > > > > > > > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > > > > > > > the adjust cacheline(consumer)?
> > > > > > > > > >
> > > > > > > > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > > > > > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > > > > > > >
> > > > > > > > > The impact on machines that have a 128B cache line is that an unused
> > > > > > > > > cache line will be added between the producer and consumer data. I
> > > > > > > > > expect that the impact is positive in case there is a hw prefetcher, and
> > > > > > > > > null in case there is no such prefetcher.
> > > > > > > >
> > > > > > > > It is not NULL, Right? You are loosing 256B for each ring.
> > > > > > >
> > > > > > > Is it really that important?
> > > > > >
> > > > > > Pipeline or eventdev SW cases there could more rings in the system.
> > > > > > I don't see any downside of having config option which is enabled
> > > > > > default.
> > > > > >
> > > > > > In my view, such config options are good, as in embedded usecases, customers
> > > > > > can really fine tune the target for the need. In server usecases, let the default
> > > > > > of option be enabled, no harm.
> > > > >
> > > > > But that would mean we have to maintain two layouts for the rte_ring structure.
> > > >
> > > > Is there any downside of having two configurable layout? meaning, we are not
> > > > transferring rte_ring structure over network etc(ie no interoperability
> > > > issue). Does it really matter? May I am missing something here.
> > >
> > > My concern about potential compatibility problems we are introducing -
> > > library build with 'y', while app wit 'n', or visa-versa.
> > 
> > Got it.
> > 
> > > I wonder are there really a lot of users who would be interested in such savings?
> > > Could it happen that this new option would sit here unused and untested?
> > 
> > OK. Fair enough. I have no objections for Olivier patch.
> > 
> > As a suggestion, may be we can move "char name[RTE_MEMZONE_NAMESIZE]" in the
> > struct rte_ring in place of " empty cacheline" to save 32B. No strong option
> > though.
> 
> That sounds like a good idea to me...
> But I suppose in that case we need to move to that empty cacheline all fields that precede prod?

Even though those fields are read only in fastpath,I suppose moving all
the fields(used in fast path) after prod, prefetch _cons_ cache line in cross
CPU case.

I think, following comment can be addressed in code as it is an ABI change.
        /*
         * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to
         * ABI
         * compatibility requirements, it could be changed to
         * RTE_RING_NAMESIZE
         * next time the ABI changes
         */
        char name[RTE_MEMZONE_NAMESIZE] __rte_cache_aligned; /**< Name of the ring. */


> Otherwise there will be not much advantage in such move.
> 
> 

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  2018-04-10  1:59  3%           ` Yongseok Koh
@ 2018-04-11  0:25  0%             ` Ananyev, Konstantin
  2018-04-11  5:33  0%               ` Yongseok Koh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-11  0:25 UTC (permalink / raw)
  To: Yongseok Koh, Olivier Matz
  Cc: Lu, Wenzhuo, Wu, Jingjing, adrien.mazarguil, nelio.laranjeiro, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Yongseok Koh
> Sent: Tuesday, April 10, 2018 2:59 AM
> To: Olivier Matz <olivier.matz@6wind.com>
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; adrien.mazarguil@6wind.com;
> nelio.laranjeiro@6wind.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
> 
> On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> > Hi Yongseok,
> >
> > On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > > Hi,
> > > >
> > > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > > rte_pktmbuf_attach_at().
> > > > >
> > > > > Possible use-cases could be:
> > > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > > >   buffers can reference different layers of the encapsulated packet.
> > > > > - A large direct mbuf can even contain multiple packets in series and
> > > > >   each packet can be referenced by multiple mbuf indirections.
> > > > >
> > > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > >
> > > > I think the current API is already able to do what you want.
> > > >
> > > > 1/ Here is a mbuf m with its data
> > > >
> > > >                off
> > > >                <-->
> > > >                       len
> > > >           +----+   <---------->
> > > >           |    |
> > > >         +-|----v----------------------+
> > > >         | |    -----------------------|
> > > > m       | buf  |    XXXXXXXXXXX      ||
> > > >         |      -----------------------|
> > > >         +-----------------------------+
> > > >
> > > >
> > > > 2/ clone m:
> > > >
> > > >   c = rte_pktmbuf_alloc(pool);
> > > >   rte_pktmbuf_attach(c, m);
> > > >
> > > >   Note that c has its own offset and length fields.
> > > >
> > > >
> > > >                off
> > > >                <-->
> > > >                       len
> > > >           +----+   <---------->
> > > >           |    |
> > > >         +-|----v----------------------+
> > > >         | |    -----------------------|
> > > > m       | buf  |    XXXXXXXXXXX      ||
> > > >         |      -----------------------|
> > > >         +------^----------------------+
> > > >                |
> > > >           +----+
> > > > indirect  |
> > > >         +-|---------------------------+
> > > >         | |    -----------------------|
> > > > c       | buf  |                     ||
> > > >         |      -----------------------|
> > > >         +-----------------------------+
> > > >
> > > >                 off    len
> > > >                 <--><---------->
> > > >
> > > >
> > > > 3/ remove some data from c without changing m
> > > >
> > > >    rte_pktmbuf_adj(c, 10)   // at head
> > > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > >
> > > >
> > > > Please let me know if it fits your needs.
> > >
> > > No, it doesn't.
> > >
> > > Trimming head and tail with the current APIs removes data and make the space
> > > available. Adjusting packet head means giving more headroom, not shifting the
> > > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > > difference offsets in m,
> > >
> > > rte_pktmbuf_adj(c1, 10);
> > > rte_pktmbuf_adj(c2, 20);
> > >
> > > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > > wants to attach outer header, it will overwrite the headroom even though the
> > > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > > header should be linked by h1->next = c2.
> >
> > Yes, after these operations c1, c2 and m should become read-only. So, to
> > prepend headers, another mbuf has to be inserted before as you suggest. It
> > is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> > length) that will:
> >   - alloc and attach indirect mbuf for each segment of m that is
> >     in the range [offset : length+offset].
> >   - prepend an empty and writable mbuf for the headers
> >
> > > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > > which actually shrink the headroom, this case can be properly handled.
> >
> > What do you mean by properly handled?
> >
> > Yes, prepending data or adding data in the indirect mbuf won't override
> > the direct mbuf. But prepending data or adding data in the direct mbuf m
> > won't be protected.
> >
> > From an application point of view, indirect mbufs, or direct mbufs that
> > have refcnt != 1, should be both considered as read-only because they
> > may share their data. How an application can know if the data is shared
> > or not?
> >
> > Maybe we need a flag to differentiate mbufs that are read-only
> > (something like SHARED_DATA, or simply READONLY). In your case, if my
> > understanding is correct, you want to have indirect mbufs with RW data.
> 
> Agree that indirect mbuf must be treated as read-only, Then the current code is
> enough to handle that use-case.
> 
> > > And another use-case (this is my actual use-case) is to make a large mbuf have
> > > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > > because it transfers multiple packets to a single large buffer to reduce PCIe
> > > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > > indirect referencing.

But just to make HW to RX multiple packets into one mbuf,
data_off inside indirect mbuf should be enough, correct?
As I understand, what you'd like to achieve with this new field -
ability to manipulate packet boundaries after RX, probably at upper layer.
As Olivier pointed above, that doesn't sound as safe approach - as you have multiple
indirect mbufs trying to modify same direct buffer.
Though if you really need to do that, why it can be achieved by updating buf_len and priv_size
Fields for indirect mbufs, straight after attach()?
Konstantin

> > >
> > > Does this make sense?
> >
> > I understand the need.
> >
> > Another option would be to make the mbuf->buffer point to an external
> > buffer (not inside the direct mbuf). This would require to add a
> > mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> > a quick overview.
> >
> > [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01
> -Session05-OlivierMatz-
> Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d
> 149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> >
> > The advantage is that it does not require the large data to be inside a
> > mbuf (requiring a mbuf structure before the buffer, and requiring to be
> > allocated from a mempool). On the other hand, it is maybe more complex
> > to implement compared to your solution.
> 
> I knew that you presented the slides and frankly, I had considered that option
> at first. But even with that option, metadata to store refcnt should also be
> allocated and managed anyway. Kernel also maintains the skb_shared_info at the
> end of the data segment. Even though it could have smaller metadata structure,
> I just wanted to make full use of the existing framework because it is less
> complex as you mentioned. Given that you presented the idea of external data
> buffer in 2016 and there hasn't been many follow-up discussions/activities so
> far, I thought the demand isn't so big yet thus I wanted to make this patch
> simpler.  I personally think that we can take the idea of external data seg when
> more demands come from users in the future as it would be a huge change and may
> break current ABI/API. When the day comes, I'll gladly participate in the
> discussions and write codes for it if I can be helpful.
> 
> Do you think this patch is okay for now?
> 
> 
> Thanks for your comments,
> Yongseok

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-04-10  9:43  4%     ` [dpdk-dev] [PATCH v6 " Remy Horton
  2018-04-10  9:43  7%       ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
@ 2018-04-10 18:56  0%       ` Ferruh Yigit
  1 sibling, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-10 18:56 UTC (permalink / raw)
  To: Remy Horton, dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

On 4/10/2018 10:43 AM, Remy Horton wrote:
> The optimal values of several transmission & reception related parameters,
> such as burst sizes, descriptor ring sizes, and number of queues, varies
> between different network interface devices. This patchset allows individual
> PMDs to specify their preferred parameter values, and if so indicated by an
> application, for them to be used automatically by the ethdev layer.
> 
> rte_eth_dev_configure() has been changed so that specifying zero for both
> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
> are not available, falls back to EAL defaults. Setting one (but not both)
> to zero does not cause the use of defaults, as having one of them zeroed is
> a valid setup.
> 
> This patchset includes per-PMD values for e1000 and i40e but it is expected
> that subsequent patchsets will cover other PMDs. A deprecation notice
> covering the API/ABI change is in place.
> 
> Changes in v6:
> * Updated/corrected testpmd documentation
> * Carried forward acks/review
> * Rebased to d218a4d060de
> 
> Changes in v5:
> * uint_16_t corrected to uint16_t
> 
> Changes in v4:
> * Added API/ABI change documentation
> * Rebased to 78f5a2e93d74
> 
> Changes in v3:
> * Changed formatting around new rte_eth_dev_info fields
> * Added Doxygen documentation to struct rte_eth_dev_portconf
> * Testpmd "port config all burst 0" and --burst=0 uses PMD 
>   Rx burst recommendations.
> * Added to release notes
> * Rebased to 8ea081f38161
> 
> Changes in v2:
> * Rebased to master
> * Removed fallback values from rte_eth_dev_info_get()
> * Added fallback values to rte_rte_[rt]x_queue_setup()
> * Added fallback values to rte_eth_dev_configure()
> * Corrected comment
> * Removed deprecation notice
> * Split RX and Tx into seperate structures
> * Changed parameter names
> 
> 
> Remy Horton (4):
>   ethdev: add support for PMD-tuned Tx/Rx parameters
>   net/e1000: add TxRx tuning parameters
>   net/i40e: add TxRx tuning parameters
>   testpmd: make use of per-PMD TxRx parameters

Series applied to dpdk-next-net/master, thanks.

(Thomas' ack added into ethdev patch)

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and action to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (11 preceding siblings ...)
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-11 13:02  0%       ` Andrew Rybchenko
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z, Declan Doherty

RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
into a different device, as identified by its DPDK port ID.

This is normally only supported when the target port ID has some kind of
relationship with the port ID the flow rule is created against, such as
being exposed by a common physical device (e.g. a different port of an
Ethernet switch).

The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
flow rule match traffic whose origin is the specified port ID. Note that
specifying a port ID that differs from the one the flow rule is created
against is normally meaningless (if even accepted), but can make sense if
combined with the transfer attribute.

These must not be confused with their PHY_PORT counterparts, which refer to
physical ports using device-specific indices, but unlike PORT_ID are not
necessarily tied to DPDK port IDs.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
Cc: Declan Doherty <declan.doherty@intel.com>

---

This patch provides the same functionality and supersedes Qi Zhang's
"ether: add flow action to redirect packet to a port" [1].

The main differences are:

- Action is named PORT_ID instead of PORT.
- Addition of a PORT_ID pattern item.
- More extensive documentation.
- Testpmd support.
- rte_flow_copy() support.

[1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
---
 app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  2 +
 doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
 lib/librte_ether/rte_flow.c                 |  2 +
 lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index cc78b4f2c..fae3c4b12 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -89,6 +89,8 @@ enum index {
 	ITEM_VF_ID,
 	ITEM_PHY_PORT,
 	ITEM_PHY_PORT_INDEX,
+	ITEM_PORT_ID,
+	ITEM_PORT_ID_ID,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -185,6 +187,9 @@ enum index {
 	ACTION_PHY_PORT,
 	ACTION_PHY_PORT_ORIGINAL,
 	ACTION_PHY_PORT_INDEX,
+	ACTION_PORT_ID,
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -445,6 +450,7 @@ static const enum index next_item[] = {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_PHY_PORT,
+	ITEM_PORT_ID,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -491,6 +497,12 @@ static const enum index item_phy_port[] = {
 	ZERO,
 };
 
+static const enum index item_port_id[] = {
+	ITEM_PORT_ID_ID,
+	ITEM_NEXT,
+	ZERO,
+};
+
 static const enum index item_raw[] = {
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -627,6 +639,7 @@ static const enum index next_action[] = {
 	ACTION_PF,
 	ACTION_VF,
 	ACTION_PHY_PORT,
+	ACTION_PORT_ID,
 	ACTION_METER,
 	ZERO,
 };
@@ -668,6 +681,13 @@ static const enum index action_phy_port[] = {
 	ZERO,
 };
 
+static const enum index action_port_id[] = {
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1084,6 +1104,20 @@ static const struct token token_list[] = {
 		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
+	[ITEM_PORT_ID] = {
+		.name = "port_id",
+		.help = "match traffic from/to a given DPDK port ID",
+		.priv = PRIV_ITEM(PORT_ID,
+				  sizeof(struct rte_flow_item_port_id)),
+		.next = NEXT(item_port_id),
+		.call = parse_vc,
+	},
+	[ITEM_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(item_port_id, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port_id, id)),
+	},
 	[ITEM_RAW] = {
 		.name = "raw",
 		.help = "match an arbitrary byte string",
@@ -1749,6 +1783,29 @@ static const struct token token_list[] = {
 					index)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PORT_ID] = {
+		.name = "port_id",
+		.help = "direct matching traffic to a given DPDK port ID",
+		.priv = PRIV_ACTION(PORT_ID,
+				    sizeof(struct rte_flow_action_port_id)),
+		.next = NEXT(action_port_id),
+		.call = parse_vc,
+	},
+	[ACTION_PORT_ID_ORIGINAL] = {
+		.name = "original",
+		.help = "use original DPDK port ID if possible",
+		.next = NEXT(action_port_id, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_port_id,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(action_port_id, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_port_id, id)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index effb4ff81..4a273eff7 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,6 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -1059,6 +1060,7 @@ static const struct {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a39c1e1b0..2fb8e9c3f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -617,6 +617,36 @@ associated with a port_id should be retrieved by other means.
    | ``mask`` | ``index`` | zeroed to match any port index |
    +----------+-----------+--------------------------------+
 
+Item: ``PORT_ID``
+^^^^^^^^^^^^^^^^^
+
+Matches traffic originating from (ingress) or going to (egress) a given DPDK
+port ID.
+
+Normally only supported if the port ID in question is known by the
+underlying PMD and related to the device the flow rule is created against.
+
+This must not be confused with `Item: PHY_PORT`_ which refers to the
+physical port of a device, whereas `Item: PORT_ID`_ refers to a ``struct
+rte_eth_dev`` object on the application side (also known as "port
+representor" depending on the kind of underlying device).
+
+- Default ``mask`` matches the specified DPDK port ID.
+
+.. _table_rte_flow_item_port_id:
+
+.. table:: PORT_ID
+
+   +----------+----------+-----------------------------+
+   | Field    | Subfield | Value                       |
+   +==========+==========+=============================+
+   | ``spec`` | ``id``   | DPDK port ID                |
+   +----------+----------+-----------------------------+
+   | ``last`` | ``id``   | upper range value           |
+   +----------+----------+-----------------------------+
+   | ``mask`` | ``id``   | zeroed to match any port ID |
+   +----------+----------+-----------------------------+
+
 Data matching item types
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1453,6 +1483,24 @@ See `Item: PHY_PORT`_.
    | ``index``    | physical port index                 |
    +--------------+-------------------------------------+
 
+Action: ``PORT_ID``
+^^^^^^^^^^^^^^^^^^^
+Directs matching traffic to a given DPDK port ID.
+
+See `Item: PORT_ID`_.
+
+.. _table_rte_flow_action_port_id:
+
+.. table:: PORT_ID
+
+   +--------------+---------------------------------------+
+   | Field        | Value                                 |
+   +==============+=======================================+
+   | ``original`` | use original DPDK port ID if possible |
+   +--------------+---------------------------------------+
+   | ``id``       | DPDK port ID                          |
+   +--------------+---------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 64d8dfddb..bfb5ad027 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3212,6 +3212,10 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: match traffic from/to a given DPDK port ID.
+
+  - ``id {unsigned}``: DPDK port ID.
+
 - ``raw``: match an arbitrary byte string.
 
   - ``relative {boolean}``: look for pattern after the previous item.
@@ -3428,6 +3432,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original port index if possible.
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: direct matching traffic to a given DPDK port ID.
+
+  - ``original {boolean}``: use original DPDK port ID if possible.
+  - ``id {unsigned}``: DPDK port ID.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index e0fd78dd5..3d8116ebd 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,6 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -77,6 +78,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index c3ae0c6a8..29a3b26e3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -180,6 +180,16 @@ enum rte_flow_item_type {
 	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
+	 * [META]
+	 *
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given DPDK port ID.
+	 *
+	 * See struct rte_flow_item_port_id.
+	 */
+	RTE_FLOW_ITEM_TYPE_PORT_ID,
+
+	/**
 	 * Matches a byte string of a given length at a given offset.
 	 *
 	 * See struct rte_flow_item_raw.
@@ -414,6 +424,32 @@ static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 #endif
 
 /**
+ * RTE_FLOW_ITEM_TYPE_PORT_ID
+ *
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * DPDK port ID.
+ *
+ * Normally only supported if the port ID in question is known by the
+ * underlying PMD and related to the device the flow rule is created
+ * against.
+ *
+ * This must not be confused with @p PHY_PORT which refers to the physical
+ * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev
+ * object on the application side (also known as "port representor"
+ * depending on the kind of underlying device).
+ */
+struct rte_flow_item_port_id {
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */
+#ifndef __cplusplus
+static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = {
+	.id = 0xffffffff,
+};
+#endif
+
+/**
  * RTE_FLOW_ITEM_TYPE_RAW
  *
  * Matches a byte string of a given length at a given offset.
@@ -997,6 +1033,13 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_PHY_PORT,
 
 	/**
+	 * Directs matching traffic to a given DPDK port ID.
+	 *
+	 * See struct rte_flow_action_port_id.
+	 */
+	RTE_FLOW_ACTION_TYPE_PORT_ID,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1134,6 +1177,19 @@ struct rte_flow_action_phy_port {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PORT_ID
+ *
+ * Directs matching traffic to a given DPDK port ID.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PORT_ID
+ */
+struct rte_flow_action_port_id {
+	uint32_t original:1; /**< Use original DPDK port ID if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (10 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-10 16:37  3%     ` Adrien Mazarguil
  2018-04-11 13:00  0%       ` Andrew Rybchenko
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
  2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

This patch adds the missing action counterpart to the PHY_PORT pattern
item, that is, the ability to directly inject matching traffic into a
physical port of the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
 6 files changed, 84 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index a0dbec119..cc78b4f2c 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -182,6 +182,9 @@ enum index {
 	ACTION_VF,
 	ACTION_VF_ORIGINAL,
 	ACTION_VF_ID,
+	ACTION_PHY_PORT,
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -623,6 +626,7 @@ static const enum index next_action[] = {
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
+	ACTION_PHY_PORT,
 	ACTION_METER,
 	ZERO,
 };
@@ -657,6 +661,13 @@ static const enum index action_vf[] = {
 	ZERO,
 };
 
+static const enum index action_phy_port[] = {
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "direct packets to physical port index",
+		.priv = PRIV_ACTION(PHY_PORT,
+				    sizeof(struct rte_flow_action_phy_port)),
+		.next = NEXT(action_phy_port),
+		.call = parse_vc,
+	},
+	[ACTION_PHY_PORT_ORIGINAL] = {
+		.name = "original",
+		.help = "use original port index if possible",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PHY_PORT_INDEX] = {
+		.name = "index",
+		.help = "physical port index",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
+					index)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9f968919e..effb4ff81 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1058,6 +1058,7 @@ static const struct {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 4e053c24b..a39c1e1b0 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1433,6 +1433,26 @@ See `Item: VF`_.
    | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
+Action: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^^^
+
+Directs matching traffic to a given physical port index of the underlying
+device.
+
+See `Item: PHY_PORT`_.
+
+.. _table_rte_flow_action_phy_port:
+
+.. table:: PHY_PORT
+
+   +--------------+-------------------------------------+
+   | Field        | Value                               |
+   +==============+=====================================+
+   | ``original`` | use original port index if possible |
+   +--------------+-------------------------------------+
+   | ``index``    | physical port index                 |
+   +--------------+-------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a2bbd1930..64d8dfddb 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3423,6 +3423,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original VF ID if possible.
   - ``id {unsigned}``: VF ID.
 
+- ``phy_port``: direct packets to physical port index.
+
+  - ``original {boolean}``: use original port index if possible.
+  - ``index {unsigned}``: physical port index.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 6d4d7f5ed..e0fd78dd5 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index ed211a8eb..c3ae0c6a8 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -989,6 +989,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VF,
 
 	/**
+	 * Directs packets to a given physical port index of the underlying
+	 * device.
+	 *
+	 * See struct rte_flow_action_phy_port.
+	 */
+	RTE_FLOW_ACTION_TYPE_PHY_PORT,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1112,6 +1120,20 @@ struct rte_flow_action_vf {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PHY_PORT
+ *
+ * Directs packets to a given physical port index of the underlying
+ * device.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
+ */
+struct rte_flow_action_phy_port {
+	uint32_t original:1; /**< Use original port index if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t index; /**< Physical port index. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (9 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-11 12:57  0%       ` Andrew Rybchenko
  2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
                       ` (2 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
device using specific identifiers, these are often confused with DPDK port
IDs exposed to applications in the global name space.

Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
for better clarity.

No ABI impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
 app/test-pmd/config.c                       |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
 lib/librte_ether/rte_flow.c                 |  2 +-
 lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index af0631036..a0dbec119 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -87,8 +87,8 @@ enum index {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_VF_ID,
-	ITEM_PORT,
-	ITEM_PORT_INDEX,
+	ITEM_PHY_PORT,
+	ITEM_PHY_PORT_INDEX,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -441,7 +441,7 @@ static const enum index next_item[] = {
 	ITEM_ANY,
 	ITEM_PF,
 	ITEM_VF,
-	ITEM_PORT,
+	ITEM_PHY_PORT,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -482,8 +482,8 @@ static const enum index item_vf[] = {
 	ZERO,
 };
 
-static const enum index item_port[] = {
-	ITEM_PORT_INDEX,
+static const enum index item_phy_port[] = {
+	ITEM_PHY_PORT_INDEX,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1059,18 +1059,19 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
-	[ITEM_PORT] = {
-		.name = "port",
-		.help = "device-specific physical port index to use",
-		.priv = PRIV_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-		.next = NEXT(item_port),
+	[ITEM_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "match traffic from/to a specific physical port",
+		.priv = PRIV_ITEM(PHY_PORT,
+				  sizeof(struct rte_flow_item_phy_port)),
+		.next = NEXT(item_phy_port),
 		.call = parse_vc,
 	},
-	[ITEM_PORT_INDEX] = {
+	[ITEM_PHY_PORT_INDEX] = {
 		.name = "index",
 		.help = "physical port index",
-		.next = NEXT(item_port, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port, index)),
+		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
 	[ITEM_RAW] = {
 		.name = "raw",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 49ef87782..9f968919e 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -960,7 +960,7 @@ static const struct {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a0a124aa2..4e053c24b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -195,8 +195,8 @@ When supported, this effectively enables an application to reroute traffic
 not necessarily intended for it (e.g. coming from or addressed to different
 physical ports, VFs or applications) at the device level.
 
-It complements the behavior of some pattern items such as `Item: PORT`_ and
-is meaningless without them.
+It complements the behavior of some pattern items such as `Item: PHY_PORT`_
+and is meaningless without them.
 
 When transferring flow rules, **ingress** and **egress** attributes
 (`Attribute: Traffic direction`_) keep their original meaning, as if
@@ -583,15 +583,15 @@ separate entities, should be addressed through their own DPDK port IDs.
    | ``mask`` | ``id``   | zeroed to match any VF ID |
    +----------+----------+---------------------------+
 
-Item: ``PORT``
-^^^^^^^^^^^^^^
+Item: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^
 
-Matches packets coming from the specified physical port of the underlying
-device.
+Matches traffic originating from (ingress) or going to (egress) a physical
+port of the underlying device.
 
-The first PORT item overrides the physical port normally associated with the
-specified DPDK input port (port_id). This item can be provided several times
-to match additional physical ports.
+The first PHY_PORT item overrides the physical port normally associated with
+the specified DPDK input port (port_id). This item can be provided several
+times to match additional physical ports.
 
 Note that physical ports are not necessarily tied to DPDK input ports
 (port_id) when those are not under DPDK control. Possible values are
@@ -603,9 +603,9 @@ associated with a port_id should be retrieved by other means.
 
 - Default ``mask`` matches any port index.
 
-.. _table_rte_flow_item_port:
+.. _table_rte_flow_item_phy_port:
 
-.. table:: PORT
+.. table:: PHY_PORT
 
    +----------+-----------+--------------------------------+
    | Field    | Subfield  | Value                          |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index af37c3d82..a2bbd1930 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3208,7 +3208,7 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``id {unsigned}``: VF ID.
 
-- ``port``: device-specific physical port index to use.
+- ``phy_port``: match traffic from/to a specific physical port.
 
   - ``index {unsigned}``: physical port index.
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 1f247d656..6d4d7f5ed 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -38,7 +38,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index f3db2ec01..ed211a8eb 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -84,7 +84,7 @@ struct rte_flow_attr {
 	 * applications) at the device level.
 	 *
 	 * It complements the behavior of some pattern items such as
-	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
 	 *
 	 * When transferring flow rules, ingress and egress attributes keep
 	 * their original meaning, as if processing traffic emitted or
@@ -172,17 +172,12 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets coming from the specified physical port of the
-	 * underlying device.
-	 *
-	 * The first PORT item overrides the physical port normally
-	 * associated with the specified DPDK input port (port_id). This
-	 * item can be provided several times to match additional physical
-	 * ports.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * physical port of the underlying device.
 	 *
-	 * See struct rte_flow_item_port.
+	 * See struct rte_flow_item_phy_port.
 	 */
-	RTE_FLOW_ITEM_TYPE_PORT,
+	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
 	 * Matches a byte string of a given length at a given offset.
@@ -388,13 +383,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
 #endif
 
 /**
- * RTE_FLOW_ITEM_TYPE_PORT
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT
  *
- * Matches packets coming from the specified physical port of the underlying
- * device.
+ * Matches traffic originating from (ingress) or going to (egress) a
+ * physical port of the underlying device.
  *
- * The first PORT item overrides the physical port normally associated with
- * the specified DPDK input port (port_id). This item can be provided
+ * The first PHY_PORT item overrides the physical port normally associated
+ * with the specified DPDK input port (port_id). This item can be provided
  * several times to match additional physical ports.
  *
  * Note that physical ports are not necessarily tied to DPDK input ports
@@ -407,13 +402,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
  *
  * A zeroed mask can be used to match any port index.
  */
-struct rte_flow_item_port {
+struct rte_flow_item_phy_port {
 	uint32_t index; /**< Physical port index. */
 };
 
-/** Default mask for RTE_FLOW_ITEM_TYPE_PORT. */
+/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */
 #ifndef __cplusplus
-static const struct rte_flow_item_port rte_flow_item_port_mask = {
+static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 	.index = 0x00000000,
 };
 #endif
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (8 preceding siblings ...)
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
                       ` (3 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

Contrary to all other pattern items, these are inconsistently documented as
affecting traffic instead of simply matching its origin, without provision
for the latter.

This commit clarifies documentation and updates PMDs since the original
behavior now has to be explicitly requested using the new transfer
attribute.

It breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
supported when a transfer attribute is also present.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 12 +++---
 doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
 drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
 drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
 lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
 6 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index a06f3f82b..af0631036 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -1041,21 +1041,21 @@ static const struct token token_list[] = {
 	},
 	[ITEM_PF] = {
 		.name = "pf",
-		.help = "match packets addressed to the physical function",
+		.help = "match traffic from/to the physical function",
 		.priv = PRIV_ITEM(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ITEM_NEXT)),
 		.call = parse_vc,
 	},
 	[ITEM_VF] = {
 		.name = "vf",
-		.help = "match packets addressed to a virtual function ID",
+		.help = "match traffic from/to a virtual function ID",
 		.priv = PRIV_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 		.next = NEXT(item_vf),
 		.call = parse_vc,
 	},
 	[ITEM_VF_ID] = {
 		.name = "id",
-		.help = "destination VF ID",
+		.help = "VF ID",
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
@@ -1686,14 +1686,14 @@ static const struct token token_list[] = {
 	},
 	[ACTION_PF] = {
 		.name = "pf",
-		.help = "redirect packets to physical device function",
+		.help = "direct traffic to physical function",
 		.priv = PRIV_ACTION(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
 	[ACTION_VF] = {
 		.name = "vf",
-		.help = "redirect packets to virtual device function",
+		.help = "direct traffic to a virtual function ID",
 		.priv = PRIV_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 		.next = NEXT(action_vf),
 		.call = parse_vc,
@@ -1708,7 +1708,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_VF_ID] = {
 		.name = "id",
-		.help = "VF ID to redirect packets to",
+		.help = "VF ID",
 		.next = NEXT(action_vf, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 550a4c95b..a0a124aa2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -528,15 +528,12 @@ Usage example, matching non-TCPv4 packets only:
 Item: ``PF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to the physical function of the device.
+Matches traffic originating from (ingress) or going to (egress) the physical
+function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: PF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the physical function is not managed by
+the application and thus not associated with a DPDK port ID.
 
-- Likely to return an error or never match any traffic if applied to a VF
-  device.
 - Can be combined with any number of `Item: VF`_ to match both PF and VF
   traffic.
 - ``spec``, ``last`` and ``mask`` must not be set.
@@ -558,15 +555,15 @@ duplicated between device instances by default.
 Item: ``VF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to a virtual function ID of the device.
+Matches traffic originating from (ingress) or going to (egress) a given
+virtual function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: VF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the virtual function is not managed by the
+application and thus not associated with a DPDK port ID.
+
+Note this pattern item does not match VF representors traffic which, as
+separate entities, should be addressed through their own DPDK port IDs.
 
-- Likely to return an error or never match any traffic if this causes a VF
-  device to match traffic addressed to a different VF.
 - Can be specified multiple times to match traffic addressed to several VF
   IDs.
 - Can be combined with a PF item to match both PF and VF traffic.
@@ -1395,7 +1392,10 @@ only matching traffic goes through.
 Action: ``PF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to the physical function (PF) of the current device.
+Directs matching traffic to the physical function (PF) of the current
+device.
+
+See `Item: PF`_.
 
 - No configurable properties.
 
@@ -1412,13 +1412,15 @@ Redirects packets to the physical function (PF) of the current device.
 Action: ``VF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to a virtual function (VF) of the current device.
+Directs matching traffic to a given virtual function of the current device.
 
 Packets matched by a VF pattern item can be redirected to their original VF
 ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
+See `Item: VF`_.
+
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1428,7 +1430,7 @@ rule or if packets are not addressed to a VF in the first place.
    +==============+================================+
    | ``original`` | use original VF ID if possible |
    +--------------+--------------------------------+
-   | ``vf``       | VF ID to redirect packets to   |
+   | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
 Action: ``METER``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 0bf6c33c9..af37c3d82 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3202,11 +3202,11 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``num {unsigned}``: number of layers covered.
 
-- ``pf``: match packets addressed to the physical function.
+- ``pf``: match traffic from/to the physical function.
 
-- ``vf``: match packets addressed to a virtual function ID.
+- ``vf``: match traffic from/to a virtual function ID.
 
-  - ``id {unsigned}``: destination VF ID.
+  - ``id {unsigned}``: VF ID.
 
 - ``port``: device-specific physical port index to use.
 
@@ -3416,12 +3416,12 @@ This section lists supported actions and their attributes, if any.
 
   - ``queues [{unsigned} [...]] end``: queue indices to use.
 
-- ``pf``: redirect packets to physical device function.
+- ``pf``: direct traffic to physical function.
 
-- ``vf``: redirect packets to virtual device function.
+- ``vf``: direct traffic to a virtual function ID.
 
   - ``original {boolean}``: use original VF ID if possible.
-  - ``id {unsigned}``: VF ID to redirect packets to.
+  - ``id {unsigned}``: VF ID.
 
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index bd166370a..f964b5ea4 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -275,6 +275,7 @@ bnxt_filter_type_check(const struct rte_flow_item pattern[],
 
 static int
 bnxt_validate_and_parse_flow_type(struct bnxt *bp,
+				  const struct rte_flow_attr *attr,
 				  const struct rte_flow_item pattern[],
 				  struct rte_flow_error *error,
 				  struct bnxt_filter_info *filter)
@@ -699,6 +700,16 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 				return -rte_errno;
 			}
 
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   item,
+					   "Matching VF traffic without"
+					   " affecting it (transfer attribute)"
+					   " is unsupported");
+				return -rte_errno;
+			}
+
 			filter->mirror_vnic_id =
 			dflt_vnic = bnxt_hwrm_func_qcfg_vf_dflt_vnic_id(bp, vf);
 			if (dflt_vnic < 0) {
@@ -746,14 +757,6 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -833,7 +836,8 @@ bnxt_validate_and_parse_flow(struct rte_eth_dev *dev,
 		goto ret;
 	}
 
-	rc = bnxt_validate_and_parse_flow_type(bp, pattern, error, filter);
+	rc = bnxt_validate_and_parse_flow_type(bp, attr, pattern, error,
+					       filter);
 	if (rc != 0)
 		goto ret;
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index b004357f1..b0aee0ef7 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -54,6 +54,7 @@ static int i40e_flow_parse_ethertype_action(struct rte_eth_dev *dev,
 				    struct rte_flow_error *error,
 				    struct rte_eth_ethertype_filter *filter);
 static int i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+					const struct rte_flow_attr *attr,
 					const struct rte_flow_item *pattern,
 					struct rte_flow_error *error,
 					struct i40e_fdir_filter_conf *filter);
@@ -1918,14 +1919,6 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -2429,6 +2422,7 @@ i40e_flow_fdir_get_pctype_value(struct i40e_pf *pf,
  */
 static int
 i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
 			     const struct rte_flow_item *pattern,
 			     struct rte_flow_error *error,
 			     struct i40e_fdir_filter_conf *filter)
@@ -2966,6 +2960,16 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ITEM_TYPE_VF:
 			vf_spec = item->spec;
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "Matching VF traffic"
+						   " without affecting it"
+						   " (transfer attribute)"
+						   " is unsupported");
+				return -rte_errno;
+			}
 			filter->input.flow_ext.is_vf = 1;
 			filter->input.flow_ext.dst_id = vf_spec->id;
 			if (filter->input.flow_ext.is_vf &&
@@ -3128,7 +3132,8 @@ i40e_flow_parse_fdir_filter(struct rte_eth_dev *dev,
 		&filter->fdir_filter;
 	int ret;
 
-	ret = i40e_flow_parse_fdir_pattern(dev, pattern, error, fdir_filter);
+	ret = i40e_flow_parse_fdir_pattern(dev, attr, pattern, error,
+					   fdir_filter);
 	if (ret)
 		return ret;
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index fc7df68d3..f3db2ec01 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -152,13 +152,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to the physical function of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a PF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress)
+	 * the physical function of the current device.
 	 *
 	 * No associated specification structure.
 	 */
@@ -167,13 +162,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to a virtual function ID of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a VF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given virtual function of the current device.
 	 *
 	 * See struct rte_flow_item_vf.
 	 */
@@ -371,15 +361,15 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
 /**
  * RTE_FLOW_ITEM_TYPE_VF
  *
- * Matches packets addressed to a virtual function ID of the device.
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * virtual function of the current device.
  *
- * If the underlying device function differs from the one that would
- * normally receive the matched traffic, specifying this item prevents it
- * from reaching that device unless the flow rule contains a VF
- * action. Packets are not duplicated between device instances by default.
+ * If supported, should work even if the virtual function is not managed by
+ * the application and thus not associated with a DPDK port ID.
+ *
+ * Note this pattern item does not match VF representors traffic which, as
+ * separate entities, should be addressed through their own DPDK port IDs.
  *
- * - Likely to return an error or never match any traffic if this causes a
- *   VF device to match traffic addressed to a different VF.
  * - Can be specified multiple times to match traffic addressed to several
  *   VF IDs.
  * - Can be combined with a PF item to match both PF and VF traffic.
@@ -387,7 +377,7 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
  * A zeroed mask can be used to match any VF ID.
  */
 struct rte_flow_item_vf {
-	uint32_t id; /**< Destination VF ID. */
+	uint32_t id; /**< VF ID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VF. */
@@ -988,16 +978,16 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_RSS,
 
 	/**
-	 * Redirects packets to the physical function (PF) of the current
-	 * device.
+	 * Directs matching traffic to the physical function (PF) of the
+	 * current device.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PF,
 
 	/**
-	 * Redirects packets to the virtual function (VF) of the current
-	 * device with the specified ID.
+	 * Directs matching traffic to a given virtual function of the
+	 * current device.
 	 *
 	 * See struct rte_flow_action_vf.
 	 */
@@ -1111,7 +1101,8 @@ struct rte_flow_action_rss {
 /**
  * RTE_FLOW_ACTION_TYPE_VF
  *
- * Redirects packets to a virtual function (VF) of the current device.
+ * Directs matching traffic to a given virtual function of the current
+ * device.
  *
  * Packets matched by a VF pattern item can be redirected to their original
  * VF ID instead of the specified one. This parameter may not be available
@@ -1122,7 +1113,7 @@ struct rte_flow_action_rss {
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
 	uint32_t reserved:31; /**< Reserved, must be zero. */
-	uint32_t id; /**< VF ID to redirect packets to. */
+	uint32_t id; /**< VF ID. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (7 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-10 16:37  2%     ` Adrien Mazarguil
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
                       ` (4 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:37 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Andrew Rybchenko

This new attribute enables applications to create flow rules that do not
simply match traffic whose origin is specified in the pattern (e.g. some
non-default physical port or VF), but actively affect it by applying the
flow rule at the lowest possible level in the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>

---

v3 changes:

Clarified definition for ingress and egress following Andrew's comment on
subsequent patch.

[1] http://dpdk.org/ml/archives/dev/2018-April/095961.html
---
 app/test-pmd/cmdline_flow.c                 | 11 +++++
 app/test-pmd/config.c                       |  6 ++-
 doc/guides/prog_guide/rte_flow.rst          | 26 +++++++++++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 11 ++---
 drivers/net/bnxt/bnxt_filter.c              |  8 ++++
 drivers/net/e1000/igb_flow.c                | 44 ++++++++++++++++++++
 drivers/net/enic/enic_flow.c                |  6 +++
 drivers/net/i40e/i40e_flow.c                |  8 ++++
 drivers/net/ixgbe/ixgbe_flow.c              | 53 ++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_flow.c                |  4 ++
 drivers/net/mlx5/mlx5_flow.c                |  7 ++++
 drivers/net/mvpp2/mrvl_flow.c               |  6 +++
 drivers/net/sfc/sfc_flow.c                  |  6 +++
 drivers/net/tap/tap_flow.c                  |  6 +++
 lib/librte_ether/rte_flow.h                 | 22 +++++++++-
 15 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 49217d5bc..a06f3f82b 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -69,6 +69,7 @@ enum index {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 
 	/* Validate/create pattern. */
 	PATTERN,
@@ -407,6 +408,7 @@ static const enum index next_vc_attr[] = {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 	PATTERN,
 	ZERO,
 };
@@ -960,6 +962,12 @@ static const struct token token_list[] = {
 		.next = NEXT(next_vc_attr),
 		.call = parse_vc,
 	},
+	[TRANSFER] = {
+		.name = "transfer",
+		.help = "apply rule directly to endpoints found in pattern",
+		.next = NEXT(next_vc_attr),
+		.call = parse_vc,
+	},
 	/* Validate/create pattern. */
 	[PATTERN] = {
 		.name = "pattern",
@@ -1945,6 +1953,9 @@ parse_vc(struct context *ctx, const struct token *token,
 	case EGRESS:
 		out->args.vc.attr.egress = 1;
 		return len;
+	case TRANSFER:
+		out->args.vc.attr.transfer = 1;
+		return len;
 	case PATTERN:
 		out->args.vc.pattern =
 			(void *)RTE_ALIGN_CEIL((uintptr_t)(out + 1),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index c0fefe475..49ef87782 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1223,6 +1223,7 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
+		[RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
 		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
@@ -1488,12 +1489,13 @@ port_flow_list(portid_t port_id, uint32_t n, const uint32_t group[n])
 		const struct rte_flow_item *item = pf->pattern;
 		const struct rte_flow_action *action = pf->actions;
 
-		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c\t",
+		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c%c\t",
 		       pf->id,
 		       pf->attr.group,
 		       pf->attr.priority,
 		       pf->attr.ingress ? 'i' : '-',
-		       pf->attr.egress ? 'e' : '-');
+		       pf->attr.egress ? 'e' : '-',
+		       pf->attr.transfer ? 't' : '-');
 		while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 			if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
 				printf("%s ", flow_item[item->type].name);
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index c62a80566..550a4c95b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -170,7 +170,13 @@ Note that support for more than a single priority level is not guaranteed.
 Attribute: Traffic direction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Flow rules can apply to inbound and/or outbound traffic (ingress/egress).
+Flow rule patterns apply to inbound and/or outbound traffic.
+
+In the context of this API, **ingress** and **egress** respectively stand
+for **inbound** and **outbound** based on the standpoint of the application
+creating a flow rule.
+
+There are no exceptions to this definition.
 
 Several pattern items and actions are valid and can be used in both
 directions. At least one direction must be specified.
@@ -178,6 +184,24 @@ directions. At least one direction must be specified.
 Specifying both directions at once for a given rule is not recommended but
 may be valid in a few cases (e.g. shared counters).
 
+Attribute: Transfer
+^^^^^^^^^^^^^^^^^^^
+
+Instead of simply matching the properties of traffic as it would appear on a
+given DPDK port ID, enabling this attribute transfers a flow rule to the
+lowest possible level of any device endpoints found in the pattern.
+
+When supported, this effectively enables an application to reroute traffic
+not necessarily intended for it (e.g. coming from or addressed to different
+physical ports, VFs or applications) at the device level.
+
+It complements the behavior of some pattern items such as `Item: PORT`_ and
+is meaningless without them.
+
+When transferring flow rules, **ingress** and **egress** attributes
+(`Attribute: Traffic direction`_) keep their original meaning, as if
+processing traffic emitted or received by the application.
+
 Pattern item
 ~~~~~~~~~~~~
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 923664f7d..0bf6c33c9 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -2970,14 +2970,14 @@ following sections.
 - Check whether a flow rule can be created::
 
    flow validate {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
 - Create a flow rule::
 
    flow create {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
@@ -3010,7 +3010,7 @@ underlying device in its current state but stops short of creating it. It is
 bound to ``rte_flow_validate()``::
 
    flow validate {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3047,7 +3047,7 @@ Creating flow rules
 to ``rte_flow_create()``::
 
    flow create {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3061,7 +3061,7 @@ Otherwise it will show an error message of the form::
 
 Parameters describe in the following order:
 
-- Attributes (*group*, *priority*, *ingress*, *egress* tokens).
+- Attributes (*group*, *priority*, *ingress*, *egress*, *transfer* tokens).
 - A matching pattern, starting with the *pattern* token and terminated by an
   *end* pattern item.
 - Actions, starting with the *actions* token and terminated by an *end*
@@ -3089,6 +3089,7 @@ specified before the ``pattern`` token.
 - ``priority {level}``: priority level within group.
 - ``ingress``: rule applies to ingress traffic.
 - ``egress``: rule applies to egress traffic.
+- ``transfer``: apply rule directly to endpoints found in pattern.
 
 Each instance of an attribute specified several times overrides the previous
 value as shown below (group 4 is used)::
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 9bb1575cb..bd166370a 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -746,6 +746,14 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index d1c0b4b8d..073852913 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -379,6 +379,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -624,6 +633,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -923,6 +940,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1211,6 +1237,15 @@ cons_parse_flex_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -1361,6 +1396,15 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index 20d6b9d59..3a0086399 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -1318,6 +1318,12 @@ enic_flow_parse(struct rte_eth_dev *dev,
 					   NULL,
 					   "egress is not supported");
 			return -rte_errno;
+		} else if (attrs->transfer) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					   NULL,
+					   "transfer is not supported");
+			return -rte_errno;
 		} else if (!attrs->ingress) {
 			rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index e3d83eac7..b004357f1 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -1918,6 +1918,14 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 438bfcdfb..eb0644c82 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -557,6 +557,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -787,6 +796,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -1078,6 +1095,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1250,6 +1276,15 @@ cons_parse_l2_tn_filter(struct rte_eth_dev *dev,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
 		rte_flow_error_set(error, EINVAL,
@@ -1354,6 +1389,15 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
 		rte_flow_error_set(error, EINVAL,
@@ -2829,6 +2873,15 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 779641e11..480442f87 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -652,6 +652,10 @@ mlx4_flow_prepare(struct priv *priv,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
 			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
 	if (!attr->ingress)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 292e579d1..de8ac9610 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -568,6 +568,13 @@ mlx5_flow_convert_attributes(const struct rte_flow_attr *attr,
 				   "egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   NULL,
+				   "transfer is not supported");
+		return -rte_errno;
+	}
 	if (!attr->ingress) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 6478eb2fe..a2e2129cc 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -2187,6 +2187,12 @@ mrvl_flow_parse_attr(struct mrvl_priv *priv __rte_unused,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, NULL,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index cd6a61b39..bcde2c2f7 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1116,6 +1116,12 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer != 0) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, attr,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->ingress == 0) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, attr,
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index e90e5165f..dc1491990 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1039,6 +1039,12 @@ priv_flow_process(struct pmd_internals *pmd,
 	};
 	int action = 0; /* Only one action authorized for now */
 
+	if (attr->transfer) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			NULL, "transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->group > MAX_GROUP) {
 		rte_flow_error_set(
 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 73d29ed32..fc7df68d3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -72,7 +72,26 @@ struct rte_flow_attr {
 	uint32_t priority; /**< Priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
-	uint32_t reserved:30; /**< Reserved, must be zero. */
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 *
+	 * When transferring flow rules, ingress and egress attributes keep
+	 * their original meaning, as if processing traffic emitted or
+	 * received by the application.
+	 */
+	uint32_t transfer:1;
+	uint32_t reserved:29; /**< Reserved, must be zero. */
 };
 
 /**
@@ -1181,6 +1200,7 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
+	RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
 	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (6 preceding siblings ...)
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-11 12:45  0%       ` Andrew Rybchenko
  2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
                       ` (5 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Andrew Rybchenko, Pascal Mazon

TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
consistent with the normal stacking order of pattern items, which is
confusing to applications.

Problem is that when followed by one of these layers, the EtherType field
of the preceding layer keeps its "inner" definition, and the "outer" TPID
is provided by the subsequent layer, the reverse of how a packet looks like
on the wire:

 Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
 rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]

Worse, when QinQ is involved, the stacking order of VLAN layers is
unspecified. It is unclear whether it should be reversed (innermost to
outermost) as well given TPID applies to the previous layer:

 Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
 rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
 rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]

While specifying EtherType/TPID is hopefully rarely necessary, the stacking
order in case of QinQ and the lack of documentation remain an issue.

This patch replaces TPID in the VLAN pattern item with an inner
EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
clarifies documentation and updates all relevant code.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
items:

- bnxt: EtherType matching is supported with and without VLAN, but TPID
  matching is not and triggers an error.

- e1000: EtherType matching is only supported with the ETHERTYPE filter,
  which does not support VLAN matching, therefore no impact.

- enic: same as bnxt.

- i40e: same as bnxt with existing FDIR limitations on allowed EtherType
  values. The remaining filter types (VXLAN, NVGRE, QINQ) do not support
  EtherType matching.

- ixgbe: same as e1000, with additional minor change to rely on the new
  E-Tag macro definition.

- mlx4: EtherType/TPID matching is not supported, no impact.

- mlx5: same as bnxt.

- mvpp2: same as bnxt.

- sfc: same as bnxt.

- tap: same as bnxt.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: John Daley <johndale@cisco.com>
Cc: Hyong Youb Kim <hyonkim@cisco.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Dmitri Epshtein <dima@marvell.com>
Cc: Natalie Samsonov <nsamsono@marvell.com>
Cc: Jianbo Liu <jianbo.liu@arm.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

Updated mrvl to mvpp2.

Moved unrelated default TCI mask update to separate patch.

Fixed sfc according to Andrew's comments [1], which made so much sense that
I standardized on the same behavior for all other PMDs: matching outer TPID
is never supported when a VLAN pattern item is present.

This is done because many devices accept several TPIDs but do not provide
means to match a given one explicitly, it's all or nothing, and that makes
the resulting flow rule inaccurate.

[1] http://dpdk.org/ml/archives/dev/2018-April/095870.html
---
 app/test-pmd/cmdline_flow.c                 | 17 +++----
 doc/guides/nics/tap.rst                     |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 19 ++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
 drivers/net/bnxt/bnxt_filter.c              | 35 +++++++++++---
 drivers/net/enic/enic_flow.c                | 19 +++++---
 drivers/net/i40e/i40e_flow.c                | 60 ++++++++++++++++++++----
 drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
 drivers/net/mlx5/mlx5_flow.c                | 13 ++++-
 drivers/net/mvpp2/mrvl_flow.c               | 26 +++++++---
 drivers/net/sfc/sfc_flow.c                  | 18 +++++++
 drivers/net/tap/tap_flow.c                  | 14 ++++--
 lib/librte_ether/rte_flow.h                 | 22 ++++++---
 lib/librte_net/rte_ether.h                  |  1 +
 14 files changed, 198 insertions(+), 55 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 9b6004176..49217d5bc 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -99,11 +99,11 @@ enum index {
 	ITEM_ETH_SRC,
 	ITEM_ETH_TYPE,
 	ITEM_VLAN,
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_IPV4,
 	ITEM_IPV4_TOS,
 	ITEM_IPV4_TTL,
@@ -505,11 +505,11 @@ static const enum index item_eth[] = {
 };
 
 static const enum index item_vlan[] = {
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1142,12 +1142,6 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vlan),
 		.call = parse_vc,
 	},
-	[ITEM_VLAN_TPID] = {
-		.name = "tpid",
-		.help = "tag protocol identifier",
-		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan, tpid)),
-	},
 	[ITEM_VLAN_TCI] = {
 		.name = "tci",
 		.help = "tag control information",
@@ -1175,6 +1169,13 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY_MASK_HTON(struct rte_flow_item_vlan,
 						  tci, "\x0f\xff")),
 	},
+	[ITEM_VLAN_INNER_TYPE] = {
+		.name = "inner_type",
+		.help = "inner EtherType",
+		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan,
+					     inner_type)),
+	},
 	[ITEM_IPV4] = {
 		.name = "ipv4",
 		.help = "match IPv4 header",
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index c97786aca..3f7a15147 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -108,7 +108,7 @@ The kernel support can be checked with this command::
 Supported items:
 
 - eth: src and dst (with variable masks), and eth_type (0xffff mask).
-- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- vlan: vid, pcp, but not eid. (requires kernel 4.9)
 - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
 - udp/tcp: src and dst port (0xffff) mask.
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 1a09e8a0f..fd317b48c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -784,9 +784,15 @@ Item: ``ETH``
 
 Matches an Ethernet header.
 
+The ``type`` field either stands for "EtherType" or "TPID" when followed by
+so-called layer 2.5 pattern items such as ``RTE_FLOW_ITEM_TYPE_VLAN``. In
+the latter case, ``type`` refers to that of the outer header, with the inner
+EtherType/TPID provided by the subsequent pattern item. This is the same
+order as on the wire.
+
 - ``dst``: destination MAC.
 - ``src``: source MAC.
-- ``type``: EtherType.
+- ``type``: EtherType or TPID.
 - Default ``mask`` matches destination and source addresses only.
 
 Item: ``VLAN``
@@ -794,8 +800,12 @@ Item: ``VLAN``
 
 Matches an 802.1Q/ad VLAN tag.
 
-- ``tpid``: tag protocol identifier.
+The corresponding standard outer EtherType (TPID) values are
+``ETHER_TYPE_VLAN`` or ``ETHER_TYPE_QINQ``. It can be overridden by the
+preceding pattern item.
+
 - ``tci``: tag control information.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` matches TCI only.
 
 Item: ``IPV4``
@@ -866,12 +876,15 @@ Item: ``E_TAG``
 
 Matches an IEEE 802.1BR E-Tag header.
 
-- ``tpid``: tag protocol identifier (0x893F)
+The corresponding standard outer EtherType (TPID) value is
+``ETHER_TYPE_ETAG``. It can be overridden by the preceding pattern item.
+
 - ``epcp_edei_in_ecid_b``: E-Tag control information (E-TCI), E-PCP (3b),
   E-DEI (1b), ingress E-CID base (12b).
 - ``rsvd_grp_ecid_b``: reserved (2b), GRP (2b), E-CID base (12b).
 - ``in_ecid_e``: ingress E-CID ext.
 - ``ecid_e``: E-CID ext.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` simultaneously matches GRP and E-CID base.
 
 Item: ``NVGRE``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 3b1073bfc..923664f7d 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3223,15 +3223,15 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``dst {MAC-48}``: destination MAC.
   - ``src {MAC-48}``: source MAC.
-  - ``type {unsigned}``: EtherType.
+  - ``type {unsigned}``: EtherType or TPID.
 
 - ``vlan``: match 802.1Q/ad VLAN tag.
 
-  - ``tpid {unsigned}``: tag protocol identifier.
   - ``tci {unsigned}``: tag control information.
   - ``pcp {unsigned}``: priority code point.
   - ``dei {unsigned}``: drop eligible indicator.
   - ``vid {unsigned}``: VLAN identifier.
+  - ``inner_type {unsigned}``: inner EtherType or TPID.
 
 - ``ipv4``: match IPv4 header.
 
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 0f9c1c9ae..9bb1575cb 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -299,6 +299,7 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 	uint32_t vf = 0;
 	int use_ntuple;
 	uint32_t en = 0;
+	uint32_t en_ethertype;
 	int dflt_vnic;
 
 	use_ntuple = bnxt_filter_type_check(pattern, error);
@@ -308,6 +309,9 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 
 	filter->filter_type = use_ntuple ?
 		HWRM_CFA_NTUPLE_FILTER : HWRM_CFA_EM_FILTER;
+	en_ethertype = use_ntuple ?
+		NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
+		EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
 
 	while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 		if (item->last) {
@@ -377,30 +381,49 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 			if (eth_mask->type) {
 				filter->ethertype =
 					rte_be_to_cpu_16(eth_spec->type);
-				en |= use_ntuple ?
-					NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
-					EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
+				en |= en_ethertype;
 			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+			if (en & en_ethertype) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "VLAN TPID matching is not"
+						   " supported");
+				return -rte_errno;
+			}
 			if (vlan_mask->tci &&
-			    vlan_mask->tci == RTE_BE16(0x0fff) &&
-			    !vlan_mask->tpid) {
+			    vlan_mask->tci == RTE_BE16(0x0fff)) {
 				/* Only the VLAN ID can be matched. */
 				filter->l2_ovlan =
 					rte_be_to_cpu_16(vlan_spec->tci &
 							 RTE_BE16(0x0fff));
 				en |= EM_FLOW_ALLOC_INPUT_EN_OVLAN_VID;
-			} else if (vlan_mask->tci || vlan_mask->tpid) {
+			} else if (vlan_mask->tci) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
 						   "VLAN mask is invalid");
 				return -rte_errno;
 			}
+			if (vlan_mask->inner_type &&
+			    vlan_mask->inner_type != RTE_BE16(0xffff)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "inner ethertype mask not"
+						   " valid");
+				return -rte_errno;
+			}
+			if (vlan_mask->inner_type) {
+				filter->ethertype =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+				en |= en_ethertype;
+			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index a5c6a1670..20d6b9d59 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -557,16 +557,21 @@ enic_copy_item_vlan_v2(const struct rte_flow_item *item,
 	if (!spec)
 		return 0;
 
-	/* Don't support filtering in tpid */
-	if (mask) {
-		if (mask->tpid != 0)
-			return ENOTSUP;
-	} else {
+	if (!mask)
 		mask = &rte_flow_item_vlan_mask;
-		RTE_ASSERT(mask->tpid == 0);
-	}
 
 	if (*inner_ofst == 0) {
+		struct ether_hdr *eth_mask =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].mask;
+		struct ether_hdr *eth_val =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].val;
+
+		/* Outer TPID cannot be matched */
+		if (eth_mask->ether_type)
+			return ENOTSUP;
+		eth_mask->ether_type = mask->inner_type;
+		eth_val->ether_type = spec->inner_type;
+
 		/* Outer header. Use the vlan mask/val fields */
 		gp->mask_vlan = mask->tci;
 		gp->val_vlan = spec->tci;
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index fef812c6b..e3d83eac7 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <stdarg.h>
 
+#include <rte_debug.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_log.h>
@@ -2491,16 +2492,22 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						      "Invalid MAC_addr mask.");
 					return -rte_errno;
 				}
+			}
+			if (eth_spec && eth_mask && eth_mask->type) {
+				enum rte_flow_item_type next = (item + 1)->type;
 
-				if ((eth_mask->type & UINT16_MAX) ==
-				    UINT16_MAX) {
-					input_set |= I40E_INSET_LAST_ETHER_TYPE;
-					filter->input.flow.l2_flow.ether_type =
-						eth_spec->type;
+				if (eth_mask->type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid type mask.");
+					return -rte_errno;
 				}
 
 				ether_type = rte_be_to_cpu_16(eth_spec->type);
-				if (ether_type == ETHER_TYPE_IPv4 ||
+
+				if (next == RTE_FLOW_ITEM_TYPE_VLAN ||
+				    ether_type == ETHER_TYPE_IPv4 ||
 				    ether_type == ETHER_TYPE_IPv6 ||
 				    ether_type == ETHER_TYPE_ARP ||
 				    ether_type == outer_tpid) {
@@ -2510,6 +2517,9 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						     "Unsupported ether_type.");
 					return -rte_errno;
 				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					eth_spec->type;
 			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
@@ -2519,6 +2529,8 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+
+			RTE_ASSERT(!(input_set & I40E_INSET_LAST_ETHER_TYPE));
 			if (vlan_spec && vlan_mask) {
 				if (vlan_mask->tci ==
 				    rte_cpu_to_be_16(I40E_TCI_MASK)) {
@@ -2527,6 +2539,33 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						vlan_spec->tci;
 				}
 			}
+			if (vlan_spec && vlan_mask && vlan_mask->inner_type) {
+				if (vlan_mask->inner_type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid inner_type"
+						      " mask.");
+					return -rte_errno;
+				}
+
+				ether_type =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+
+				if (ether_type == ETHER_TYPE_IPv4 ||
+				    ether_type == ETHER_TYPE_IPv6 ||
+				    ether_type == ETHER_TYPE_ARP ||
+				    ether_type == outer_tpid) {
+					rte_flow_error_set(error, EINVAL,
+						     RTE_FLOW_ERROR_TYPE_ITEM,
+						     item,
+						     "Unsupported inner_type.");
+					return -rte_errno;
+				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					vlan_spec->inner_type;
+			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
 			layer_idx = I40E_FLXPLD_L2_IDX;
@@ -3285,7 +3324,8 @@ i40e_flow_parse_vxlan_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -3515,7 +3555,8 @@ i40e_flow_parse_nvgre_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -4023,7 +4064,8 @@ i40e_flow_parse_qinq_pattern(__rte_unused struct rte_eth_dev *dev,
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
 
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 					   RTE_FLOW_ERROR_TYPE_ITEM,
 					   item,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index ea3624ba4..94fba2908 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -115,7 +115,6 @@
 
 #define IXGBE_VT_CTL_POOLING_MODE_MASK         0x00030000
 #define IXGBE_VT_CTL_POOLING_MODE_ETAG         0x00010000
-#define DEFAULT_ETAG_ETYPE                     0x893f
 #define IXGBE_ETAG_ETYPE                       0x00005084
 #define IXGBE_ETAG_ETYPE_MASK                  0x0000ffff
 #define IXGBE_ETAG_ETYPE_VALID                 0x80000000
@@ -1481,7 +1480,7 @@ static int ixgbe_l2_tn_filter_init(struct rte_eth_dev *eth_dev)
 	}
 	l2_tn_info->e_tag_en = FALSE;
 	l2_tn_info->e_tag_fwd_en = FALSE;
-	l2_tn_info->e_tag_ether_type = DEFAULT_ETAG_ETYPE;
+	l2_tn_info->e_tag_ether_type = ETHER_TYPE_ETAG;
 
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bc1176819..292e579d1 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_ether.h>
 #include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
@@ -306,6 +307,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.actions = valid_actions,
 		.mask = &(const struct rte_flow_item_vlan){
 			.tci = -1,
+			.inner_type = -1,
 		},
 		.default_mask = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
@@ -1285,6 +1287,7 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 	struct mlx5_flow_parse *parser = data->parser;
 	struct ibv_flow_spec_eth *eth;
 	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg = "VLAN cannot be empty";
 
 	if (spec) {
 		unsigned int i;
@@ -1306,12 +1309,20 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 			 */
 			if (!eth->mask.vlan_tag)
 				goto error;
+			/* Outer TPID cannot be matched. */
+			if (eth->mask.ether_type) {
+				msg = "VLAN TPID matching is not supported";
+				goto error;
+			}
+			eth->val.ether_type = spec->inner_type;
+			eth->mask.ether_type = mask->inner_type;
+			eth->val.ether_type &= eth->mask.ether_type;
 		}
 		return 0;
 	}
 error:
 	return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				  item, "VLAN cannot be empty");
+				  item, msg);
 }
 
 /**
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 8fd4dbfb1..6478eb2fe 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -1091,12 +1091,6 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 	if (ret)
 		return ret;
 
-	if (mask->tpid) {
-		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				   NULL, "Not supported by classifier\n");
-		return -rte_errno;
-	}
-
 	m = rte_be_to_cpu_16(mask->tci);
 	if (m & MRVL_VLAN_ID_MASK) {
 		RTE_LOG(WARNING, PMD, "vlan id mask is ignored\n");
@@ -1112,6 +1106,26 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 			goto out;
 	}
 
+	if (flow->pattern & F_TYPE) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported\n");
+		return -rte_errno;
+	}
+	if (mask->inner_type) {
+		struct rte_flow_item_eth spec_eth = {
+			.type = spec->inner_type,
+		};
+		struct rte_flow_item_eth mask_eth = {
+			.type = mask->inner_type,
+		};
+
+		RTE_LOG(WARNING, PMD, "inner eth type mask is ignored\n");
+		ret = mrvl_parse_type(spec_eth, mask_eth, flow);
+		if (ret)
+			goto out;
+	}
+
 	return 0;
 out:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 3028efbf9..cd6a61b39 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -7,6 +7,7 @@
  * for Solarflare) and Solarflare Communications, Inc.
  */
 
+#include <rte_byteorder.h>
 #include <rte_tailq.h>
 #include <rte_common.h>
 #include <rte_ethdev_driver.h>
@@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 	const struct rte_flow_item_vlan *mask = NULL;
 	const struct rte_flow_item_vlan supp_mask = {
 		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
+		.inner_type = RTE_BE16(0xffff),
 	};
 
 	rc = sfc_flow_parse_init(item,
@@ -393,6 +395,22 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 		return -rte_errno;
 	}
 
+	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "VLAN TPID matching is not supported");
+		return -rte_errno;
+	}
+	if (mask->inner_type == supp_mask.inner_type) {
+		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
+	} else if (mask->inner_type) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Bad mask for VLAN inner_type");
+		return -rte_errno;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 1caefff43..e90e5165f 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -270,13 +270,13 @@ static const struct tap_flow_items tap_flow_items[] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
 			       RTE_FLOW_ITEM_TYPE_IPV6),
 		.mask = &(const struct rte_flow_item_vlan){
-			.tpid = -1,
 			/* DEI matching is not supported */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 			.tci = 0xffef,
 #else
 			.tci = 0xefff,
 #endif
+			.inner_type = -1,
 		},
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
 		.default_mask = &rte_flow_item_vlan_mask,
@@ -578,13 +578,19 @@ tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
 	/* use default mask if none provided */
 	if (!mask)
 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
-	/* TC does not support tpid masking. Only accept if exact match. */
-	if (mask->tpid && mask->tpid != 0xffff)
+	/* Outer TPID cannot be matched. */
+	if (info->eth_type)
 		return -1;
 	/* Double-tagging not supported. */
-	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+	if (info->vlan)
 		return -1;
 	info->vlan = 1;
+	if (mask->inner_type) {
+		/* TC does not support partial eth_type masking */
+		if (mask->inner_type != RTE_BE16(0xffff))
+			return -1;
+		info->eth_type = spec->inner_type;
+	}
 	if (!flow)
 		return 0;
 	msg = &flow->msg;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cf4a3faf2..f6ee28929 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -454,11 +454,17 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
  * RTE_FLOW_ITEM_TYPE_ETH
  *
  * Matches an Ethernet header.
+ *
+ * The @p type field either stands for "EtherType" or "TPID" when followed
+ * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In
+ * the latter case, @p type refers to that of the outer header, with the
+ * inner EtherType/TPID provided by the subsequent pattern item. This is the
+ * same order as on the wire.
  */
 struct rte_flow_item_eth {
 	struct ether_addr dst; /**< Destination MAC. */
 	struct ether_addr src; /**< Source MAC. */
-	rte_be16_t type; /**< EtherType. */
+	rte_be16_t type; /**< EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */
@@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
  *
  * Matches an 802.1Q/ad VLAN tag.
  *
- * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
- * RTE_FLOW_ITEM_TYPE_VLAN.
+ * The corresponding standard outer EtherType (TPID) values are
+ * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
+ * pattern item.
  */
 struct rte_flow_item_vlan {
-	rte_be16_t tpid; /**< Tag protocol identifier. */
 	rte_be16_t tci; /**< Tag control information. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
 #ifndef __cplusplus
 static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
-	.tpid = RTE_BE16(0x0000),
 	.tci = RTE_BE16(0xffff),
+	.inner_type = RTE_BE16(0x0000),
 };
 #endif
 
@@ -636,9 +643,11 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
  * RTE_FLOW_ITEM_TYPE_E_TAG.
  *
  * Matches a E-tag header.
+ *
+ * The corresponding standard outer EtherType (TPID) value is
+ * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item.
  */
 struct rte_flow_item_e_tag {
-	rte_be16_t tpid; /**< Tag protocol identifier (0x893F). */
 	/**
 	 * E-Tag control information (E-TCI).
 	 * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b).
@@ -648,6 +657,7 @@ struct rte_flow_item_e_tag {
 	rte_be16_t rsvd_grp_ecid_b;
 	uint8_t in_ecid_e; /**< Ingress E-CID ext. */
 	uint8_t ecid_e; /**< E-CID ext. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index 45daa911a..a271d1c86 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -301,6 +301,7 @@ struct vxlan_hdr {
 #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */
 #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */
 #define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
+#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */
 #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
 #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
 #define ETHER_TYPE_TEB  0x6558 /**< Transparent Ethernet Bridging. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level to RSS flow API action
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (5 preceding siblings ...)
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-10 16:36  3%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
                       ` (6 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 34f33f671..9b6004176 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index b258c93e8..c0fefe475 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1085,6 +1085,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index e0c68495c..1a09e8a0f 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1311,6 +1311,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1320,6 +1342,8 @@ field only, both can be requested simultaneously.
    +===============+=============================================+
    | ``func``      | RSS hash function to apply                  |
    +---------------+---------------------------------------------+
+   | ``level``     | encapsulation level for ``types``           |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 546ef3ab7..3b1073bfc 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 82307ec5d..d1c0b4b8d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index d5c1cd3d3..a3776a0d7 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2906,6 +2906,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2921,6 +2922,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 5e313950c..b104b551c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11975,6 +11975,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11990,6 +11991,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 33f77cc80..fef812c6b 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4356,6 +4356,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 00d975b93..438bfcdfb 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index e17f5a433..23af21712 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5683,6 +5683,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5698,6 +5699,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dcaf8df44..779641e11 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 0771ad339..bc1176819 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 779edad0c..3028efbf9 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1269,6 +1269,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 		return -EINVAL;
 	}
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 7abf49ab1..1caefff43 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 0a2c0ac00..1f247d656 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index acf6031ec..cf4a3faf2 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1046,6 +1046,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (4 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-10 16:36  2%     ` Adrien Mazarguil
  2018-04-11 12:40  0%       ` Andrew Rybchenko
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
                       ` (7 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

By definition, RSS involves some kind of hash algorithm, usually Toeplitz.

Until now it could not be modified on a flow rule basis and PMDs had to
always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
behavior when unspecified (0).

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

v3 changes:

- Although RTE_ETH_HASH_FUNCTION_DEFAULT is defined as 0, made comparisons
  more explicit where doing so would clarify the code.

- Updated sfc to include Toeplitz as the other allowed value.

Both according to Andrew's suggestions [1].

[1] http://dpdk.org/ml/archives/dev/2018-April/095840.html
---
 app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          |  2 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
 drivers/net/e1000/igb_flow.c                |  4 ++
 drivers/net/e1000/igb_rxtx.c                |  4 +-
 drivers/net/i40e/i40e_ethdev.c              |  4 +-
 drivers/net/i40e/i40e_flow.c                |  4 ++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
 drivers/net/mlx4/mlx4_flow.c                |  7 +++
 drivers/net/mlx5/mlx5_flow.c                | 13 +++++
 drivers/net/sfc/sfc_flow.c                  |  8 +++
 drivers/net/tap/tap_flow.c                  |  6 ++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 |  2 +
 16 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f6b73ca6e..34f33f671 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -14,6 +14,7 @@
 #include <sys/socket.h>
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev.h>
 #include <rte_byteorder.h>
 #include <cmdline_parse.h>
@@ -165,6 +166,10 @@ enum index {
 	ACTION_DROP,
 	ACTION_COUNT,
 	ACTION_RSS,
+	ACTION_RSS_FUNC,
+	ACTION_RSS_FUNC_DEFAULT,
+	ACTION_RSS_FUNC_TOEPLITZ,
+	ACTION_RSS_FUNC_SIMPLE_XOR,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
 	ACTION_RSS_KEY,
@@ -632,6 +637,7 @@ static const enum index action_queue[] = {
 };
 
 static const enum index action_rss[] = {
+	ACTION_RSS_FUNC,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -666,6 +672,9 @@ static int parse_vc_conf(struct context *, const struct token *,
 static int parse_vc_action_rss(struct context *, const struct token *,
 			       const char *, unsigned int, void *,
 			       unsigned int);
+static int parse_vc_action_rss_func(struct context *, const struct token *,
+				    const char *, unsigned int, void *,
+				    unsigned int);
 static int parse_vc_action_rss_type(struct context *, const struct token *,
 				    const char *, unsigned int, void *,
 				    unsigned int);
@@ -1584,6 +1593,29 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
+	[ACTION_RSS_FUNC] = {
+		.name = "func",
+		.help = "RSS hash function to apply",
+		.next = NEXT(action_rss,
+			     NEXT_ENTRY(ACTION_RSS_FUNC_DEFAULT,
+					ACTION_RSS_FUNC_TOEPLITZ,
+					ACTION_RSS_FUNC_SIMPLE_XOR)),
+	},
+	[ACTION_RSS_FUNC_DEFAULT] = {
+		.name = "default",
+		.help = "default hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_TOEPLITZ] = {
+		.name = "toeplitz",
+		.help = "Toeplitz hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_SIMPLE_XOR] = {
+		.name = "simple_xor",
+		.help = "simple XOR hash function",
+		.call = parse_vc_action_rss_func,
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "specific RSS hash types",
@@ -2074,6 +2106,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
+			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
@@ -2099,6 +2132,45 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 }
 
 /**
+ * Parse func field for RSS action.
+ *
+ * The RTE_ETH_HASH_FUNCTION_* value to assign is derived from the
+ * ACTION_RSS_FUNC_* index that called this function.
+ */
+static int
+parse_vc_action_rss_func(struct context *ctx, const struct token *token,
+			 const char *str, unsigned int len,
+			 void *buf, unsigned int size)
+{
+	struct action_rss_data *action_rss_data;
+	enum rte_eth_hash_function func;
+
+	(void)buf;
+	(void)size;
+	/* Token name must match. */
+	if (parse_default(ctx, token, str, len, NULL, 0) < 0)
+		return -1;
+	switch (ctx->curr) {
+	case ACTION_RSS_FUNC_DEFAULT:
+		func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+		break;
+	case ACTION_RSS_FUNC_TOEPLITZ:
+		func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+		break;
+	case ACTION_RSS_FUNC_SIMPLE_XOR:
+		func = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR;
+		break;
+	default:
+		return -1;
+	}
+	if (!ctx->object)
+		return len;
+	action_rss_data = ctx->object;
+	action_rss_data->conf.func = func;
+	return len;
+}
+
+/**
  * Parse type field for RSS action.
  *
  * Valid tokens are type field names and the "end" token.
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 717f31774..b258c93e8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,6 +1084,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index cf252eeba..e0c68495c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1318,6 +1318,8 @@ field only, both can be requested simultaneously.
    +---------------+---------------------------------------------+
    | Field         | Value                                       |
    +===============+=============================================+
+   | ``func``      | RSS hash function to apply                  |
+   +---------------+---------------------------------------------+
    | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
    +---------------+---------------------------------------------+
    | ``key_len``   | hash key length in bytes                    |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 17336d163..546ef3ab7 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,6 +3398,9 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
+  - ``func {hash function}``: RSS hash function to apply, allowed tokens are
+    the same as `set_hash_global_config`_.
+
   - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
     tokens are the same as `set_hash_input_set`_, except that an empty list
     does not disable RSS but instead requests unspecified "best-effort"
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 8dc5f75f2..82307ec5d 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1310,6 +1310,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 45bb3455c..d5c1cd3d3 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2905,6 +2905,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2919,7 +2920,8 @@ int
 igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 0242b5d59..5e313950c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11974,6 +11974,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11988,7 +11989,8 @@ int
 i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index db708fb5b..33f77cc80 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4352,6 +4352,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 4e31c7c56..00d975b93 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2779,6 +2779,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 94ea7444d..e17f5a433 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5682,6 +5682,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5696,7 +5697,8 @@ int
 ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4dbcaa39c..dcaf8df44 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -790,6 +790,12 @@ mlx4_flow_prepare(struct priv *priv,
 					" of the context size";
 				goto exit_action_not_supported;
 			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1283,6 +1289,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 7798052f9..0771ad339 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
@@ -634,6 +635,15 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "the only supported RSS hash"
+						   " function is Toeplitz");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -683,6 +693,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				}
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
+				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1915,6 +1926,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2429,6 +2441,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 1a2c0299c..779edad0c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1261,6 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
+	switch (rss->func) {
+	case RTE_ETH_HASH_FUNCTION_DEFAULT:
+	case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
+		break;
+	default:
+		return -EINVAL;
+	}
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 78f20913f..7abf49ab1 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,6 +2055,12 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
+	/* Check supported hash functions */
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "non-default RSS hash functions are not supported");
+
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
 	if (err < 0) {
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 2fabc9a29..0a2c0ac00 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,6 +330,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 4385e7eaa..acf6031ec 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -19,6 +19,7 @@
 
 #include <rte_arp.h>
 #include <rte_ether.h>
+#include <rte_eth_ctrl.h>
 #include <rte_icmp.h>
 #include <rte_ip.h>
 #include <rte_sctp.h>
@@ -1044,6 +1045,7 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
+	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
 	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (3 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-11 13:06  0%       ` Andrew Rybchenko
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
                       ` (8 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon, Radu Nicolau, Akhil Goyal

Since its inception, the rte_flow RSS action has been relying in part on
external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
This structure lacks parameters such as the hash algorithm to use, and more
recently, a method to tell which layer RSS should be performed on [1].

Given struct rte_eth_rss_conf will never be flexible enough to represent a
complete RSS configuration (e.g. RETA table), this patch supersedes it by
extending the rte_flow RSS action directly.

A subsequent patch will add a field to use a non-default RSS hash
algorithm. To that end, a field named "types" replaces the field formerly
known as "rss_hf" and standing for "RSS hash functions" as it was
confusing. Actual RSS hash function types are defined by enum
rte_eth_hash_function.

This patch updates all PMDs and example applications accordingly.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

[1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
Cc: Radu Nicolau <radu.nicolau@intel.com>
Cc: Akhil Goyal <akhil.goyal@nxp.com>

---

v3 changes:

Documentation update regarding the meaning of a 0 value for RSS types in
flow rules.

It used to implicitly mean "no RSS" but is redefined as requesting a kind
of "best-effort" mode from PMDs, i.e. anything ranging from empty to
all-inclusive RSS; what matters is it provides safe defaults that will work
regardless of PMD capabilities.
---
 app/test-pmd/cmdline_flow.c                 |  48 +++---
 app/test-pmd/config.c                       |  39 ++---
 doc/guides/prog_guide/rte_flow.rst          |  28 ++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   6 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  31 ++--
 drivers/net/e1000/igb_rxtx.c                |  51 +++++-
 drivers/net/i40e/i40e_ethdev.c              |  53 +++++--
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                |  57 ++++---
 drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  30 ++--
 drivers/net/ixgbe/ixgbe_rxtx.c              |  51 +++++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                |  61 +++----
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 193 +++++++++++------------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +--
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +--
 drivers/net/sfc/sfc_flow.c                  |  21 ++-
 drivers/net/tap/tap_flow.c                  |   8 +-
 examples/ipsec-secgw/ipsec.c                |  10 +-
 lib/librte_ether/rte_flow.c                 |  39 ++---
 lib/librte_ether/rte_flow.h                 |  12 +-
 28 files changed, 484 insertions(+), 359 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index acf19eb8a..f6b73ca6e 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -192,9 +192,8 @@ enum index {
 /** Storage for struct rte_flow_action_rss including external data. */
 struct action_rss_data {
 	struct rte_flow_action_rss conf;
+	uint8_t key[RSS_HASH_KEY_LENGTH];
 	uint16_t queue[ACTION_RSS_QUEUE_NUM];
-	struct rte_eth_rss_conf rss_conf;
-	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -1587,7 +1586,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
-		.help = "RSS hash types",
+		.help = "specific RSS hash types",
 		.next = NEXT(action_rss, NEXT_ENTRY(ACTION_RSS_TYPE)),
 	},
 	[ACTION_RSS_TYPE] = {
@@ -1602,21 +1601,21 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
 		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
 			     ARGS_ENTRY_ARB
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len)),
-			     ARGS_ENTRY(struct action_rss_data, rss_key)),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len)),
+			     ARGS_ENTRY(struct action_rss_data, key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len),
 			      0,
 			      RSS_HASH_KEY_LENGTH)),
 	},
@@ -2075,27 +2074,24 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->rss_conf,
-			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.types = rss_hf,
+			.key_len = sizeof(action_rss_data->key),
+			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.key = action_rss_data->key,
 			.queue = action_rss_data->queue,
 		},
+		.key = "testpmd's default RSS hash key",
 		.queue = { 0 },
-		.rss_conf = (struct rte_eth_rss_conf){
-			.rss_key = action_rss_data->rss_key,
-			.rss_key_len = sizeof(action_rss_data->rss_key),
-			.rss_hf = rss_hf,
-		},
-		.rss_key = "testpmd's default RSS hash key",
 	};
-	for (i = 0; i < action_rss_data->conf.num; ++i)
+	for (i = 0; i < action_rss_data->conf.queue_num; ++i)
 		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->rss_key),
+		action_rss_data->conf.key_len =
+			RTE_MIN(sizeof(action_rss_data->key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2123,7 +2119,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->rss_conf.rss_hf = 0;
+		action_rss_data->conf.types = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2142,7 +2138,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->conf.types |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2192,7 +2188,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue_num = i;
 	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 052163357..717f31774 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,40 +1084,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index acbeaacbd..cf252eeba 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1301,6 +1301,12 @@ Action: ``RSS``
 Similar to QUEUE, except RSS is additionally performed on packets to spread
 them among several queues according to the provided parameters.
 
+Unlike global RSS settings used by other DPDK APIs, unsetting the ``types``
+field does not disable RSS in a flow rule. Doing so instead requests safe
+unspecified "best-effort" settings from the underlying PMD, which depending
+on the flow rule, may result in anything ranging from empty (single queue)
+to all-inclusive RSS.
+
 Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
@@ -1309,15 +1315,19 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+--------------------------------+
-   | Field        | Value                          |
-   +==============+================================+
-   | ``rss_conf`` | RSS parameters                 |
-   +--------------+--------------------------------+
-   | ``num``      | number of entries in ``queue`` |
-   +--------------+--------------------------------+
-   | ``queue``    | queue indices to use           |
-   +--------------+--------------------------------+
+   +---------------+---------------------------------------------+
+   | Field         | Value                                       |
+   +===============+=============================================+
+   | ``types``     | specific RSS hash types (see ``ETH_RSS_*``) |
+   +---------------+---------------------------------------------+
+   | ``key_len``   | hash key length in bytes                    |
+   +---------------+---------------------------------------------+
+   | ``queue_num`` | number of entries in ``queue``              |
+   +---------------+---------------------------------------------+
+   | ``key``       | hash key                                    |
+   +---------------+---------------------------------------------+
+   | ``queue``     | queue indices to use                        |
+   +---------------+---------------------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a015d02a4..17336d163 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,8 +3398,10 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
-  - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
-    are the same as `set_hash_input_set`_, an empty list means none (0).
+  - ``types [{RSS hash type} [...]] end``: specific RSS hash types, allowed
+    tokens are the same as `set_hash_input_set`_, except that an empty list
+    does not disable RSS but instead requests unspecified "best-effort"
+    settings.
 
   - ``key {string}``: RSS hash key, overrides ``key_len``.
 
diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b894a..902001f36 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -4,6 +4,10 @@
 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
+
+#include <stdint.h>
+
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_pci.h>
 
@@ -27,6 +31,7 @@
 #define E1000_CTRL_EXT_EXTEND_VLAN  (1<<26)    /* EXTENDED VLAN */
 #define IGB_VFTA_SIZE 128
 
+#define IGB_HKEY_MAX_INDEX             10
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
@@ -229,8 +234,8 @@ struct igb_ethertype_filter {
 };
 
 struct igb_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IGB_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IGB_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -501,6 +506,10 @@ int eth_igb_syn_filter_set(struct rte_eth_dev *dev,
 int eth_igb_add_del_flex_filter(struct rte_eth_dev *dev,
 			struct rte_eth_flex_filter *filter,
 			bool add);
+int igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		      const struct rte_flow_action_rss *in);
+int igb_action_rss_same(const struct rte_flow_action_rss *comp,
+			const struct rte_flow_action_rss *with);
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..7a431ac33 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -41,8 +41,6 @@
 #define IGB_DEFAULT_TX_HTHRESH      1
 #define IGB_DEFAULT_TX_WTHRESH      ((hw->mac.type == e1000_82576) ? 1 : 16)
 
-#define IGB_HKEY_MAX_INDEX 10
-
 /* Bit shift and mask */
 #define IGB_4_BIT_WIDTH  (CHAR_BIT / 2)
 #define IGB_4_BIT_MASK   RTE_LEN2MASK(IGB_4_BIT_WIDTH, uint8_t)
@@ -5576,7 +5574,7 @@ igb_rss_filter_restore(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter_info->rss_info, TRUE);
 }
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index c0f5b5190..8dc5f75f2 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1292,7 +1292,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -1300,7 +1300,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1310,14 +1310,18 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IGB_RSS_OFFLOAD_ALL;
-
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (igb_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	index++;
@@ -1518,9 +1522,8 @@ igb_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct igb_rte_flow_rss_conf));
+			igb_rss_conf_init(&rss_filter_ptr->filter_info,
+					  &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&igb_filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
@@ -1757,7 +1760,7 @@ igb_clear_rss_filter(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter->rss_info.num)
+	if (filter->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter->rss_info, FALSE);
 }
 
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 323913f0d..45bb3455c 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2898,12 +2898,47 @@ igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 }
 
 int
+igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		  const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+igb_action_rss_same(const struct rte_flow_action_rss *comp,
+		    const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 igb_config_rss_filter(struct rte_eth_dev *dev,
 		struct igb_rte_flow_rss_conf *conf, bool add)
 {
 	uint32_t shift;
 	uint16_t i, j;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
@@ -2911,8 +2946,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct igb_rte_flow_rss_conf)) == 0) {
+		if (igb_action_rss_same(&filter_info->rss_info.conf,
+					&conf->conf)) {
 			igb_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct igb_rte_flow_rss_conf));
@@ -2921,7 +2956,7 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 
 	/* Fill in redirection table. */
@@ -2933,9 +2968,9 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		} reta;
 		uint8_t q_idx;
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		q_idx = conf->queue[j];
+		q_idx = conf->conf.queue[j];
 		reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
 		if ((i & 3) == 3)
 			E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
@@ -2952,8 +2987,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	igb_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct igb_rte_flow_rss_conf));
+	if (igb_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..0242b5d59 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include <assert.h>
 
+#include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_string_fns.h>
 #include <rte_pci.h>
@@ -11467,7 +11468,7 @@ i40e_rss_filter_restore(struct i40e_pf *pf)
 {
 	struct i40e_rte_flow_rss_conf *conf =
 					&pf->rss_info;
-	if (conf->num)
+	if (conf->conf.queue_num)
 		i40e_config_rss_filter(pf, conf, TRUE);
 }
 
@@ -11966,18 +11967,52 @@ i40e_cloud_filter_qinq_create(struct i40e_pf *pf)
 }
 
 int
+i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		   const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+		     const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	uint32_t i, lut = 0;
 	uint16_t j, num;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct i40e_rte_flow_rss_conf *rss_info = &pf->rss_info;
 
 	if (!add) {
-		if (memcmp(conf, rss_info,
-			sizeof(struct i40e_rte_flow_rss_conf)) == 0) {
+		if (i40e_action_rss_same(&rss_info->conf, &conf->conf)) {
 			i40e_pf_disable_rss(pf);
 			memset(rss_info, 0,
 				sizeof(struct i40e_rte_flow_rss_conf));
@@ -11986,7 +12021,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 		return -EINVAL;
 	}
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		return -EINVAL;
 
 	/* If both VMDQ and RSS enabled, not all of PF queues are configured.
@@ -11997,7 +12032,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	else
 		num = pf->dev_data->nb_rx_queues;
 
-	num = RTE_MIN(num, conf->num);
+	num = RTE_MIN(num, conf->conf.queue_num);
 	PMD_DRV_LOG(INFO, "Max of contiguous %u PF queues are configured",
 			num);
 
@@ -12010,7 +12045,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
 			j = 0;
-		lut = (lut << 8) | (conf->queue[j] & ((0x1 <<
+		lut = (lut << 8) | (conf->conf.queue[j] & ((0x1 <<
 			hw->func_caps.rss_table_entry_width) - 1));
 		if ((i & 3) == 3)
 			I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i >> 2), lut);
@@ -12035,8 +12070,8 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 
 	i40e_hw_rss_hash_set(pf, &rss_conf);
 
-	rte_memcpy(rss_info,
-		conf, sizeof(struct i40e_rte_flow_rss_conf));
+	if (i40e_rss_conf_init(rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 151ed1a8c..5c02b37a0 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -5,13 +5,18 @@
 #ifndef _I40E_ETHDEV_H_
 #define _I40E_ETHDEV_H_
 
+#include <stdint.h>
+
 #include <rte_eth_ctrl.h>
 #include <rte_time.h>
 #include <rte_kvargs.h>
 #include <rte_hash.h>
+#include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_tm_driver.h>
 
+#include "base/i40e_register.h"
+
 #define I40E_VLAN_TAG_SIZE        4
 
 #define I40E_AQ_LEN               32
@@ -877,9 +882,11 @@ struct i40e_customized_pctype {
 };
 
 struct i40e_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
 	uint16_t queue_region_conf; /**< Queue region config flag */
-	uint16_t num; /**< Number of entries in queue[]. */
+	uint8_t key[(I40E_VFQF_HKEY_MAX_INDEX > I40E_PFQF_HKEY_MAX_INDEX ?
+		     I40E_VFQF_HKEY_MAX_INDEX : I40E_PFQF_HKEY_MAX_INDEX) + 1 *
+		    sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[I40E_MAX_Q_PER_TC]; /**< Queues indices to use. */
 };
 
@@ -1217,6 +1224,10 @@ void i40e_init_queue_region_conf(struct rte_eth_dev *dev);
 void i40e_flex_payload_reg_set_default(struct i40e_hw *hw);
 int i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, uint8_t key_len);
 int i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, uint16_t lut_size);
+int i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		       const struct rte_flow_action_rss *in);
+int i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+			 const struct rte_flow_action_rss *with);
 int i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index a32ad9b58..db708fb5b 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4207,7 +4207,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 
 	if (action_flag) {
 		for (n = 0; n < 64; n++) {
-			if (rss->rss_conf->rss_hf & (hf_bit << n)) {
+			if (rss->types & (hf_bit << n)) {
 				conf_info->region[0].hw_flowtype[0] = n;
 				conf_info->region[0].flowtype_num = 1;
 				conf_info->queue_region_number = 1;
@@ -4217,12 +4217,12 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	}
 
 	if (conf_info->queue_region_number) {
-		for (i = 0; i < rss->num; i++) {
-			for (j = 0; j < rss_info->num; j++) {
-				if (rss->queue[i] == rss_info->queue[j])
+		for (i = 0; i < rss->queue_num; i++) {
+			for (j = 0; j < rss_info->conf.queue_num; j++) {
+				if (rss->queue[i] == rss_info->conf.queue[j])
 					break;
 			}
-			if (j == rss_info->num) {
+			if (j == rss_info->conf.queue_num) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4231,7 +4231,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 		}
 
-		for (i = 0; i < rss->num - 1; i++) {
+		for (i = 0; i < rss->queue_num - 1; i++) {
 			if (rss->queue[i + 1] != rss->queue[i] + 1) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4245,8 +4245,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	for (n = 0; n < conf_info->queue_region_number; n++) {
 		if (conf_info->region[n].user_priority_num ||
 				conf_info->region[n].flowtype_num) {
-			if (!((rte_is_power_of_2(rss->num)) &&
-					rss->num <= 64)) {
+			if (!((rte_is_power_of_2(rss->queue_num)) &&
+					rss->queue_num <= 64)) {
 				PMD_DRV_LOG(ERR, "The region sizes should be any of the following values: 1, 2, 4, 8, 16, 32, 64 as long as the "
 				"total number of queues do not exceed the VSI allocation");
 				return -rte_errno;
@@ -4264,10 +4264,11 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				return -rte_errno;
 			}
 
-			if (rss_info->num < rss->num ||
-				rss->queue[0] < rss_info->queue[0] ||
-				(rss->queue[0] + rss->num >
-					rss_info->num + rss_info->queue[0])) {
+			if (rss_info->conf.queue_num < rss->queue_num ||
+				rss->queue[0] < rss_info->conf.queue[0] ||
+				(rss->queue[0] + rss->queue_num >
+					rss_info->conf.queue_num +
+					rss_info->conf.queue[0])) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4276,7 +4277,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 
 			for (i = 0; i < info->queue_region_number; i++) {
-				if (info->region[i].queue_num == rss->num &&
+				if (info->region[i].queue_num ==
+				    rss->queue_num &&
 					info->region[i].queue_start_index ==
 						rss->queue[0])
 					break;
@@ -4289,7 +4291,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				}
 
 				info->region[i].queue_num =
-					rss->num;
+					rss->queue_num;
 				info->region[i].queue_start_index =
 					rss->queue[0];
 				info->region[i].region_id =
@@ -4332,7 +4334,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	if (rss_config->queue_region_conf)
 		return 0;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -4340,7 +4342,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4349,15 +4351,20 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_config->rss_conf = *rss->rss_conf;
-	else
-		rss_config->rss_conf.rss_hf =
-			pf->adapter->flow_types_mask;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_config->queue[n] = rss->queue[n];
-	rss_config->num = rss->num;
+	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key too large");
+	if (rss->queue_num > RTE_DIM(rss_config->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (i40e_rss_conf_init(rss_config, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
+
 	index++;
 
 	/* check if the next not void action is END */
@@ -4877,7 +4884,7 @@ i40e_flow_flush_rss_filter(struct rte_eth_dev *dev)
 
 	ret = i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		ret = i40e_config_rss_filter(pf, rss_info, FALSE);
 	return ret;
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 752a17af0..ea3624ba4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -100,8 +100,6 @@
 
 #define IXGBE_QUEUE_STAT_COUNTERS (sizeof(hw_stats->qprc) / sizeof(hw_stats->qprc[0]))
 
-#define IXGBE_HKEY_MAX_INDEX 10
-
 /* Additional timesync values. */
 #define NSEC_PER_SEC             1000000000L
 #define IXGBE_INCVAL_10GB        0x66666666
@@ -8276,7 +8274,7 @@ ixgbe_rss_filter_restore(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev,
 			&filter_info->rss_info, TRUE);
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 655077700..9491b03f4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -4,6 +4,9 @@
 
 #ifndef _IXGBE_ETHDEV_H_
 #define _IXGBE_ETHDEV_H_
+
+#include <stdint.h>
+
 #include "base/ixgbe_type.h"
 #include "base/ixgbe_dcb.h"
 #include "base/ixgbe_dcb_82599.h"
@@ -12,6 +15,7 @@
 #ifdef RTE_LIBRTE_SECURITY
 #include "ixgbe_ipsec.h"
 #endif
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_hash.h>
 #include <rte_pci.h>
@@ -39,6 +43,7 @@
 #define IXGBE_EXTENDED_VLAN	  (uint32_t)(1 << 26) /* EXTENDED VLAN ENABLE */
 #define IXGBE_VFTA_SIZE 128
 #define IXGBE_VLAN_TAG_SIZE 4
+#define IXGBE_HKEY_MAX_INDEX 10
 #define IXGBE_MAX_RX_QUEUE_NUM	128
 #define IXGBE_MAX_INTR_QUEUE_NUM	15
 #define IXGBE_VMDQ_DCB_NB_QUEUES     IXGBE_MAX_RX_QUEUE_NUM
@@ -196,8 +201,8 @@ struct ixgbe_hw_fdir_info {
 };
 
 struct ixgbe_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IXGBE_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -696,6 +701,10 @@ void ixgbe_tm_conf_init(struct rte_eth_dev *dev);
 void ixgbe_tm_conf_uninit(struct rte_eth_dev *dev);
 int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev, uint16_t queue_idx,
 			       uint16_t tx_rate);
+int ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+			const struct rte_flow_action_rss *in);
+int ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+			  const struct rte_flow_action_rss *with);
 int ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index abdeac28b..4e31c7c56 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2761,7 +2761,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -2769,7 +2769,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -2778,14 +2778,19 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IXGBE_RSS_OFFLOAD_ALL;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (ixgbe_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	act = next_no_void_action(actions, act);
@@ -2834,7 +2839,7 @@ ixgbe_clear_rss_filter(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev, &filter_info->rss_info, FALSE);
 }
 
@@ -3153,9 +3158,8 @@ ixgbe_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct ixgbe_rte_flow_rss_conf));
+			ixgbe_rss_conf_init(&rss_filter_ptr->filter_info,
+					    &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 7511e183f..94ea7444d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5675,6 +5675,36 @@ ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
 }
 
 int
+ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+		    const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+		      const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add)
 {
@@ -5684,7 +5714,12 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	uint16_t j;
 	uint16_t sp_reta_size;
 	uint32_t reta_reg;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
@@ -5694,8 +5729,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	sp_reta_size = ixgbe_reta_size_get(hw->mac.type);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct ixgbe_rte_flow_rss_conf)) == 0) {
+		if (ixgbe_action_rss_same(&filter_info->rss_info.conf,
+					  &conf->conf)) {
 			ixgbe_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct ixgbe_rte_flow_rss_conf));
@@ -5704,7 +5739,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 	/* Fill in redirection table
 	 * The byte-swap is needed because NIC registers are in
@@ -5714,9 +5749,9 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	for (i = 0, j = 0; i < sp_reta_size; i++, j++) {
 		reta_reg = ixgbe_reta_reg_get(hw->mac.type, i);
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		reta = (reta << 8) | conf->queue[j];
+		reta = (reta << 8) | conf->conf.queue[j];
 		if ((i & 3) == 3)
 			IXGBE_WRITE_REG(hw, reta_reg,
 					rte_bswap32(reta));
@@ -5733,8 +5768,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	ixgbe_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct ixgbe_rte_flow_rss_conf));
+	if (ixgbe_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index fb8a8b848..c7854bead 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -569,7 +569,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			     " for UDP RSS and inner VXLAN RSS");
 			/* Fake support for all possible RSS hash fields. */
 			priv->hw_rss_sup = ~UINT64_C(0);
-			priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
+			priv->hw_rss_sup = mlx4_conv_rss_types(priv, -1);
 			/* Filter out known unsupported fields. */
 			priv->hw_rss_sup &=
 				~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 5a1b7dedd..4dbcaa39c 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -76,22 +76,22 @@ struct mlx4_drop {
 };
 
 /**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert DPDK RSS hash types to their Verbs equivalent.
  *
- * This function returns the supported (default) set when @p rss_hf has
+ * This function returns the supported (default) set when @p types has
  * special value (uint64_t)-1.
  *
  * @param priv
  *   Pointer to private structure.
- * @param rss_hf
- *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ *   Hash types in DPDK format (see struct rte_eth_rss_conf).
  *
  * @return
  *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
  *   otherwise and rte_errno is set.
  */
 uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types)
 {
 	enum { IPV4, IPV6, TCP, UDP, };
 	const uint64_t in[] = {
@@ -126,17 +126,17 @@ mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
 	unsigned int i;
 
 	for (i = 0; i != RTE_DIM(in); ++i)
-		if (rss_hf & in[i]) {
-			seen |= rss_hf & in[i];
+		if (types & in[i]) {
+			seen |= types & in[i];
 			conv |= out[i];
 		}
 	if ((conv & priv->hw_rss_sup) == conv) {
-		if (rss_hf == (uint64_t)-1) {
+		if (types == (uint64_t)-1) {
 			/* Include inner RSS by default if supported. */
 			conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
 			return conv;
 		}
-		if (!(rss_hf & ~seen))
+		if (!(types & ~seen))
 			return conv;
 	}
 	rte_errno = ENOTSUP;
@@ -717,7 +717,8 @@ mlx4_flow_prepare(struct priv *priv,
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
-			const struct rte_eth_rss_conf *rss_conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint64_t fields;
 			unsigned int i;
 
@@ -747,58 +748,56 @@ mlx4_flow_prepare(struct priv *priv,
 				break;
 			rss = action->conf;
 			/* Default RSS configuration if none is provided. */
-			rss_conf =
-				rss->rss_conf ?
-				rss->rss_conf :
-				&(struct rte_eth_rss_conf){
-					.rss_key = mlx4_rss_hash_key_default,
-					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
-					.rss_hf = -1,
-				};
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
 			/* Sanity checks. */
-			for (i = 0; i < rss->num; ++i)
+			for (i = 0; i < rss->queue_num; ++i)
 				if (rss->queue[i] >=
 				    priv->dev->data->nb_rx_queues)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "queue index target beyond number of"
 					" configured Rx queues";
 				goto exit_action_not_supported;
 			}
-			if (!rte_is_power_of_2(rss->num)) {
+			if (!rte_is_power_of_2(rss->queue_num)) {
 				msg = "for RSS, mlx4 requires the number of"
 					" queues to be a power of two";
 				goto exit_action_not_supported;
 			}
-			if (rss_conf->rss_key_len !=
-			    sizeof(flow->rss->key)) {
+			if (rss_key_len != sizeof(flow->rss->key)) {
 				msg = "mlx4 supports exactly one RSS hash key"
 					" length: "
 					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
 				goto exit_action_not_supported;
 			}
-			for (i = 1; i < rss->num; ++i)
+			for (i = 1; i < rss->queue_num; ++i)
 				if (rss->queue[i] - rss->queue[i - 1] != 1)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "mlx4 requires RSS contexts to use"
 					" consecutive queue indices only";
 				goto exit_action_not_supported;
 			}
-			if (rss->queue[0] % rss->num) {
+			if (rss->queue[0] % rss->queue_num) {
 				msg = "mlx4 requires the first queue of a RSS"
 					" context to be aligned on a multiple"
 					" of the context size";
 				goto exit_action_not_supported;
 			}
 			rte_errno = 0;
-			fields = mlx4_conv_rss_hf(priv, rss_conf->rss_hf);
+			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
 				msg = "unsupported RSS hash type requested";
 				goto exit_action_not_supported;
 			}
 			flow->rss = mlx4_rss_get
-				(priv, fields, rss_conf->rss_key, rss->num,
+				(priv, fields, rss_key, rss->queue_num,
 				 rss->queue);
 			if (!flow->rss) {
 				msg = "either invalid parameters or not enough"
@@ -1284,8 +1283,10 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
+		.types = -1,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 00188a65c..f71078ecc 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -47,7 +47,7 @@ struct rte_flow {
 
 /* mlx4_flow.c */
 
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t rss_hf);
 int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
 void mlx4_flow_clean(struct priv *priv);
 int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 7a036ed83..474614e4d 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
  */
 struct mlx4_rss *
 mlx4_rss_get(struct priv *priv, uint64_t fields,
-	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 	     uint16_t queues, const uint16_t queue_id[])
 {
 	struct mlx4_rss *rss;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index dd46ac006..521267724 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -126,7 +126,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
 int mlx4_rss_init(struct priv *priv);
 void mlx4_rss_deinit(struct priv *priv);
 struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
-			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 			      uint16_t queues, const uint16_t queue_id[]);
 void mlx4_rss_put(struct mlx4_rss *rss);
 int mlx4_rss_attach(struct mlx4_rss *rss);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a52dcf263..7798052f9 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -214,9 +214,8 @@ struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t (*queues)[]; /**< Queues indexes to use. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
 	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
@@ -406,9 +405,8 @@ struct mlx5_flow_parse {
 	uint32_t mark:1; /**< Mark is present in the flow. */
 	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
@@ -532,47 +530,6 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Copy the RSS configuration from the user ones, of the rss_conf is null,
- * uses the driver one.
- *
- * @param parser
- *   Internal parser structure.
- * @param rss_conf
- *   User RSS configuration to save.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_flow_convert_rss_conf(struct mlx5_flow_parse *parser,
-			   const struct rte_eth_rss_conf *rss_conf)
-{
-	/*
-	 * This function is also called at the beginning of
-	 * mlx5_flow_convert_actions() to initialize the parser with the
-	 * device default RSS configuration.
-	 */
-	if (rss_conf) {
-		if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len != 40) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len && rss_conf->rss_key) {
-			parser->rss_conf.rss_key_len = rss_conf->rss_key_len;
-			memcpy(parser->rss_key, rss_conf->rss_key,
-			       rss_conf->rss_key_len);
-			parser->rss_conf.rss_key = parser->rss_key;
-		}
-		parser->rss_conf.rss_hf = rss_conf->rss_hf;
-	}
-	return 0;
-}
-
-/**
  * Extract attribute to the parser.
  *
  * @param[in] attr
@@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	enum { FATE = 1, MARK = 2, COUNT = 4, };
 	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
 
-	/*
-	 * Add default RSS configuration necessary for Verbs to create QP even
-	 * if no RSS is necessary.
-	 */
-	ret = mlx5_flow_convert_rss_conf(parser,
-					 (const struct rte_eth_rss_conf *)
-					 &priv->rss_conf);
-	if (ret)
-		return ret;
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
@@ -671,25 +618,53 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			parser->queues_n = 1;
 			parser->queues[0] = queue->index;
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.queue_num = 1,
+				.queue = parser->queues,
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint16_t n;
 
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
-			if (!rss || !rss->num) {
+			if (rss->types & MLX5_RSS_HF_MASK) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "unsupported RSS type"
+						   " requested");
+				return -rte_errno;
+			}
+			if (rss->key_len) {
+				rss_key_len = rss->key_len;
+				rss_key = rss->key;
+			} else {
+				rss_key_len = rss_hash_default_key_len;
+				rss_key = rss_hash_default_key;
+			}
+			if (rss_key_len != RTE_DIM(parser->rss_key)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS hash key must be"
+						   " exactly 40 bytes long");
+				return -rte_errno;
+			}
+			if (!rss->queue_num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (rss->num > RTE_DIM(parser->queues)) {
+			if (rss->queue_num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -697,7 +672,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " context");
 				return -rte_errno;
 			}
-			for (n = 0; n < rss->num; ++n) {
+			for (n = 0; n < rss->queue_num; ++n) {
 				if (rss->queue[n] >= priv->rxqs_n) {
 					rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -707,16 +682,16 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 					return -rte_errno;
 				}
 			}
-			for (n = 0; n < rss->num; ++n)
-				parser->queues[n] = rss->queue[n];
-			parser->queues_n = rss->num;
-			if (mlx5_flow_convert_rss_conf(parser, rss->rss_conf)) {
-				rte_flow_error_set(error, EINVAL,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "wrong RSS configuration");
-				return -rte_errno;
-			}
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.types = rss->types,
+				.key_len = rss_key_len,
+				.queue_num = rss->queue_num,
+				.key = memcpy(parser->rss_key, rss_key,
+					      sizeof(*rss_key) * rss_key_len),
+				.queue = memcpy(parser->queues, rss->queue,
+						sizeof(*rss->queue) *
+						rss->queue_num),
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -761,7 +736,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
-	if (!parser->queues_n && !parser->drop) {
+	if (!parser->rss_conf.queue_num && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
@@ -941,7 +916,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	unsigned int i;
 
 	/* Remove any other flow not matching the pattern. */
-	if (parser->queues_n == 1 && !parser->rss_conf.rss_hf) {
+	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			if (i == HASH_RXQ_ETH)
 				continue;
@@ -969,7 +944,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	}
 	/* Remove impossible flow according to the RSS configuration. */
 	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.rss_hf) {
+	    parser->rss_conf.types) {
 		/* Remove any other flow. */
 		for (i = hmin; i != (hmax + 1); ++i) {
 			if ((i == parser->layer) ||
@@ -980,7 +955,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		}
 	} else  if (!parser->queue[ip].ibv_attr) {
 		/* no RSS possible with the current configuration. */
-		parser->queues_n = 1;
+		parser->rss_conf.queue_num = 1;
 		return;
 	}
 fill:
@@ -1109,7 +1084,7 @@ mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.rss_hf &
+			if (!(parser->rss_conf.types &
 			      hash_rxq_init[i].dpdk_rss_hf) &&
 			    (i != HASH_RXQ_ETH))
 				continue;
@@ -1777,20 +1752,20 @@ mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -1861,9 +1836,9 @@ mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 				   NULL, "internal error in flow creation");
 		goto error;
 	}
-	for (i = 0; i != parser->queues_n; ++i) {
+	for (i = 0; i != parser->rss_conf.queue_num; ++i) {
 		struct mlx5_rxq_data *q =
-			(*priv->rxqs)[parser->queues[i]];
+			(*priv->rxqs)[parser->rss_conf.queue[i]];
 
 		q->mark |= parser->mark;
 	}
@@ -1927,7 +1902,8 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	if (ret)
 		goto exit;
 	flow = rte_calloc(__func__, 1,
-			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  sizeof(*flow) +
+			  parser.rss_conf.queue_num * sizeof(uint16_t),
 			  0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
@@ -1936,15 +1912,20 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 				   "cannot allocate flow memory");
 		return NULL;
 	}
-	/* Copy queues configuration. */
+	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
-	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
-	flow->queues_n = parser.queues_n;
+	flow->rss_conf = (struct rte_flow_action_rss){
+		.types = parser.rss_conf.types,
+		.key_len = parser.rss_conf.key_len,
+		.queue_num = parser.rss_conf.queue_num,
+		.key = memcpy(flow->rss_key, parser.rss_conf.key,
+			      sizeof(*parser.rss_conf.key) *
+			      parser.rss_conf.key_len),
+		.queue = memcpy(flow->queues, parser.rss_conf.queue,
+				sizeof(*parser.rss_conf.queue) *
+				parser.rss_conf.queue_num),
+	};
 	flow->mark = parser.mark;
-	/* Copy RSS configuration. */
-	flow->rss_conf = parser.rss_conf;
-	flow->rss_conf.rss_key = flow->rss_key;
-	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
 	/* finalise the flow. */
 	if (parser.drop)
 		ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow,
@@ -2024,7 +2005,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 
 	if (flow->drop || !flow->mark)
 		goto free;
-	for (i = 0; i != flow->queues_n; ++i) {
+	for (i = 0; i != flow->rss_conf.queue_num; ++i) {
 		struct rte_flow *tmp;
 		int mark = 0;
 
@@ -2334,19 +2315,19 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			if (!flow->frxq[i].ibv_attr)
 				continue;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_get(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_get(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_new(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_new(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
@@ -2370,8 +2351,8 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 		}
 		if (!flow->mark)
 			continue;
-		for (i = 0; i != flow->queues_n; ++i)
-			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
+		for (i = 0; i != flow->rss_conf.queue_num; ++i)
+			(*priv->rxqs)[flow->rss_conf.queue[i]]->mark = 1;
 	}
 	return 0;
 }
@@ -2448,8 +2429,10 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = &priv->rss_conf,
-		.num = priv->reta_idx_n,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 1b4570586..1e4354ab3 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1218,8 +1218,8 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1286,8 +1286,8 @@ mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
  *   An indirection table if found.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1391,8 +1391,10 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1419,7 +1421,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = rss_key,
+				.rx_hash_key = (void *)(uintptr_t)rss_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
@@ -1469,8 +1471,10 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
  *   An hash Rx queue on success.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f5af43735..a702cb603 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -134,7 +134,7 @@ struct mlx5_ind_table_ibv {
 	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
-	uint16_t queues_n; /**< Number of queues in the list. */
+	uint32_t queues_n; /**< Number of queues in the list. */
 	uint16_t queues[]; /**< Queue list. */
 };
 
@@ -145,7 +145,7 @@ struct mlx5_hrxq {
 	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
-	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
 
@@ -237,20 +237,22 @@ int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_rxq_verify(struct rte_eth_dev *dev);
 int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
 			       struct mlx5_ind_table_ibv *ind_tbl);
 int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev);
-struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
-struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 056405515..1a2c0299c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	struct sfc_rxq *rxq;
 	unsigned int rxq_hw_index_min;
 	unsigned int rxq_hw_index_max;
-	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
-	uint64_t rss_hf;
-	uint8_t *rss_key = NULL;
+	const uint8_t *rss_key;
 	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
 	unsigned int i;
 
-	if (rss->num == 0)
+	if (rss->queue_num == 0)
 		return -EINVAL;
 
 	rxq_sw_index = sa->rxq_count - 1;
@@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	rxq_hw_index_min = rxq->hw_index;
 	rxq_hw_index_max = 0;
 
-	for (i = 0; i < rss->num; ++i) {
+	for (i = 0; i < rss->queue_num; ++i) {
 		rxq_sw_index = rss->queue[i];
 
 		if (rxq_sw_index >= sa->rxq_count)
@@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
-	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
-	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
+	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
-	if (rss_conf != NULL) {
-		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
+	if (rss->key_len) {
+		if (rss->key_len != sizeof(sa->rss_key))
 			return -EINVAL;
 
-		rss_key = rss_conf->rss_key;
+		rss_key = rss->key;
 	} else {
 		rss_key = sa->rss_key;
 	}
@@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 
 	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
 	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
-	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
+	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
 	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
 
 	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
-		unsigned int rxq_sw_index = rss->queue[i % rss->num];
+		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
 		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
 
 		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index aea3462a6..78f20913f 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1215,7 +1215,7 @@ priv_flow_process(struct pmd_internals *pmd,
 				if (err)
 					goto exit_action_not_supported;
 			}
-			if (flow && rss)
+			if (flow)
 				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
@@ -2050,7 +2050,7 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 			   struct rte_flow_error *error)
 {
 	/* 4096 is the maximum number of instructions for a BPF program */
-	int i;
+	unsigned int i;
 	int err;
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
@@ -2066,8 +2066,8 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	}
 
 	/* Update RSS map entry with queues */
-	rss_entry.nb_queues = rss->num;
-	for (i = 0; i < rss->num; i++)
+	rss_entry.nb_queues = rss->queue_num;
+	for (i = 0; i < rss->queue_num; i++)
 		rss_entry.queues[i] = rss->queue[i];
 	rss_entry.hash_fields =
 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 8b2047adb..3ce76c413 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -202,9 +202,13 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
 						queue[j++] = i;
-				action_rss.rss_conf = &rss_conf;
-				action_rss.num = j;
-				action_rss.queue = queue;
+				action_rss = (struct rte_flow_action_rss){
+					.types = rss_conf.rss_hf,
+					.key_len = rss_conf.rss_key_len,
+					.queue_num = j,
+					.key = rss_key,
+					.queue = queue,
+				};
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 550086411..2fabc9a29 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,40 +330,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 895feb1a3..4385e7eaa 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1033,13 +1033,21 @@ struct rte_flow_query_count {
  * Similar to QUEUE, except RSS is additionally performed on packets to
  * spread them among several queues according to the provided parameters.
  *
+ * Unlike global RSS settings used by other DPDK APIs, unsetting the
+ * @p types field does not disable RSS in a flow rule. Doing so instead
+ * requests safe unspecified "best-effort" settings from the underlying PMD,
+ * which depending on the flow rule, may result in anything ranging from
+ * empty (single queue) to all-inclusive RSS.
+ *
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
-	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< Specific RSS hash types (see ETH_RSS_*). */
+	uint32_t key_len; /**< Hash key length in bytes. */
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	const uint8_t *key; /**< Hash key. */
 	const uint16_t *queue; /**< Queue indices to use. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                       ` (2 preceding siblings ...)
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
                       ` (9 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 117 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 135 insertions(+), 117 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index be867b0ec..acf19eb8a 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,29 +2073,29 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		struct rte_eth_dev_info info;
 
 		rte_eth_dev_info_get(ctx->port, &info);
-		action_rss_data->s.rss_conf.rss_key_len =
-			RTE_MIN(sizeof(action_rss_data->s.rss_key),
+		action_rss_data->rss_conf.rss_key_len =
+			RTE_MIN(sizeof(action_rss_data->rss_key),
 				info.hash_key_size);
 	}
 	action->conf = &action_rss_data->conf;
@@ -2125,7 +2113,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2135,7 +2123,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2154,7 +2142,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2169,7 +2157,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2186,10 +2174,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2206,6 +2193,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2483,8 +2471,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2493,6 +2481,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2503,6 +2492,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2525,8 +2519,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 8d42ea9a9..052163357 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,7 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1010,14 +1010,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1049,7 +1055,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1080,11 +1086,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 582483076..5a1b7dedd 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 84d6f9b92..a52dcf263 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index db04c4f94..550086411 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index af9b14a4d..895feb1a3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-10 16:36  1%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
                       ` (10 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to do so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index b9f36587c..a5c6a1670 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -964,6 +965,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -975,6 +979,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -984,6 +992,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -1001,6 +1011,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -1009,6 +1022,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1019,6 +1036,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1029,6 +1049,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1044,6 +1067,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4d26df326..582483076 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index f051fbef5..84d6f9b92 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap & FATE)
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 551b2d83d..aea3462a6 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index aab637a2c..af9b14a4d 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-10 16:36  2%     ` Adrien Mazarguil
  2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
                       ` (11 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index ac4b51a8a..be867b0ec 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 7ae0295f6..8d42ea9a9 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1049,7 +1049,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ba6feddee..db04c4f94 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 36fd38ffa..aab637a2c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-10 16:36  3%     ` Adrien Mazarguil
  2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
                       ` (12 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 2058e6ec8..7ae0295f6 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1228,8 +1228,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cdaaa3a5b..95799fd9c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (12 preceding siblings ...)
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 15/15] ethdev: add port ID item and " Adrien Mazarguil
@ 2018-04-10 16:36  4%   ` Adrien Mazarguil
  2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
                       ` (13 more replies)
  13 siblings, 14 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:36 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v4 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/096509.html

v3 changes:

- Rebased series, fixed latest conflicts.
- Addressed Andrew's comments, see affected patches for details:
  - Empty RSS types in flow rule means PMD-specific RSS instead of no RSS.
  - RSS hash function now explicitly compared against
    RTE_ETH_HASH_FUNCTION_DEFAULT instead of 0 in all PMDs.
  - sfc PMD updated to also accept Toeplitz.
  - Implicit VLAN TPID matching now removed from all PMDs.
  - Default mask upate for VLAN TCI now split as separate patch #11.
  - Ingress/egress definition clarified in patch #12.

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (16):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: limit default VLAN TCI mask in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 394 +++++++++++----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 618 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  60 ++-
 drivers/net/bnxt/bnxt_filter.c              |  49 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 ++-
 drivers/net/e1000/igb_rxtx.c                |  55 +-
 drivers/net/enic/enic_flow.c                |  50 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 140 +++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 +-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 316 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  32 +-
 drivers/net/sfc/sfc_flow.c                  |  78 ++-
 drivers/net/tap/tap_flow.c                  |  49 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 339 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1756 insertions(+), 1127 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 00/11] Bunch of flow API-related fixes
  2018-04-06 13:22  3%   ` [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes Adrien Mazarguil
@ 2018-04-10 16:34  3%     ` Adrien Mazarguil
  2018-04-16 16:21  3%       ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 16:34 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v4 changes:

- Rebased again.
- The reliance on rte_eth_dev_rss_hash_conf_get() was removed from patch #7,
  see updated patch for details.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline.c                      |   2 +
 app/test-pmd/cmdline_flow.c                 | 252 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 10 files changed, 494 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] app/test: enhance power manager unit tests
  @ 2018-04-10 14:19  3% ` Hunt, David
  0 siblings, 0 replies; 200+ results
From: Hunt, David @ 2018-04-10 14:19 UTC (permalink / raw)
  To: Reshma Pattan, dev; +Cc: jananeex.m.parthasarathy

Hi Reshma,


On 6/4/2018 2:51 PM, Reshma Pattan wrote:
> Unit Testcases are added for power_acpi_cpu_freq,
> power_kvm_vm_test to improve coverage
>
> Signed-off-by: Jananee Parthasarathy <jananeex.m.parthasarathy@intel.com>
> ---
>   test/test/test_power_acpi_cpufreq.c |  2 +-
>   test/test/test_power_kvm_vm.c       | 62 +++++++++++++++++++++++++++++++++----
>   2 files changed, 57 insertions(+), 7 deletions(-)
>
> diff --git a/test/test/test_power_acpi_cpufreq.c b/test/test/test_power_acpi_cpufreq.c
> index 3bfd033..8da2dcc 100644
> --- a/test/test/test_power_acpi_cpufreq.c
> +++ b/test/test/test_power_acpi_cpufreq.c
> @@ -27,7 +27,7 @@
>   #define TEST_POWER_FREQS_NUM_MAX ((unsigned)RTE_MAX_LCORE_FREQS)
>   
>   #define TEST_POWER_SYSFILE_CUR_FREQ \
> -	"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
> +	"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_cur_freq"

This change is OK with me, from what I can see using cpuinfo_cur_freq 
instead of
scaling_cpu_freq gives us more compatibility on a wider selection of 
operating systems.

>   
>   static uint32_t total_freq_num;
>   static uint32_t freqs[TEST_POWER_FREQS_NUM_MAX];
> diff --git a/test/test/test_power_kvm_vm.c b/test/test/test_power_kvm_vm.c
> index 91b31c4..012ad82 100644
> --- a/test/test/test_power_kvm_vm.c
> +++ b/test/test/test_power_kvm_vm.c
> @@ -25,12 +25,19 @@
>   #define TEST_POWER_VM_LCORE_ID            0U
>   #define TEST_POWER_VM_LCORE_OUT_OF_BOUNDS (RTE_MAX_LCORE+1)
>   #define TEST_POWER_VM_LCORE_INVALID       1U
> +#define TEMP_POWER_MANAGER_FILE_PATH  "/tmp/testpm"
> +
> +int guest_channel_host_connect(const char *path, unsigned int lcore_id);
> +int power_kvm_vm_enable_turbo(unsigned int lcore_id);
> +int power_kvm_vm_disable_turbo(unsigned int lcore_id);

I see here you are calling guest_channel_host_connect to "emulate" a 
virtio-serial connection
to a host. While I am not a huge fan of faking functionality, I feel 
that having these unit tests
check ABI and API breakages is more beneficial, so I'm good with it for 
this reason.

However, there's no need to have the power_kvm_vm_enable/disable_turbo() 
prototypes, as you
can just use rte_power_freq_enable_turbo() and 
rte_power_freq_disable_turbo(), which in turn
call power_kvm_vm_enable_turbo() and power_kvm_vm_disable_turbo()

>   
>   static int
>   test_power_kvm_vm(void)
>   {
>   	int ret;
>   	enum power_management_env env;
> +	char fPath[PATH_MAX];
> +	FILE *fPtr = NULL;
>   
>   	ret = rte_power_set_env(PM_ENV_KVM_VM);
>   	if (ret != 0) {
> @@ -95,12 +102,31 @@
>   	/* Test initialisation of a valid lcore */
>   	ret = rte_power_init(TEST_POWER_VM_LCORE_ID);
>   	if (ret < 0) {
> -		printf("Cannot initialise power management for lcore %u, this "
> -				"may occur if environment is not configured "
> -				"correctly(KVM VM) or operating in another valid "
> -				"Power management environment\n", TEST_POWER_VM_LCORE_ID);
> -		rte_power_unset_env();
> -		return -1;
> +		printf("rte_power_init failed as expected in host\n");
> +		/* This test would be successful when run on VM,
> +		 * in order to run in Host itself, temporary file path
> +		 * is created and same is used for further communication
> +		 */
> +
> +		snprintf(fPath, PATH_MAX, "%s.%u",
> +			TEMP_POWER_MANAGER_FILE_PATH, TEST_POWER_VM_LCORE_ID);
> +		fPtr = fopen(fPath, "w");
> +		if (fPtr == NULL) {
> +			printf(" Unable to create file\n");
> +			rte_power_unset_env();
> +			return -1;
> +		}
> +		ret = guest_channel_host_connect(TEMP_POWER_MANAGER_FILE_PATH,
> +			TEST_POWER_VM_LCORE_ID);
> +		if (ret == 0)
> +			printf("guest_channel_host_connect successful\n");
> +		else {
> +			printf("guest_channel_host_connect failed\n");
> +			rte_power_unset_env();
> +			fclose(fPtr);
> +			remove(fPath);
> +			return -1;
> +		}
>   	}
>   
>   	/* Test initialisation of previously initialised lcore */
> @@ -175,6 +201,22 @@
>   		goto fail_all;
>   	}
>   
> +	/* Test KVM_VM Enable Turbo of valid core */
> +	ret = power_kvm_vm_enable_turbo(TEST_POWER_VM_LCORE_ID);

see comment above about using rte_power_freq_enable_turbo()

> +	if (ret == -1) {
> +		printf("power_kvm_vm_enable_turbo failed on valid lcore"
> +			"%u\n", TEST_POWER_VM_LCORE_ID);
> +		goto fail_all;
> +	}
> +
> +	/* Test KVM_VM Disable Turbo of valid core */
> +	ret = power_kvm_vm_disable_turbo(TEST_POWER_VM_LCORE_ID);

see comment above about using rte_power_freq_disable_turbo()

> +	if (ret == -1) {
> +		printf("power_kvm_vm_disable_turbo failed on valid lcore"
> +		"%u\n", TEST_POWER_VM_LCORE_ID);
> +		goto fail_all;
> +	}
> +
>   	/* Test frequency up of valid lcore */
>   	ret = rte_power_freq_up(TEST_POWER_VM_LCORE_ID);
>   	if (ret != 1) {
> @@ -274,10 +316,18 @@
>   		return -1;
>   	}
>   	rte_power_unset_env();
> +	if (fPtr != NULL) {
> +		fclose(fPtr);
> +		remove(fPath);
> +	}
>   	return 0;
>   fail_all:
>   	rte_power_exit(TEST_POWER_VM_LCORE_ID);
>   	rte_power_unset_env();
> +	if (fPtr != NULL) {
> +		fclose(fPtr);
> +		remove(fPath);
> +	}
>   	return -1;
>   }
>   #endif

With the changes described above:

Acked-by: David Hunt <david.hunt@intel.com>

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:26  0%           ` Dumitrescu, Cristian
@ 2018-04-10 12:32  0%             ` Van Haaren, Harry
  0 siblings, 0 replies; 200+ results
From: Van Haaren, Harry @ 2018-04-10 12:32 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Neil Horman
  Cc: dev, Ananyev, Konstantin, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce

+CC Neil from other reply

> From: Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 6:27 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Van Haaren, Harry
> <harry.van.haaren@intel.com>; Stephen Hemminger <stephen@networkplumber.org>;
> Singh, Jasvinder <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> > >
> > > If people think that this function conversion is not nice, it can be
> reworked
> > in multiple ways at the expense of API (but not ABI) change:
> > > 1. Define the hash function field in the table parameter structure as
> > opaque void * rather than 4-parameter version.
> > > 2. Create a separate parameter structure just for this hash table type.
> >
> > Why just not define your f_hash member as a union:
> >
> > struct rte_table_hash_params {
> > ...
> > union {
> >     rte_table_hash_op_hash  f_hash_4params;
> >     rte_hash_function f_hash_3_params;
> > };
> >
> > ?
> >
> 
> Yes, agreed, this is yet another way to handle this, thanks Konstantin.

Agree that this solution is a lot better than raw casting.

The issue I have with casting is that it doesn't explicitly show that the signature is different, and that the code must be aware of that fact. With a union, at least the code explicitly states that there is a difference in signature, and that this is being handled by the code, so this looks a better solution.

Neil proposed an alternative solution using a bit to indicate calling params in a separate reply - another possibility.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
@ 2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 0 replies; 200+ results
From: Neil Horman @ 2018-04-10 11:43 UTC (permalink / raw)
  To: Van Haaren, Harry
  Cc: Dumitrescu, Cristian, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce, dev

On Mon, Apr 09, 2018 at 04:38:11PM +0000, Van Haaren, Harry wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> > Sent: Monday, April 9, 2018 4:59 PM
> > To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > 
> > 
> > 
> > > -----Original Message-----
> > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > Sent: Monday, April 9, 2018 4:10 PM
> > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > >
> > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > >
> > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > ---
> > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > index dcb4fe9..f7eae27 100644
> > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
> > > >  		return NULL;
> > > >  	}
> > > >
> > > > +	void *hash_func = p->f_hash;
> > > > +
> > > >  	/* Create cuckoo hash table */
> > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > >  		.entries = p->n_keys,
> > > >  		.key_len = p->key_size,
> > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > +		.hash_func = (rte_hash_function) hash_func,
> > > >  		.hash_func_init_val = p->seed,
> > > >  		.socket_id = socket_id,
> > > >  		.name = p->name
> > >
> > > This is just tricking the compiler into not complaining.
> > > I would really rather see the two hash functions made the same.
> > 
> > (Adding Bruce as well to consolidate all conversations in a single thread.)
> > 
> > What we want to do here is be able to use the librte_hash under the same API
> > as the several hash table flavors implemented in librte_table.
> > 
> > Both of these libraries allow configuring the hash function per each hash
> > table instance. Problem is: hash function in librte_hash has only 3 parameters
> > (no key mask), while hash function in librte_table has 4 parameters (includes
> > key mask). The key mask helps a lot for practical protocol implementations by
> > avoiding key copy & pre-process on lookup.
> > 
> > So then: how to plug in librte_hash under the same API as the suite of hash
> > tables in librte_table? We don't want to re-implement cuckoo hash from
> > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > to how the LPM and ACL tables are plugged into librte_table.
> > 
> > Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
> > flavor under the librte_table. Maybe this should be documented better. This
> > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > this trivial patch.
> > 
> > Ideally, for every 3-parameter hash function, I would like to generate the
> > corresponding 4-parameter hash function on-the-fly, but unfortunately this is
> > not what C language can do.
> > 
> > Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.
> Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.
> 
> With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?
> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)
> 
Why not make the hash_func pointer in the rte_hash_parameters structure an
anonymous union, and reserve a bit in the extra_flag field to denote if the
function pointer has 3 arguments or 4?  Then rte_hash_hash can use the
appropriate calling convention on hash_func.

Neil

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-10 10:19  0%       ` Adrien Mazarguil
@ 2018-04-10 11:06  0%         ` Shahaf Shuler
  0 siblings, 0 replies; 200+ results
From: Shahaf Shuler @ 2018-04-10 11:06 UTC (permalink / raw)
  To: Adrien Mazarguil, Mohammad Abdul Awal; +Cc: Declan Doherty, dev, Alex Rosenbaum

Hi,

Adding small comment on top of Adrien's

Tuesday, April 10, 2018 1:20 PM, Adrien Mazarguil:
> On Mon, Apr 09, 2018 at 05:10:35PM +0100, Mohammad Abdul Awal wrote:
> > On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > > On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> > > > Add new flow action types and associated action data structures to
> > > > support the encapsulation and decapsulation of the virtual tunnel
> > > > endpoints.
> > > >
> > > > The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the
> > > > matching flow to be encapsulated in the virtual tunnel endpoint
> > > > overlay defined in the tunnel_encap action data.
> > > >
> > > > The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all
> > > > virtual tunnel endpoint overlays up to and including the first
> > > > instance of the flow item type defined in the tunnel_decap action
> > > > data for the matching flows.
> > > >
> > > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > > This generic approach looks flexible enough to cover the use cases
> > > that immediately come to mind (VLAN, VXLAN), its design is sound.
> > >
> > > However, while I'm aware it's not a concern at this point, it won't
> > > be able to deal with stateful tunnel or encapsulation types (e.g.
> > > IPsec or TCP) which will require additional meta data or some
> > > run-time assistance from the application.
> > >
> > > Eventually for more complex use cases, dedicated encap/decap actions
> > > will have to appear, so the issue I wanted to raise before going further is
> this:
> > >
> > > Going generic inevitably trades some of the usability; flat
> > > structures dedicated to VXLAN encap/decap with only the needed info
> > > to get the job done would likely be easier to implement in PMDs and
> > > use in applications. Any number of such actions can be added to rte_flow
> without ABI impact.
> > >
> > > If VXLAN is the only use case at this point, my suggestion would be
> > > to go with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP)
> actions,
> > > with fixed
> > > L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
> > We can go this way and this will increase the action for more and more
> > tunneling protocols being added. Current proposal is already a generic
> > approach which specifies as a tunnel for all the tunneling protocols.
> 
> Right, on the other hand there are not that many standard encapsulations
> offloaded by existing devices. rte_flow could easily handle dedicated actions
> for each of them without problem.
> 
> My point is that many of those (will eventually) have their own quirks to
> manage when doing encap/decap, it's not just a matter of prepending or
> removing a bunch of header definitions, otherwise we could as well let
> applications simply provide an arbitrary buffer to prepend.
> 
> Consider that the "generic" part is already built into rte_flow as the way
> patterns and action are handled. Adding another generic layer on top of that
> could make things more inconvenient than necessary to applications (my
> main concern).
> 
> You'd need another layer of validation/error reporting machinery to properly
> let applications know they cannot encap VXLAN on top of TCP on top of
> QinQinQinQinQ for instance. Either a single bounded encapsulation definition
> or a combination at the action list level is needed to avoid that.
> 
> > > Now we can start with the generic approach, see how it fares and add
> > > dedicated encap/decap later as needed.
> > >
> > > More comments below.
> <snip>
> > > > +Action: ``TUNNEL_ENCAP``
> > > > +^^^^^^^^^^^^^^^^^^^^^^

The ENCAP/DECAP doesn't have to be in the context of tunnel.
For example - let's take GRE - application may want to decap the GRE and encap it with L2. The L2 encapsulation is not related to any tunnel. 
Same for the other direction - VM sends Eth frame, and we want to decap the Eth and encap with GRE.

I think those action should be free from the tunnel association and just provide flow items we want to encap/decap or in a more generic way offset to the packet headers and buffer to encap (not sure how many devices supports that, may be overkill at this point). 

> > > > +
> > > > +Performs an encapsulation action by encapsulating the flows
> > > > +matched by the pattern items according to the network overlay
> > > > +defined in the ``rte_flow_action_tunnel_encap`` pattern items.
> > > > +
> > > > +This action modifies the payload of matched flows. The pattern
> > > > +items specified in the ``rte_flow_action_tunnel_encap`` action
> > > > +structure must defined a valid set of overlay headers, from the
> > > > +Ethernet header up to the overlay header. The pattern must be
> terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> > > Regarding the use of a pattern list, if you consider PMDs are
> > > already iterating on a list of actions when encountering
> > > RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner
> loop.
> > We understand that it is implementation specifics. If we do not go for
> > another inner loop, all the bundling need to be handled in the same
> > function, which seems more clumsy to me. This also breaks the tunnel
> > endpoint concept.
> > >
> > > How about making each encountered
> RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
> > > provide exactly one item instead (in encap, i.e. reverse order)?
> > Again, if we have tunnel action, security action, and other actions,
> > all the processing and tracking need to be done in one function. Now
> > we will need ETH_ENCAP/DECAP, UDP_ENCAP/DECAP,
> NVGRE_ENCAP/DECAP, etc.
> 
> Well, the number of DECAP actions doesn't need to perfectly reflect that of
> ENCAP since it implies all preceding layers. No problem with that.
> 
> Regarding multiple dedicated actions, my suggestion was for a single generic
> one as in this patch, but each instance on the ENCAP side would deal with a
> single protocol layer, instead of having a single ENCAP action with multiple
> inner layers (and thus an inner loop).
> 
> PMDs also gain the ability to precisely report which encap step fails by making
> rte_flow_error point to the problematic object to ease debugging of flow
> rules on the application side.
> 
> Why would that break the tunnel idea and more importantly, how would it
> prevent PMD developers from splitting their processing into multiple
> functions?
> 
> > >
> > > In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
> > >
> <snip>
> > > > +
> > > > +   +-------+--------------------------+------------+
> > > > +   | Index | Flow Item Type           | Flow Item  |
> > > > +   +=======+==========================+============+
> > > > +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> > > > +   +-------+--------------------------+------------+
> > > > +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> > > > +   +-------+--------------------------+------------+
> > > One possible issue is that it relies on objects normally found on
> > > the pattern side of flow rules. Those are supposed to match
> > > something, they are not intended for packet header generation. While
> their "spec" and "mask"
> > > fields might make sense in this context, the "last" field is odd.
> > >
> > > You must define them without leaving anything open for
> > > interpretation by PMDs and users alike. Defining things as
> > > "undefined" is fine as long as it's covered.
> > Please note that the "void *item" in the
> > "rte_flow_action_tunnel_encap.pattern" points to the data structure
> > defined for the corresponding rte_flow_item_type instead of a
> > rte_flow_item structure. As an example, for the rte_flow_item_eth type,
> the "void *item"
> > will point to a "struct rte_flow_item_eth" instance. Thats why we have
> > defined struct rte_flow_action_item inside struct
> > rte_flow_action_tunnel_encap. So, no question of spec, mask, last
> anymore.
> 
> Right, I noticed that after commenting its structure definition below.
> 
> I think I won't be the only one confused by this approach, also because a
> mask is needed in addition to a specification structure, otherwise how do you
> plan to tell what fields are relevant in application-provided protocol headers?
> 
> An application might set unusual IPv4/UDP/VXLAN fields and expect them to
> be part of the encapsulated traffic. Without a mask, a PMD must take
> headers verbatim, and I don't think many devices are ready for that yet.
> 
> Hence my other suggestion: defining inflexible $PROTOCOL_(ENCAP|DECAP)
> actions that do not allow more than what's defined by official RFCs for
> $PROTOCOL.
> 
> <snip>
> > > > + */
> > > > +struct rte_flow_action_tunnel_encap {
> > > > +	struct rte_flow_action_item {
> > > > +		enum rte_flow_item_type type;
> > > > +		/**< Flow item type. */
> > > > +		const void *item;
> > > > +		/**< Flow item definition which points to the data of
> > > > +		 * corresponding rte_flow_item_type.
> > > > +		 */
> > > I see it's a new action type, albeit a bit confusing (there is no
> > > RTE_FLOW_ACTION_TYPE_ITEM).
> > >
> > > I suggest the standard pattern item type since you're going with
> > > enum rte_flow_item_type anyway. Keep in mind you need some kind of
> > > mask to tell what fields are relevant. An application might
> > > otherwise want to encap with unsupported properties (e.g. specific IPv4
> ToS field and whatnot).
> > >
> > > How about a single "struct rte_flow_pattern_item item", neither
> > > const and neither a pointer. It's generic enough, enclosed
> > > spec/last/mask pointers take care of the specifics. You just need to
> > > define what's supposed to happen when "last" is set.
> > Please see the comment above regarding this field.
> 
> Point still stands, if you need to distinguish spec and mask, a more complete
> structure is needed. Instead of adding a new (confusing) type, you should
> use rte_flow_item and define what happens when "last" is set.
> 
> You should define it as reserved for now, any non-NULL value is an error. This
> field might also be useful later.
> 
> <snip>
> > > > +};
> > > > +
> > > > +/**
> > > > + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> > > > + *
> > > > + * Virtual tunnel end-point decapsulation action data.
> > > > + *
> > > > + * Non-terminating action by default.
> > > > + */
> > > > +struct rte_flow_action_tunnel_decap {
> > > > +	enum rte_flow_item_type type;
> > > > +	/**<
> > > > +	 * Flow item type of virtual tunnel end-point to be decapsulated
> > > > +	 */
> > > > +};
> > > Note that contrary to ENCAP, DECAP wouldn't necessarily need
> > > repeated actions to peel each layer off. The current definition is fine.
> > To clarify, the the decap is upto the PMD to remove all the header for
> > a specified type. For example, for
> >
> > rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will
> peel off (ETH, IPV4, UDP, VXLAN) header all together.
> 
> Yep, that's fine, whether we use multiple actions or a single one doing
> multiple things, a single DECAP can peel them off all at once :)
> 
> > >
> > > > +
> > > > +/**
> > > >    * Definition of a single action.
> > > >    *
> > > >    * A list of actions is terminated by a END action.
> > > > --
> > > > 2.7.4
> > > >
> 
> If the reasons I gave did not manage to convince you about choosing
> between
> either fixed (VLAN|VXLAN)_(ENCAP|DECAP) actions or generic encap/decap
> actions that deal with a single protocol layer at once instead of the
> proposed approach, I'm ready to try it out as an experimental API (all new
> objects tagged as experimental) *if* you address the lack of mask, which
> remains an open issue.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-09 16:10  0%     ` Mohammad Abdul Awal
@ 2018-04-10 10:19  0%       ` Adrien Mazarguil
  2018-04-10 11:06  0%         ` Shahaf Shuler
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-10 10:19 UTC (permalink / raw)
  To: Mohammad Abdul Awal; +Cc: Declan Doherty, dev

On Mon, Apr 09, 2018 at 05:10:35PM +0100, Mohammad Abdul Awal wrote:
> On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> > > Add new flow action types and associated action data structures to
> > > support the encapsulation and decapsulation of the virtual tunnel
> > > endpoints.
> > > 
> > > The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
> > > flow to be encapsulated in the virtual tunnel endpoint overlay
> > > defined in the tunnel_encap action data.
> > > 
> > > The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
> > > tunnel endpoint overlays up to and including the first instance of
> > > the flow item type defined in the tunnel_decap action data for the
> > > matching flows.
> > > 
> > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > This generic approach looks flexible enough to cover the use cases that
> > immediately come to mind (VLAN, VXLAN), its design is sound.
> > 
> > However, while I'm aware it's not a concern at this point, it won't be able
> > to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
> > which will require additional meta data or some run-time assistance from the
> > application.
> > 
> > Eventually for more complex use cases, dedicated encap/decap actions will
> > have to appear, so the issue I wanted to raise before going further is this:
> > 
> > Going generic inevitably trades some of the usability; flat structures
> > dedicated to VXLAN encap/decap with only the needed info to get the job done
> > would likely be easier to implement in PMDs and use in applications. Any
> > number of such actions can be added to rte_flow without ABI impact.
> > 
> > If VXLAN is the only use case at this point, my suggestion would be to go
> > with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
> > L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
> We can go this way and this will increase the action for more and more
> tunneling protocols being added. Current proposal is already a generic
> approach which specifies as a tunnel for all the tunneling protocols.

Right, on the other hand there are not that many standard encapsulations
offloaded by existing devices. rte_flow could easily handle dedicated
actions for each of them without problem.

My point is that many of those (will eventually) have their own quirks to
manage when doing encap/decap, it's not just a matter of prepending or
removing a bunch of header definitions, otherwise we could as well let
applications simply provide an arbitrary buffer to prepend.

Consider that the "generic" part is already built into rte_flow as the way
patterns and action are handled. Adding another generic layer on top of that
could make things more inconvenient than necessary to applications (my main
concern).

You'd need another layer of validation/error reporting machinery to properly
let applications know they cannot encap VXLAN on top of TCP on top of
QinQinQinQinQ for instance. Either a single bounded encapsulation definition
or a combination at the action list level is needed to avoid that.

> > Now we can start with the generic approach, see how it fares and add
> > dedicated encap/decap later as needed.
> > 
> > More comments below.
<snip>
> > > +Action: ``TUNNEL_ENCAP``
> > > +^^^^^^^^^^^^^^^^^^^^^^
> > > +
> > > +Performs an encapsulation action by encapsulating the flows matched by the
> > > +pattern items according to the network overlay defined in the
> > > +``rte_flow_action_tunnel_encap`` pattern items.
> > > +
> > > +This action modifies the payload of matched flows. The pattern items specified
> > > +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
> > > +set of overlay headers, from the Ethernet header up to the overlay header. The
> > > +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> > Regarding the use of a pattern list, if you consider PMDs are already
> > iterating on a list of actions when encountering
> > RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.
> We understand that it is implementation specifics. If we do not go for
> another inner loop, all the bundling need to be handled in the same
> function, which seems more clumsy to me. This also breaks the tunnel
> endpoint concept.
> > 
> > How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
> > exactly one item instead (in encap, i.e. reverse order)?
> Again, if we have tunnel action, security action, and other actions, all the
> processing and tracking need to be done in one function. Now we will need
> ETH_ENCAP/DECAP, UDP_ENCAP/DECAP, NVGRE_ENCAP/DECAP, etc.

Well, the number of DECAP actions doesn't need to perfectly reflect that of
ENCAP since it implies all preceding layers. No problem with that.

Regarding multiple dedicated actions, my suggestion was for a single generic
one as in this patch, but each instance on the ENCAP side would deal with a
single protocol layer, instead of having a single ENCAP action with multiple
inner layers (and thus an inner loop).

PMDs also gain the ability to precisely report which encap step fails by
making rte_flow_error point to the problematic object to ease debugging of
flow rules on the application side.

Why would that break the tunnel idea and more importantly, how would it
prevent PMD developers from splitting their processing into multiple
functions?

> > 
> > In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
> > 
<snip>
> > > +
> > > +   +-------+--------------------------+------------+
> > > +   | Index | Flow Item Type           | Flow Item  |
> > > +   +=======+==========================+============+
> > > +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> > > +   +-------+--------------------------+------------+
> > > +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> > > +   +-------+--------------------------+------------+
> > > +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> > > +   +-------+--------------------------+------------+
> > > +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> > > +   +-------+--------------------------+------------+
> > > +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> > > +   +-------+--------------------------+------------+
> > One possible issue is that it relies on objects normally found on the
> > pattern side of flow rules. Those are supposed to match something, they are
> > not intended for packet header generation. While their "spec" and "mask"
> > fields might make sense in this context, the "last" field is odd.
> > 
> > You must define them without leaving anything open for interpretation by
> > PMDs and users alike. Defining things as "undefined" is fine as long as it's
> > covered.
> Please note that the "void *item" in the
> "rte_flow_action_tunnel_encap.pattern" points to the data structure defined
> for the corresponding rte_flow_item_type instead of a rte_flow_item
> structure. As an example, for the rte_flow_item_eth type, the "void *item"
> will point to a "struct rte_flow_item_eth" instance. Thats why we have
> defined struct rte_flow_action_item inside struct
> rte_flow_action_tunnel_encap. So, no question of spec, mask, last anymore.

Right, I noticed that after commenting its structure definition below.

I think I won't be the only one confused by this approach, also because a
mask is needed in addition to a specification structure, otherwise how do
you plan to tell what fields are relevant in application-provided protocol
headers?

An application might set unusual IPv4/UDP/VXLAN fields and expect them to be
part of the encapsulated traffic. Without a mask, a PMD must take headers
verbatim, and I don't think many devices are ready for that yet.

Hence my other suggestion: defining inflexible $PROTOCOL_(ENCAP|DECAP)
actions that do not allow more than what's defined by official RFCs for
$PROTOCOL.

<snip>
> > > + */
> > > +struct rte_flow_action_tunnel_encap {
> > > +	struct rte_flow_action_item {
> > > +		enum rte_flow_item_type type;
> > > +		/**< Flow item type. */
> > > +		const void *item;
> > > +		/**< Flow item definition which points to the data of
> > > +		 * corresponding rte_flow_item_type.
> > > +		 */
> > I see it's a new action type, albeit a bit confusing (there is no
> > RTE_FLOW_ACTION_TYPE_ITEM).
> > 
> > I suggest the standard pattern item type since you're going with enum
> > rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
> > what fields are relevant. An application might otherwise want to encap with
> > unsupported properties (e.g. specific IPv4 ToS field and whatnot).
> > 
> > How about a single "struct rte_flow_pattern_item item", neither const and
> > neither a pointer. It's generic enough, enclosed spec/last/mask pointers
> > take care of the specifics. You just need to define what's supposed to
> > happen when "last" is set.
> Please see the comment above regarding this field.

Point still stands, if you need to distinguish spec and mask, a more
complete structure is needed. Instead of adding a new (confusing) type, you
should use rte_flow_item and define what happens when "last" is set.

You should define it as reserved for now, any non-NULL value is an
error. This field might also be useful later.

<snip>
> > > +};
> > > +
> > > +/**
> > > + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> > > + *
> > > + * Virtual tunnel end-point decapsulation action data.
> > > + *
> > > + * Non-terminating action by default.
> > > + */
> > > +struct rte_flow_action_tunnel_decap {
> > > +	enum rte_flow_item_type type;
> > > +	/**<
> > > +	 * Flow item type of virtual tunnel end-point to be decapsulated
> > > +	 */
> > > +};
> > Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
> > actions to peel each layer off. The current definition is fine.
> To clarify, the the decap is upto the PMD to remove all the header for a
> specified type. For example, for
> 
> rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will peel off (ETH, IPV4, UDP, VXLAN) header all together.

Yep, that's fine, whether we use multiple actions or a single one doing
multiple things, a single DECAP can peel them off all at once :)

> > 
> > > +
> > > +/**
> > >    * Definition of a single action.
> > >    *
> > >    * A list of actions is terminated by a END action.
> > > -- 
> > > 2.7.4
> > > 

If the reasons I gave did not manage to convince you about choosing between
either fixed (VLAN|VXLAN)_(ENCAP|DECAP) actions or generic encap/decap
actions that deal with a single protocol layer at once instead of the
proposed approach, I'm ready to try it out as an experimental API (all new
objects tagged as experimental) *if* you address the lack of mask, which
remains an open issue.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  2018-04-10  9:43  4%     ` [dpdk-dev] [PATCH v6 " Remy Horton
@ 2018-04-10  9:43  7%       ` Remy Horton
  2018-04-10 18:56  0%       ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
  1 sibling, 0 replies; 200+ results
From: Remy Horton @ 2018-04-10  9:43 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related
parameters, such as burst sizes, descriptor ring sizes, and number
of queues, varies between different network interface devices. This
patch allows individual PMDs to specify preferred parameter values.

Signed-off-by: Remy Horton <remy.horton@intel.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
---
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 4 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..d13077d 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -112,19 +112,6 @@ Deprecation Notices
   The new API add rss_level field to ``rte_eth_rss_conf`` to enable a choice
   of RSS hash calculation on outer or inner header of tunneled packet.
 
-* ethdev:  Currently, if the  rte_eth_rx_burst() function returns a value less
-  than *nb_pkts*, the application will assume that no more packets are present.
-  Some of the hw queue based hardware can only support smaller burst for RX
-  and TX and thus break the expectation of the rx_burst API. Similar is the
-  case for TX burst as well as ring sizes. ``rte_eth_dev_info`` will be added
-  with following new parameters so as to support semantics for drivers to
-  define a preferred size for Rx/Tx burst and rings.
-
-  - Member ``struct preferred_size`` would be added to enclose all preferred
-    size to be fetched from driver/implementation.
-  - Members ``uint16_t rx_burst``,  ``uint16_t tx_burst``, ``uint16_t rx_ring``,
-    and ``uint16_t tx_ring`` would be added to ``struct preferred_size``.
-
 * ethdev: A work is being planned for 18.05 to expose VF port representors
   as a mean to perform control and data path operation on the different VFs.
   As VF representor is an ethdev port, new fields are needed in order to map
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index abc1c17..0d57a63 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -58,6 +58,11 @@ New Features
   * Added support for NVGRE, VXLAN and GENEVE filters in flow API.
   * Added support for DROP action in flow API.
 
+* **Added PMD-recommended Tx and Rx parameters**
+
+  Applications can now query drivers for device-tuned values of
+  ring sizes, burst sizes, and number of queues.
+
 
 API Changes
 -----------
@@ -83,6 +88,29 @@ API Changes
   memory footprint which helps in better cache utilization when large number
   of meter objects are used.
 
+* **Changes to semantics of rte_eth_dev_configure() parameters.**
+
+   If both the ``nb_rx_q`` and ``nb_tx_q`` parameters are zero,
+   ``rte_eth_dev_configure`` will now use PMD-recommended queue sizes, or if
+   recommendations are not provided by the PMD the function will use ethdev
+   fall-back values. Previously setting both of the parameters to zero would
+   have resulted in ``-EINVAL`` being returned.
+
+* **Changes to semantics of rte_eth_rx_queue_setup() parameters.**
+
+   If the ``nb_rx_desc`` parameter is zero, ``rte_eth_rx_queue_setup`` will
+   now use the PMD-recommended Rx ring size, or in the case where the PMD
+   does not provide a recommendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_rx_desc`` to zero would have
+   resulted in an error.
+
+* **Changes to semantics of rte_eth_tx_queue_setup() parameters.**
+
+   If the ``nb_tx_desc`` parameter is zero, ``rte_eth_tx_queue_setup`` will
+   now use the PMD-recommended Tx ring size, or in the case where the PMD
+   does not provide a recoomendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
+   resulted in an error.
 
 ABI Changes
 -----------
@@ -97,6 +125,13 @@ ABI Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Additional fields in rte_eth_dev_info.**
+
+  The ``rte_eth_dev_info`` structure has had two extra entries appended to the
+  end of it: ``default_rxportconf`` and ``default_txportconf``. Each of these
+  in turn are ``rte_eth_dev_portconf`` structures containing three fields of
+  type ``uint16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
+  are parameter values recommended for use by the PMD.
 
 Removed Items
 -------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..209796d 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1061,6 +1061,26 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
+	dev = &rte_eth_devices[port_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
+
+	/* If number of queues specified by application for both Rx and Tx is
+	 * zero, use driver preferred values. This cannot be done individually
+	 * as it is valid for either Tx or Rx (but not both) to be zero.
+	 * If driver does not provide any preferred valued, fall back on
+	 * EAL defaults.
+	 */
+	if (nb_rx_q == 0 && nb_tx_q == 0) {
+		nb_rx_q = dev_info.default_rxportconf.nb_queues;
+		if (nb_rx_q == 0)
+			nb_rx_q = RTE_ETH_DEV_FALLBACK_RX_NBQUEUES;
+		nb_tx_q = dev_info.default_txportconf.nb_queues;
+		if (nb_tx_q == 0)
+			nb_tx_q = RTE_ETH_DEV_FALLBACK_TX_NBQUEUES;
+	}
+
 	if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) {
 		RTE_PMD_DEBUG_TRACE(
 			"Number of RX queues requested (%u) is greater than max supported(%d)\n",
@@ -1075,8 +1095,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 		return -EINVAL;
 	}
 
-	dev = &rte_eth_devices[port_id];
-
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
 
@@ -1106,13 +1124,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 	 * than the maximum number of RX and TX queues supported by the
 	 * configured device.
 	 */
-	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
-
-	if (nb_rx_q == 0 && nb_tx_q == 0) {
-		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d both rx and tx queue cannot be 0\n", port_id);
-		return -EINVAL;
-	}
-
 	if (nb_rx_q > dev_info.max_rx_queues) {
 		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
 				port_id, nb_rx_q, dev_info.max_rx_queues);
@@ -1477,6 +1488,14 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		return -EINVAL;
 	}
 
+	/* Use default specified by driver, if nb_rx_desc is zero */
+	if (nb_rx_desc == 0) {
+		nb_rx_desc = dev_info.default_rxportconf.ring_size;
+		/* If driver default is also zero, fall back on EAL default */
+		if (nb_rx_desc == 0)
+			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
+	}
+
 	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
 			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
 			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
@@ -1600,6 +1619,13 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 
 	rte_eth_dev_info_get(port_id, &dev_info);
 
+	/* Use default specified by driver, if nb_tx_desc is zero */
+	if (nb_tx_desc == 0) {
+		nb_tx_desc = dev_info.default_txportconf.ring_size;
+		/* If driver default is zero, fall back on EAL default */
+		if (nb_tx_desc == 0)
+			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
+	}
 	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
 	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
 	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca..685145f 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -988,6 +988,27 @@ struct rte_eth_conf {
 
 struct rte_pci_device;
 
+/*
+ * Fallback default preferred Rx/Tx port parameters.
+ * These are used if an application requests default parameters
+ * but the PMD does not provide preferred values.
+ */
+#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1
+#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1
+
+/**
+ * Preferred Rx/Tx port parameters.
+ * There are separate instances of this structure for transmission
+ * and reception respectively.
+ */
+struct rte_eth_dev_portconf {
+	uint16_t burst_size; /**< Device-preferred burst size */
+	uint16_t ring_size; /**< Device-preferred size of queue rings */
+	uint16_t nb_queues; /**< Device-preferred number of queues */
+};
+
 /**
  * Ethernet device information
  */
@@ -1029,6 +1050,10 @@ struct rte_eth_dev_info {
 	/** Configured number of rx/tx queues */
 	uint16_t nb_rx_queues; /**< Number of RX queues. */
 	uint16_t nb_tx_queues; /**< Number of TX queues. */
+	/** Rx parameter recommendations */
+	struct rte_eth_dev_portconf default_rxportconf;
+	/** Tx parameter recommendations */
+	struct rte_eth_dev_portconf default_txportconf;
 };
 
 /**
-- 
2.9.5

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-04-06 14:49  4%   ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
  2018-04-06 14:49  7%     ` [dpdk-dev] [PATCH v5 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
  2018-04-06 17:01  0%     ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
@ 2018-04-10  9:43  4%     ` Remy Horton
  2018-04-10  9:43  7%       ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
  2018-04-10 18:56  0%       ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
  2 siblings, 2 replies; 200+ results
From: Remy Horton @ 2018-04-10  9:43 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related parameters,
such as burst sizes, descriptor ring sizes, and number of queues, varies
between different network interface devices. This patchset allows individual
PMDs to specify their preferred parameter values, and if so indicated by an
application, for them to be used automatically by the ethdev layer.

rte_eth_dev_configure() has been changed so that specifying zero for both
nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
are not available, falls back to EAL defaults. Setting one (but not both)
to zero does not cause the use of defaults, as having one of them zeroed is
a valid setup.

This patchset includes per-PMD values for e1000 and i40e but it is expected
that subsequent patchsets will cover other PMDs. A deprecation notice
covering the API/ABI change is in place.

Changes in v6:
* Updated/corrected testpmd documentation
* Carried forward acks/review
* Rebased to d218a4d060de

Changes in v5:
* uint_16_t corrected to uint16_t

Changes in v4:
* Added API/ABI change documentation
* Rebased to 78f5a2e93d74

Changes in v3:
* Changed formatting around new rte_eth_dev_info fields
* Added Doxygen documentation to struct rte_eth_dev_portconf
* Testpmd "port config all burst 0" and --burst=0 uses PMD 
  Rx burst recommendations.
* Added to release notes
* Rebased to 8ea081f38161

Changes in v2:
* Rebased to master
* Removed fallback values from rte_eth_dev_info_get()
* Added fallback values to rte_rte_[rt]x_queue_setup()
* Added fallback values to rte_eth_dev_configure()
* Corrected comment
* Removed deprecation notice
* Split RX and Tx into seperate structures
* Changed parameter names


Remy Horton (4):
  ethdev: add support for PMD-tuned Tx/Rx parameters
  net/e1000: add TxRx tuning parameters
  net/i40e: add TxRx tuning parameters
  testpmd: make use of per-PMD TxRx parameters

 app/test-pmd/cmdline.c                 | 31 +++++++++++++++++++++---
 app/test-pmd/parameters.c              | 38 +++++++++++++++++++++++++----
 app/test-pmd/testpmd.c                 |  5 ++--
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 doc/guides/testpmd_app_ug/run_app.rst  |  4 +++-
 drivers/net/e1000/em_ethdev.c          |  6 +++++
 drivers/net/i40e/i40e_ethdev.c         | 33 ++++++++++++++++++++++---
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 10 files changed, 198 insertions(+), 36 deletions(-)

-- 
2.9.5

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection
  @ 2018-04-10  1:59  3%           ` Yongseok Koh
  2018-04-11  0:25  0%             ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Yongseok Koh @ 2018-04-10  1:59 UTC (permalink / raw)
  To: Olivier Matz
  Cc: wenzhuo.lu, jingjing.wu, adrien.mazarguil, nelio.laranjeiro, dev

On Mon, Apr 09, 2018 at 06:04:34PM +0200, Olivier Matz wrote:
> Hi Yongseok,
> 
> On Tue, Apr 03, 2018 at 05:12:06PM -0700, Yongseok Koh wrote:
> > On Tue, Apr 03, 2018 at 10:26:15AM +0200, Olivier Matz wrote:
> > > Hi,
> > > 
> > > On Mon, Apr 02, 2018 at 11:50:03AM -0700, Yongseok Koh wrote:
> > > > When attaching a mbuf, indirect mbuf has to point to start of buffer of
> > > > direct mbuf. By adding buf_off field to rte_mbuf, this becomes more
> > > > flexible. Indirect mbuf can point to any part of direct mbuf by calling
> > > > rte_pktmbuf_attach_at().
> > > > 
> > > > Possible use-cases could be:
> > > > - If a packet has multiple layers of encapsulation, multiple indirect
> > > >   buffers can reference different layers of the encapsulated packet.
> > > > - A large direct mbuf can even contain multiple packets in series and
> > > >   each packet can be referenced by multiple mbuf indirections.
> > > > 
> > > > Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> > > 
> > > I think the current API is already able to do what you want.
> > > 
> > > 1/ Here is a mbuf m with its data
> > > 
> > >                off
> > >                <-->
> > >                       len
> > >           +----+   <---------->
> > >           |    |
> > >         +-|----v----------------------+
> > >         | |    -----------------------|
> > > m       | buf  |    XXXXXXXXXXX      ||
> > >         |      -----------------------|
> > >         +-----------------------------+
> > > 
> > > 
> > > 2/ clone m:
> > > 
> > >   c = rte_pktmbuf_alloc(pool);
> > >   rte_pktmbuf_attach(c, m);
> > > 
> > >   Note that c has its own offset and length fields.
> > > 
> > > 
> > >                off
> > >                <-->
> > >                       len
> > >           +----+   <---------->
> > >           |    |
> > >         +-|----v----------------------+
> > >         | |    -----------------------|
> > > m       | buf  |    XXXXXXXXXXX      ||
> > >         |      -----------------------|
> > >         +------^----------------------+
> > >                |
> > >           +----+
> > > indirect  |
> > >         +-|---------------------------+
> > >         | |    -----------------------|
> > > c       | buf  |                     ||
> > >         |      -----------------------|
> > >         +-----------------------------+
> > > 
> > >                 off    len
> > >                 <--><---------->
> > > 
> > > 
> > > 3/ remove some data from c without changing m
> > > 
> > >    rte_pktmbuf_adj(c, 10)   // at head
> > >    rte_pktmbuf_trim(c, 10)  // at tail
> > > 
> > > 
> > > Please let me know if it fits your needs.
> > 
> > No, it doesn't.
> > 
> > Trimming head and tail with the current APIs removes data and make the space
> > available. Adjusting packet head means giving more headroom, not shifting the
> > buffer itself. If m has two indirect mbufs (c1 and c2) and those are pointing to
> > difference offsets in m,
> > 
> > rte_pktmbuf_adj(c1, 10);
> > rte_pktmbuf_adj(c2, 20);
> > 
> > then the owner of c2 regard the first (off+20)B as available headroom. If it
> > wants to attach outer header, it will overwrite the headroom even though the
> > owner of c1 is still accessing it. Instead, another mbuf (h1) for the outer
> > header should be linked by h1->next = c2.
> 
> Yes, after these operations c1, c2 and m should become read-only. So, to
> prepend headers, another mbuf has to be inserted before as you suggest. It
> is possible to wrap this in a function rte_pktmbuf_clone_area(m, offset,
> length) that will:
>   - alloc and attach indirect mbuf for each segment of m that is
>     in the range [offset : length+offset].
>   - prepend an empty and writable mbuf for the headers
> 
> > If c1 and c2 are attached with shifting buffer address by adjusting buf_off,
> > which actually shrink the headroom, this case can be properly handled.
> 
> What do you mean by properly handled?
> 
> Yes, prepending data or adding data in the indirect mbuf won't override
> the direct mbuf. But prepending data or adding data in the direct mbuf m
> won't be protected.
> 
> From an application point of view, indirect mbufs, or direct mbufs that
> have refcnt != 1, should be both considered as read-only because they
> may share their data. How an application can know if the data is shared
> or not?
> 
> Maybe we need a flag to differentiate mbufs that are read-only
> (something like SHARED_DATA, or simply READONLY). In your case, if my
> understanding is correct, you want to have indirect mbufs with RW data.

Agree that indirect mbuf must be treated as read-only, Then the current code is
enough to handle that use-case.

> > And another use-case (this is my actual use-case) is to make a large mbuf have
> > multiple packets in series. AFAIK, this will also be helpful for some FPGA NICs
> > because it transfers multiple packets to a single large buffer to reduce PCIe
> > overhead for small packet traffic like the Multi-Packet Rx of mlx5 does.
> > Otherwise, packets should be memcpy'd to regular mbufs one by one instead of
> > indirect referencing.
> > 
> > Does this make sense?
> 
> I understand the need.
> 
> Another option would be to make the mbuf->buffer point to an external
> buffer (not inside the direct mbuf). This would require to add a
> mbuf->free_cb. See "Mbuf with external data buffer" (page 19) in [1] for
> a quick overview.
> 
> [1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdpdksummit.com%2FArchive%2Fpdf%2F2016Userspace%2FDay01-Session05-OlivierMatz-Userspace2016.pdf&data=02%7C01%7Cyskoh%40mellanox.com%7Ca5405edb36e445e6540808d59e339a38%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636588866861082855&sdata=llw%2BwiY5cC56naOUhBbIg8TKtfFN6VZcIRY5PV7VqZs%3D&reserved=0
> 
> The advantage is that it does not require the large data to be inside a
> mbuf (requiring a mbuf structure before the buffer, and requiring to be
> allocated from a mempool). On the other hand, it is maybe more complex
> to implement compared to your solution.

I knew that you presented the slides and frankly, I had considered that option
at first. But even with that option, metadata to store refcnt should also be
allocated and managed anyway. Kernel also maintains the skb_shared_info at the
end of the data segment. Even though it could have smaller metadata structure,
I just wanted to make full use of the existing framework because it is less
complex as you mentioned. Given that you presented the idea of external data
buffer in 2016 and there hasn't been many follow-up discussions/activities so
far, I thought the demand isn't so big yet thus I wanted to make this patch
simpler.  I personally think that we can take the idea of external data seg when
more demands come from users in the future as it would be a huge change and may
break current ABI/API. When the day comes, I'll gladly participate in the
discussions and write codes for it if I can be helpful.

Do you think this patch is okay for now?


Thanks for your comments,
Yongseok

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:09  0%         ` Ananyev, Konstantin
@ 2018-04-09 17:26  0%           ` Dumitrescu, Cristian
  2018-04-10 12:32  0%             ` Van Haaren, Harry
  0 siblings, 1 reply; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:26 UTC (permalink / raw)
  To: Ananyev, Konstantin, Van Haaren, Harry, Stephen Hemminger, Singh,
	Jasvinder, Richardson, Bruce
  Cc: dev

> >
> > If people think that this function conversion is not nice, it can be reworked
> in multiple ways at the expense of API (but not ABI) change:
> > 1. Define the hash function field in the table parameter structure as
> opaque void * rather than 4-parameter version.
> > 2. Create a separate parameter structure just for this hash table type.
> 
> Why just not define your f_hash member as a union:
> 
> struct rte_table_hash_params {
> ...
> union {
>     rte_table_hash_op_hash  f_hash_4params;
>     rte_hash_function f_hash_3_params;
> };
> 
> ?
> 

Yes, agreed, this is yet another way to handle this, thanks Konstantin.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
@ 2018-04-09 17:09  0%         ` Ananyev, Konstantin
  2018-04-09 17:26  0%           ` Dumitrescu, Cristian
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-09 17:09 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Van Haaren, Harry, Stephen Hemminger,
	Singh, Jasvinder, Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 6:02 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> 
> 
> > -----Original Message-----
> > From: Van Haaren, Harry
> > Sent: Monday, April 9, 2018 5:38 PM
> > To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Stephen
> > Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> > Cristian
> > > Sent: Monday, April 9, 2018 4:59 PM
> > > To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> > Jasvinder
> > > <jasvinder.singh@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > > Sent: Monday, April 9, 2018 4:10 PM
> > > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > > >
> > > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > > >
> > > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > > >
> > > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > > ---
> > > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > index dcb4fe9..f7eae27 100644
> > > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> > *params,
> > > > >  		return NULL;
> > > > >  	}
> > > > >
> > > > > +	void *hash_func = p->f_hash;
> > > > > +
> > > > >  	/* Create cuckoo hash table */
> > > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > > >  		.entries = p->n_keys,
> > > > >  		.key_len = p->key_size,
> > > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > > +		.hash_func = (rte_hash_function) hash_func,
> > > > >  		.hash_func_init_val = p->seed,
> > > > >  		.socket_id = socket_id,
> > > > >  		.name = p->name
> > > >
> > > > This is just tricking the compiler into not complaining.
> > > > I would really rather see the two hash functions made the same.
> > >
> > > (Adding Bruce as well to consolidate all conversations in a single thread.)
> > >
> > > What we want to do here is be able to use the librte_hash under the same
> > API
> > > as the several hash table flavors implemented in librte_table.
> > >
> > > Both of these libraries allow configuring the hash function per each hash
> > > table instance. Problem is: hash function in librte_hash has only 3
> > parameters
> > > (no key mask), while hash function in librte_table has 4 parameters
> > (includes
> > > key mask). The key mask helps a lot for practical protocol implementations
> > by
> > > avoiding key copy & pre-process on lookup.
> > >
> > > So then: how to plug in librte_hash under the same API as the suite of
> > hash
> > > tables in librte_table? We don't want to re-implement cuckoo hash from
> > > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > > to how the LPM and ACL tables are plugged into librte_table.
> > >
> > > Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> > hash
> > > flavor under the librte_table. Maybe this should be documented better.
> > This
> > > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > > this trivial patch.
> > >
> > > Ideally, for every 3-parameter hash function, I would like to generate the
> > > corresponding 4-parameter hash function on-the-fly, but unfortunately this
> > is
> > > not what C language can do.
> > >
> > > Of course, IMO the best solution is to add key mask support to librte_hash.
> >
> >
> > Looking at the previous discussion I see the following as a possible solution;
> >
> > Given the current code looks broken it should be fixed in this release.
> 
> The code is not broken. This is not a bug, it is a limitation for that particular table type. The only gap that I see is adding a Doxygen
> comment in the API header file.
> 
> User explicitly picks the hash table type it wants; when using this particular hash table type, that pointer needs to point to a 3-parameter
> function instead of 4. Given the limitation is clearly documented in Doxygen (current gap that we can quickly address), I don't see any
> problem.
> 
> If people think that this function conversion is not nice, it can be reworked in multiple ways at the expense of API (but not ABI) change:
> 1. Define the hash function field in the table parameter structure as opaque void * rather than 4-parameter version.
> 2. Create a separate parameter structure just for this hash table type.

Why just not define your f_hash member as a union:

struct rte_table_hash_params {
...
union {
    rte_table_hash_op_hash  f_hash_4params;
    rte_hash_function f_hash_3_params;
}; 

?

> 
> > Given the actual code fix is an API / ABI break (depending on solution) it
> > cannot be merged official in this release.
> > We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> > compile time.
> 
> This is not new code introduced in this release cycle, this is just fixing the compiler warning, I fail to see how your ABI breakage mention is
> applicable.
> 
> Maybe we should talk more specifics over the code, where exactly in the code would you like to place your NEXT_ABI macro?
> 
> >
> > With the above 3 points, I think the best solution is to correctly fix the
> > problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> > macros.
> >
> > In this case, we can preserve backwards (buggy) behavior if required, and
> > provide correct (but API/ABI breaking) code as well. This is a tough decision -
> > particularly for distros - what do they package?
> >
> > Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:43  0%       ` Ferruh Yigit
@ 2018-04-09 17:05  0%         ` Dumitrescu, Cristian
  0 siblings, 0 replies; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:05 UTC (permalink / raw)
  To: Yigit, Ferruh, Van Haaren, Harry, Stephen Hemminger, Singh,
	Jasvinder, Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: Yigit, Ferruh
> Sent: Monday, April 9, 2018 5:43 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Dumitrescu, Cristian
> <cristian.dumitrescu@intel.com>; Stephen Hemminger
> <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> On 4/9/2018 5:38 PM, Van Haaren, Harry wrote:
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> Cristian
> >> Sent: Monday, April 9, 2018 4:59 PM
> >> To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> Jasvinder
> >> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> >> Cc: dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> >>> Sent: Monday, April 9, 2018 4:10 PM
> >>> To: Singh, Jasvinder <jasvinder.singh@intel.com>
> >>> Cc: dev@dpdk.org; Dumitrescu, Cristian
> <cristian.dumitrescu@intel.com>
> >>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >>>
> >>> On Mon,  9 Apr 2018 13:49:48 +0100
> >>> Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> >>>
> >>>> Fix build error with gcc 8.0 due to cast between function types.
> >>>> Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> >>>>
> >>>> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> >>>> ---
> >>>>  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> >>>>  1 file changed, 3 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> >>> b/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> index dcb4fe9..f7eae27 100644
> >>>> --- a/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> >>>> @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> *params,
> >>>>  		return NULL;
> >>>>  	}
> >>>>
> >>>> +	void *hash_func = p->f_hash;
> >>>> +
> >>>>  	/* Create cuckoo hash table */
> >>>>  	struct rte_hash_parameters hash_cuckoo_params = {
> >>>>  		.entries = p->n_keys,
> >>>>  		.key_len = p->key_size,
> >>>> -		.hash_func = (rte_hash_function)(p->f_hash),
> >>>> +		.hash_func = (rte_hash_function) hash_func,
> >>>>  		.hash_func_init_val = p->seed,
> >>>>  		.socket_id = socket_id,
> >>>>  		.name = p->name
> >>>
> >>> This is just tricking the compiler into not complaining.
> >>> I would really rather see the two hash functions made the same.
> >>
> >> (Adding Bruce as well to consolidate all conversations in a single thread.)
> >>
> >> What we want to do here is be able to use the librte_hash under the
> same API
> >> as the several hash table flavors implemented in librte_table.
> >>
> >> Both of these libraries allow configuring the hash function per each hash
> >> table instance. Problem is: hash function in librte_hash has only 3
> parameters
> >> (no key mask), while hash function in librte_table has 4 parameters
> (includes
> >> key mask). The key mask helps a lot for practical protocol
> implementations by
> >> avoiding key copy & pre-process on lookup.
> >>
> >> So then: how to plug in librte_hash under the same API as the suite of
> hash
> >> tables in librte_table? We don't want to re-implement cuckoo hash from
> >> librte_hash, we simply want to invoke it as a low-level primitive, similarly
> >> to how the LPM and ACL tables are plugged into librte_table.
> >>
> >> Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> hash
> >> flavor under the librte_table. Maybe this should be documented better.
> This
> >> currently triggers a build warning with gcc 8, which is easy to fix, hence
> >> this trivial patch.
> >>
> >> Ideally, for every 3-parameter hash function, I would like to generate the
> >> corresponding 4-parameter hash function on-the-fly, but unfortunately
> this is
> >> not what C language can do.
> >>
> >> Of course, IMO the best solution is to add key mask support to
> librte_hash.
> >
> >
> > Looking at the previous discussion I see the following as a possible
> solution;
> >
> > Given the current code looks broken it should be fixed in this release.
> > Given the actual code fix is an API / ABI break (depending on solution) it
> cannot be merged official in this release.
> > We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> compile time.
> >
> > With the above 3 points, I think the best solution is to correctly fix the
> problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> macros.
> >
> > In this case, we can preserve backwards (buggy) behavior if required, and
> provide correct (but API/ABI breaking) code as well. This is a tough decision -
> particularly for distros - what do they package?
> 
> +1 to use RTE_NEXT_ABI and deliver fixed code, and agree this is kind of
> pushing
> decision to distros.
> 

Again, where is the bug, and where exactly in the code you want to put RTE_NEXT_ABI macro, and what is the problem fixed by using this macro?

As stated in the reply to Harry, this could be reworked in multiple ways if people think the function pointer conversion is misleading to the user.

> >
> > Given the current code, I don't see a better solution - but I hope I'm wrong
> :)
> >


^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
@ 2018-04-09 17:02  4%       ` Dumitrescu, Cristian
  2018-04-09 17:09  0%         ` Ananyev, Konstantin
  2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 1 reply; 200+ results
From: Dumitrescu, Cristian @ 2018-04-09 17:02 UTC (permalink / raw)
  To: Van Haaren, Harry, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce
  Cc: dev



> -----Original Message-----
> From: Van Haaren, Harry
> Sent: Monday, April 9, 2018 5:38 PM
> To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>; Stephen
> Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu,
> Cristian
> > Sent: Monday, April 9, 2018 4:59 PM
> > To: Stephen Hemminger <stephen@networkplumber.org>; Singh,
> Jasvinder
> > <jasvinder.singh@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> >
> >
> > > -----Original Message-----
> > > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > > Sent: Monday, April 9, 2018 4:10 PM
> > > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> > >
> > > On Mon,  9 Apr 2018 13:49:48 +0100
> > > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> > >
> > > > Fix build error with gcc 8.0 due to cast between function types.
> > > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > > >
> > > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > > ---
> > > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > index dcb4fe9..f7eae27 100644
> > > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void
> *params,
> > > >  		return NULL;
> > > >  	}
> > > >
> > > > +	void *hash_func = p->f_hash;
> > > > +
> > > >  	/* Create cuckoo hash table */
> > > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > > >  		.entries = p->n_keys,
> > > >  		.key_len = p->key_size,
> > > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > > +		.hash_func = (rte_hash_function) hash_func,
> > > >  		.hash_func_init_val = p->seed,
> > > >  		.socket_id = socket_id,
> > > >  		.name = p->name
> > >
> > > This is just tricking the compiler into not complaining.
> > > I would really rather see the two hash functions made the same.
> >
> > (Adding Bruce as well to consolidate all conversations in a single thread.)
> >
> > What we want to do here is be able to use the librte_hash under the same
> API
> > as the several hash table flavors implemented in librte_table.
> >
> > Both of these libraries allow configuring the hash function per each hash
> > table instance. Problem is: hash function in librte_hash has only 3
> parameters
> > (no key mask), while hash function in librte_table has 4 parameters
> (includes
> > key mask). The key mask helps a lot for practical protocol implementations
> by
> > avoiding key copy & pre-process on lookup.
> >
> > So then: how to plug in librte_hash under the same API as the suite of
> hash
> > tables in librte_table? We don't want to re-implement cuckoo hash from
> > librte_hash, we simply want to invoke it as a low-level primitive, similarly
> > to how the LPM and ACL tables are plugged into librte_table.
> >
> > Solution is: as an exception, pass a 3-parameter hash function to cuckoo
> hash
> > flavor under the librte_table. Maybe this should be documented better.
> This
> > currently triggers a build warning with gcc 8, which is easy to fix, hence
> > this trivial patch.
> >
> > Ideally, for every 3-parameter hash function, I would like to generate the
> > corresponding 4-parameter hash function on-the-fly, but unfortunately this
> is
> > not what C language can do.
> >
> > Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.

The code is not broken. This is not a bug, it is a limitation for that particular table type. The only gap that I see is adding a Doxygen comment in the API header file.

User explicitly picks the hash table type it wants; when using this particular hash table type, that pointer needs to point to a 3-parameter function instead of 4. Given the limitation is clearly documented in Doxygen (current gap that we can quickly address), I don't see any problem.

If people think that this function conversion is not nice, it can be reworked in multiple ways at the expense of API (but not ABI) change:
1. Define the hash function field in the table parameter structure as opaque void * rather than 4-parameter version.
2. Create a separate parameter structure just for this hash table type.

> Given the actual code fix is an API / ABI break (depending on solution) it
> cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at
> compile time.

This is not new code introduced in this release cycle, this is just fixing the compiler warning, I fail to see how your ABI breakage mention is applicable.

Maybe we should talk more specifics over the code, where exactly in the code would you like to place your NEXT_ABI macro?

> 
> With the above 3 points, I think the best solution is to correctly fix the
> problem that GCC 8 is identifying, and putting that new API inside the NEXT_
> macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and
> provide correct (but API/ABI breaking) code as well. This is a tough decision -
> particularly for distros - what do they package?
> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  2018-04-09 16:38  4%     ` Van Haaren, Harry
@ 2018-04-09 16:43  0%       ` Ferruh Yigit
  2018-04-09 17:05  0%         ` Dumitrescu, Cristian
  2018-04-09 17:02  4%       ` Dumitrescu, Cristian
  2018-04-10 11:43  0%       ` Neil Horman
  2 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-09 16:43 UTC (permalink / raw)
  To: Van Haaren, Harry, Dumitrescu, Cristian, Stephen Hemminger,
	Singh, Jasvinder, Richardson, Bruce
  Cc: dev

On 4/9/2018 5:38 PM, Van Haaren, Harry wrote:
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
>> Sent: Monday, April 9, 2018 4:59 PM
>> To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
>> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
>> Cc: dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
>>
>>
>>
>>> -----Original Message-----
>>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>>> Sent: Monday, April 9, 2018 4:10 PM
>>> To: Singh, Jasvinder <jasvinder.singh@intel.com>
>>> Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
>>> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
>>>
>>> On Mon,  9 Apr 2018 13:49:48 +0100
>>> Jasvinder Singh <jasvinder.singh@intel.com> wrote:
>>>
>>>> Fix build error with gcc 8.0 due to cast between function types.
>>>> Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
>>>>
>>>> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
>>>> ---
>>>>  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
>>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
>>> b/lib/librte_table/rte_table_hash_cuckoo.c
>>>> index dcb4fe9..f7eae27 100644
>>>> --- a/lib/librte_table/rte_table_hash_cuckoo.c
>>>> +++ b/lib/librte_table/rte_table_hash_cuckoo.c
>>>> @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> +	void *hash_func = p->f_hash;
>>>> +
>>>>  	/* Create cuckoo hash table */
>>>>  	struct rte_hash_parameters hash_cuckoo_params = {
>>>>  		.entries = p->n_keys,
>>>>  		.key_len = p->key_size,
>>>> -		.hash_func = (rte_hash_function)(p->f_hash),
>>>> +		.hash_func = (rte_hash_function) hash_func,
>>>>  		.hash_func_init_val = p->seed,
>>>>  		.socket_id = socket_id,
>>>>  		.name = p->name
>>>
>>> This is just tricking the compiler into not complaining.
>>> I would really rather see the two hash functions made the same.
>>
>> (Adding Bruce as well to consolidate all conversations in a single thread.)
>>
>> What we want to do here is be able to use the librte_hash under the same API
>> as the several hash table flavors implemented in librte_table.
>>
>> Both of these libraries allow configuring the hash function per each hash
>> table instance. Problem is: hash function in librte_hash has only 3 parameters
>> (no key mask), while hash function in librte_table has 4 parameters (includes
>> key mask). The key mask helps a lot for practical protocol implementations by
>> avoiding key copy & pre-process on lookup.
>>
>> So then: how to plug in librte_hash under the same API as the suite of hash
>> tables in librte_table? We don't want to re-implement cuckoo hash from
>> librte_hash, we simply want to invoke it as a low-level primitive, similarly
>> to how the LPM and ACL tables are plugged into librte_table.
>>
>> Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
>> flavor under the librte_table. Maybe this should be documented better. This
>> currently triggers a build warning with gcc 8, which is easy to fix, hence
>> this trivial patch.
>>
>> Ideally, for every 3-parameter hash function, I would like to generate the
>> corresponding 4-parameter hash function on-the-fly, but unfortunately this is
>> not what C language can do.
>>
>> Of course, IMO the best solution is to add key mask support to librte_hash.
> 
> 
> Looking at the previous discussion I see the following as a possible solution;
> 
> Given the current code looks broken it should be fixed in this release.
> Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
> We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.
> 
> With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.
> 
> In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?

+1 to use RTE_NEXT_ABI and deliver fixed code, and agree this is kind of pushing
decision to distros.

> 
> Given the current code, I don't see a better solution - but I hope I'm wrong :)
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
  @ 2018-04-09 16:38  4%     ` Van Haaren, Harry
  2018-04-09 16:43  0%       ` Ferruh Yigit
                         ` (2 more replies)
  0 siblings, 3 replies; 200+ results
From: Van Haaren, Harry @ 2018-04-09 16:38 UTC (permalink / raw)
  To: Dumitrescu, Cristian, Stephen Hemminger, Singh, Jasvinder,
	Richardson, Bruce
  Cc: dev

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dumitrescu, Cristian
> Sent: Monday, April 9, 2018 4:59 PM
> To: Stephen Hemminger <stephen@networkplumber.org>; Singh, Jasvinder
> <jasvinder.singh@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> 
> 
> 
> > -----Original Message-----
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Monday, April 9, 2018 4:10 PM
> > To: Singh, Jasvinder <jasvinder.singh@intel.com>
> > Cc: dev@dpdk.org; Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > Subject: Re: [dpdk-dev] [PATCH] table: fix build error with gcc 8
> >
> > On Mon,  9 Apr 2018 13:49:48 +0100
> > Jasvinder Singh <jasvinder.singh@intel.com> wrote:
> >
> > > Fix build error with gcc 8.0 due to cast between function types.
> > > Fixes: 5a80bf0ae613 ("table: add cuckoo hash")
> > >
> > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > ---
> > >  lib/librte_table/rte_table_hash_cuckoo.c | 4 +++-
> > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/lib/librte_table/rte_table_hash_cuckoo.c
> > b/lib/librte_table/rte_table_hash_cuckoo.c
> > > index dcb4fe9..f7eae27 100644
> > > --- a/lib/librte_table/rte_table_hash_cuckoo.c
> > > +++ b/lib/librte_table/rte_table_hash_cuckoo.c
> > > @@ -103,11 +103,13 @@ rte_table_hash_cuckoo_create(void *params,
> > >  		return NULL;
> > >  	}
> > >
> > > +	void *hash_func = p->f_hash;
> > > +
> > >  	/* Create cuckoo hash table */
> > >  	struct rte_hash_parameters hash_cuckoo_params = {
> > >  		.entries = p->n_keys,
> > >  		.key_len = p->key_size,
> > > -		.hash_func = (rte_hash_function)(p->f_hash),
> > > +		.hash_func = (rte_hash_function) hash_func,
> > >  		.hash_func_init_val = p->seed,
> > >  		.socket_id = socket_id,
> > >  		.name = p->name
> >
> > This is just tricking the compiler into not complaining.
> > I would really rather see the two hash functions made the same.
> 
> (Adding Bruce as well to consolidate all conversations in a single thread.)
> 
> What we want to do here is be able to use the librte_hash under the same API
> as the several hash table flavors implemented in librte_table.
> 
> Both of these libraries allow configuring the hash function per each hash
> table instance. Problem is: hash function in librte_hash has only 3 parameters
> (no key mask), while hash function in librte_table has 4 parameters (includes
> key mask). The key mask helps a lot for practical protocol implementations by
> avoiding key copy & pre-process on lookup.
> 
> So then: how to plug in librte_hash under the same API as the suite of hash
> tables in librte_table? We don't want to re-implement cuckoo hash from
> librte_hash, we simply want to invoke it as a low-level primitive, similarly
> to how the LPM and ACL tables are plugged into librte_table.
> 
> Solution is: as an exception, pass a 3-parameter hash function to cuckoo hash
> flavor under the librte_table. Maybe this should be documented better. This
> currently triggers a build warning with gcc 8, which is easy to fix, hence
> this trivial patch.
> 
> Ideally, for every 3-parameter hash function, I would like to generate the
> corresponding 4-parameter hash function on-the-fly, but unfortunately this is
> not what C language can do.
> 
> Of course, IMO the best solution is to add key mask support to librte_hash.


Looking at the previous discussion I see the following as a possible solution;

Given the current code looks broken it should be fixed in this release.
Given the actual code fix is an API / ABI break (depending on solution) it cannot be merged official in this release.
We have a NEXT_ABI macro - it allows us to break API/ABI conditionally at compile time.

With the above 3 points, I think the best solution is to correctly fix the problem that GCC 8 is identifying, and putting that new API inside the NEXT_ macros.

In this case, we can preserve backwards (buggy) behavior if required, and provide correct (but API/ABI breaking) code as well. This is a tough decision - particularly for distros - what do they package?

Given the current code, I don't see a better solution - but I hope I'm wrong :)

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  2018-04-06 20:26  2%   ` Adrien Mazarguil
@ 2018-04-09 16:10  0%     ` Mohammad Abdul Awal
  2018-04-10 10:19  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Mohammad Abdul Awal @ 2018-04-09 16:10 UTC (permalink / raw)
  To: Adrien Mazarguil, Declan Doherty; +Cc: dev



On 06/04/2018 21:26, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
>> Add new flow action types and associated action data structures to
>> support the encapsulation and decapsulation of the virtual tunnel
>> endpoints.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
>> flow to be encapsulated in the virtual tunnel endpoint overlay
>> defined in the tunnel_encap action data.
>>
>> The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
>> tunnel endpoint overlays up to and including the first instance of
>> the flow item type defined in the tunnel_decap action data for the
>> matching flows.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> This generic approach looks flexible enough to cover the use cases that
> immediately come to mind (VLAN, VXLAN), its design is sound.
>
> However, while I'm aware it's not a concern at this point, it won't be able
> to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
> which will require additional meta data or some run-time assistance from the
> application.
>
> Eventually for more complex use cases, dedicated encap/decap actions will
> have to appear, so the issue I wanted to raise before going further is this:
>
> Going generic inevitably trades some of the usability; flat structures
> dedicated to VXLAN encap/decap with only the needed info to get the job done
> would likely be easier to implement in PMDs and use in applications. Any
> number of such actions can be added to rte_flow without ABI impact.
>
> If VXLAN is the only use case at this point, my suggestion would be to go
> with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
> L2/L3/L4/L5 header definitions to prepend according to RFC 7348.
We can go this way and this will increase the action for more and more 
tunneling protocols being added. Current proposal is already a generic 
approach which specifies as a tunnel for all the tunneling protocols.

> Now we can start with the generic approach, see how it fares and add
> dedicated encap/decap later as needed.
>
> More comments below.
>
>> ---
>>   doc/guides/prog_guide/rte_flow.rst | 77 ++++++++++++++++++++++++++++++++--
>>   lib/librte_ether/rte_flow.h        | 84 ++++++++++++++++++++++++++++++++++++--
>>   2 files changed, 155 insertions(+), 6 deletions(-)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index fd33d19..106fb93 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -997,9 +997,11 @@ Actions
>>   
>>   Each possible action is represented by a type. Some have associated
>>   configuration structures. Several actions combined in a list can be assigned
>> -to a flow rule. That list is not ordered.
>> +to a flow rule. That list is not ordered, with the exception of  actions which
>> +modify the packet itself, these packet modification actions must be specified
>> +in the explicit order in which they are to be executed.
>>   
>> -They fall in three categories:
>> +They fall in four categories:
>>   
>>   - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>     processing matched packets by subsequent flow rules, unless overridden
>> @@ -1008,8 +1010,11 @@ They fall in three categories:
>>   - Non-terminating actions (PASSTHRU, DUP) that leave matched packets up for
>>     additional processing by subsequent flow rules.
>>   
>> +- Non-terminating meta actions that do not affect the fate of packets but result
>> +  in modification of the packet itself (SECURITY, TUNNEL_ENCAP, TUNNEL_DECAP).
>> +
>>   - Other non-terminating meta actions that do not affect the fate of packets
>> -  (END, VOID, MARK, FLAG, COUNT, SECURITY).
>> +  (END, VOID, MARK, FLAG, COUNT).
> The above changes are not necessary anymore [1][2].
>
> [1] "ethdev: clarify flow API pattern items and actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095776.html
> [2] "ethdev: alter behavior of flow API actions"
>      https://dpdk.org/ml/archives/dev/2018-April/095779.html
OK, we can undo some changes here.
>
>>   When several actions are combined in a flow rule, they should all have
>>   different types (e.g. dropping a packet twice is not possible).
>> @@ -1486,6 +1491,72 @@ fields in the pattern items.
>>      | 1     | END      |
>>      +-------+----------+
>>   
>> +
> Nit: titles in this file are separated by a single empty line.
Fixed.
>
>> +Action: ``TUNNEL_ENCAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs an encapsulation action by encapsulating the flows matched by the
>> +pattern items according to the network overlay defined in the
>> +``rte_flow_action_tunnel_encap`` pattern items.
>> +
>> +This action modifies the payload of matched flows. The pattern items specified
>> +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
>> +set of overlay headers, from the Ethernet header up to the overlay header. The
>> +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.
> Regarding the use of a pattern list, if you consider PMDs are already
> iterating on a list of actions when encountering
> RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.
We understand that it is implementation specifics. If we do not go for 
another inner loop, all the bundling need to be handled in the same 
function, which seems more clumsy to me. This also breaks the tunnel 
endpoint concept.
>
> How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
> exactly one item instead (in encap, i.e. reverse order)?
Again, if we have tunnel action, security action, and other actions, all 
the processing and tracking need to be done in one function. Now we will 
need ETH_ENCAP/DECAP, UDP_ENCAP/DECAP, NVGRE_ENCAP/DECAP, etc.
>
> In which case perhaps "GENERIC" would be a better fit than "TUNNEL".
>
>> +
>> +- Non-terminating by default.
> There's no such property anymore [2].
Removed.
>
>> +
>> +.. _table_rte_flow_action_tunnel_encap:
>> +
>> +.. table:: TUNNEL_ENCAP
>> +
>> +   +-------------+---------------------------------------------+
>> +   | Field       | Value                                       |
>> +   +=============+=============================================+
>> +   | ``pattern`` | Virtual tunnel end-point pattern definition |
>> +   +-------------+---------------------------------------------+
>> +
>> +
>> +.. _table_rte_flow_action_tunnel_encap_example:
>> +
>> +.. table:: IPv4 VxLAN flow pattern example.
> VxLAN => VXLAN
Fixed.
>
>> +
>> +   +-------+--------------------------+------------+
>> +   | Index | Flow Item Type           | Flow Item  |
>> +   +=======+==========================+============+
>> +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
>> +   +-------+--------------------------+------------+
>> +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
>> +   +-------+--------------------------+------------+
>> +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
>> +   +-------+--------------------------+------------+
>> +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
>> +   +-------+--------------------------+------------+
>> +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
>> +   +-------+--------------------------+------------+
> One possible issue is that it relies on objects normally found on the
> pattern side of flow rules. Those are supposed to match something, they are
> not intended for packet header generation. While their "spec" and "mask"
> fields might make sense in this context, the "last" field is odd.
>
> You must define them without leaving anything open for interpretation by
> PMDs and users alike. Defining things as "undefined" is fine as long as it's
> covered.
Please note that the "void *item" in the 
"rte_flow_action_tunnel_encap.pattern" points to the data structure 
defined for the corresponding rte_flow_item_type instead of a 
rte_flow_item structure. As an example, for the rte_flow_item_eth type, 
the "void *item" will point to a "struct rte_flow_item_eth" instance. 
Thats why we have defined struct rte_flow_action_item inside struct 
rte_flow_action_tunnel_encap. So, no question of spec, mask, last anymore.

>
>> +
>> +
> Nit: only one empty line necessary here.
Fixed.
>
>> +Action: ``TUNNEL_DECAP``
>> +^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +Performs a decapsulation action by stripping all headers of the virtual tunnel
>> +end-point overlay up to the header defined by the flow item type of flows
>> +matched by the pattern items.
> Not necessarily, for instance if one guarantees that flowing traffic only
> consists of decap'able packets. You must avoid mandatory dependencies
> between patterns and actions since they are normally unrelated.
>
> What you can document on the other hand is that the behavior is undefined
> when processing traffic on which the action can't be applied. This is
> how RSS level is documented [3].
>
> [3] https://dpdk.org/ml/archives/dev/2018-April/095783.html
Fixed per your suggestion.
>
>> +
>> +This action modifies the payload of matched flows. The flow item type specified
>> +in the ``rte_flow_action_tunnel_decap`` action structure must defined a valid
>> +set of overlay header type.
>> +
>> +- Non-terminating by default.
> See [2].
Removed.
>
>> +
>> +.. _table_rte_flow_action_tunnel_decap:
>> +
>> +   +---------------+----------------------------------------------+
>> +   | Field         | Value                                        |
>> +   +===============+==============================================+
>> +   | ``item type`` | Item type of tunnel end-point to decapsulate |
>> +   +---------------+----------------------------------------------+
> "item type" should be the exact name used in the structure.
Fixed.
>
>> +
>>   Negative types
>>   ~~~~~~~~~~~~~~
>>   
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 7d1f89d..6d94423 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -854,14 +854,17 @@ struct rte_flow_item {
>>   	const void *mask; /**< Bit-mask applied to spec and last. */
>>   };
>>   
>> +
> Unnecessary empty line.
Fixed.
>
>>   /**
>>    * Action types.
>>    *
>>    * Each possible action is represented by a type. Some have associated
>>    * configuration structures. Several actions combined in a list can be
>> - * affected to a flow rule. That list is not ordered.
>> + * affected to a flow rule. That list is not ordered, with the exception of
>> + * actions which modify the packet itself, these packet modification actions
>> + * must be specified in the explicit order in which they are to be executed.
>>    *
>> - * They fall in three categories:
>> + * They fall in four categories:
>>    *
>>    * - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>>    *   processing matched packets by subsequent flow rules, unless overridden
>> @@ -870,6 +873,10 @@ struct rte_flow_item {
>>    * - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
>>    *   for additional processing by subsequent flow rules.
>>    *
>> + * - Non terminating meta actions that do not affect the fate of
>> + *   packets but result in modification of the packet itself (SECURITY,
>> + *   TUNNEL_ENCAP, TUNNEL_DECAP).
>> + *
> Same comment as above [1][2].
Will be revised and undo.
>
>>    * - Other non terminating meta actions that do not affect the fate of
>>    *   packets (END, VOID, MARK, FLAG, COUNT).
>>    *
>> @@ -1022,7 +1029,42 @@ enum rte_flow_action_type {
>>   	 *
>>   	 * See struct rte_flow_action_group_count.
>>   	 */
>> -	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
>> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT,
> An empty line would have been needed here (if we agree about no more
> GROUP_COUNT.)
Fixed.
>
>> +	/**
>> +	 * Encapsulate flow with tunnel defined in
>> +	 * rte_flow_action_tunnel_encap structure.
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_encap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP,
>> +
>> +	/**
>> +	 * Decapsulate all the headers of the tunnel
>> +	 *
>> +	 * See struct rte_flow_action_tunnel_decap.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP,
>> +
>> +	/**
>> +	 * Redirects packets to the logical group of the current device.
>> +	 *
>> +	 * In a logical hierarchy of groups, which can be used to represent a
>> +	 * physical of logical chaining of flow tables, this action allows the
>> +	 * terminating action to be a logical group of the same device.
>> +	 *
>> +	 * See struct rte_flow_action_group.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_GROUP,
>> +
>> +	/**
>> +	 * [META]
>> +	 *
>> +	 * Set specific metadata field associated with packet which is then
>> +	 * available to further pipeline stages.
>> +	 *
>> +	 * See struct rte_flow_action_metadata.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_METADATA
> These two actions should be part of the next patch, I won't comment them
> here.
It was my mistake. I will fix them.
>
>>   };
>>   
>>   /**
>> @@ -1173,6 +1215,42 @@ struct rte_flow_action_group_count {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
>> + *
>> + * Virtual tunnel end-point encapsulation action data.
>> + *
>> + * Non-terminating action by default.
> See [2].
Fixed.
>
>> + */
>> +struct rte_flow_action_tunnel_encap {
>> +	struct rte_flow_action_item {
>> +		enum rte_flow_item_type type;
>> +		/**< Flow item type. */
>> +		const void *item;
>> +		/**< Flow item definition which points to the data of
>> +		 * corresponding rte_flow_item_type.
>> +		 */
> I see it's a new action type, albeit a bit confusing (there is no
> RTE_FLOW_ACTION_TYPE_ITEM).
>
> I suggest the standard pattern item type since you're going with enum
> rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
> what fields are relevant. An application might otherwise want to encap with
> unsupported properties (e.g. specific IPv4 ToS field and whatnot).
>
> How about a single "struct rte_flow_pattern_item item", neither const and
> neither a pointer. It's generic enough, enclosed spec/last/mask pointers
> take care of the specifics. You just need to define what's supposed to
> happen when "last" is set.
Please see the comment above regarding this field.

>
>> +	} *pattern;
>> +	/**<
>> +	 * Tunnel pattern specification (list terminated by the END pattern
>> +	 * item).
>> +	 */
> As previously suggested, how about a single item per encap?
Please see the comment above regarding this field.
>
>> +};
>> +
>> +/**
>> + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
>> + *
>> + * Virtual tunnel end-point decapsulation action data.
>> + *
>> + * Non-terminating action by default.
>> + */
>> +struct rte_flow_action_tunnel_decap {
>> +	enum rte_flow_item_type type;
>> +	/**<
>> +	 * Flow item type of virtual tunnel end-point to be decapsulated
>> +	 */
>> +};
> Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
> actions to peel each layer off. The current definition is fine.
To clarify, the the decap is upto the PMD to remove all the header for a 
specified type. For example, for

rte_flow_item_type type=RTE_FLOW_ITEM_TYPE_VXLAN, the PMD will peel off (ETH, IPV4, UDP, VXLAN) header all together.

>
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> -- 
>> 2.7.4
>>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  2018-04-09 14:22  0%     ` Mohammad Abdul Awal
@ 2018-04-09 15:23  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 15:23 UTC (permalink / raw)
  To: Mohammad Abdul Awal; +Cc: Declan Doherty, dev

Hi Mohammad,

On Mon, Apr 09, 2018 at 03:22:45PM +0100, Mohammad Abdul Awal wrote:
> Hi Adrien,
> 
> 
> On 06/04/2018 21:26, Adrien Mazarguil wrote:
> > On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
> > > Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
> > > counters across multiple flows on a single port or across multiple
> > > flows on multiple ports within the same switch domain.
> > > 
> > > Introduce new API rte_flow_query_group_count to allow querying of group
> > > counters.
> > > 
> > > Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> > Both features are definitely needed, however I suggest to enhance the
> > existing action type and query function instead, given the rte_flow ABI
> > won't be maintained for the 18.05 release [1].
> > 
> > Counters and query support were defined as a kind of PoC in preparation for
> > future requirements back in DPDK 17.02 and so far few PMDs have implemented
> > the query callback (mlx5 and failsafe, and the latter isn't really a PMD).
> > 
> > Due to the behavior change of action lists [2], providing an action type as
> > a query parameter is not specific enough anymore, for instance if a list
> > contains multiple COUNT, the application should be able to tell which needs
> > to be queried.
> > 
> > Therefore I suggest to redefine the query function as follows:
> > 
> >   int
> >   rte_flow_query(uint16_t port_id,
> >                  struct rte_flow *flow,
> >                  const struct rte_flow_action *action,
> >                  void *data,
> >                  struct rte_flow_error *error);
> > 
> > Third argument is an action definition with the same configuration (if any)
> > as previously defined in the action list originally used to create the flow
> > rule (not necessarily the same pointer, only the contents matter).
> > 
> > It means two perfectly identical actions can't be distinguished, and that's
> > how group counters will work.
> > Instead of adding a new action type to distinguish groups, a configuration
> > structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
> > non-shared counters as a default behavior:
> > 
> >   struct rte_flow_action_count {
> >           uint32_t shared:1; /**< Share counter ID with other flow rules. */
> >           uint32_t reserved:31; /**< Reserved, must be zero. */
> >           uint32_t id; /**< Counter ID. */
> >   };
> > 
> > Doing so will impact some existing code in mlx5 and librte_flow_classify,
> > but that shouldn't be much of an issue.
> > 
> > Keep in mind testpmd and its documentation must be updated as well.
> > 
> > Thoughts?
> Please correct me if I am wrong but I think we are talking two different
> things here.
> If I understood you correctly, you are proposing to pass a list of actions
> (instead if a action type) in the third parameter to perform multiple
> actions in the same query call. Lets take an example for 100 ingress flows.
> So, if we want to query the counter for all the flows, we can get them by a
> single query providing a list (of size 100) of action_count in the 3rd
> param.

Whoa no! I'm only suggesting a pointer to a single action as a replacement
for the basic action type, not a *list* of actions. I hope this addresses
all concerns :)

The fact the action in question would refer to a shared counter (see struct
above) with a given ID would be enough to make all counters with the same ID
refer to the same internal PMD object.

> On the other hand, we are saying that all the flows are belongs to same
> tunnel end-point (we are talking only 1 TEP here), then the PMD will be
> responsible to increment the counter of TEP for matching all the flows (100
> flows). So, using one group query by passing one action_count in 3rd param,
> we can get the count of the TEP.
> 
> This case is generic enough for sure for simple flows but may not be
> suitable for tunnel cases, as application needs to track the counters for
> all the flows, and needs to build the list of action each time the flows
> added/deleted.

I think we're on the same page. I'm only suggesting to define a
configuration structure for the COUNT action (which currently doesn't take
any) as a replacement for GROUP_COUNT, and tweak the query callback to be
able to tell a specific counter to query instead of adding a new query
callback and leaving the existing one broken by design.

> > A few nits below for the sake of commenting.
> > 
> > [1] "Flow API overhaul for switch offloads"
> >      http://dpdk.org/ml/archives/dev/2018-April/095774.html
> > [2] "ethdev: alter behavior of flow API actions"
> >      http://dpdk.org/ml/archives/dev/2018-April/095779.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to flow API
  2018-04-07  9:51  0%     ` Andrew Rybchenko
@ 2018-04-09 15:00  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 15:00 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: Thomas Monjalon, Ferruh Yigit, dev, Zhang, Qi Z

On Sat, Apr 07, 2018 at 12:51:40PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > This patch adds the missing action counterpart to the PHY_PORT pattern
> > item, that is, the ability to directly inject matching traffic into a
> > physical port of the underlying device.
> 
> Does it mean that if it is applied on ingress (incoming packet from network)
> it will simply send packets back to network (specified physical port)?

Precisely.

> And if it is applied on egress (outgoing from device to network) it will
> be directed to possibly different physical port and sent to network.

Right, note it gives applications the ability to express that wish, the fact
PMDs support it is another matter :)

In any case, this action is added for API completeness but should be rarely
necessary since we chose to go with port representors.

Port representors will expose valid DPDK port IDs, therefore applications
will simply have to create ingress/egress flow rules on the right DPDK port
targeting different port IDs through the PORT_ID action.

> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
> >   app/test-pmd/config.c                       |  1 +
> >   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
> >   lib/librte_ether/rte_flow.c                 |  1 +
> >   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
> >   6 files changed, 84 insertions(+)
> 
> <...>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in flow API
  2018-04-07  9:41  0%     ` Andrew Rybchenko
@ 2018-04-09 14:49  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:49 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Ajit Khaparde, Somnath Kotur,
	Beilei Xing, Qi Zhang

On Sat, Apr 07, 2018 at 12:41:17PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > Contrary to all other pattern items, these are inconsistently documented as
> > affecting traffic instead of simply matching its origin, without provision
> > for the latter.
> > 
> > This commit clarifies documentation and updates PMDs since the original
> > behavior now has to be explicitly requested using the new transfer
> > attribute.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_create()
> > - rte_flow_validate()
> > 
> > Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
> > supported when a transfer attribute is also present.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 12 +++---
> >   doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
> >   drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
> >   drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
> >   lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
> >   6 files changed, 77 insertions(+), 75 deletions(-)
> 
> <...>
> 
> > diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> > index 735ce6323..beedc713b 100644
> > --- a/doc/guides/prog_guide/rte_flow.rst
> > +++ b/doc/guides/prog_guide/rte_flow.rst
> > @@ -518,15 +518,12 @@ Usage example, matching non-TCPv4 packets only:
> >   Item: ``PF``
> >   ^^^^^^^^^^^^
> > -Matches packets addressed to the physical function of the device.
> > +Matches traffic originating from (ingress) or going to (egress) the physical
> > +function of the current device.
> 
> Not sure that I understand above. It looks like ingress and egress are
> misplaced.
> There many similar cases below.

In this API, "ingress" and "egress" are always defined as relative to the
application creating the flow rule. In that sense they are respectively
synonyms for "from" and "to".

I agree they are not properly defined in this document, in fact ingress and
egress were clarified (with diagrams and all) in the RFC submitted prior to
this patch [1].

I will update the "Attribute: Traffic direction" section in my next update.

[1] See "Traffic direction" in
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-07  9:05  0%     ` Andrew Rybchenko
@ 2018-04-09 14:42  0%       ` Adrien Mazarguil
  2018-04-11 13:21  0%         ` Andrew Rybchenko
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:42 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Xueming Li, Wenzhuo Lu,
	Jingjing Wu, Beilei Xing, Qi Zhang, Konstantin Ananyev,
	Nelio Laranjeiro, Yongseok Koh, Pascal Mazon, Radu Nicolau,
	Akhil Goyal, Ivan Malov

On Sat, Apr 07, 2018 at 12:05:51PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > Since its inception, the rte_flow RSS action has been relying in part on
> > external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> > This structure lacks parameters such as the hash algorithm to use, and more
> > recently, a method to tell which layer RSS should be performed on [1].
> > 
> > Given struct rte_eth_rss_conf will never be flexible enough to represent a
> > complete RSS configuration (e.g. RETA table), this patch supersedes it by
> > extending the rte_flow RSS action directly.
> > 
> > A subsequent patch will add a field to use a non-default RSS hash
> > algorithm. To that end, a field named "types" replaces the field formerly
> > known as "rss_hf" and standing for "RSS hash functions" as it was
> > confusing. Actual RSS hash function types are defined by enum
> > rte_eth_hash_function.
> > This patch updates all PMDs and example applications accordingly.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
> >      configuration")
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Xueming Li <xuemingl@mellanox.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > Cc: Radu Nicolau <radu.nicolau@intel.com>
> > Cc: Akhil Goyal <akhil.goyal@nxp.com>
> > ---
> >   app/test-pmd/cmdline_flow.c        |  59 +++++-----
> >   app/test-pmd/config.c              |  39 +++----
> >   doc/guides/prog_guide/rte_flow.rst |  22 ++--
> >   drivers/net/e1000/e1000_ethdev.h   |  13 ++-
> >   drivers/net/e1000/igb_ethdev.c     |   4 +-
> >   drivers/net/e1000/igb_flow.c       |  31 ++---
> >   drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
> >   drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
> >   drivers/net/i40e/i40e_ethdev.h     |  15 ++-
> >   drivers/net/i40e/i40e_flow.c       |  47 ++++----
> >   drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
> >   drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
> >   drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
> >   drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
> >   drivers/net/mlx4/mlx4.c            |   2 +-
> >   drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
> >   drivers/net/mlx4/mlx4_flow.h       |   2 +-
> >   drivers/net/mlx4/mlx4_rxq.c        |   2 +-
> >   drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
> >   drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
> >   drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
> >   drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
> >   drivers/net/sfc/sfc_flow.c         |  21 ++--
> >   drivers/net/tap/tap_flow.c         |   8 +-
> >   examples/ipsec-secgw/ipsec.c       |  10 +-
> >   lib/librte_ether/rte_flow.c        |  39 +++----
> >   lib/librte_ether/rte_flow.h        |   6 +-
> >   27 files changed, 473 insertions(+), 353 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index 056405515..1a2c0299c 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	struct sfc_rxq *rxq;
> >   	unsigned int rxq_hw_index_min;
> >   	unsigned int rxq_hw_index_max;
> > -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
> > -	uint64_t rss_hf;
> > -	uint8_t *rss_key = NULL;
> > +	const uint8_t *rss_key;
> >   	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
> >   	unsigned int i;
> > -	if (rss->num == 0)
> > +	if (rss->queue_num == 0)
> >   		return -EINVAL;
> >   	rxq_sw_index = sa->rxq_count - 1;
> > @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	rxq_hw_index_min = rxq->hw_index;
> >   	rxq_hw_index_max = 0;
> > -	for (i = 0; i < rss->num; ++i) {
> > +	for (i = 0; i < rss->queue_num; ++i) {
> >   		rxq_sw_index = rss->queue[i];
> >   		if (rxq_sw_index >= sa->rxq_count)
> > @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   			rxq_hw_index_max = rxq->hw_index;
> >   	}
> > -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
> 
> Here we had a fallback to default rss_hf (now types) if rss_conf is
> unspecified.

Thing is, rss_action->conf was never supposed to be NULL in the first
place. Crashing on a NULL configuration has always been fine, but until
recently prevented validation with testpmd's broken implementation. This
problem was addressed in a prior series [1][2][3].

Since a value is now always provided, no need for a fallback.

[1] "app/testpmd: fix lack of flow action configuration"
    http://dpdk.org/ml/archives/dev/2018-April/095280.html
[2] "app/testpmd: fix RSS flow action configuration"
    http://dpdk.org/ml/archives/dev/2018-April/095281.html
[3] "app/testpmd: fix missing RSS fields in flow action"
    http://dpdk.org/ml/archives/dev/2018-April/095282.html

> > -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
> > +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
> >   		return -EINVAL;
> > -	if (rss_conf != NULL) {
> > -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
> > +	if (rss->key_len) {
> > +		if (rss->key_len != sizeof(sa->rss_key))
> >   			return -EINVAL;
> > -		rss_key = rss_conf->rss_key;
> > +		rss_key = rss->key;
> >   	} else {
> >   		rss_key = sa->rss_key;
> >   	}
> > @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
> >   	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
> > -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
> > +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
> 
> Now types go directly to mapping function and unspecified types (0)
> will result in 0 rss_hash_types. Of course, it is a question how to treat
> types==0. It is possible to say that it no RSS, but it does not make sense.
> So, real options are device defaults (regardless configured on device level)
> or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
> Please, document the intended behaviour in rte_flow.rst.

Granted the existing documentation doesn't say much on that topic, but a 0
value for rss_hf does actually mean "no RSS" [4]:

 "The *rss_hf* field of the *rss_conf* structure indicates the different
  types of IPv4/IPv6 packets to which the RSS hashing must be applied.
  Supplying an *rss_hf* equal to zero disables the RSS feature."

Now since this action doesn't use struct rte_eth_rss_conf anymore, we could
define 0 as a PMD-specific behavior, which could be no RSS. It would make
the API easier to use for applications that don't care about the RSS
capabilities of each underlying adapter, 0 would just work everywhere as a
safe default.

[4] https://dpdk.org/doc/api/structrte__eth__rss__conf.html

> If the later is chosen, above we'll have a bug since fallback to fixed
> default.
> Just use sa->rss_hash_types as fallback. Something like:
> if (rss->types)
>     sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
> else
>     sfc_rss_conf->rss_hash_types =sa->rss_hash_types;

Looks like the previous code didn't provide a fallback when rss_hf was 0,
only when rss_conf itself was NULL. So this is not a new issue introduced by
this patch.

I will update documentation to define 0 as described above for the
convenience of application writers and leave the existing code in place.
PMD maintainers will be free to enhance it as they wish later.
Just remember testpmd now always provides a default value for it after
querying the device [2].

> >   	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
> >   	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
> > -		unsigned int rxq_sw_index = rss->queue[i % rss->num];
> > +		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
> >   		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
> >   		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
> 
> <...>

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API
  2018-04-06 17:11  0%     ` Andrew Rybchenko
@ 2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:42 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Wenzhuo Lu, Jingjing Wu,
	Ajit Khaparde, Somnath Kotur, John Daley, Hyong Youb Kim,
	Beilei Xing, Qi Zhang, Konstantin Ananyev, Nelio Laranjeiro,
	Yongseok Koh, Tomasz Duszynski, Dmitri Epshtein,
	Natalie Samsonov, Jianbo Liu, Pascal Mazon

On Fri, Apr 06, 2018 at 08:11:38PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> > consistent with the normal stacking order of pattern items, which is
> > confusing to applications.
> > 
> > Problem is that when followed by one of these layers, the EtherType field
> > of the preceding layer keeps its "inner" definition, and the "outer" TPID
> > is provided by the subsequent layer, the reverse of how a packet looks like
> > on the wire:
> > 
> >   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
> >   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
> > 
> > Worse, when QinQ is involved, the stacking order of VLAN layers is
> > unspecified. It is unclear whether it should be reversed (innermost to
> > outermost) as well given TPID applies to the previous layer:
> > 
> >   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
> >   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
> >   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
> > 
> > While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> > order in case of QinQ and the lack of documentation remain an issue.
> > 
> > This patch replaces TPID in the VLAN pattern item with an inner
> > EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> > clarifies documentation and updates all relevant code.
> > 
> > It breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> > items:
> > 
> > - bnxt: EtherType matching is supported, and vlan->inner_type overrides
> >    eth->type if the latter has standard TPID value 0x8100, otherwise an
> >    error is triggered.
> > 
> > - e1000: EtherType matching is only supported with the ETHERTYPE filter,
> >    which does not support VLAN matching, therefore no impact.
> > 
> > - enic: same as bnxt.
> > 
> > - i40e: same as bnxt with a configurable TPID value for the FDIR filter,
> >    with existing limitations on allowed EtherType values. The remaining
> >    filter types (VXLAN, NVGRE, QINQ) do not support EtherType matching.
> > 
> > - ixgbe: same as e1000, with additional minor change to rely on the new
> >    E-Tag macro definition.
> > 
> > - mlx4: EtherType/TPID matching is not supported, no impact.
> > 
> > - mlx5: same as bnxt.
> > 
> > - mrvl: EtherType matching is supported but eth->type cannot be specified
> >    when a VLAN item is present. However vlan->inner_type is used if
> >    specified.
> > 
> > - sfc: same as bnxt with QinQ TPID value 0x88a8 additionally supported.
> > 
> > - tap: same as bnxt.
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> > Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> > Cc: John Daley <johndale@cisco.com>
> > Cc: Hyong Youb Kim <hyonkim@cisco.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Tomasz Duszynski <tdu@semihalf.com>
> > Cc: Dmitri Epshtein <dima@marvell.com>
> > Cc: Natalie Samsonov <nsamsono@marvell.com>
> > Cc: Jianbo Liu <jianbo.liu@arm.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > 
> > ---
> > 
> > Hi PMD maintainers, while I'm pretty confident in these changes, I could
> > not validate them with all devices.
> > 
> > It would be great if you could apply this patch, run testpmd, create VLAN
> > flow rules with/without inner EtherType as described and send matching
> > traffic while making sure nothing was broken in the process.
> > 
> > Thanks!
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 17 +++---
> >   doc/guides/nics/tap.rst                     |  2 +-
> >   doc/guides/prog_guide/rte_flow.rst          | 21 ++++++--
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
> >   drivers/net/bnxt/bnxt_filter.c              | 39 +++++++++++---
> >   drivers/net/enic/enic_flow.c                | 22 +++++---
> >   drivers/net/i40e/i40e_flow.c                | 69 +++++++++++++++++++-----
> >   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
> >   drivers/net/mlx5/mlx5_flow.c                | 16 +++++-
> >   drivers/net/mvpp2/mrvl_flow.c               | 27 +++++++---
> >   drivers/net/sfc/sfc_flow.c                  | 28 ++++++++++
> >   drivers/net/tap/tap_flow.c                  | 16 ++++--
> >   lib/librte_ether/rte_flow.h                 | 24 ++++++---
> >   lib/librte_net/rte_ether.h                  |  1 +
> >   14 files changed, 229 insertions(+), 60 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index bc4974edf..f61d4ec92 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -7,6 +7,7 @@
> >    * for Solarflare) and Solarflare Communications, Inc.
> >    */
> > +#include <rte_byteorder.h>
> >   #include <rte_tailq.h>
> >   #include <rte_common.h>
> >   #include <rte_ethdev_driver.h>
> > @@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
> >   	const struct rte_flow_item_vlan *mask = NULL;
> >   	const struct rte_flow_item_vlan supp_mask = {
> >   		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
> > +		.inner_type = RTE_BE16(0xffff),
> >   	};
> >   	rc = sfc_flow_parse_init(item,
> > @@ -393,6 +395,32 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
> >   		return -rte_errno;
> >   	}
> > +	/*
> > +	 * If an EtherType was already specified, make sure it is a valid
> > +	 * TPID for the current VLAN layer before overwriting it with the
> > +	 * specified inner type.
> > +	 */
> > +	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
> > +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_VLAN) &&
> > +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_QINQ)) {
> 
> 1. efs_ether_type is host-endian

Whoops, looks like I only half-fixed that endian issue in v2.

> 2. HW recognizes more TPIDs (0x9100, 0x9200, 0x9300) as VLAN
> 3. However, if some TPID is specified, user may expect that only VLAN
> packets
>     with specified TPID match. It is false expectation since the information
> is not
>     passed to HW to match (and there is no way to match).
>     So, it is safer to deny TPID specification (i.e. keep the first
> condition only).
>     From the flexibility point of view it is possible to allow any, but it
> should be
>     documented that exact match is not checked in fact.

Thanks for pointing this out. I've decided to update all PMDs to disallow
TPID matching because many devices support multiple concurrent TPIDs and
there's no way to match a given one explicitly. This should make the patch
simpler as well.

> > +		rte_flow_error_set(error, EINVAL,
> > +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> > +				   "Unsupported outer TPID");
> > +		return -rte_errno;
> > +	}
> > +	if (!mask->inner_type) {
> > +		efx_spec->efs_match_flags &= ~EFX_FILTER_MATCH_ETHER_TYPE;
> > +		efx_spec->efs_ether_type = RTE_BE16(0x0000);
> 
> Nothing should be done here if above is done.
> 
> > +	} else if (mask->inner_type == supp_mask.inner_type) {
> > +		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
> > +		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
> > +	} else {
> > +		rte_flow_error_set(error, EINVAL,
> > +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> > +				   "Bad mask for VLAN inner_type");
> > +		return -rte_errno;
> > +	}
> > +
> >   	return 0;
> >   }
> 
> <...>
> 
> > diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> > index fc7e6705d..b13b0e2e6 100644
> > --- a/lib/librte_ether/rte_flow.h
> > +++ b/lib/librte_ether/rte_flow.h
> > @@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
> >    *
> >    * Matches an 802.1Q/ad VLAN tag.
> >    *
> > - * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
> > - * RTE_FLOW_ITEM_TYPE_VLAN.
> > + * The corresponding standard outer EtherType (TPID) values are
> > + * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
> > + * pattern item.
> >    */
> >   struct rte_flow_item_vlan {
> > -	rte_be16_t tpid; /**< Tag protocol identifier. */
> >   	rte_be16_t tci; /**< Tag control information. */
> > +	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
> >   };
> >   /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
> >   #ifndef __cplusplus
> >   static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
> > -	.tpid = RTE_BE16(0x0000),
> > -	.tci = RTE_BE16(0xffff),
> > +	.tci = RTE_BE16(0x0fff),
> 
> It looks like unrelated change.

Yep, it should have been in a separate patch. I'll split it in my next
update. Thanks for reviewing.

> 
> > +	.inner_type = RTE_BE16(0x0000),
> >   };
> >   #endif
> 
> <...>
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action
  2018-04-06 15:41  0%     ` Andrew Rybchenko
@ 2018-04-09 14:41  0%       ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-09 14:41 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Thomas Monjalon, Ferruh Yigit, dev, Wenzhuo Lu, Jingjing Wu,
	Beilei Xing, Qi Zhang, Konstantin Ananyev, Nelio Laranjeiro,
	Yongseok Koh, Pascal Mazon

On Fri, Apr 06, 2018 at 06:41:35PM +0300, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> > By definition, RSS involves some kind of hash algorithm, usually Toeplitz.
> > 
> > Until now it could not be modified on a flow rule basis and PMDs had to
> > always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
> > behavior when unspecified (0).
> > 
> > This breaks ABI compatibility for the following public functions:
> > 
> > - rte_flow_copy()
> > - rte_flow_create()
> > - rte_flow_query()
> > - rte_flow_validate()
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> > Cc: Thomas Monjalon <thomas@monjalon.net>
> > Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Cc: Jingjing Wu <jingjing.wu@intel.com>
> > Cc: Beilei Xing <beilei.xing@intel.com>
> > Cc: Qi Zhang <qi.z.zhang@intel.com>
> > Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> > Cc: Pascal Mazon <pascal.mazon@6wind.com>
> > ---
> >   app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
> >   app/test-pmd/config.c                       |  1 +
> >   doc/guides/prog_guide/rte_flow.rst          |  2 +
> >   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
> >   drivers/net/e1000/igb_flow.c                |  4 ++
> >   drivers/net/e1000/igb_rxtx.c                |  4 +-
> >   drivers/net/i40e/i40e_ethdev.c              |  4 +-
> >   drivers/net/i40e/i40e_flow.c                |  4 ++
> >   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
> >   drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
> >   drivers/net/mlx4/mlx4_flow.c                |  7 +++
> >   drivers/net/mlx5/mlx5_flow.c                | 13 +++++
> >   drivers/net/sfc/sfc_flow.c                  |  3 +
> >   drivers/net/tap/tap_flow.c                  |  6 ++
> >   lib/librte_ether/rte_flow.c                 |  1 +
> >   lib/librte_ether/rte_flow.h                 |  2 +
> >   16 files changed, 131 insertions(+), 3 deletions(-)
> 
> <...>
> 
> > diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> > index 1a2c0299c..dbe4c2baa 100644
> > --- a/drivers/net/sfc/sfc_flow.c
> > +++ b/drivers/net/sfc/sfc_flow.c
> > @@ -1261,6 +1261,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
> >   			rxq_hw_index_max = rxq->hw_index;
> >   	}
> > +	if (rss->func)
> 
> May be it is better to compare with RTE_ETH_HASH_FUNCTION_DEFAULT
> explicitly? I think it is more readable. If so, it is applicable to all
> similar checks
> in the patch.

Good suggestion, although RTE_ETH_HASH_FUNCTION_DEFAULT can't be anything
other than 0 for various reasons, I'll clarify (most of) the code in my next
update.

> In the case of sfc, please, allow RTE_ETH_HASH_FUNCTION_TOEPLITZ as well.
> I'd suggest:
> switch (rss->func) {
> case RTE_ETH_HASH_FUNCTION_DEFAULT:
> case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
>       break;
> default:
>       return -EINVAL;
> }

I'll add it, thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  2018-04-06 20:26  3%   ` Adrien Mazarguil
@ 2018-04-09 14:22  0%     ` Mohammad Abdul Awal
  2018-04-09 15:23  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Mohammad Abdul Awal @ 2018-04-09 14:22 UTC (permalink / raw)
  To: Adrien Mazarguil, Declan Doherty; +Cc: dev

Hi Adrien,


On 06/04/2018 21:26, Adrien Mazarguil wrote:
> On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
>> Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
>> counters across multiple flows on a single port or across multiple
>> flows on multiple ports within the same switch domain.
>>
>> Introduce new API rte_flow_query_group_count to allow querying of group
>> counters.
>>
>> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> Both features are definitely needed, however I suggest to enhance the
> existing action type and query function instead, given the rte_flow ABI
> won't be maintained for the 18.05 release [1].
>
> Counters and query support were defined as a kind of PoC in preparation for
> future requirements back in DPDK 17.02 and so far few PMDs have implemented
> the query callback (mlx5 and failsafe, and the latter isn't really a PMD).
>
> Due to the behavior change of action lists [2], providing an action type as
> a query parameter is not specific enough anymore, for instance if a list
> contains multiple COUNT, the application should be able to tell which needs
> to be queried.
>
> Therefore I suggest to redefine the query function as follows:
>
>   int
>   rte_flow_query(uint16_t port_id,
>                  struct rte_flow *flow,
>                  const struct rte_flow_action *action,
>                  void *data,
>                  struct rte_flow_error *error);
>
> Third argument is an action definition with the same configuration (if any)
> as previously defined in the action list originally used to create the flow
> rule (not necessarily the same pointer, only the contents matter).
>
> It means two perfectly identical actions can't be distinguished, and that's
> how group counters will work.
> Instead of adding a new action type to distinguish groups, a configuration
> structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
> non-shared counters as a default behavior:
>
>   struct rte_flow_action_count {
>           uint32_t shared:1; /**< Share counter ID with other flow rules. */
>           uint32_t reserved:31; /**< Reserved, must be zero. */
>           uint32_t id; /**< Counter ID. */
>   };
>
> Doing so will impact some existing code in mlx5 and librte_flow_classify,
> but that shouldn't be much of an issue.
>
> Keep in mind testpmd and its documentation must be updated as well.
>
> Thoughts?
Please correct me if I am wrong but I think we are talking two different 
things here.
If I understood you correctly, you are proposing to pass a list of 
actions (instead if a action type) in the third parameter to perform 
multiple actions in the same query call. Lets take an example for 100 
ingress flows. So, if we want to query the counter for all the flows, we 
can get them by a single query providing a list (of size 100) of 
action_count in the 3rd param.

On the other hand, we are saying that all the flows are belongs to same 
tunnel end-point (we are talking only 1 TEP here), then the PMD will be 
responsible to increment the counter of TEP for matching all the flows 
(100 flows). So, using one group query by passing one action_count in 
3rd param, we can get the count of the TEP.

This case is generic enough for sure for simple flows but may not be 
suitable for tunnel cases, as application needs to track the counters 
for all the flows, and needs to build the list of action each time the 
flows added/deleted.

>
> A few nits below for the sake of commenting.
>
> [1] "Flow API overhaul for switch offloads"
>      http://dpdk.org/ml/archives/dev/2018-April/095774.html
> [2] "ethdev: alter behavior of flow API actions"
>      http://dpdk.org/ml/archives/dev/2018-April/095779.html
>
>> ---
>>   doc/guides/prog_guide/rte_flow.rst      | 35 +++++++++++++++++++++
>>   lib/librte_ether/rte_ethdev_version.map |  8 +++++
>>   lib/librte_ether/rte_flow.c             | 21 +++++++++++++
>>   lib/librte_ether/rte_flow.h             | 56 ++++++++++++++++++++++++++++++++-
>>   lib/librte_ether/rte_flow_driver.h      |  6 ++++
>>   5 files changed, 125 insertions(+), 1 deletion(-)
>>
>> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
>> index 961943d..fd33d19 100644
>> --- a/doc/guides/prog_guide/rte_flow.rst
>> +++ b/doc/guides/prog_guide/rte_flow.rst
>> @@ -1698,6 +1698,41 @@ Return values:
>>   
>>   - 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>>   
>> +
> Unnecessary empty line.
I will take into account all the comments.

Regards,
Awal.

>
>> +Group Count Query
>> +~~~~~~~~~~~~~~~~~
>> +
>> +Query group counter which can be associated with multiple flows on a specified
>> +port.
>> +
>> +This function allows retrieving of group counters. A group counter is a
>> +counter which can be shared among multiple flows on a single port or among
>> +multiple flows on multiple ports within the same switch domain. Data is
>> +gathered by special actions which must be present in the flow rule
>> +definition.
>> +
>> +.. code-block:: c
>> +
>> +   int
>> +   rte_flow_query_group_count(uint16_t port_id,
>> +			   uint32_t group_counter_id,
>> +			   struct rte_flow_query_count *count,
>> +               struct rte_flow_error *error);
>> +
>> +Arguments:
>> +
>> +- ``port_id``: port identifier of Ethernet device.
>> +- ``group_counter_id``: group counter identifier.
>> +- ``count``: group counter parameters.
>> +- ``error``: perform verbose error reporting if not NULL. PMDs initialize
>> +  this structure in case of error only.
>> +
>> +Return values:
>> +
>> +- 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>> +
>> +
>> +
> More unnecessary empty lines.
>
>>   Isolated mode
>>   -------------
>>   
>> diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
>> index 34df6c8..cff6807 100644
>> --- a/lib/librte_ether/rte_ethdev_version.map
>> +++ b/lib/librte_ether/rte_ethdev_version.map
>> @@ -229,3 +229,11 @@ EXPERIMENTAL {
>>   	rte_mtr_stats_update;
>>   
>>   } DPDK_17.11;
>> +
>> +
> One more.
>
>> +EXPERIMENTAL {
>> +	global:
>> +
>> +	rte_flow_query_group_count
>> +
>> +} DPDK_18.05;
>> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
>> index 38f2d27..e10b1d0 100644
>> --- a/lib/librte_ether/rte_flow.c
>> +++ b/lib/librte_ether/rte_flow.c
>> @@ -418,3 +418,24 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len,
>>   	}
>>   	return 0;
>>   }
>> +
>> +int __rte_experimental
>> +rte_flow_query_group_count(uint16_t port_id,
>> +	uint32_t group_count_id,
>> +	struct rte_flow_query_count *count,
>> +	struct rte_flow_error *error)
> This function lacks a short documentation comment (see rte_flow_query()).
>
>> +{
>> +	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
>> +	const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error);
>> +
>> +	if (!ops)
>> +		return -rte_errno;
>> +	if (likely(!!ops->query_group_count))
>> +		return flow_err(port_id,
>> +				ops->query_group_count(dev, group_count_id,
>> +						       count, error),
>> +				error);
>> +	return rte_flow_error_set(error, ENOSYS,
>> +				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
>> +				  NULL, rte_strerror(ENOSYS));
>> +}
>> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
>> index 13e4202..7d1f89d 100644
>> --- a/lib/librte_ether/rte_flow.h
>> +++ b/lib/librte_ether/rte_flow.h
>> @@ -1010,7 +1010,19 @@ enum rte_flow_action_type {
>>   	 *
>>   	 * See struct rte_flow_action_security.
>>   	 */
>> -	RTE_FLOW_ACTION_TYPE_SECURITY
>> +	RTE_FLOW_ACTION_TYPE_SECURITY,
>> +
>> +	/**
>> +	 * Enable a shared flow group counter for flow. Group counters can be
>> +	 * associated with multiples flows on the same port or on port within
>> +	 * the same switch domain if supported by that device.
>> +	 *
>> +	 * Group counters can be retrieved and reset through
>> +	 * rte_flow_query_group_count()
>> +	 *
>> +	 * See struct rte_flow_action_group_count.
>> +	 */
>> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> Don't forget the trailing comma.
>
>>   };
>>   
>>   /**
>> @@ -1149,6 +1161,18 @@ struct rte_flow_action_security {
>>   };
>>   
>>   /**
>> + * RTE_FLOW_ACTION_TYPE_GROUP_COUNT
>> + *
>> + * A packet/byte counter which can be shared across a group of flows programmed
>> + * on the same port/switch domain.
>> + *
>> + * Non-terminating by default.
>> + */
>> +struct rte_flow_action_group_count {
>> +	uint32_t id;
>> +};
>> +
>> +/**
>>    * Definition of a single action.
>>    *
>>    * A list of actions is terminated by a END action.
>> @@ -1476,6 +1500,36 @@ rte_flow_copy(struct rte_flow_desc *fd, size_t len,
>>   	      const struct rte_flow_item *items,
>>   	      const struct rte_flow_action *actions);
>>   
>> +
> Caught another empty line.
>
>> +/**
>> + * Get hit/bytes count for group counter.
>> + *
>> + * A group counter is a counter which can be shared among multiple flows on a
>> + * single port or among multiple flows on multiple ports within the same
>> + * switch domain.
>> + *
>> + * In the case of ports within the same switch domain a global name space is
>> + * assumed for group_count_id value.
>> + *
>> + * @param[in]	port_id
>> + *   Port identifier of Ethernet device.
>> + * @param[in]	group_count_id
>> + *   Group counter identifier to query
>> + * @param[out]	count
>> + *   Group counter value
>> + * @param[out]	error
>> + *   Perform verbose error reporting if not NULL. PMDs initialize this
>> + *   structure in case of error only.
>> + *
>> + * @return
>> + *   Negative error code (errno value) and rte_errno is set.
>> + */
>> +int __rte_experimental
>> +rte_flow_query_group_count(uint16_t port_id,
>> +			   uint32_t group_count_id,
>> +			   struct rte_flow_query_count *count,
>> +			   struct rte_flow_error *error);
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_ether/rte_flow_driver.h b/lib/librte_ether/rte_flow_driver.h
>> index 7778c8e..ef09465 100644
>> --- a/lib/librte_ether/rte_flow_driver.h
>> +++ b/lib/librte_ether/rte_flow_driver.h
>> @@ -96,6 +96,12 @@ struct rte_flow_ops {
>>   		(struct rte_eth_dev *,
>>   		 int,
>>   		 struct rte_flow_error *);
>> +	/** See rte_flow_query_group_count(). */
>> +	int (*query_group_count)
>> +		(struct rte_eth_dev *,
>> +		 uint32_t,
>> +		 struct rte_flow_query_count *,
>> +		 struct rte_flow_error *);
>>   };
>>   
>>   /**
>> -- 
>> 2.7.4
>>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v1] doc:  add SPDX Licence to doc files
@ 2018-04-09 13:11  5% Marko Kovacevic
  0 siblings, 0 replies; 200+ results
From: Marko Kovacevic @ 2018-04-09 13:11 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, john.mcnamara, hemant.agrawal, Marko Kovacevic

Added SPDX headers to doc files to have them aligned with
the other doc files.

Signed-off-by: Marko Kovacevic <marko.kovacevic@intel.com>
---
 doc/guides/contributing/cheatsheet.rst           | 3 +++
 doc/guides/contributing/coding_style.rst         | 3 +++
 doc/guides/contributing/design.rst               | 3 +++
 doc/guides/contributing/documentation.rst        | 3 +++
 doc/guides/contributing/img/patch_cheatsheet.svg | 3 ++-
 doc/guides/contributing/index.rst                | 3 +++
 doc/guides/contributing/patches.rst              | 3 +++
 doc/guides/contributing/stable.rst               | 3 +++
 doc/guides/contributing/versioning.rst           | 3 +++
 doc/guides/linux_gsg/nic_perf_intel_platform.rst | 3 +++
 doc/guides/rel_notes/deprecation.rst             | 3 +++
 doc/guides/rel_notes/release_16_04.rst           | 3 +++
 doc/guides/rel_notes/release_16_07.rst           | 3 +++
 doc/guides/rel_notes/release_16_11.rst           | 3 +++
 doc/guides/rel_notes/release_17_02.rst           | 3 +++
 doc/guides/rel_notes/release_17_05.rst           | 3 +++
 doc/guides/rel_notes/release_17_08.rst           | 3 +++
 doc/guides/rel_notes/release_17_11.rst           | 3 +++
 doc/guides/rel_notes/release_18_02.rst           | 3 +++
 doc/guides/rel_notes/release_2_2.rst             | 3 +++
 20 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/doc/guides/contributing/cheatsheet.rst b/doc/guides/contributing/cheatsheet.rst
index 7bc0771..97512d7 100644
--- a/doc/guides/contributing/cheatsheet.rst
+++ b/doc/guides/contributing/cheatsheet.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Patch Cheatsheet
 ================
 
diff --git a/doc/guides/contributing/coding_style.rst b/doc/guides/contributing/coding_style.rst
index b0f0adb..e285fc8 100644
--- a/doc/guides/contributing/coding_style.rst
+++ b/doc/guides/contributing/coding_style.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. _coding_style:
 
 DPDK Coding Style
diff --git a/doc/guides/contributing/design.rst b/doc/guides/contributing/design.rst
index 88d3a43..bbe219e 100644
--- a/doc/guides/contributing/design.rst
+++ b/doc/guides/contributing/design.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Design
 ======
 
diff --git a/doc/guides/contributing/documentation.rst b/doc/guides/contributing/documentation.rst
index 82f2e1b..7ac1b41 100644
--- a/doc/guides/contributing/documentation.rst
+++ b/doc/guides/contributing/documentation.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. _doc_guidelines:
 
 DPDK Documentation Guidelines
diff --git a/doc/guides/contributing/img/patch_cheatsheet.svg b/doc/guides/contributing/img/patch_cheatsheet.svg
index 8522592..af2473d 100644
--- a/doc/guides/contributing/img/patch_cheatsheet.svg
+++ b/doc/guides/contributing/img/patch_cheatsheet.svg
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<!-- SPDX-License-Identifier: BSD-3-Clause -->
+<!-- Copyright(c) 2018 Intel Corporation -->
 
 <svg
    xmlns:dc="http://purl.org/dc/elements/1.1/"
diff --git a/doc/guides/contributing/index.rst b/doc/guides/contributing/index.rst
index 329b678..9fa0076 100644
--- a/doc/guides/contributing/index.rst
+++ b/doc/guides/contributing/index.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Contributor's Guidelines
 ========================
 
diff --git a/doc/guides/contributing/patches.rst b/doc/guides/contributing/patches.rst
index 2287835..494cdf4 100644
--- a/doc/guides/contributing/patches.rst
+++ b/doc/guides/contributing/patches.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. submitting_patches:
 
 Contributing Code to DPDK
diff --git a/doc/guides/contributing/stable.rst b/doc/guides/contributing/stable.rst
index 0f2f1f3..16518b0 100644
--- a/doc/guides/contributing/stable.rst
+++ b/doc/guides/contributing/stable.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 .. stable_lts_releases:
 
 DPDK Stable Releases and Long Term Support
diff --git a/doc/guides/contributing/versioning.rst b/doc/guides/contributing/versioning.rst
index c495294d..6e2f073 100644
--- a/doc/guides/contributing/versioning.rst
+++ b/doc/guides/contributing/versioning.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 Managing ABI updates
 ====================
 
diff --git a/doc/guides/linux_gsg/nic_perf_intel_platform.rst b/doc/guides/linux_gsg/nic_perf_intel_platform.rst
index 987cd0a..7bd05b6 100644
--- a/doc/guides/linux_gsg/nic_perf_intel_platform.rst
+++ b/doc/guides/linux_gsg/nic_perf_intel_platform.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 How to get best performance with NICs on Intel platforms
 ========================================================
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..a8bd787 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 ABI and API Deprecation
 =======================
 
diff --git a/doc/guides/rel_notes/release_16_04.rst b/doc/guides/rel_notes/release_16_04.rst
index d0a09ef..072b294 100644
--- a/doc/guides/rel_notes/release_16_04.rst
+++ b/doc/guides/rel_notes/release_16_04.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.04
 ==================
 
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index a8a3fc1..e8aea7a 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.07
 ==================
 
diff --git a/doc/guides/rel_notes/release_16_11.rst b/doc/guides/rel_notes/release_16_11.rst
index 8c9ec65..551e1d5 100644
--- a/doc/guides/rel_notes/release_16_11.rst
+++ b/doc/guides/rel_notes/release_16_11.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 16.11
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_02.rst b/doc/guides/rel_notes/release_17_02.rst
index 357965a..f6c1b91 100644
--- a/doc/guides/rel_notes/release_17_02.rst
+++ b/doc/guides/rel_notes/release_17_02.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.02
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
index 6892284..9550cd5 100644
--- a/doc/guides/rel_notes/release_17_05.rst
+++ b/doc/guides/rel_notes/release_17_05.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.05
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_08.rst b/doc/guides/rel_notes/release_17_08.rst
index 0bcdfb7..f2cf434 100644
--- a/doc/guides/rel_notes/release_17_08.rst
+++ b/doc/guides/rel_notes/release_17_08.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.08
 ==================
 
diff --git a/doc/guides/rel_notes/release_17_11.rst b/doc/guides/rel_notes/release_17_11.rst
index c2e3fc3..d0e2c6f 100644
--- a/doc/guides/rel_notes/release_17_11.rst
+++ b/doc/guides/rel_notes/release_17_11.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 17.11
 ==================
 
diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 44b7de5..b2b207a 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 18.02
 ==================
 
diff --git a/doc/guides/rel_notes/release_2_2.rst b/doc/guides/rel_notes/release_2_2.rst
index bb7d15a..8baccfb 100644
--- a/doc/guides/rel_notes/release_2_2.rst
+++ b/doc/guides/rel_notes/release_2_2.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2018 Intel Corporation
+
 DPDK Release 2.2
 ================
 
-- 
2.9.5

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v6] ethdev: replace bus specific struct with generic dev
  2018-04-05 16:40  2%   ` [dpdk-dev] [PATCH v5] " Ferruh Yigit
@ 2018-04-09 12:09  2%     ` Ferruh Yigit
  0 siblings, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-09 12:09 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Ferruh Yigit, Shreyansh Jain, Allain Legacy,
	Tomasz Duszynski, Santosh Shukla, David Marchand

Public struct rte_eth_dev_info has a "struct rte_pci_device" field in it
although it is common for all ethdev in all buses.

Replacing pci specific struct with generic device struct and updating
places that are using pci device in a way to get this information from
generic device.

Signed-off-by: Ferruh Yigit <ferruh.yigit@intel.com>
Reviewed-by: David Marchand <david.marchand@6wind.com>
Acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
Cc: Shreyansh Jain <shreyansh.jain@nxp.com>
Cc: Allain Legacy <allain.legacy@windriver.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Cc: David Marchand <david.marchand@6wind.com>

v2:
- prevent possible crash while getting bus (Pablo)
- Remove unnecessary __rte_unused
- Some PMD info_dev->device was assigned to NULL, fixed them

v3:
- rebased on latest next-net

v4:
- Move dev_info->device assignment to ethdev layer

v5:
- Document API change in related section in release notes

v6:
- Rebase on latest next-net, ip_pipeline updated
- Update axgbe too
---
 app/test-pmd/config.c                   | 18 +++++++++++++++-
 app/test-pmd/testpmd.h                  | 38 +++++++++++++++++++++++++++------
 doc/guides/rel_notes/release_18_05.rst  |  3 +++
 drivers/net/ark/ark_ethdev.c            |  1 -
 drivers/net/avf/avf_ethdev.c            |  1 -
 drivers/net/avp/avp_ethdev.c            |  1 -
 drivers/net/axgbe/axgbe_ethdev.c        |  4 +---
 drivers/net/bnx2x/bnx2x_ethdev.c        |  1 -
 drivers/net/bnxt/bnxt_ethdev.c          |  2 --
 drivers/net/cxgbe/cxgbe_ethdev.c        |  2 --
 drivers/net/e1000/em_ethdev.c           |  1 -
 drivers/net/e1000/igb_ethdev.c          |  2 --
 drivers/net/ena/ena_ethdev.c            |  2 --
 drivers/net/enic/enic_ethdev.c          |  1 -
 drivers/net/fm10k/fm10k_ethdev.c        |  1 -
 drivers/net/i40e/i40e_ethdev.c          |  1 -
 drivers/net/i40e/i40e_ethdev_vf.c       |  1 -
 drivers/net/ixgbe/ixgbe_ethdev.c        |  2 --
 drivers/net/kni/rte_eth_kni.c           |  1 -
 drivers/net/liquidio/lio_ethdev.c       |  2 --
 drivers/net/mlx4/mlx4_ethdev.c          |  1 -
 drivers/net/mlx5/mlx5_ethdev.c          |  1 -
 drivers/net/nfp/nfp_net.c               |  1 -
 drivers/net/octeontx/octeontx_ethdev.c  |  1 -
 drivers/net/qede/qede_ethdev.c          |  1 -
 drivers/net/sfc/sfc_ethdev.c            |  1 -
 drivers/net/szedata2/rte_eth_szedata2.c |  1 -
 drivers/net/tap/rte_eth_tap.c           |  1 -
 drivers/net/thunderx/nicvf_ethdev.c     |  2 --
 drivers/net/virtio/virtio_ethdev.c      |  1 -
 drivers/net/vmxnet3/vmxnet3_ethdev.c    |  4 +---
 examples/ethtool/lib/rte_ethtool.c      | 16 ++++++++------
 examples/ip_pipeline/kni.c              | 11 ++++++++--
 examples/kni/main.c                     | 11 +++++++---
 lib/librte_ether/rte_ethdev.c           |  1 +
 lib/librte_ether/rte_ethdev.h           |  2 +-
 test/test/test_kni.c                    | 35 ++++++++++++++++++++++++------
 37 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 4bb255c62..dd051f5ca 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -754,6 +754,8 @@ vlan_id_is_invalid(uint16_t vlan_id)
 static int
 port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	uint64_t pci_len;
 
 	if (reg_off & 0x3) {
@@ -762,7 +764,21 @@ port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 		       (unsigned)reg_off);
 		return 1;
 	}
-	pci_len = ports[port_id].dev_info.pci_dev->mem_resource[0].len;
+
+	if (!ports[port_id].dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(ports[port_id].dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(ports[port_id].dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 1;
+	}
+
+	pci_len = pci_dev->mem_resource[0].len;
 	if (reg_off >= pci_len) {
 		printf("Port %d: register offset %u (0x%X) out of port PCI "
 		       "resource (length=%"PRIu64")\n",
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 153abea05..4d84e7b00 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -500,12 +500,25 @@ mbuf_pool_find(unsigned int sock_id)
 static inline uint32_t
 port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 	uint32_t reg_v;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 0;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	reg_v = *((volatile uint32_t *)reg_addr);
 	return rte_le_to_cpu_32(reg_v);
 }
@@ -516,11 +529,24 @@ port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 static inline void
 port_pci_reg_write(struct rte_port *port, uint32_t reg_off, uint32_t reg_v)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	*((volatile uint32_t *)reg_addr) = rte_cpu_to_le_32(reg_v);
 }
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 0f3d00972..4b6bc9bf5 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -88,6 +88,9 @@ API Changes
   memory footprint which helps in better cache utilization when large number
   of meter objects are used.
 
+* ethdev, in struct ``struct rte_eth_dev_info``, field ``rte_pci_device *pci_dev``
+  replaced with field ``struct rte_device *device``.
+
 
 ABI Changes
 -----------
diff --git a/drivers/net/ark/ark_ethdev.c b/drivers/net/ark/ark_ethdev.c
index ff87c20e2..c9d541921 100644
--- a/drivers/net/ark/ark_ethdev.c
+++ b/drivers/net/ark/ark_ethdev.c
@@ -771,7 +771,6 @@ eth_ark_dev_info_get(struct rte_eth_dev *dev,
 				ETH_LINK_SPEED_40G |
 				ETH_LINK_SPEED_50G |
 				ETH_LINK_SPEED_100G);
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 }
 
 static int
diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index b59e3cf79..8e2a1b066 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -507,7 +507,6 @@ avf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct avf_info *vf = AVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = AVF_BUF_SIZE_MIN;
diff --git a/drivers/net/avp/avp_ethdev.c b/drivers/net/avp/avp_ethdev.c
index a07a288ed..5b3c4cebf 100644
--- a/drivers/net/avp/avp_ethdev.c
+++ b/drivers/net/avp/avp_ethdev.c
@@ -2172,7 +2172,6 @@ avp_dev_info_get(struct rte_eth_dev *eth_dev,
 {
 	struct avp_dev *avp = AVP_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->max_rx_queues = avp->max_rx_queues;
 	dev_info->max_tx_queues = avp->max_tx_queues;
 	dev_info->min_rx_bufsize = AVP_MIN_RX_BUFSIZE;
diff --git a/drivers/net/axgbe/axgbe_ethdev.c b/drivers/net/axgbe/axgbe_ethdev.c
index 07c1337ac..2a68ccbf5 100644
--- a/drivers/net/axgbe/axgbe_ethdev.c
+++ b/drivers/net/axgbe/axgbe_ethdev.c
@@ -349,12 +349,10 @@ axgbe_dev_stats_reset(struct rte_eth_dev *dev)
 }
 
 static void
-axgbe_dev_info_get(struct rte_eth_dev *dev,
-		   struct rte_eth_dev_info *dev_info)
+axgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct axgbe_port *pdata = dev->data->dev_private;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = pdata->tx_ring_count;
 	dev_info->max_tx_queues = pdata->rx_ring_count;
 	dev_info->min_rx_bufsize = AXGBE_RX_MIN_BUF_SIZE;
diff --git a/drivers/net/bnx2x/bnx2x_ethdev.c b/drivers/net/bnx2x/bnx2x_ethdev.c
index 483d5a17c..8726b357a 100644
--- a/drivers/net/bnx2x/bnx2x_ethdev.c
+++ b/drivers/net/bnx2x/bnx2x_ethdev.c
@@ -447,7 +447,6 @@ static void
 bnx2x_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct bnx2x_softc *sc = dev->data->dev_private;
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues  = sc->max_rx_queues;
 	dev_info->max_tx_queues  = sc->max_tx_queues;
 	dev_info->min_rx_bufsize = BNX2X_MIN_RX_BUF_SIZE;
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 7c007c8f9..c447cd727 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -379,8 +379,6 @@ static void bnxt_dev_info_get_op(struct rte_eth_dev *eth_dev,
 	uint16_t max_vnics, i, j, vpool, vrxq;
 	unsigned int max_rx_rings;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	/* MAC Specifics */
 	dev_info->max_mac_addrs = bp->max_l2_ctx;
 	dev_info->max_hash_mac_addrs = 0;
diff --git a/drivers/net/cxgbe/cxgbe_ethdev.c b/drivers/net/cxgbe/cxgbe_ethdev.c
index 581a1f33a..24c9a9323 100644
--- a/drivers/net/cxgbe/cxgbe_ethdev.c
+++ b/drivers/net/cxgbe/cxgbe_ethdev.c
@@ -134,8 +134,6 @@ void cxgbe_dev_info_get(struct rte_eth_dev *eth_dev,
 		.nb_align = 1,
 	};
 
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	device_info->min_rx_bufsize = CXGBE_MIN_RX_BUFSIZE;
 	device_info->max_rx_pktlen = CXGBE_MAX_RX_PKTLEN;
 	device_info->max_rx_queues = max_queues;
diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c
index 087c192d5..c6062468c 100644
--- a/drivers/net/e1000/em_ethdev.c
+++ b/drivers/net/e1000/em_ethdev.c
@@ -1070,7 +1070,6 @@ eth_em_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen = em_get_max_pktlen(dev);
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..872357146 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -2144,7 +2144,6 @@ eth_igb_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
@@ -2269,7 +2268,6 @@ eth_igbvf_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d78..a15436c99 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -1527,8 +1527,6 @@ static void ena_infos_get(struct rte_eth_dev *dev,
 	ena_dev = &adapter->ena_dev;
 	ena_assert_msg(ena_dev != NULL, "Uninitialized device");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->speed_capa =
 			ETH_LINK_SPEED_1G   |
 			ETH_LINK_SPEED_2_5G |
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 03f0c2547..801f4704c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -471,7 +471,6 @@ static void enicpmd_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct enic *enic = pmd_priv(eth_dev);
 
 	ENICPMD_FUNC_TRACE();
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	/* Scattered Rx uses two receive queues per rx queue exposed to dpdk */
 	device_info->max_rx_queues = enic->conf_rq_count / 2;
 	device_info->max_tx_queues = enic->conf_wq_count;
diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c
index 61de4d772..34affd1cc 100644
--- a/drivers/net/fm10k/fm10k_ethdev.c
+++ b/drivers/net/fm10k/fm10k_ethdev.c
@@ -1404,7 +1404,6 @@ fm10k_dev_infos_get(struct rte_eth_dev *dev,
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev            = pdev;
 	dev_info->min_rx_bufsize     = FM10K_MIN_RX_BUF_SIZE;
 	dev_info->max_rx_pktlen      = FM10K_MAX_PKT_SIZE;
 	dev_info->max_rx_queues      = hw->mac.max_queues;
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..6a8a2cd2a 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -3212,7 +3212,6 @@ i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vsi *vsi = pf->main_vsi;
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = vsi->nb_qps;
 	dev_info->max_tx_queues = vsi->nb_qps;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c
index 2908c87e0..f6d7f40b1 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2183,7 +2183,6 @@ i40evf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index fbc048f7d..bd1773978 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -3593,7 +3593,6 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	if (RTE_ETH_DEV_SRIOV(dev).active == 0) {
@@ -3712,7 +3711,6 @@ ixgbevf_dev_info_get(struct rte_eth_dev *dev,
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	dev_info->min_rx_bufsize = 1024; /* cf BSIZEPACKET in SRRCTL reg */
diff --git a/drivers/net/kni/rte_eth_kni.c b/drivers/net/kni/rte_eth_kni.c
index dc4e65f5d..c10e970c2 100644
--- a/drivers/net/kni/rte_eth_kni.c
+++ b/drivers/net/kni/rte_eth_kni.c
@@ -201,7 +201,6 @@ eth_kni_dev_info(struct rte_eth_dev *dev __rte_unused,
 	dev_info->max_rx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->max_tx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 }
 
 static int
diff --git a/drivers/net/liquidio/lio_ethdev.c b/drivers/net/liquidio/lio_ethdev.c
index eeb8350e4..a13a566f9 100644
--- a/drivers/net/liquidio/lio_ethdev.c
+++ b/drivers/net/liquidio/lio_ethdev.c
@@ -373,8 +373,6 @@ lio_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct lio_device *lio_dev = LIO_DEV(eth_dev);
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 
-	devinfo->pci_dev = pci_dev;
-
 	switch (pci_dev->id.subsystem_device_id) {
 	/* CN23xx 10G cards */
 	case PCI_SUBSYS_DEV_ID_CN2350_210:
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 5f731e023..636100b23 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -556,7 +556,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index cc85f76c0..44cdbb622 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -405,7 +405,6 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 606cd3dc2..e030bbf9f 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1253,7 +1253,6 @@ nfp_net_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = (uint16_t)hw->max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->max_tx_queues;
 	dev_info->min_rx_bufsize = ETHER_MIN_MTU;
diff --git a/drivers/net/octeontx/octeontx_ethdev.c b/drivers/net/octeontx/octeontx_ethdev.c
index 1406e4e19..f829e0ca9 100644
--- a/drivers/net/octeontx/octeontx_ethdev.c
+++ b/drivers/net/octeontx/octeontx_ethdev.c
@@ -616,7 +616,6 @@ octeontx_dev_info(struct rte_eth_dev *dev,
 	dev_info->max_rx_queues = 1;
 	dev_info->max_tx_queues = PKO_MAX_NUM_DQ;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 
 	dev_info->default_rxconf = (struct rte_eth_rxconf) {
 		.rx_free_thresh = 0,
diff --git a/drivers/net/qede/qede_ethdev.c b/drivers/net/qede/qede_ethdev.c
index a4e9e753e..13c2a3b87 100644
--- a/drivers/net/qede/qede_ethdev.c
+++ b/drivers/net/qede/qede_ethdev.c
@@ -1549,7 +1549,6 @@ qede_dev_info_get(struct rte_eth_dev *eth_dev,
 
 	PMD_INIT_FUNC_TRACE(edev);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->min_rx_bufsize = (uint32_t)QEDE_MIN_RX_BUFF_SIZE;
 	dev_info->max_rx_pktlen = (uint32_t)ETH_TX_MAX_NON_LSO_PKT_LEN;
 	dev_info->rx_desc_lim = qede_rx_desc_lim;
diff --git a/drivers/net/sfc/sfc_ethdev.c b/drivers/net/sfc/sfc_ethdev.c
index 2af898e08..6631c5a7e 100644
--- a/drivers/net/sfc/sfc_ethdev.c
+++ b/drivers/net/sfc/sfc_ethdev.c
@@ -89,7 +89,6 @@ sfc_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	sfc_log_init(sa, "entry");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_pktlen = EFX_MAC_PDU_MAX;
 
 	/* Autonegotiation may be disabled */
diff --git a/drivers/net/szedata2/rte_eth_szedata2.c b/drivers/net/szedata2/rte_eth_szedata2.c
index fb9aac04b..41a6fb427 100644
--- a/drivers/net/szedata2/rte_eth_szedata2.c
+++ b/drivers/net/szedata2/rte_eth_szedata2.c
@@ -1015,7 +1015,6 @@ eth_dev_info(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *internals = dev->data->dev_private;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->if_index = 0;
 	dev_info->max_mac_addrs = 1;
 	dev_info->max_rx_pktlen = (uint32_t)-1;
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 61d646558..54c7c2b0f 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -760,7 +760,6 @@ tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 	dev_info->speed_capa = tap_dev_speed_capa();
 	dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
 	dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
diff --git a/drivers/net/thunderx/nicvf_ethdev.c b/drivers/net/thunderx/nicvf_ethdev.c
index 067f2243b..75e9d16c5 100644
--- a/drivers/net/thunderx/nicvf_ethdev.c
+++ b/drivers/net/thunderx/nicvf_ethdev.c
@@ -1400,8 +1400,6 @@ nicvf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	/* Autonegotiation may be disabled */
 	dev_info->speed_capa = ETH_LINK_SPEED_FIXED;
 	dev_info->speed_capa |= ETH_LINK_SPEED_10M | ETH_LINK_SPEED_100M |
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 11f758929..d7c81747e 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -2064,7 +2064,6 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
 
-	dev_info->pci_dev = dev->device ? RTE_ETH_DEV_TO_PCI(dev) : NULL;
 	dev_info->max_rx_queues =
 		RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_RX_QUEUES);
 	dev_info->max_tx_queues =
diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 426008722..01b4802e0 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -1022,11 +1022,9 @@ vmxnet3_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 }
 
 static void
-vmxnet3_dev_info_get(struct rte_eth_dev *dev,
+vmxnet3_dev_info_get(struct rte_eth_dev *dev __rte_unused,
 		     struct rte_eth_dev_info *dev_info)
 {
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->max_rx_queues = VMXNET3_MAX_RX_QUEUES;
 	dev_info->max_tx_queues = VMXNET3_MAX_TX_QUEUES;
 	dev_info->min_rx_bufsize = 1518 + RTE_PKTMBUF_HEADROOM;
diff --git a/examples/ethtool/lib/rte_ethtool.c b/examples/ethtool/lib/rte_ethtool.c
index 90dfbb739..d519a50db 100644
--- a/examples/ethtool/lib/rte_ethtool.c
+++ b/examples/ethtool/lib/rte_ethtool.c
@@ -22,6 +22,8 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 {
 	struct rte_eth_dev_info dev_info;
 	struct rte_dev_reg_info reg_info;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 	int n;
 	int ret;
 
@@ -46,15 +48,17 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 	snprintf(drvinfo->version, sizeof(drvinfo->version), "%s",
 		rte_version());
 	/* TODO: replace bus_info by rte_devargs.name */
-	if (dev_info.pci_dev)
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info),
 			"%04x:%02x:%02x.%x",
-			dev_info.pci_dev->addr.domain,
-			dev_info.pci_dev->addr.bus,
-			dev_info.pci_dev->addr.devid,
-			dev_info.pci_dev->addr.function);
-	else
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+	} else {
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info), "N/A");
+	}
 
 	memset(&reg_info, 0, sizeof(reg_info));
 	rte_eth_dev_get_reg_info(port_id, &reg_info);
diff --git a/examples/ip_pipeline/kni.c b/examples/ip_pipeline/kni.c
index ebc8c7904..712775338 100644
--- a/examples/ip_pipeline/kni.c
+++ b/examples/ip_pipeline/kni.c
@@ -106,6 +106,8 @@ kni_create(const char *name, struct kni_params *params)
 	struct mempool *mempool;
 	struct link *link;
 	struct rte_kni *k;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 
 	/* Check input params */
 	if ((name == NULL) ||
@@ -128,8 +130,13 @@ kni_create(const char *name, struct kni_params *params)
 	kni_conf.core_id = params->thread_id;
 	kni_conf.group_id = link->port_id;
 	kni_conf.mbuf_size = mempool->buffer_size;
-	kni_conf.addr = dev_info.pci_dev->addr;
-	kni_conf.id = dev_info.pci_dev->id;
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		kni_conf.addr = pci_dev->addr;
+		kni_conf.id = pci_dev->id;
+	}
 
 	memset(&kni_ops, 0, sizeof(kni_ops));
 	kni_ops.port_id = link->port_id;
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 0d9980ee1..aebfedd59 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -834,13 +834,18 @@ kni_alloc(uint16_t port_id)
 		if (i == 0) {
 			struct rte_kni_ops ops;
 			struct rte_eth_dev_info dev_info;
+			const struct rte_pci_device *pci_dev;
+			const struct rte_bus *bus = NULL;
 
 			memset(&dev_info, 0, sizeof(dev_info));
 			rte_eth_dev_info_get(port_id, &dev_info);
 
-			if (dev_info.pci_dev) {
-				conf.addr = dev_info.pci_dev->addr;
-				conf.id = dev_info.pci_dev->id;
+			if (dev_info.device)
+				bus = rte_bus_find_by_device(dev_info.device);
+			if (bus && !strcmp(bus->name, "pci")) {
+				pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+				conf.addr = pci_dev->addr;
+				conf.id = pci_dev->id;
 			}
 			/* Get the interface default mac address */
 			rte_eth_macaddr_get(port_id,
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e04..90c47ad12 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -2395,6 +2395,7 @@ rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info)
 	memset(dev_info, 0, sizeof(struct rte_eth_dev_info));
 	dev_info->rx_desc_lim = lim;
 	dev_info->tx_desc_lim = lim;
+	dev_info->device = dev->device;
 
 	RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get);
 	(*dev->dev_ops->dev_infos_get)(dev, dev_info);
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca6a..784c6faa4 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -992,7 +992,7 @@ struct rte_pci_device;
  * Ethernet device information
  */
 struct rte_eth_dev_info {
-	struct rte_pci_device *pci_dev; /**< Device PCI information. */
+	struct rte_device *device; /** Generic device information */
 	const char *driver_name; /**< Device Driver name. */
 	unsigned int if_index; /**< Index to bound host interface, or 0 if none.
 		Use if_indextoname() to translate into an interface name. */
diff --git a/test/test/test_kni.c b/test/test/test_kni.c
index e4839cdb7..3d1be56a9 100644
--- a/test/test/test_kni.c
+++ b/test/test/test_kni.c
@@ -357,6 +357,8 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 
 	if (!mp)
 		return -1;
@@ -366,8 +368,13 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	memset(&ops, 0, sizeof(ops));
 
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	snprintf(conf.name, sizeof(conf.name), TEST_KNI_PORT);
 
 	/* core id 1 configured for kernel thread */
@@ -465,6 +472,8 @@ test_kni(void)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 
 	/* Initialize KNI subsytem */
 	rte_kni_init(KNI_TEST_MAX_PORTS);
@@ -523,8 +532,15 @@ test_kni(void)
 	memset(&conf, 0, sizeof(conf));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
@@ -552,8 +568,15 @@ test_kni(void)
 	memset(&info, 0, sizeof(info));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
-- 
2.14.3

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to flow API
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-07  9:51  0%     ` Andrew Rybchenko
  2018-04-09 15:00  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:51 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> This patch adds the missing action counterpart to the PHY_PORT pattern
> item, that is, the ability to directly inject matching traffic into a
> physical port of the underlying device.

Does it mean that if it is applied on ingress (incoming packet from network)
it will simply send packets back to network (specified physical port)?
And if it is applied on egress (outgoing from device to network) it will
be directed to possibly different physical port and sent to network.

> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
>   6 files changed, 84 insertions(+)

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in flow API
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-07  9:41  0%     ` Andrew Rybchenko
  2018-04-09 14:49  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:41 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Contrary to all other pattern items, these are inconsistently documented as
> affecting traffic instead of simply matching its origin, without provision
> for the latter.
>
> This commit clarifies documentation and updates PMDs since the original
> behavior now has to be explicitly requested using the new transfer
> attribute.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_validate()
>
> Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
> supported when a transfer attribute is also present.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 12 +++---
>   doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
>   drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
>   drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
>   lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
>   6 files changed, 77 insertions(+), 75 deletions(-)

<...>

> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 735ce6323..beedc713b 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -518,15 +518,12 @@ Usage example, matching non-TCPv4 packets only:
>   Item: ``PF``
>   ^^^^^^^^^^^^
>   
> -Matches packets addressed to the physical function of the device.
> +Matches traffic originating from (ingress) or going to (egress) the physical
> +function of the current device.

Not sure that I understand above. It looks like ingress and egress are 
misplaced.
There many similar cases below.

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from flow API
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-07  9:23  0%     ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:23 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Upcoming changes in relation to the handling of actions list will make the
> DUP action redundant as specifying several QUEUE actions will achieve the
> same behavior. Besides, no PMD implements this action.
>
> By removing an entry from enum rte_flow_action_type, this patch breaks ABI
> compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 23 -----------------------
>   app/test-pmd/config.c                       |  1 -
>   doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
>   lib/librte_ether/rte_ethdev_version.map     |  2 +-
>   lib/librte_ether/rte_flow.c                 |  1 -
>   lib/librte_ether/rte_flow.h                 | 24 ------------------------
>   7 files changed, 1 insertion(+), 81 deletions(-)

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API
  2018-04-07  9:15  0%     ` Andrew Rybchenko
@ 2018-04-07  9:18  0%       ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:18 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/07/2018 12:15 PM, Andrew Rybchenko wrote:
> On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
>> These enable more precise reporting of objects responsible for errors.
>>
>> This breaks ABI compatibility for the following public functions:
>>
>> - rte_flow_create()
>> - rte_flow_destroy()
>> - rte_flow_error_set()
>> - rte_flow_flush()
>> - rte_flow_isolate()
>> - rte_flow_query()
>> - rte_flow_validate()
>>
>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
>> ---
>>   app/test-pmd/config.c                   |  4 ++++
>>   lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
>>   lib/librte_ether/rte_flow.h             |  4 ++++
>>   3 files changed, 21 insertions(+), 7 deletions(-)
>
> I think PMD maintainers with flow API support should be additionally
> notified and encouraged to refine error reporting.

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-07  9:15  0%     ` Andrew Rybchenko
  2018-04-07  9:18  0%       ` Andrew Rybchenko
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:15 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> These enable more precise reporting of objects responsible for errors.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_destroy()
> - rte_flow_error_set()
> - rte_flow_flush()
> - rte_flow_isolate()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>   app/test-pmd/config.c                   |  4 ++++
>   lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
>   lib/librte_ether/rte_flow.h             |  4 ++++
>   3 files changed, 21 insertions(+), 7 deletions(-)

I think PMD maintainers with flow API support should be additionally
notified and encouraged to refine error reporting.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-07  9:05  0%     ` Andrew Rybchenko
  2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-07  9:05 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon,
	Radu Nicolau, Akhil Goyal, Ivan Malov

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> Since its inception, the rte_flow RSS action has been relying in part on
> external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
> This structure lacks parameters such as the hash algorithm to use, and more
> recently, a method to tell which layer RSS should be performed on [1].
>
> Given struct rte_eth_rss_conf will never be flexible enough to represent a
> complete RSS configuration (e.g. RETA table), this patch supersedes it by
> extending the rte_flow RSS action directly.
>
> A subsequent patch will add a field to use a non-default RSS hash
> algorithm. To that end, a field named "types" replaces the field formerly
> known as "rss_hf" and standing for "RSS hash functions" as it was
> confusing. Actual RSS hash function types are defined by enum
> rte_eth_hash_function.
> This patch updates all PMDs and example applications accordingly.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> [1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
>      configuration")
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> Cc: Radu Nicolau <radu.nicolau@intel.com>
> Cc: Akhil Goyal <akhil.goyal@nxp.com>
> ---
>   app/test-pmd/cmdline_flow.c        |  59 +++++-----
>   app/test-pmd/config.c              |  39 +++----
>   doc/guides/prog_guide/rte_flow.rst |  22 ++--
>   drivers/net/e1000/e1000_ethdev.h   |  13 ++-
>   drivers/net/e1000/igb_ethdev.c     |   4 +-
>   drivers/net/e1000/igb_flow.c       |  31 ++---
>   drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
>   drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
>   drivers/net/i40e/i40e_ethdev.h     |  15 ++-
>   drivers/net/i40e/i40e_flow.c       |  47 ++++----
>   drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
>   drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
>   drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
>   drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
>   drivers/net/mlx4/mlx4.c            |   2 +-
>   drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
>   drivers/net/mlx4/mlx4_flow.h       |   2 +-
>   drivers/net/mlx4/mlx4_rxq.c        |   2 +-
>   drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
>   drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
>   drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
>   drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
>   drivers/net/sfc/sfc_flow.c         |  21 ++--
>   drivers/net/tap/tap_flow.c         |   8 +-
>   examples/ipsec-secgw/ipsec.c       |  10 +-
>   lib/librte_ether/rte_flow.c        |  39 +++----
>   lib/librte_ether/rte_flow.h        |   6 +-
>   27 files changed, 473 insertions(+), 353 deletions(-)

<...>

> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> index 056405515..1a2c0299c 100644
> --- a/drivers/net/sfc/sfc_flow.c
> +++ b/drivers/net/sfc/sfc_flow.c
> @@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   	struct sfc_rxq *rxq;
>   	unsigned int rxq_hw_index_min;
>   	unsigned int rxq_hw_index_max;
> -	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
> -	uint64_t rss_hf;
> -	uint8_t *rss_key = NULL;
> +	const uint8_t *rss_key;
>   	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
>   	unsigned int i;
>   
> -	if (rss->num == 0)
> +	if (rss->queue_num == 0)
>   		return -EINVAL;
>   
>   	rxq_sw_index = sa->rxq_count - 1;
> @@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   	rxq_hw_index_min = rxq->hw_index;
>   	rxq_hw_index_max = 0;
>   
> -	for (i = 0; i < rss->num; ++i) {
> +	for (i = 0; i < rss->queue_num; ++i) {
>   		rxq_sw_index = rss->queue[i];
>   
>   		if (rxq_sw_index >= sa->rxq_count)
> @@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   			rxq_hw_index_max = rxq->hw_index;
>   	}
>   
> -	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;

Here we had a fallback to default rss_hf (now types) if rss_conf is 
unspecified.

> -	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
> +	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
>   		return -EINVAL;
>   
> -	if (rss_conf != NULL) {
> -		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
> +	if (rss->key_len) {
> +		if (rss->key_len != sizeof(sa->rss_key))
>   			return -EINVAL;
>   
> -		rss_key = rss_conf->rss_key;
> +		rss_key = rss->key;
>   	} else {
>   		rss_key = sa->rss_key;
>   	}
> @@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   
>   	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
>   	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
> -	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
> +	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);

Now types go directly to mapping function and unspecified types (0)
will result in 0 rss_hash_types. Of course, it is a question how to treat
types==0. It is possible to say that it no RSS, but it does not make sense.
So, real options are device defaults (regardless configured on device level)
or device config (rx_adv.conf.rss_conf.rss_hf). I would prefer the later.
Please, document the intended behaviour in rte_flow.rst.

If the later is chosen, above we'll have a bug since fallback to fixed 
default.
Just use sa->rss_hash_types as fallback. Something like:
if (rss->types)
     sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
else
     sfc_rss_conf->rss_hash_types =sa->rss_hash_types;

>   	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
>   
>   	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
> -		unsigned int rxq_sw_index = rss->queue[i % rss->num];
> +		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
>   		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
>   
>   		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level to RSS flow API action
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-07  8:27  0%     ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-07  8:27 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
> protocol header fields of a packet that must be taken into account while
> computing RSS.
>
> When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
> to whether these should apply to inner or outer packets. Applications need
> the ability to tell exactly "where" RSS must be performed.
>
> This is addressed by adding encapsulation level information to the RSS flow
> action. Its default value is 0 and stands for the usual unspecified
> behavior. Other values provide a specific encapsulation level.
>
> Contrary to the change announced by commit 676b605182a5 ("doc: announce
> ethdev API change for RSS configuration"), this patch does not affect
> struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
> used anymore by the RSS flow action. ABI impact is therefore limited to
> rte_flow.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Xueming Li <xuemingl@mellanox.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
>   drivers/net/e1000/igb_flow.c                |  4 ++++
>   drivers/net/e1000/igb_rxtx.c                |  2 ++
>   drivers/net/i40e/i40e_ethdev.c              |  2 ++
>   drivers/net/i40e/i40e_flow.c                |  4 ++++
>   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
>   drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
>   drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
>   drivers/net/sfc/sfc_flow.c                  |  3 +++
>   drivers/net/tap/tap_flow.c                  |  6 +++++-
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
>   16 files changed, 110 insertions(+), 1 deletion(-)

Generic and sfc parts
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 3/4] ethdev: Add group action type to rte_flow
  @ 2018-04-06 20:26  3%   ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:02PM +0100, Declan Doherty wrote:
> Add group action type which defines a terminating action which
> allows a matched flow to be redirect to a group. This allows logical
> flow table hierarchies to be managed through rte_flow.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

OK, I'm wondering if perhaps with the addition of this action, we should
redefine groups as unlinked by default?

Currently traffic enters through the flow rule with the lowest priority of
the group with the lowest ID and iterates through subsequent flow rules and
groups until matched by a flow rule without PASSTHRU (according to latest
definition [1]).

This would make jumps between groups always explicit, not necessarily a bad
idea given no PMD implements groups as of yet. Thoughts?

Also as a rather fundamental API addition, I suggest to add it after
RTE_FLOW_ACTION_TYPE_PASSTHRU. It's OK because ABI is already broken. You
just need to mention it in the commit log [1].

Another suggestion would be to rename it "JUMP" (reasons below).

[1] "ethdev: alter behavior of flow API actions"
    http://dpdk.org/ml/archives/dev/2018-April/095779.html

> ---
>  doc/guides/prog_guide/rte_flow.rst | 23 +++++++++++++++++++++++
>  lib/librte_ether/rte_flow.h        | 15 +++++++++++++++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 106fb93..2f0a47a 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1557,6 +1557,29 @@ set of overlay header type.
>     | ``item type`` | Item type of tunnel end-point to decapsulate |
>     +---------------+----------------------------------------------+
>  
> +

Unnecessary empty line.

> +Action: ``GROUP``
> +^^^^^^^^^^^^^^^^^
> +
> +Redirects packets to a group on the current device.
> +
> +In a hierarchy of groups, which can be used to represent physical or logical
> +flow tables on the device, this action allows the terminating action to be a
> +group on that device.
> +
> +- Terminating by default.

Keep in mind there's no such thing as a terminating action anymore [1].

> +
> +.. _table_rte_flow_action_group:
> +
> +.. table:: GROUP
> +
> +   +--------------+---------------------------------+
> +   | Field        | Value                           |
> +   +==============+=================================+
> +   | ``id``       | Group ID to redirect packets to |
> +   +--------------+---------------------------------+

"Field" column can be shrunk somewhat.

> +
> +
>  Negative types
>  ~~~~~~~~~~~~~~
>  
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 6d94423..968a23b 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1251,6 +1251,21 @@ struct rte_flow_action_tunnel_decap {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_GROUP

Its addition to enum rte_flow_action_type should be part of this commit.

> + *
> + * Redirects packets to a group on the current device.
> + *
> + * In a hierarchy of groups, which can be used to represent physical or logical
> + * flow tables on the device, this action allows the terminating action to be a
> + * group on that device.
> + *
> + * Terminating by default.

See [1].

> + */
> +struct rte_flow_action_group {
> +	uint32_t id;

Assuming this structure is named rte_flow_action_jump, naming this field
"group" would match the attribute of the same name.

> +};
> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> -- 
> 2.7.4
> 

Don't forget testpmd code and documentation update.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions
  @ 2018-04-06 20:26  2%   ` Adrien Mazarguil
  2018-04-09 16:10  0%     ` Mohammad Abdul Awal
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:01PM +0100, Declan Doherty wrote:
> Add new flow action types and associated action data structures to
> support the encapsulation and decapsulation of the virtual tunnel
> endpoints.
> 
> The RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP action will cause the matching
> flow to be encapsulated in the virtual tunnel endpoint overlay
> defined in the tunnel_encap action data.
> 
> The RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP action will cause all virtual
> tunnel endpoint overlays up to and including the first instance of
> the flow item type defined in the tunnel_decap action data for the
> matching flows.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

This generic approach looks flexible enough to cover the use cases that
immediately come to mind (VLAN, VXLAN), its design is sound.

However, while I'm aware it's not a concern at this point, it won't be able
to deal with stateful tunnel or encapsulation types (e.g. IPsec or TCP)
which will require additional meta data or some run-time assistance from the
application.

Eventually for more complex use cases, dedicated encap/decap actions will
have to appear, so the issue I wanted to raise before going further is this:

Going generic inevitably trades some of the usability; flat structures
dedicated to VXLAN encap/decap with only the needed info to get the job done
would likely be easier to implement in PMDs and use in applications. Any
number of such actions can be added to rte_flow without ABI impact.

If VXLAN is the only use case at this point, my suggestion would be to go
with simpler RTE_FLOW_ACTION_TYPE_VXLAN_(ENCAP|DECAP) actions, with fixed
L2/L3/L4/L5 header definitions to prepend according to RFC 7348.

Now we can start with the generic approach, see how it fares and add
dedicated encap/decap later as needed.

More comments below.

> ---
>  doc/guides/prog_guide/rte_flow.rst | 77 ++++++++++++++++++++++++++++++++--
>  lib/librte_ether/rte_flow.h        | 84 ++++++++++++++++++++++++++++++++++++--
>  2 files changed, 155 insertions(+), 6 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index fd33d19..106fb93 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -997,9 +997,11 @@ Actions
>  
>  Each possible action is represented by a type. Some have associated
>  configuration structures. Several actions combined in a list can be assigned
> -to a flow rule. That list is not ordered.
> +to a flow rule. That list is not ordered, with the exception of  actions which
> +modify the packet itself, these packet modification actions must be specified
> +in the explicit order in which they are to be executed.
>  
> -They fall in three categories:
> +They fall in four categories:
>  
>  - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>    processing matched packets by subsequent flow rules, unless overridden
> @@ -1008,8 +1010,11 @@ They fall in three categories:
>  - Non-terminating actions (PASSTHRU, DUP) that leave matched packets up for
>    additional processing by subsequent flow rules.
>  
> +- Non-terminating meta actions that do not affect the fate of packets but result
> +  in modification of the packet itself (SECURITY, TUNNEL_ENCAP, TUNNEL_DECAP).
> +
>  - Other non-terminating meta actions that do not affect the fate of packets
> -  (END, VOID, MARK, FLAG, COUNT, SECURITY).
> +  (END, VOID, MARK, FLAG, COUNT).

The above changes are not necessary anymore [1][2].

[1] "ethdev: clarify flow API pattern items and actions"
    https://dpdk.org/ml/archives/dev/2018-April/095776.html
[2] "ethdev: alter behavior of flow API actions"
    https://dpdk.org/ml/archives/dev/2018-April/095779.html

>  When several actions are combined in a flow rule, they should all have
>  different types (e.g. dropping a packet twice is not possible).
> @@ -1486,6 +1491,72 @@ fields in the pattern items.
>     | 1     | END      |
>     +-------+----------+
>  
> +

Nit: titles in this file are separated by a single empty line.

> +Action: ``TUNNEL_ENCAP``
> +^^^^^^^^^^^^^^^^^^^^^^
> +
> +Performs an encapsulation action by encapsulating the flows matched by the
> +pattern items according to the network overlay defined in the
> +``rte_flow_action_tunnel_encap`` pattern items.
> +
> +This action modifies the payload of matched flows. The pattern items specified
> +in the ``rte_flow_action_tunnel_encap`` action structure must defined a valid
> +set of overlay headers, from the Ethernet header up to the overlay header. The
> +pattern must be terminated with the RTE_FLOW_ITEM_TYPE_END item type.

Regarding the use of a pattern list, if you consider PMDs are already
iterating on a list of actions when encountering
RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP, it adds yet another inner loop.

How about making each encountered RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP provide
exactly one item instead (in encap, i.e. reverse order)?

In which case perhaps "GENERIC" would be a better fit than "TUNNEL".

> +
> +- Non-terminating by default.

There's no such property anymore [2].

> +
> +.. _table_rte_flow_action_tunnel_encap:
> +
> +.. table:: TUNNEL_ENCAP
> +
> +   +-------------+---------------------------------------------+
> +   | Field       | Value                                       |
> +   +=============+=============================================+
> +   | ``pattern`` | Virtual tunnel end-point pattern definition |
> +   +-------------+---------------------------------------------+
> +
> +
> +.. _table_rte_flow_action_tunnel_encap_example:
> +
> +.. table:: IPv4 VxLAN flow pattern example.

VxLAN => VXLAN

> +
> +   +-------+--------------------------+------------+
> +   | Index | Flow Item Type           | Flow Item  |
> +   +=======+==========================+============+
> +   | 0     | RTE_FLOW_ITEM_TYPE_ETH   | eth item   |
> +   +-------+--------------------------+------------+
> +   | 1     | RTE_FLOW_ITEM_TYPE_IPV4  | ipv4 item  |
> +   +-------+--------------------------+------------+
> +   | 2     | RTE_FLOW_ITEM_TYPE_UDP   | udp item   |
> +   +-------+--------------------------+------------+
> +   | 3     | RTE_FLOW_ITEM_TYPE_VXLAN | vxlan item |
> +   +-------+--------------------------+------------+
> +   | 4     | RTE_FLOW_ITEM_TYPE_END   | NULL       |
> +   +-------+--------------------------+------------+

One possible issue is that it relies on objects normally found on the
pattern side of flow rules. Those are supposed to match something, they are
not intended for packet header generation. While their "spec" and "mask"
fields might make sense in this context, the "last" field is odd.

You must define them without leaving anything open for interpretation by
PMDs and users alike. Defining things as "undefined" is fine as long as it's
covered.

> +
> +

Nit: only one empty line necessary here.

> +Action: ``TUNNEL_DECAP``
> +^^^^^^^^^^^^^^^^^^^^^^
> +
> +Performs a decapsulation action by stripping all headers of the virtual tunnel
> +end-point overlay up to the header defined by the flow item type of flows
> +matched by the pattern items.

Not necessarily, for instance if one guarantees that flowing traffic only
consists of decap'able packets. You must avoid mandatory dependencies
between patterns and actions since they are normally unrelated.

What you can document on the other hand is that the behavior is undefined
when processing traffic on which the action can't be applied. This is
how RSS level is documented [3].

[3] https://dpdk.org/ml/archives/dev/2018-April/095783.html

> +
> +This action modifies the payload of matched flows. The flow item type specified
> +in the ``rte_flow_action_tunnel_decap`` action structure must defined a valid
> +set of overlay header type.
> +
> +- Non-terminating by default.

See [2].

> +
> +.. _table_rte_flow_action_tunnel_decap:
> +
> +   +---------------+----------------------------------------------+
> +   | Field         | Value                                        |
> +   +===============+==============================================+
> +   | ``item type`` | Item type of tunnel end-point to decapsulate |
> +   +---------------+----------------------------------------------+

"item type" should be the exact name used in the structure.

> +
>  Negative types
>  ~~~~~~~~~~~~~~
>  
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 7d1f89d..6d94423 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -854,14 +854,17 @@ struct rte_flow_item {
>  	const void *mask; /**< Bit-mask applied to spec and last. */
>  };
>  
> +

Unnecessary empty line.

>  /**
>   * Action types.
>   *
>   * Each possible action is represented by a type. Some have associated
>   * configuration structures. Several actions combined in a list can be
> - * affected to a flow rule. That list is not ordered.
> + * affected to a flow rule. That list is not ordered, with the exception of
> + * actions which modify the packet itself, these packet modification actions
> + * must be specified in the explicit order in which they are to be executed.
>   *
> - * They fall in three categories:
> + * They fall in four categories:
>   *
>   * - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>   *   processing matched packets by subsequent flow rules, unless overridden
> @@ -870,6 +873,10 @@ struct rte_flow_item {
>   * - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
>   *   for additional processing by subsequent flow rules.
>   *
> + * - Non terminating meta actions that do not affect the fate of
> + *   packets but result in modification of the packet itself (SECURITY,
> + *   TUNNEL_ENCAP, TUNNEL_DECAP).
> + *

Same comment as above [1][2].

>   * - Other non terminating meta actions that do not affect the fate of
>   *   packets (END, VOID, MARK, FLAG, COUNT).
>   *
> @@ -1022,7 +1029,42 @@ enum rte_flow_action_type {
>  	 *
>  	 * See struct rte_flow_action_group_count.
>  	 */
> -	RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT,

An empty line would have been needed here (if we agree about no more
GROUP_COUNT.)

> +	/**
> +	 * Encapsulate flow with tunnel defined in
> +	 * rte_flow_action_tunnel_encap structure.
> +	 *
> +	 * See struct rte_flow_action_tunnel_encap.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP,
> +
> +	/**
> +	 * Decapsulate all the headers of the tunnel
> +	 *
> +	 * See struct rte_flow_action_tunnel_decap.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_TUNNEL_DECAP,
> +
> +	/**
> +	 * Redirects packets to the logical group of the current device.
> +	 *
> +	 * In a logical hierarchy of groups, which can be used to represent a
> +	 * physical of logical chaining of flow tables, this action allows the
> +	 * terminating action to be a logical group of the same device.
> +	 *
> +	 * See struct rte_flow_action_group.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_GROUP,
> +
> +	/**
> +	 * [META]
> +	 *
> +	 * Set specific metadata field associated with packet which is then
> +	 * available to further pipeline stages.
> +	 *
> +	 * See struct rte_flow_action_metadata.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_METADATA

These two actions should be part of the next patch, I won't comment them
here.

>  };
>  
>  /**
> @@ -1173,6 +1215,42 @@ struct rte_flow_action_group_count {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_TUNNEL_ENCAP
> + *
> + * Virtual tunnel end-point encapsulation action data.
> + *
> + * Non-terminating action by default.

See [2].

> + */
> +struct rte_flow_action_tunnel_encap {
> +	struct rte_flow_action_item {
> +		enum rte_flow_item_type type;
> +		/**< Flow item type. */
> +		const void *item;
> +		/**< Flow item definition which points to the data of
> +		 * corresponding rte_flow_item_type.
> +		 */

I see it's a new action type, albeit a bit confusing (there is no
RTE_FLOW_ACTION_TYPE_ITEM).

I suggest the standard pattern item type since you're going with enum
rte_flow_item_type anyway. Keep in mind you need some kind of mask to tell
what fields are relevant. An application might otherwise want to encap with
unsupported properties (e.g. specific IPv4 ToS field and whatnot).

How about a single "struct rte_flow_pattern_item item", neither const and
neither a pointer. It's generic enough, enclosed spec/last/mask pointers
take care of the specifics. You just need to define what's supposed to
happen when "last" is set.

> +	} *pattern;
> +	/**<
> +	 * Tunnel pattern specification (list terminated by the END pattern
> +	 * item).
> +	 */

As previously suggested, how about a single item per encap?

> +};
> +
> +/**
> + * RTE_FLOW_ACTION_TYP_TUNNEL_DECAP
> + *
> + * Virtual tunnel end-point decapsulation action data.
> + *
> + * Non-terminating action by default.
> + */
> +struct rte_flow_action_tunnel_decap {
> +	enum rte_flow_item_type type;
> +	/**<
> +	 * Flow item type of virtual tunnel end-point to be decapsulated
> +	 */
> +};

Note that contrary to ENCAP, DECAP wouldn't necessarily need repeated
actions to peel each layer off. The current definition is fine.

> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow
  @ 2018-04-06 20:26  3%   ` Adrien Mazarguil
  2018-04-09 14:22  0%     ` Mohammad Abdul Awal
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 20:26 UTC (permalink / raw)
  To: Declan Doherty; +Cc: dev

On Fri, Apr 06, 2018 at 01:24:00PM +0100, Declan Doherty wrote:
> Add new RTE_FLOW_ACTION_TYPE_GROUP_COUNT action type to enable shared
> counters across multiple flows on a single port or across multiple
> flows on multiple ports within the same switch domain.
> 
> Introduce new API rte_flow_query_group_count to allow querying of group
> counters.
> 
> Signed-off-by: Declan Doherty <declan.doherty@intel.com>

Both features are definitely needed, however I suggest to enhance the
existing action type and query function instead, given the rte_flow ABI
won't be maintained for the 18.05 release [1].

Counters and query support were defined as a kind of PoC in preparation for
future requirements back in DPDK 17.02 and so far few PMDs have implemented
the query callback (mlx5 and failsafe, and the latter isn't really a PMD).

Due to the behavior change of action lists [2], providing an action type as
a query parameter is not specific enough anymore, for instance if a list
contains multiple COUNT, the application should be able to tell which needs
to be queried.

Therefore I suggest to redefine the query function as follows:

 int
 rte_flow_query(uint16_t port_id,
                struct rte_flow *flow,
                const struct rte_flow_action *action,
                void *data,
                struct rte_flow_error *error);

Third argument is an action definition with the same configuration (if any)
as previously defined in the action list originally used to create the flow
rule (not necessarily the same pointer, only the contents matter).

It means two perfectly identical actions can't be distinguished, and that's
how group counters will work.

Instead of adding a new action type to distinguish groups, a configuration
structure is added to the existing RTE_FLOW_ACTION_TYPE_COUNT, with
non-shared counters as a default behavior:

 struct rte_flow_action_count {
         uint32_t shared:1; /**< Share counter ID with other flow rules. */
         uint32_t reserved:31; /**< Reserved, must be zero. */
         uint32_t id; /**< Counter ID. */
 };

Doing so will impact some existing code in mlx5 and librte_flow_classify,
but that shouldn't be much of an issue.

Keep in mind testpmd and its documentation must be updated as well.

Thoughts?

A few nits below for the sake of commenting.

[1] "Flow API overhaul for switch offloads"
    http://dpdk.org/ml/archives/dev/2018-April/095774.html
[2] "ethdev: alter behavior of flow API actions"
    http://dpdk.org/ml/archives/dev/2018-April/095779.html

> ---
>  doc/guides/prog_guide/rte_flow.rst      | 35 +++++++++++++++++++++
>  lib/librte_ether/rte_ethdev_version.map |  8 +++++
>  lib/librte_ether/rte_flow.c             | 21 +++++++++++++
>  lib/librte_ether/rte_flow.h             | 56 ++++++++++++++++++++++++++++++++-
>  lib/librte_ether/rte_flow_driver.h      |  6 ++++
>  5 files changed, 125 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
> index 961943d..fd33d19 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1698,6 +1698,41 @@ Return values:
>  
>  - 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
>  
> +

Unnecessary empty line.

> +Group Count Query
> +~~~~~~~~~~~~~~~~~
> +
> +Query group counter which can be associated with multiple flows on a specified
> +port.
> +
> +This function allows retrieving of group counters. A group counter is a
> +counter which can be shared among multiple flows on a single port or among
> +multiple flows on multiple ports within the same switch domain. Data is
> +gathered by special actions which must be present in the flow rule
> +definition.
> +
> +.. code-block:: c
> +
> +   int
> +   rte_flow_query_group_count(uint16_t port_id,
> +			   uint32_t group_counter_id,
> +			   struct rte_flow_query_count *count,
> +               struct rte_flow_error *error);
> +
> +Arguments:
> +
> +- ``port_id``: port identifier of Ethernet device.
> +- ``group_counter_id``: group counter identifier.
> +- ``count``: group counter parameters.
> +- ``error``: perform verbose error reporting if not NULL. PMDs initialize
> +  this structure in case of error only.
> +
> +Return values:
> +
> +- 0 on success, a negative errno value otherwise and ``rte_errno`` is set.
> +
> +
> +

More unnecessary empty lines.

>  Isolated mode
>  -------------
>  
> diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
> index 34df6c8..cff6807 100644
> --- a/lib/librte_ether/rte_ethdev_version.map
> +++ b/lib/librte_ether/rte_ethdev_version.map
> @@ -229,3 +229,11 @@ EXPERIMENTAL {
>  	rte_mtr_stats_update;
>  
>  } DPDK_17.11;
> +
> +

One more.

> +EXPERIMENTAL {
> +	global:
> +
> +	rte_flow_query_group_count
> +
> +} DPDK_18.05;
> diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
> index 38f2d27..e10b1d0 100644
> --- a/lib/librte_ether/rte_flow.c
> +++ b/lib/librte_ether/rte_flow.c
> @@ -418,3 +418,24 @@ rte_flow_copy(struct rte_flow_desc *desc, size_t len,
>  	}
>  	return 0;
>  }
> +
> +int __rte_experimental
> +rte_flow_query_group_count(uint16_t port_id,
> +	uint32_t group_count_id,
> +	struct rte_flow_query_count *count,
> +	struct rte_flow_error *error)

This function lacks a short documentation comment (see rte_flow_query()).

> +{
> +	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +	const struct rte_flow_ops *ops = rte_flow_ops_get(port_id, error);
> +
> +	if (!ops)
> +		return -rte_errno;
> +	if (likely(!!ops->query_group_count))
> +		return flow_err(port_id,
> +				ops->query_group_count(dev, group_count_id,
> +						       count, error),
> +				error);
> +	return rte_flow_error_set(error, ENOSYS,
> +				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +				  NULL, rte_strerror(ENOSYS));
> +}
> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index 13e4202..7d1f89d 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -1010,7 +1010,19 @@ enum rte_flow_action_type {
>  	 *
>  	 * See struct rte_flow_action_security.
>  	 */
> -	RTE_FLOW_ACTION_TYPE_SECURITY
> +	RTE_FLOW_ACTION_TYPE_SECURITY,
> +
> +	/**
> +	 * Enable a shared flow group counter for flow. Group counters can be
> +	 * associated with multiples flows on the same port or on port within
> +	 * the same switch domain if supported by that device.
> +	 *
> +	 * Group counters can be retrieved and reset through
> +	 * rte_flow_query_group_count()
> +	 *
> +	 * See struct rte_flow_action_group_count.
> +	 */
> +	RTE_FLOW_ACTION_TYPE_GROUP_COUNT

Don't forget the trailing comma.

>  };
>  
>  /**
> @@ -1149,6 +1161,18 @@ struct rte_flow_action_security {
>  };
>  
>  /**
> + * RTE_FLOW_ACTION_TYPE_GROUP_COUNT
> + *
> + * A packet/byte counter which can be shared across a group of flows programmed
> + * on the same port/switch domain.
> + *
> + * Non-terminating by default.
> + */
> +struct rte_flow_action_group_count {
> +	uint32_t id;
> +};
> +
> +/**
>   * Definition of a single action.
>   *
>   * A list of actions is terminated by a END action.
> @@ -1476,6 +1500,36 @@ rte_flow_copy(struct rte_flow_desc *fd, size_t len,
>  	      const struct rte_flow_item *items,
>  	      const struct rte_flow_action *actions);
>  
> +

Caught another empty line.

> +/**
> + * Get hit/bytes count for group counter.
> + *
> + * A group counter is a counter which can be shared among multiple flows on a
> + * single port or among multiple flows on multiple ports within the same
> + * switch domain.
> + *
> + * In the case of ports within the same switch domain a global name space is
> + * assumed for group_count_id value.
> + *
> + * @param[in]	port_id
> + *   Port identifier of Ethernet device.
> + * @param[in]	group_count_id
> + *   Group counter identifier to query
> + * @param[out]	count
> + *   Group counter value
> + * @param[out]	error
> + *   Perform verbose error reporting if not NULL. PMDs initialize this
> + *   structure in case of error only.
> + *
> + * @return
> + *   Negative error code (errno value) and rte_errno is set.
> + */
> +int __rte_experimental
> +rte_flow_query_group_count(uint16_t port_id,
> +			   uint32_t group_count_id,
> +			   struct rte_flow_query_count *count,
> +			   struct rte_flow_error *error);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_ether/rte_flow_driver.h b/lib/librte_ether/rte_flow_driver.h
> index 7778c8e..ef09465 100644
> --- a/lib/librte_ether/rte_flow_driver.h
> +++ b/lib/librte_ether/rte_flow_driver.h
> @@ -96,6 +96,12 @@ struct rte_flow_ops {
>  		(struct rte_eth_dev *,
>  		 int,
>  		 struct rte_flow_error *);
> +	/** See rte_flow_query_group_count(). */
> +	int (*query_group_count)
> +		(struct rte_eth_dev *,
> +		 uint32_t,
> +		 struct rte_flow_query_count *,
> +		 struct rte_flow_error *);
>  };
>  
>  /**
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 02/10] bpf: add BPF loading and execution framework
  @ 2018-04-06 18:49  2% ` Konstantin Ananyev
  0 siblings, 0 replies; 200+ results
From: Konstantin Ananyev @ 2018-04-06 18:49 UTC (permalink / raw)
  To: dev; +Cc: Konstantin Ananyev

librte_bpf provides a framework to load and execute eBPF bytecode
inside user-space dpdk based applications.
It supports basic set of features from eBPF spec
(https://www.kernel.org/doc/Documentation/networking/filter.txt).

Not currently supported features:
 - JIT
 - cBPF
 - tail-pointer call
 - eBPF MAP
 - skb

It also adds dependency on libelf.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 config/common_base                 |   5 +
 lib/Makefile                       |   2 +
 lib/librte_bpf/Makefile            |  30 +++
 lib/librte_bpf/bpf.c               |  59 +++++
 lib/librte_bpf/bpf_exec.c          | 452 +++++++++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_impl.h          |  41 ++++
 lib/librte_bpf/bpf_load.c          | 386 +++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_validate.c      |  55 +++++
 lib/librte_bpf/meson.build         |  18 ++
 lib/librte_bpf/rte_bpf.h           | 170 ++++++++++++++
 lib/librte_bpf/rte_bpf_version.map |  12 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   2 +
 13 files changed, 1233 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_bpf/Makefile
 create mode 100644 lib/librte_bpf/bpf.c
 create mode 100644 lib/librte_bpf/bpf_exec.c
 create mode 100644 lib/librte_bpf/bpf_impl.h
 create mode 100644 lib/librte_bpf/bpf_load.c
 create mode 100644 lib/librte_bpf/bpf_validate.c
 create mode 100644 lib/librte_bpf/meson.build
 create mode 100644 lib/librte_bpf/rte_bpf.h
 create mode 100644 lib/librte_bpf/rte_bpf_version.map

diff --git a/config/common_base b/config/common_base
index c09c7cf88..d68c2e211 100644
--- a/config/common_base
+++ b/config/common_base
@@ -821,3 +821,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile librte_bpf
+#
+CONFIG_RTE_LIBRTE_BPF=y
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..a4a2329f9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -97,6 +97,8 @@ DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
 DEPDIRS-librte_gso += librte_mempool
+DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
+DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ether
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile
new file mode 100644
index 000000000..e0f434e77
--- /dev/null
+++ b/lib/librte_bpf/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_bpf.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+LDLIBS += -lrte_net -lrte_eal
+LDLIBS += -lrte_mempool -lrte_ring
+LDLIBS += -lrte_mbuf -lrte_ethdev
+LDLIBS += -lelf
+
+EXPORT_MAP := rte_bpf_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c
+
+# install header files
+SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c
new file mode 100644
index 000000000..d7f68c017
--- /dev/null
+++ b/lib/librte_bpf/bpf.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+int rte_bpf_logtype;
+
+__rte_experimental void
+rte_bpf_destroy(struct rte_bpf *bpf)
+{
+	if (bpf != NULL) {
+		if (bpf->jit.func != NULL)
+			munmap(bpf->jit.func, bpf->jit.sz);
+		munmap(bpf, bpf->sz);
+	}
+}
+
+__rte_experimental int
+rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	jit[0] = bpf->jit;
+	return 0;
+}
+
+int
+bpf_jit(struct rte_bpf *bpf)
+{
+	int32_t rc;
+
+	rc = -ENOTSUP;
+	if (rc != 0)
+		RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
+
+RTE_INIT(rte_bpf_init_log);
+
+static void
+rte_bpf_init_log(void)
+{
+	rte_bpf_logtype = rte_log_register("lib.bpf");
+	if (rte_bpf_logtype >= 0)
+		rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO);
+}
diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c
new file mode 100644
index 000000000..0382ade98
--- /dev/null
+++ b/lib/librte_bpf/bpf_exec.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+
+#include "bpf_impl.h"
+
+#define BPF_JMP_UNC(ins)	((ins) += (ins)->off)
+
+#define BPF_JMP_CND_REG(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \
+		(ins)->off : 0)
+
+#define BPF_JMP_CND_IMM(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \
+		(ins)->off : 0)
+
+#define BPF_NEG_ALU(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg]))
+
+#define BPF_MOV_ALU_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg])
+
+#define BPF_OP_ALU_REG(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg])
+
+#define BPF_MOV_ALU_IMM(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(ins)->imm)
+
+#define BPF_OP_ALU_IMM(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(ins)->imm)
+
+#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \
+	if ((type)(reg)[(ins)->src_reg] == 0) { \
+		RTE_BPF_LOG(ERR, \
+			"%s(%p): division by 0 at pc: %#zx;\n", \
+			__func__, bpf, \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+		return 0; \
+	} \
+} while (0)
+
+#define BPF_LD_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = \
+		*(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off))
+
+#define BPF_ST_IMM(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(ins)->imm)
+
+#define BPF_ST_REG(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(reg)[(ins)->src_reg])
+
+#define BPF_ST_XADD_REG(reg, ins, tp)	\
+	(rte_atomic##tp##_add((rte_atomic##tp##_t *) \
+		(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \
+		reg[ins->src_reg]))
+
+static inline void
+bpf_alu_be(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_be_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_be_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_be_64(*v);
+		break;
+	}
+}
+
+static inline void
+bpf_alu_le(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_le_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_le_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_le_64(*v);
+		break;
+	}
+}
+
+static inline uint64_t
+bpf_exec(const struct rte_bpf *bpf, uint64_t reg[MAX_BPF_REG])
+{
+	const struct bpf_insn *ins;
+
+	for (ins = bpf->prm.ins; ; ins++) {
+		switch (ins->code) {
+		/* 32 bit ALU IMM operations */
+		case (BPF_ALU | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint32_t);
+			break;
+		/* 32 bit ALU REG operations */
+		case (BPF_ALU | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_BE):
+			bpf_alu_be(reg, ins);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_LE):
+			bpf_alu_le(reg, ins);
+			break;
+		/* 64 bit ALU IMM operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint64_t);
+			break;
+		/* 64 bit ALU REG operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint64_t);
+			break;
+		/* load instructions */
+		case (BPF_LDX | BPF_MEM | BPF_B):
+			BPF_LD_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_H):
+			BPF_LD_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_W):
+			BPF_LD_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_DW):
+			BPF_LD_REG(reg, ins, uint64_t);
+			break;
+		/* load 64 bit immediate value */
+		case (BPF_LD | BPF_IMM | BPF_DW):
+			reg[ins->dst_reg] = (uint32_t)ins[0].imm |
+				(uint64_t)(uint32_t)ins[1].imm << 32;
+			ins++;
+			break;
+		/* store instructions */
+		case (BPF_STX | BPF_MEM | BPF_B):
+			BPF_ST_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_H):
+			BPF_ST_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_W):
+			BPF_ST_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_DW):
+			BPF_ST_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_B):
+			BPF_ST_IMM(reg, ins, uint8_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_H):
+			BPF_ST_IMM(reg, ins, uint16_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_W):
+			BPF_ST_IMM(reg, ins, uint32_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_DW):
+			BPF_ST_IMM(reg, ins, uint64_t);
+			break;
+		/* atomic add instructions */
+		case (BPF_STX | BPF_XADD | BPF_W):
+			BPF_ST_XADD_REG(reg, ins, 32);
+			break;
+		case (BPF_STX | BPF_XADD | BPF_DW):
+			BPF_ST_XADD_REG(reg, ins, 64);
+			break;
+		/* jump instructions */
+		case (BPF_JMP | BPF_JA):
+			BPF_JMP_UNC(ins);
+			break;
+		/* jump IMM instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, &, uint64_t);
+			break;
+		/* jump REG instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, &, uint64_t);
+			break;
+		/* call instructions */
+		case (BPF_JMP | BPF_CALL):
+			reg[BPF_REG_0] = bpf->prm.xsym[ins->imm].func(
+				reg[BPF_REG_1], reg[BPF_REG_2], reg[BPF_REG_3],
+				reg[BPF_REG_4], reg[BPF_REG_5]);
+			break;
+		/* return instruction */
+		case (BPF_JMP | BPF_EXIT):
+			return reg[BPF_REG_0];
+		default:
+			RTE_BPF_LOG(ERR,
+				"%s(%p): invalid opcode %#x at pc: %#zx;\n",
+				__func__, bpf, ins->code,
+				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+			return 0;
+		}
+	}
+
+	/* should never be reached */
+	RTE_VERIFY(0);
+	return 0;
+}
+
+__rte_experimental uint32_t
+rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
+	uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[MAX_BPF_REG];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		reg[BPF_REG_1] = (uintptr_t)ctx[i];
+		reg[BPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+__rte_experimental uint64_t
+rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
+{
+	uint64_t rc;
+
+	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
+	return rc;
+}
diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h
new file mode 100644
index 000000000..5d7e65c31
--- /dev/null
+++ b/lib/librte_bpf/bpf_impl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _BPF_H_
+#define _BPF_H_
+
+#include <rte_bpf.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BPF_STACK_SIZE	0x200
+
+struct rte_bpf {
+	struct rte_bpf_prm prm;
+	struct rte_bpf_jit jit;
+	size_t sz;
+	uint32_t stack_sz;
+};
+
+extern int bpf_validate(struct rte_bpf *bpf);
+
+extern int bpf_jit(struct rte_bpf *bpf);
+
+#ifdef RTE_ARCH_X86_64
+extern int bpf_jit_x86(struct rte_bpf *);
+#endif
+
+extern int rte_bpf_logtype;
+
+#define	RTE_BPF_LOG(lvl, fmt, args...) \
+	rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BPF_H_ */
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
new file mode 100644
index 000000000..3c7279a6c
--- /dev/null
+++ b/lib/librte_bpf/bpf_load.c
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+
+#include "bpf_impl.h"
+
+/* To overcome compatibility issue */
+#ifndef EM_BPF
+#define	EM_BPF	247
+#endif
+
+static uint32_t
+bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
+	const struct rte_bpf_xsym fp[], uint32_t fn)
+{
+	uint32_t i;
+
+	if (sn == NULL || fp == NULL)
+		return UINT32_MAX;
+
+	for (i = 0; i != fn; i++) {
+		if (fp[i].type == type && strcmp(sn, fp[i].name) == 0)
+			break;
+	}
+
+	return (i != fn) ? i : UINT32_MAX;
+}
+
+/*
+ * update BPF code at offset *ofs* with a proper address(index) for external
+ * symbol *sn*
+ */
+static int
+resolve_xsym(const char *sn, size_t ofs, struct bpf_insn *ins, size_t ins_sz,
+	const struct rte_bpf_prm *prm)
+{
+	uint32_t idx, fidx;
+	enum rte_bpf_xtype type;
+
+	if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz)
+		return -EINVAL;
+
+	idx = ofs / sizeof(ins[0]);
+	if (ins[idx].code == (BPF_JMP | BPF_CALL))
+		type = RTE_BPF_XTYPE_FUNC;
+	else if (ins[idx].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+			ofs < ins_sz - sizeof(ins[idx]))
+		type = RTE_BPF_XTYPE_VAR;
+	else
+		return -EINVAL;
+
+	fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym);
+	if (fidx == UINT32_MAX)
+		return -ENOENT;
+
+	/* for function we just need an index in our xsym table */
+	if (type == RTE_BPF_XTYPE_FUNC)
+		ins[idx].imm = fidx;
+	/* for variable we need to store its absolute address */
+	else {
+		ins[idx].imm = (uintptr_t)prm->xsym[fidx].var;
+		ins[idx + 1].imm =
+			(uint64_t)(uintptr_t)prm->xsym[fidx].var >> 32;
+	}
+
+	return 0;
+}
+
+static int
+check_elf_header(const Elf64_Ehdr * eh)
+{
+	const char *err;
+
+	err = NULL;
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	if (eh->e_ident[EI_DATA] != ELFDATA2LSB)
+#else
+	if (eh->e_ident[EI_DATA] != ELFDATA2MSB)
+#endif
+		err = "not native byte order";
+	else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE)
+		err = "unexpected OS ABI";
+	else if (eh->e_type != ET_REL)
+		err = "unexpected ELF type";
+	else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF)
+		err = "unexpected machine type";
+
+	if (err != NULL) {
+		RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find executable section by name.
+ */
+static int
+find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
+{
+	Elf_Scn *sc;
+	const Elf64_Ehdr *eh;
+	const Elf64_Shdr *sh;
+	Elf_Data *sd;
+	const char *sn;
+	int32_t rc;
+
+	eh = elf64_getehdr(elf);
+	if (eh == NULL) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	if (check_elf_header(eh) != 0)
+		return -EINVAL;
+
+	/* find given section by name */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL;
+			sc = elf_nextscn(elf, sc)) {
+		sh = elf64_getshdr(sc);
+		sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name);
+		if (sn != NULL && strcmp(section, sn) == 0 &&
+				sh->sh_type == SHT_PROGBITS &&
+				sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR))
+			break;
+	}
+
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL || sd->d_size == 0 ||
+			sd->d_size % sizeof(struct bpf_insn) != 0) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	*psd = sd;
+	*pidx = elf_ndxscn(sc);
+	return 0;
+}
+
+/*
+ * helper function to process data from relocation table.
+ */
+static int
+process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
+	struct bpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+{
+	int32_t rc;
+	uint32_t i, n;
+	size_t ofs, sym;
+	const char *sn;
+	const Elf64_Ehdr *eh;
+	Elf_Scn *sc;
+	const Elf_Data *sd;
+	Elf64_Sym *sm;
+
+	eh = elf64_getehdr(elf);
+
+	/* get symtable by section index */
+	sc = elf_getscn(elf, sym_idx);
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL)
+		return -EINVAL;
+	sm = sd->d_buf;
+
+	n = re_sz / sizeof(re[0]);
+	for (i = 0; i != n; i++) {
+
+		ofs = re[i].r_offset;
+
+		/* retrieve index in the symtable */
+		sym = ELF64_R_SYM(re[i].r_info);
+		if (sym * sizeof(sm[0]) >= sd->d_size)
+			return -EINVAL;
+
+		sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name);
+
+		rc = resolve_xsym(sn, ofs, ins, ins_sz, prm);
+		if (rc != 0) {
+			RTE_BPF_LOG(ERR,
+				"resolve_xsym(%s, %zu) error code: %d\n",
+				sn, ofs, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find relocation information (if any)
+ * and update bpf code.
+ */
+static int
+elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
+	const struct rte_bpf_prm *prm)
+{
+	Elf64_Rel *re;
+	Elf_Scn *sc;
+	const Elf64_Shdr *sh;
+	const Elf_Data *sd;
+	int32_t rc;
+
+	rc = 0;
+
+	/* walk through all sections */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0;
+			sc = elf_nextscn(elf, sc)) {
+
+		sh = elf64_getshdr(sc);
+
+		/* relocation data for our code section */
+		if (sh->sh_type == SHT_REL && sh->sh_info == sidx) {
+			sd = elf_getdata(sc, NULL);
+			if (sd == NULL || sd->d_size == 0 ||
+					sd->d_size % sizeof(re[0]) != 0)
+				return -EINVAL;
+			rc = process_reloc(elf, sh->sh_link,
+				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				prm);
+		}
+	}
+
+	return rc;
+}
+
+static struct rte_bpf *
+bpf_load(const struct rte_bpf_prm *prm)
+{
+	uint8_t *buf;
+	struct rte_bpf *bpf;
+	size_t sz, bsz, insz, xsz;
+
+	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
+	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	bsz = sizeof(bpf[0]);
+	sz = insz + xsz + bsz;
+
+	buf = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf == MAP_FAILED)
+		return NULL;
+
+	bpf = (void *)buf;
+	bpf->sz = sz;
+
+	memcpy(&bpf->prm, prm, sizeof(bpf->prm));
+
+	memcpy(buf + bsz, prm->xsym, xsz);
+	memcpy(buf + bsz + xsz, prm->ins, insz);
+
+	bpf->prm.xsym = (void *)(buf + bsz);
+	bpf->prm.ins = (void *)(buf + bsz + xsz);
+
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	if (prm == NULL || prm->ins == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load(prm);
+	if (bpf == NULL) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	rc = bpf_validate(bpf);
+	if (rc == 0) {
+		bpf_jit(bpf);
+		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
+			rc = -ENOMEM;
+	}
+
+	if (rc != 0) {
+		rte_bpf_destroy(bpf);
+		rte_errno = -rc;
+		return NULL;
+	}
+
+	return bpf;
+}
+
+static struct rte_bpf *
+bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+{
+	Elf *elf;
+	Elf_Data *sd;
+	size_t sidx;
+	int32_t rc;
+	struct rte_bpf *bpf;
+	struct rte_bpf_prm np;
+
+	elf_version(EV_CURRENT);
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+
+	rc = find_elf_code(elf, section, &sd, &sidx);
+	if (rc == 0)
+		rc = elf_reloc_code(elf, sd, sidx, prm);
+
+	if (rc == 0) {
+		np = prm[0];
+		np.ins = sd->d_buf;
+		np.nb_ins = sd->d_size / sizeof(struct bpf_insn);
+		bpf = rte_bpf_load(&np);
+	} else {
+		bpf = NULL;
+		rte_errno = -rc;
+	}
+
+	elf_end(elf);
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	int32_t fd, rc;
+	struct rte_bpf *bpf;
+
+	if (prm == NULL || fname == NULL || sname == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		rc = errno;
+		RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n",
+			__func__, fname, rc, strerror(rc));
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load_elf(prm, fd, sname);
+	close(fd);
+
+	if (bpf == NULL) {
+		RTE_BPF_LOG(ERR,
+			"%s(fname=\"%s\", sname=\"%s\") failed, "
+			"error code: %d\n",
+			__func__, fname, sname, rte_errno);
+		return NULL;
+	}
+
+	RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") "
+		"successfully creates %p(jit={.func=%p,.sz=%zu});\n",
+		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
+	return bpf;
+}
diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c
new file mode 100644
index 000000000..1911e1381
--- /dev/null
+++ b/lib/librte_bpf/bpf_validate.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+/*
+ * dummy one for now, need more work.
+ */
+int
+bpf_validate(struct rte_bpf *bpf)
+{
+	int32_t rc, ofs, stack_sz;
+	uint32_t i, op, dr;
+	const struct bpf_insn *ins;
+
+	rc = 0;
+	stack_sz = 0;
+	for (i = 0; i != bpf->prm.nb_ins; i++) {
+
+		ins = bpf->prm.ins + i;
+		op = ins->code;
+		dr = ins->dst_reg;
+		ofs = ins->off;
+
+		if ((BPF_CLASS(op) == BPF_STX || BPF_CLASS(op) == BPF_ST) &&
+				dr == BPF_REG_10) {
+			ofs -= sizeof(uint64_t);
+			stack_sz = RTE_MIN(ofs, stack_sz);
+		}
+	}
+
+	if (stack_sz != 0) {
+		stack_sz = -stack_sz;
+		if (stack_sz > MAX_BPF_STACK_SIZE)
+			rc = -ERANGE;
+		else
+			bpf->stack_sz = stack_sz;
+	}
+
+	if (rc != 0)
+		RTE_BPF_LOG(ERR, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build
new file mode 100644
index 000000000..05c48c7ff
--- /dev/null
+++ b/lib/librte_bpf/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+allow_experimental_apis = true
+sources = files('bpf.c',
+		'bpf_exec.c',
+		'bpf_load.c',
+		'bpf_validate.c')
+
+install_headers = files('rte_bpf.h')
+
+deps += ['mbuf', 'net']
+
+dep = dependency('libelf', required: false)
+if dep.found() == false
+	build = false
+endif
+ext_deps += dep
diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h
new file mode 100644
index 000000000..825621404
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_BPF_H_
+#define _RTE_BPF_H_
+
+/**
+ * @file
+ *
+ * RTE BPF support.
+ * librte_bpf provides a framework to load and execute eBPF bytecode
+ * inside user-space dpdk based applications.
+ * It supports basic set of features from eBPF spec
+ * (https://www.kernel.org/doc/Documentation/networking/filter.txt).
+ */
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <bpf_def.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible types for external symbols.
+ */
+enum rte_bpf_xtype {
+	RTE_BPF_XTYPE_FUNC, /**< function */
+	RTE_BPF_XTYPE_VAR, /**< variable */
+	RTE_BPF_XTYPE_NUM
+};
+
+/**
+ * Definition for external symbols available in the BPF program.
+ */
+struct rte_bpf_xsym {
+	const char *name;        /**< name */
+	enum rte_bpf_xtype type; /**< type */
+	union {
+		uint64_t (*func)(uint64_t, uint64_t, uint64_t,
+				uint64_t, uint64_t);
+		void *var;
+	}; /**< value */
+};
+
+/**
+ * Possible BPF program types.
+ * Use negative values for DPDK specific prog-types, to make sure they will
+ * not interfere with Linux related ones.
+ */
+enum rte_bpf_prog_type {
+	RTE_BPF_PROG_TYPE_UNSPEC = BPF_PROG_TYPE_UNSPEC,
+	/**< input is a pointer to raw data */
+	RTE_BPF_PROG_TYPE_MBUF = INT32_MIN,
+	/**< input is a pointer to rte_mbuf */
+};
+
+/**
+ * Input parameters for loading eBPF code.
+ */
+struct rte_bpf_prm {
+	const struct bpf_insn *ins; /**< array of eBPF instructions */
+	uint32_t nb_ins;            /**< number of instructions in ins */
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym; /**< number of elements in xsym */
+	enum rte_bpf_prog_type prog_type; /**< eBPF program type */
+};
+
+/**
+ * Information about compiled into native ISA eBPF code.
+ */
+struct rte_bpf_jit {
+	uint64_t (*func)(void *); /**< JIT-ed native code */
+	size_t sz;                /**< size of JIT-ed code */
+};
+
+struct rte_bpf;
+
+/**
+ * De-allocate all memory used by this eBPF execution context.
+ *
+ * @param bpf
+ *   BPF handle to destroy.
+ */
+void rte_bpf_destroy(struct rte_bpf *bpf);
+
+/**
+ * Create a new eBPF execution context and load given BPF code into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_load(const struct rte_bpf_prm *prm);
+
+/**
+ * Create a new eBPF execution context and load BPF code from given ELF
+ * file into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @param fname
+ *  Pathname for a ELF file.
+ * @param sname
+ *  Name of the executable section within the file to load.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_elf_load(const struct rte_bpf_prm *prm,
+	const char *fname, const char *sname);
+
+/**
+ * Execute given BPF bytecode.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to input context.
+ * @return
+ *   BPF execution return value.
+ */
+uint64_t rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
+
+/**
+ * Execute given BPF bytecode over a set of input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   array of pointers to the input contexts.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number of elements in ctx[] (and rc[]).
+ * @return
+ *   number of successfully processed inputs.
+ */
+uint32_t rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[],
+	uint64_t rc[], uint32_t num);
+
+/**
+ * Provide information about natively compield code for given BPF handle.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the rte_bpf_jit structure to be filled with related data.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BPF_H_ */
diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map
new file mode 100644
index 000000000..ff65144df
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_bpf_destroy;
+	rte_bpf_elf_load;
+	rte_bpf_exec;
+	rte_bpf_exec_burst;
+	rte_bpf_get_jit;
+	rte_bpf_load;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ef6159170..7ff7aaaa5 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -23,7 +23,7 @@ libraries = [ 'compat', # just a header, used for versioning
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
-	'flow_classify']
+	'flow_classify', 'bpf']
 
 foreach l:libraries
 	build = true
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 258590819..405a13147 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -83,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER)          += -lrte_timer
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 
+_LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf -lelf
+
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
-- 
2.13.6

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-06 17:11  0%     ` Andrew Rybchenko
  2018-04-09 14:42  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-06 17:11 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
> consistent with the normal stacking order of pattern items, which is
> confusing to applications.
>
> Problem is that when followed by one of these layers, the EtherType field
> of the preceding layer keeps its "inner" definition, and the "outer" TPID
> is provided by the subsequent layer, the reverse of how a packet looks like
> on the wire:
>
>   Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
>   rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]
>
> Worse, when QinQ is involved, the stacking order of VLAN layers is
> unspecified. It is unclear whether it should be reversed (innermost to
> outermost) as well given TPID applies to the previous layer:
>
>   Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
>   rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
>   rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]
>
> While specifying EtherType/TPID is hopefully rarely necessary, the stacking
> order in case of QinQ and the lack of documentation remain an issue.
>
> This patch replaces TPID in the VLAN pattern item with an inner
> EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
> clarifies documentation and updates all relevant code.
>
> It breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
> items:
>
> - bnxt: EtherType matching is supported, and vlan->inner_type overrides
>    eth->type if the latter has standard TPID value 0x8100, otherwise an
>    error is triggered.
>
> - e1000: EtherType matching is only supported with the ETHERTYPE filter,
>    which does not support VLAN matching, therefore no impact.
>
> - enic: same as bnxt.
>
> - i40e: same as bnxt with a configurable TPID value for the FDIR filter,
>    with existing limitations on allowed EtherType values. The remaining
>    filter types (VXLAN, NVGRE, QINQ) do not support EtherType matching.
>
> - ixgbe: same as e1000, with additional minor change to rely on the new
>    E-Tag macro definition.
>
> - mlx4: EtherType/TPID matching is not supported, no impact.
>
> - mlx5: same as bnxt.
>
> - mrvl: EtherType matching is supported but eth->type cannot be specified
>    when a VLAN item is present. However vlan->inner_type is used if
>    specified.
>
> - sfc: same as bnxt with QinQ TPID value 0x88a8 additionally supported.
>
> - tap: same as bnxt.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Somnath Kotur <somnath.kotur@broadcom.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Hyong Youb Kim <hyonkim@cisco.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Tomasz Duszynski <tdu@semihalf.com>
> Cc: Dmitri Epshtein <dima@marvell.com>
> Cc: Natalie Samsonov <nsamsono@marvell.com>
> Cc: Jianbo Liu <jianbo.liu@arm.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
>
> ---
>
> Hi PMD maintainers, while I'm pretty confident in these changes, I could
> not validate them with all devices.
>
> It would be great if you could apply this patch, run testpmd, create VLAN
> flow rules with/without inner EtherType as described and send matching
> traffic while making sure nothing was broken in the process.
>
> Thanks!
> ---
>   app/test-pmd/cmdline_flow.c                 | 17 +++---
>   doc/guides/nics/tap.rst                     |  2 +-
>   doc/guides/prog_guide/rte_flow.rst          | 21 ++++++--
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
>   drivers/net/bnxt/bnxt_filter.c              | 39 +++++++++++---
>   drivers/net/enic/enic_flow.c                | 22 +++++---
>   drivers/net/i40e/i40e_flow.c                | 69 +++++++++++++++++++-----
>   drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
>   drivers/net/mlx5/mlx5_flow.c                | 16 +++++-
>   drivers/net/mvpp2/mrvl_flow.c               | 27 +++++++---
>   drivers/net/sfc/sfc_flow.c                  | 28 ++++++++++
>   drivers/net/tap/tap_flow.c                  | 16 ++++--
>   lib/librte_ether/rte_flow.h                 | 24 ++++++---
>   lib/librte_net/rte_ether.h                  |  1 +
>   14 files changed, 229 insertions(+), 60 deletions(-)

<...>

> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> index bc4974edf..f61d4ec92 100644
> --- a/drivers/net/sfc/sfc_flow.c
> +++ b/drivers/net/sfc/sfc_flow.c
> @@ -7,6 +7,7 @@
>    * for Solarflare) and Solarflare Communications, Inc.
>    */
>   
> +#include <rte_byteorder.h>
>   #include <rte_tailq.h>
>   #include <rte_common.h>
>   #include <rte_ethdev_driver.h>
> @@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
>   	const struct rte_flow_item_vlan *mask = NULL;
>   	const struct rte_flow_item_vlan supp_mask = {
>   		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
> +		.inner_type = RTE_BE16(0xffff),
>   	};
>   
>   	rc = sfc_flow_parse_init(item,
> @@ -393,6 +395,32 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
>   		return -rte_errno;
>   	}
>   
> +	/*
> +	 * If an EtherType was already specified, make sure it is a valid
> +	 * TPID for the current VLAN layer before overwriting it with the
> +	 * specified inner type.
> +	 */
> +	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
> +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_VLAN) &&
> +	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_QINQ)) {

1. efs_ether_type is host-endian
2. HW recognizes more TPIDs (0x9100, 0x9200, 0x9300) as VLAN
3. However, if some TPID is specified, user may expect that only VLAN 
packets
     with specified TPID match. It is false expectation since the 
information is not
     passed to HW to match (and there is no way to match).
     So, it is safer to deny TPID specification (i.e. keep the first 
condition only).
     From the flexibility point of view it is possible to allow any, but 
it should be
     documented that exact match is not checked in fact.

> +		rte_flow_error_set(error, EINVAL,
> +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> +				   "Unsupported outer TPID");
> +		return -rte_errno;
> +	}
> +	if (!mask->inner_type) {
> +		efx_spec->efs_match_flags &= ~EFX_FILTER_MATCH_ETHER_TYPE;
> +		efx_spec->efs_ether_type = RTE_BE16(0x0000);

Nothing should be done here if above is done.

> +	} else if (mask->inner_type == supp_mask.inner_type) {
> +		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
> +		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
> +	} else {
> +		rte_flow_error_set(error, EINVAL,
> +				   RTE_FLOW_ERROR_TYPE_ITEM, item,
> +				   "Bad mask for VLAN inner_type");
> +		return -rte_errno;
> +	}
> +
>   	return 0;
>   }

<...>

> diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
> index fc7e6705d..b13b0e2e6 100644
> --- a/lib/librte_ether/rte_flow.h
> +++ b/lib/librte_ether/rte_flow.h
> @@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
>    *
>    * Matches an 802.1Q/ad VLAN tag.
>    *
> - * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
> - * RTE_FLOW_ITEM_TYPE_VLAN.
> + * The corresponding standard outer EtherType (TPID) values are
> + * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
> + * pattern item.
>    */
>   struct rte_flow_item_vlan {
> -	rte_be16_t tpid; /**< Tag protocol identifier. */
>   	rte_be16_t tci; /**< Tag control information. */
> +	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
>   };
>   
>   /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
>   #ifndef __cplusplus
>   static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
> -	.tpid = RTE_BE16(0x0000),
> -	.tci = RTE_BE16(0xffff),
> +	.tci = RTE_BE16(0x0fff),

It looks like unrelated change.

> +	.inner_type = RTE_BE16(0x0000),
>   };
>   #endif

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-04-06 14:49  4%   ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
  2018-04-06 14:49  7%     ` [dpdk-dev] [PATCH v5 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
@ 2018-04-06 17:01  0%     ` Ferruh Yigit
  2018-04-10  9:43  4%     ` [dpdk-dev] [PATCH v6 " Remy Horton
  2 siblings, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-06 17:01 UTC (permalink / raw)
  To: Remy Horton, dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

On 4/6/2018 3:49 PM, Remy Horton wrote:
> The optimal values of several transmission & reception related parameters,
> such as burst sizes, descriptor ring sizes, and number of queues, varies
> between different network interface devices. This patchset allows individual
> PMDs to specify their preferred parameter values, and if so indicated by an
> application, for them to be used automatically by the ethdev layer.
> 
> rte_eth_dev_configure() has been changed so that specifying zero for both
> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
> are not available, falls back to EAL defaults. Setting one (but not both)
> to zero does not cause the use of defaults, as having one of them zeroed is
> a valid setup.
> 
> This patchset includes per-PMD values for e1000 and i40e but it is expected
> that subsequent patchsets will cover other PMDs. A deprecation notice
> covering the API/ABI change is in place.
> 
> Changes in v5:
> * uint_16_t corrected to uint16_t
> 
> Changes in v4:
> * Added API/ABI change documentation
> * Rebased to 78f5a2e93d74
> 
> Changes in v3:
> * Changed formatting around new rte_eth_dev_info fields
> * Added Doxygen documentation to struct rte_eth_dev_portconf
> * Testpmd "port config all burst 0" and --burst=0 uses PMD 
>   Rx burst recommendations.
> * Added to release notes
> * Rebased to 8ea081f38161
> 
> Changes in v2:
> * Rebased to master
> * Removed fallback values from rte_eth_dev_info_get()
> * Added fallback values to rte_rte_[rt]x_queue_setup()
> * Added fallback values to rte_eth_dev_configure()
> * Corrected comment
> * Removed deprecation notice
> * Split RX and Tx into seperate structures
> * Changed parameter names
> 
> 
> Remy Horton (4):
>   ethdev: add support for PMD-tuned Tx/Rx parameters
>   net/e1000: add TxRx tuning parameters
>   net/i40e: add TxRx tuning parameters
>   testpmd: make use of per-PMD TxRx parameters

For series,
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated
    2018-04-04 15:08  0%     ` santosh
@ 2018-04-06 15:51  0%     ` Olivier Matz
  2018-04-12 15:22  0%     ` Burakov, Anatoly
  2 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2018-04-06 15:51 UTC (permalink / raw)
  To: Andrew Rybchenko; +Cc: dev

On Mon, Mar 26, 2018 at 05:09:44PM +0100, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
> 
> Bump ABI version since the patch breaks it.
> 
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>

Acked-by: Olivier Matz <olivier.matz@6wind.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-06 15:41  0%     ` Andrew Rybchenko
  2018-04-09 14:41  0%       ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Andrew Rybchenko @ 2018-04-06 15:41 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh, Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> By definition, RSS involves some kind of hash algorithm, usually Toeplitz.
>
> Until now it could not be modified on a flow rule basis and PMDs had to
> always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
> behavior when unspecified (0).
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_copy()
> - rte_flow_create()
> - rte_flow_query()
> - rte_flow_validate()
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Thomas Monjalon <thomas@monjalon.net>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: Jingjing Wu <jingjing.wu@intel.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Qi Zhang <qi.z.zhang@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> ---
>   app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
>   app/test-pmd/config.c                       |  1 +
>   doc/guides/prog_guide/rte_flow.rst          |  2 +
>   doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
>   drivers/net/e1000/igb_flow.c                |  4 ++
>   drivers/net/e1000/igb_rxtx.c                |  4 +-
>   drivers/net/i40e/i40e_ethdev.c              |  4 +-
>   drivers/net/i40e/i40e_flow.c                |  4 ++
>   drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
>   drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
>   drivers/net/mlx4/mlx4_flow.c                |  7 +++
>   drivers/net/mlx5/mlx5_flow.c                | 13 +++++
>   drivers/net/sfc/sfc_flow.c                  |  3 +
>   drivers/net/tap/tap_flow.c                  |  6 ++
>   lib/librte_ether/rte_flow.c                 |  1 +
>   lib/librte_ether/rte_flow.h                 |  2 +
>   16 files changed, 131 insertions(+), 3 deletions(-)

<...>

> diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
> index 1a2c0299c..dbe4c2baa 100644
> --- a/drivers/net/sfc/sfc_flow.c
> +++ b/drivers/net/sfc/sfc_flow.c
> @@ -1261,6 +1261,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
>   			rxq_hw_index_max = rxq->hw_index;
>   	}
>   
> +	if (rss->func)

May be it is better to compare with RTE_ETH_HASH_FUNCTION_DEFAULT
explicitly? I think it is more readable. If so, it is applicable to all 
similar checks
in the patch.
In the case of sfc, please, allow RTE_ETH_HASH_FUNCTION_TOEPLITZ as well.
I'd suggest:
switch (rss->func) {
case RTE_ETH_HASH_FUNCTION_DEFAULT:
case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
       break;
default:
       return -EINVAL;
}

> +		return -EINVAL;
> +
>   	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
>   		return -EINVAL;
>   

<...>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-06 15:06  0%     ` Andrew Rybchenko
  0 siblings, 0 replies; 200+ results
From: Andrew Rybchenko @ 2018-04-06 15:06 UTC (permalink / raw)
  To: Adrien Mazarguil, Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Pascal Mazon

On 04/06/2018 04:25 PM, Adrien Mazarguil wrote:
> This patch makes the following changes to flow rule actions:
>
> - List order now matters, they are redefined as performed first to last
>    instead of "all simultaneously".
>
> - Repeated actions are now supported (e.g. specifying QUEUE multiple times
>    now duplicates traffic among them). Previously only the last action of
>    any given kind was taken into account.
>
> - No more distinction between terminating/non-terminating/meta actions.
>    Flow rules themselves are now defined as always terminating unless a
>    PASSTHRU action is specified.
>
> These changes alter the behavior of flow rules in corner cases in order to
> prepare the flow API for actions that modify traffic contents or properties
> (e.g. encapsulation, compression) and for which order matter when combined.
>
> Previously one would have to so through multiple flow rules by combining
> PASSTRHU with priority levels, however this proved overly complex to
> implement at the PMD level, hence this simpler approach.
>
> This breaks ABI compatibility for the following public functions:
>
> - rte_flow_create()
> - rte_flow_validate()
>
> PMDs with rte_flow support are modified accordingly:
>
> - bnxt: no change, implementation already forbids multiple actions and does
>    not support PASSTHRU.
>
> - e1000: no change, same as bnxt.
>
> - enic: modified to forbid redundant actions, no support for default drop.
>
> - failsafe: no change needed.
>
> - i40e: no change, implementation already forbids multiple actions.
>
> - ixgbe: same as i40e.
>
> - mlx4: modified to forbid multiple fate-deciding actions and drop when
>    unspecified.
>
> - mlx5: same as mlx4, with other redundant actions also forbidden.
>
> - sfc: same as mlx4.
>
> - tap: implementation already complies with the new behavior except for
>    the default pass-through modified as a default drop.
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
> Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Cc: John Daley <johndale@cisco.com>
> Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
> Cc: Beilei Xing <beilei.xing@intel.com>
> Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>
> Cc: Pascal Mazon <pascal.mazon@6wind.com>
> ---
>   doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
>   drivers/net/enic/enic_flow.c       | 25 ++++++++++++
>   drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
>   drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
>   drivers/net/sfc/sfc_flow.c         | 22 +++++++----
>   drivers/net/tap/tap_flow.c         | 11 ++++++
>   lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
>   7 files changed, 138 insertions(+), 131 deletions(-)

sfc part
Reviewed-by: Andrew Rybchenko <arybchenko@oktetlabs.ru>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v5 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  2018-04-06 14:49  4%   ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
@ 2018-04-06 14:49  7%     ` Remy Horton
  2018-04-06 17:01  0%     ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
  2018-04-10  9:43  4%     ` [dpdk-dev] [PATCH v6 " Remy Horton
  2 siblings, 0 replies; 200+ results
From: Remy Horton @ 2018-04-06 14:49 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related
parameters, such as burst sizes, descriptor ring sizes, and number
of queues, varies between different network interface devices. This
patch allows individual PMDs to specify preferred parameter values.

Signed-off-by: Remy Horton <remy.horton@intel.com>
---
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 4 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..d13077d 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -112,19 +112,6 @@ Deprecation Notices
   The new API add rss_level field to ``rte_eth_rss_conf`` to enable a choice
   of RSS hash calculation on outer or inner header of tunneled packet.
 
-* ethdev:  Currently, if the  rte_eth_rx_burst() function returns a value less
-  than *nb_pkts*, the application will assume that no more packets are present.
-  Some of the hw queue based hardware can only support smaller burst for RX
-  and TX and thus break the expectation of the rx_burst API. Similar is the
-  case for TX burst as well as ring sizes. ``rte_eth_dev_info`` will be added
-  with following new parameters so as to support semantics for drivers to
-  define a preferred size for Rx/Tx burst and rings.
-
-  - Member ``struct preferred_size`` would be added to enclose all preferred
-    size to be fetched from driver/implementation.
-  - Members ``uint16_t rx_burst``,  ``uint16_t tx_burst``, ``uint16_t rx_ring``,
-    and ``uint16_t tx_ring`` would be added to ``struct preferred_size``.
-
 * ethdev: A work is being planned for 18.05 to expose VF port representors
   as a mean to perform control and data path operation on the different VFs.
   As VF representor is an ethdev port, new fields are needed in order to map
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1c..2fe9c70 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -58,6 +58,11 @@ New Features
   * Added support for NVGRE, VXLAN and GENEVE filters in flow API.
   * Added support for DROP action in flow API.
 
+* **Added PMD-recommended Tx and Rx parameters**
+
+  Applications can now query drivers for device-tuned values of
+  ring sizes, burst sizes, and number of queues.
+
 
 API Changes
 -----------
@@ -72,6 +77,29 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Changes to semantics of rte_eth_dev_configure() parameters.**
+
+   If both the ``nb_rx_q`` and ``nb_tx_q`` parameters are zero,
+   ``rte_eth_dev_configure`` will now use PMD-recommended queue sizes, or if
+   recommendations are not provided by the PMD the function will use ethdev
+   fall-back values. Previously setting both of the parameters to zero would
+   have resulted in ``-EINVAL`` being returned.
+
+* **Changes to semantics of rte_eth_rx_queue_setup() parameters.**
+
+   If the ``nb_rx_desc`` parameter is zero, ``rte_eth_rx_queue_setup`` will
+   now use the PMD-recommended Rx ring size, or in the case where the PMD
+   does not provide a recommendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_rx_desc`` to zero would have
+   resulted in an error.
+
+* **Changes to semantics of rte_eth_tx_queue_setup() parameters.**
+
+   If the ``nb_tx_desc`` parameter is zero, ``rte_eth_tx_queue_setup`` will
+   now use the PMD-recommended Tx ring size, or in the case where the PMD
+   does not provide a recoomendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
+   resulted in an error.
 
 ABI Changes
 -----------
@@ -86,6 +114,13 @@ ABI Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Additional fields in rte_eth_dev_info.**
+
+  The ``rte_eth_dev_info`` structure has had two extra entries appended to the
+  end of it: ``default_rxportconf`` and ``default_txportconf``. Each of these
+  in turn are ``rte_eth_dev_portconf`` structures containing three fields of
+  type ``uint16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
+  are parameter values recommended for use by the PMD.
 
 Removed Items
 -------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..209796d 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1061,6 +1061,26 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
+	dev = &rte_eth_devices[port_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
+
+	/* If number of queues specified by application for both Rx and Tx is
+	 * zero, use driver preferred values. This cannot be done individually
+	 * as it is valid for either Tx or Rx (but not both) to be zero.
+	 * If driver does not provide any preferred valued, fall back on
+	 * EAL defaults.
+	 */
+	if (nb_rx_q == 0 && nb_tx_q == 0) {
+		nb_rx_q = dev_info.default_rxportconf.nb_queues;
+		if (nb_rx_q == 0)
+			nb_rx_q = RTE_ETH_DEV_FALLBACK_RX_NBQUEUES;
+		nb_tx_q = dev_info.default_txportconf.nb_queues;
+		if (nb_tx_q == 0)
+			nb_tx_q = RTE_ETH_DEV_FALLBACK_TX_NBQUEUES;
+	}
+
 	if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) {
 		RTE_PMD_DEBUG_TRACE(
 			"Number of RX queues requested (%u) is greater than max supported(%d)\n",
@@ -1075,8 +1095,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 		return -EINVAL;
 	}
 
-	dev = &rte_eth_devices[port_id];
-
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
 
@@ -1106,13 +1124,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 	 * than the maximum number of RX and TX queues supported by the
 	 * configured device.
 	 */
-	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
-
-	if (nb_rx_q == 0 && nb_tx_q == 0) {
-		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d both rx and tx queue cannot be 0\n", port_id);
-		return -EINVAL;
-	}
-
 	if (nb_rx_q > dev_info.max_rx_queues) {
 		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
 				port_id, nb_rx_q, dev_info.max_rx_queues);
@@ -1477,6 +1488,14 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		return -EINVAL;
 	}
 
+	/* Use default specified by driver, if nb_rx_desc is zero */
+	if (nb_rx_desc == 0) {
+		nb_rx_desc = dev_info.default_rxportconf.ring_size;
+		/* If driver default is also zero, fall back on EAL default */
+		if (nb_rx_desc == 0)
+			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
+	}
+
 	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
 			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
 			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
@@ -1600,6 +1619,13 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 
 	rte_eth_dev_info_get(port_id, &dev_info);
 
+	/* Use default specified by driver, if nb_tx_desc is zero */
+	if (nb_tx_desc == 0) {
+		nb_tx_desc = dev_info.default_txportconf.ring_size;
+		/* If driver default is zero, fall back on EAL default */
+		if (nb_tx_desc == 0)
+			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
+	}
 	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
 	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
 	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca..685145f 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -988,6 +988,27 @@ struct rte_eth_conf {
 
 struct rte_pci_device;
 
+/*
+ * Fallback default preferred Rx/Tx port parameters.
+ * These are used if an application requests default parameters
+ * but the PMD does not provide preferred values.
+ */
+#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1
+#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1
+
+/**
+ * Preferred Rx/Tx port parameters.
+ * There are separate instances of this structure for transmission
+ * and reception respectively.
+ */
+struct rte_eth_dev_portconf {
+	uint16_t burst_size; /**< Device-preferred burst size */
+	uint16_t ring_size; /**< Device-preferred size of queue rings */
+	uint16_t nb_queues; /**< Device-preferred number of queues */
+};
+
 /**
  * Ethernet device information
  */
@@ -1029,6 +1050,10 @@ struct rte_eth_dev_info {
 	/** Configured number of rx/tx queues */
 	uint16_t nb_rx_queues; /**< Number of RX queues. */
 	uint16_t nb_tx_queues; /**< Number of TX queues. */
+	/** Rx parameter recommendations */
+	struct rte_eth_dev_portconf default_rxportconf;
+	/** Tx parameter recommendations */
+	struct rte_eth_dev_portconf default_txportconf;
 };
 
 /**
-- 
2.9.5

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-04-04 17:17  3% ` [dpdk-dev] [PATCH v3 " Remy Horton
  @ 2018-04-06 14:49  4%   ` Remy Horton
  2018-04-06 14:49  7%     ` [dpdk-dev] [PATCH v5 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
                       ` (2 more replies)
  1 sibling, 3 replies; 200+ results
From: Remy Horton @ 2018-04-06 14:49 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related parameters,
such as burst sizes, descriptor ring sizes, and number of queues, varies
between different network interface devices. This patchset allows individual
PMDs to specify their preferred parameter values, and if so indicated by an
application, for them to be used automatically by the ethdev layer.

rte_eth_dev_configure() has been changed so that specifying zero for both
nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
are not available, falls back to EAL defaults. Setting one (but not both)
to zero does not cause the use of defaults, as having one of them zeroed is
a valid setup.

This patchset includes per-PMD values for e1000 and i40e but it is expected
that subsequent patchsets will cover other PMDs. A deprecation notice
covering the API/ABI change is in place.

Changes in v5:
* uint_16_t corrected to uint16_t

Changes in v4:
* Added API/ABI change documentation
* Rebased to 78f5a2e93d74

Changes in v3:
* Changed formatting around new rte_eth_dev_info fields
* Added Doxygen documentation to struct rte_eth_dev_portconf
* Testpmd "port config all burst 0" and --burst=0 uses PMD 
  Rx burst recommendations.
* Added to release notes
* Rebased to 8ea081f38161

Changes in v2:
* Rebased to master
* Removed fallback values from rte_eth_dev_info_get()
* Added fallback values to rte_rte_[rt]x_queue_setup()
* Added fallback values to rte_eth_dev_configure()
* Corrected comment
* Removed deprecation notice
* Split RX and Tx into seperate structures
* Changed parameter names


Remy Horton (4):
  ethdev: add support for PMD-tuned Tx/Rx parameters
  net/e1000: add TxRx tuning parameters
  net/i40e: add TxRx tuning parameters
  testpmd: make use of per-PMD TxRx parameters

 app/test-pmd/cmdline.c                 | 31 +++++++++++++++++++++---
 app/test-pmd/parameters.c              | 38 +++++++++++++++++++++++++----
 app/test-pmd/testpmd.c                 |  5 ++--
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 drivers/net/e1000/em_ethdev.c          |  6 +++++
 drivers/net/i40e/i40e_ethdev.c         | 33 ++++++++++++++++++++++---
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 9 files changed, 195 insertions(+), 35 deletions(-)

-- 
2.9.5

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  2018-04-06 13:54  4% [dpdk-dev] [PATCH v4 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
@ 2018-04-06 13:54  7% ` Remy Horton
  0 siblings, 0 replies; 200+ results
From: Remy Horton @ 2018-04-06 13:54 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related
parameters, such as burst sizes, descriptor ring sizes, and number
of queues, varies between different network interface devices. This
patch allows individual PMDs to specify preferred parameter values.

Signed-off-by: Remy Horton <remy.horton@intel.com>
---
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 4 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..d13077d 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -112,19 +112,6 @@ Deprecation Notices
   The new API add rss_level field to ``rte_eth_rss_conf`` to enable a choice
   of RSS hash calculation on outer or inner header of tunneled packet.
 
-* ethdev:  Currently, if the  rte_eth_rx_burst() function returns a value less
-  than *nb_pkts*, the application will assume that no more packets are present.
-  Some of the hw queue based hardware can only support smaller burst for RX
-  and TX and thus break the expectation of the rx_burst API. Similar is the
-  case for TX burst as well as ring sizes. ``rte_eth_dev_info`` will be added
-  with following new parameters so as to support semantics for drivers to
-  define a preferred size for Rx/Tx burst and rings.
-
-  - Member ``struct preferred_size`` would be added to enclose all preferred
-    size to be fetched from driver/implementation.
-  - Members ``uint16_t rx_burst``,  ``uint16_t tx_burst``, ``uint16_t rx_ring``,
-    and ``uint16_t tx_ring`` would be added to ``struct preferred_size``.
-
 * ethdev: A work is being planned for 18.05 to expose VF port representors
   as a mean to perform control and data path operation on the different VFs.
   As VF representor is an ethdev port, new fields are needed in order to map
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1c..2fe9c70 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -58,6 +58,11 @@ New Features
   * Added support for NVGRE, VXLAN and GENEVE filters in flow API.
   * Added support for DROP action in flow API.
 
+* **Added PMD-recommended Tx and Rx parameters**
+
+  Applications can now query drivers for device-tuned values of
+  ring sizes, burst sizes, and number of queues.
+
 
 API Changes
 -----------
@@ -72,6 +77,29 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Changes to semantics of rte_eth_dev_configure() parameters.**
+
+   If both the ``nb_rx_q`` and ``nb_tx_q`` parameters are zero,
+   ``rte_eth_dev_configure`` will now use PMD-recommended queue sizes, or if
+   recommendations are not provided by the PMD the function will use ethdev
+   fall-back values. Previously setting both of the parameters to zero would
+   have resulted in ``-EINVAL`` being returned.
+
+* **Changes to semantics of rte_eth_rx_queue_setup() parameters.**
+
+   If the ``nb_rx_desc`` parameter is zero, ``rte_eth_rx_queue_setup`` will
+   now use the PMD-recommended Rx ring size, or in the case where the PMD
+   does not provide a recommendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_rx_desc`` to zero would have
+   resulted in an error.
+
+* **Changes to semantics of rte_eth_tx_queue_setup() parameters.**
+
+   If the ``nb_tx_desc`` parameter is zero, ``rte_eth_tx_queue_setup`` will
+   now use the PMD-recommended Tx ring size, or in the case where the PMD
+   does not provide a recoomendation, will use an ethdev-provided
+   fall-back value. Previously, setting ``nb_tx_desc`` to zero would have
+   resulted in an error.
 
 ABI Changes
 -----------
@@ -86,6 +114,13 @@ ABI Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Additional fields in rte_eth_dev_info.**
+
+  The ``rte_eth_dev_info`` structure has had two extra entries appended to the
+  end of it: ``default_rxportconf`` and ``default_txportconf``. Each of these
+  in turn are ``rte_eth_dev_portconf`` structures containing three fields of
+  type ``uint_16_t``: ``burst_size``, ``ring_size``, and ``nb_queues``. These
+  are parameter values recommended for use by the PMD.
 
 Removed Items
 -------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..209796d 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1061,6 +1061,26 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
 
+	dev = &rte_eth_devices[port_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
+
+	/* If number of queues specified by application for both Rx and Tx is
+	 * zero, use driver preferred values. This cannot be done individually
+	 * as it is valid for either Tx or Rx (but not both) to be zero.
+	 * If driver does not provide any preferred valued, fall back on
+	 * EAL defaults.
+	 */
+	if (nb_rx_q == 0 && nb_tx_q == 0) {
+		nb_rx_q = dev_info.default_rxportconf.nb_queues;
+		if (nb_rx_q == 0)
+			nb_rx_q = RTE_ETH_DEV_FALLBACK_RX_NBQUEUES;
+		nb_tx_q = dev_info.default_txportconf.nb_queues;
+		if (nb_tx_q == 0)
+			nb_tx_q = RTE_ETH_DEV_FALLBACK_TX_NBQUEUES;
+	}
+
 	if (nb_rx_q > RTE_MAX_QUEUES_PER_PORT) {
 		RTE_PMD_DEBUG_TRACE(
 			"Number of RX queues requested (%u) is greater than max supported(%d)\n",
@@ -1075,8 +1095,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 		return -EINVAL;
 	}
 
-	dev = &rte_eth_devices[port_id];
-
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
 	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
 
@@ -1106,13 +1124,6 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,
 	 * than the maximum number of RX and TX queues supported by the
 	 * configured device.
 	 */
-	(*dev->dev_ops->dev_infos_get)(dev, &dev_info);
-
-	if (nb_rx_q == 0 && nb_tx_q == 0) {
-		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d both rx and tx queue cannot be 0\n", port_id);
-		return -EINVAL;
-	}
-
 	if (nb_rx_q > dev_info.max_rx_queues) {
 		RTE_PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",
 				port_id, nb_rx_q, dev_info.max_rx_queues);
@@ -1477,6 +1488,14 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		return -EINVAL;
 	}
 
+	/* Use default specified by driver, if nb_rx_desc is zero */
+	if (nb_rx_desc == 0) {
+		nb_rx_desc = dev_info.default_rxportconf.ring_size;
+		/* If driver default is also zero, fall back on EAL default */
+		if (nb_rx_desc == 0)
+			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
+	}
+
 	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
 			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
 			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
@@ -1600,6 +1619,13 @@ rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 
 	rte_eth_dev_info_get(port_id, &dev_info);
 
+	/* Use default specified by driver, if nb_tx_desc is zero */
+	if (nb_tx_desc == 0) {
+		nb_tx_desc = dev_info.default_txportconf.ring_size;
+		/* If driver default is zero, fall back on EAL default */
+		if (nb_tx_desc == 0)
+			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
+	}
 	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
 	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
 	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca..685145f 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -988,6 +988,27 @@ struct rte_eth_conf {
 
 struct rte_pci_device;
 
+/*
+ * Fallback default preferred Rx/Tx port parameters.
+ * These are used if an application requests default parameters
+ * but the PMD does not provide preferred values.
+ */
+#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512
+#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1
+#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1
+
+/**
+ * Preferred Rx/Tx port parameters.
+ * There are separate instances of this structure for transmission
+ * and reception respectively.
+ */
+struct rte_eth_dev_portconf {
+	uint16_t burst_size; /**< Device-preferred burst size */
+	uint16_t ring_size; /**< Device-preferred size of queue rings */
+	uint16_t nb_queues; /**< Device-preferred number of queues */
+};
+
 /**
  * Ethernet device information
  */
@@ -1029,6 +1050,10 @@ struct rte_eth_dev_info {
 	/** Configured number of rx/tx queues */
 	uint16_t nb_rx_queues; /**< Number of RX queues. */
 	uint16_t nb_tx_queues; /**< Number of TX queues. */
+	/** Rx parameter recommendations */
+	struct rte_eth_dev_portconf default_rxportconf;
+	/** Tx parameter recommendations */
+	struct rte_eth_dev_portconf default_txportconf;
 };
 
 /**
-- 
2.9.5

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v4 0/4] ethdev: add per-PMD tuning of RxTx parmeters
@ 2018-04-06 13:54  4% Remy Horton
  2018-04-06 13:54  7% ` [dpdk-dev] [PATCH v4 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
  0 siblings, 1 reply; 200+ results
From: Remy Horton @ 2018-04-06 13:54 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related parameters,
such as burst sizes, descriptor ring sizes, and number of queues, varies
between different network interface devices. This patchset allows individual
PMDs to specify their preferred parameter values, and if so indicated by an
application, for them to be used automatically by the ethdev layer.

rte_eth_dev_configure() has been changed so that specifying zero for both
nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
are not available, falls back to EAL defaults. Setting one (but not both)
to zero does not cause the use of defaults, as having one of them zeroed is
a valid setup.

This patchset includes per-PMD values for e1000 and i40e but it is expected
that subsequent patchsets will cover other PMDs. A deprecation notice
covering the API/ABI change is in place.

Changes in v4:
* Added API/ABI change documentation
# Rebased to 78f5a2e93d74

Changes in v3:
* Changed formatting around new rte_eth_dev_info fields
* Added Doxygen documentation to struct rte_eth_dev_portconf
* Testpmd "port config all burst 0" and --burst=0 uses PMD 
  Rx burst recommendations.
* Added to release notes
* Rebased to 8ea081f38161

Changes in v2:
* Rebased to master
* Removed fallback values from rte_eth_dev_info_get()
* Added fallback values to rte_rte_[rt]x_queue_setup()
* Added fallback values to rte_eth_dev_configure()
* Corrected comment
* Removed deprecation notice
* Split RX and Tx into seperate structures
* Changed parameter names


Remy Horton (4):
  ethdev: add support for PMD-tuned Tx/Rx parameters
  net/e1000: add TxRx tuning parameters
  net/i40e: add TxRx tuning parameters
  testpmd: make use of per-PMD TxRx parameters

 app/test-pmd/cmdline.c                 | 31 +++++++++++++++++++++---
 app/test-pmd/parameters.c              | 38 +++++++++++++++++++++++++----
 app/test-pmd/testpmd.c                 |  5 ++--
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst | 35 +++++++++++++++++++++++++++
 drivers/net/e1000/em_ethdev.c          |  6 +++++
 drivers/net/i40e/i40e_ethdev.c         | 33 ++++++++++++++++++++++---
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 9 files changed, 195 insertions(+), 35 deletions(-)

-- 
2.9.5

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v2 15/15] ethdev: add port ID item and action to flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (11 preceding siblings ...)
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to " Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z, Declan Doherty

RTE_FLOW_ACTION_TYPE_PORT_ID brings the ability to inject matching traffic
into a different device, as identified by its DPDK port ID.

This is normally only supported when the target port ID has some kind of
relationship with the port ID the flow rule is created against, such as
being exposed by a common physical device (e.g. a different port of an
Ethernet switch).

The converse pattern item, RTE_FLOW_ITEM_TYPE_PORT_ID, makes the resulting
flow rule match traffic whose origin is the specified port ID. Note that
specifying a port ID that differs from the one the flow rule is created
against is normally meaningless (if even accepted), but can make sense if
combined with the transfer attribute.

These must not be confused with their PHY_PORT counterparts, which refer to
physical ports using device-specific indices, but unlike PORT_ID are not
necessarily tied to DPDK port IDs.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Qi Zhang <qi.z.zhang@intel.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
Cc: Declan Doherty <declan.doherty@intel.com>

---

This patch provides the same functionality and supersedes Qi Zhang's
"ether: add flow action to redirect packet to a port" [1].

The main differences are:

- Action is named PORT_ID instead of PORT.
- Addition of a PORT_ID pattern item.
- More extensive documentation.
- Testpmd support.
- rte_flow_copy() support.

[1] http://dpdk.org/ml/archives/dev/2018-April/094648.html
---
 app/test-pmd/cmdline_flow.c                 | 57 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  2 +
 doc/guides/prog_guide/rte_flow.rst          | 48 ++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  9 ++++
 lib/librte_ether/rte_flow.c                 |  2 +
 lib/librte_ether/rte_flow.h                 | 56 +++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index c77525ad9..f85c1c57f 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -89,6 +89,8 @@ enum index {
 	ITEM_VF_ID,
 	ITEM_PHY_PORT,
 	ITEM_PHY_PORT_INDEX,
+	ITEM_PORT_ID,
+	ITEM_PORT_ID_ID,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -185,6 +187,9 @@ enum index {
 	ACTION_PHY_PORT,
 	ACTION_PHY_PORT_ORIGINAL,
 	ACTION_PHY_PORT_INDEX,
+	ACTION_PORT_ID,
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -445,6 +450,7 @@ static const enum index next_item[] = {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_PHY_PORT,
+	ITEM_PORT_ID,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -491,6 +497,12 @@ static const enum index item_phy_port[] = {
 	ZERO,
 };
 
+static const enum index item_port_id[] = {
+	ITEM_PORT_ID_ID,
+	ITEM_NEXT,
+	ZERO,
+};
+
 static const enum index item_raw[] = {
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -627,6 +639,7 @@ static const enum index next_action[] = {
 	ACTION_PF,
 	ACTION_VF,
 	ACTION_PHY_PORT,
+	ACTION_PORT_ID,
 	ACTION_METER,
 	ZERO,
 };
@@ -668,6 +681,13 @@ static const enum index action_phy_port[] = {
 	ZERO,
 };
 
+static const enum index action_port_id[] = {
+	ACTION_PORT_ID_ORIGINAL,
+	ACTION_PORT_ID_ID,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1084,6 +1104,20 @@ static const struct token token_list[] = {
 		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
+	[ITEM_PORT_ID] = {
+		.name = "port_id",
+		.help = "match traffic from/to a given DPDK port ID",
+		.priv = PRIV_ITEM(PORT_ID,
+				  sizeof(struct rte_flow_item_port_id)),
+		.next = NEXT(item_port_id),
+		.call = parse_vc,
+	},
+	[ITEM_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(item_port_id, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port_id, id)),
+	},
 	[ITEM_RAW] = {
 		.name = "raw",
 		.help = "match an arbitrary byte string",
@@ -1749,6 +1783,29 @@ static const struct token token_list[] = {
 					index)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PORT_ID] = {
+		.name = "port_id",
+		.help = "direct matching traffic to a given DPDK port ID",
+		.priv = PRIV_ACTION(PORT_ID,
+				    sizeof(struct rte_flow_action_port_id)),
+		.next = NEXT(action_port_id),
+		.call = parse_vc,
+	},
+	[ACTION_PORT_ID_ORIGINAL] = {
+		.name = "original",
+		.help = "use original DPDK port ID if possible",
+		.next = NEXT(action_port_id, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_port_id,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PORT_ID_ID] = {
+		.name = "id",
+		.help = "DPDK port ID",
+		.next = NEXT(action_port_id, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_port_id, id)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index effb4ff81..4a273eff7 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,6 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -1059,6 +1060,7 @@ static const struct {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index ec59d0f3c..91dbd61a0 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -607,6 +607,36 @@ associated with a port_id should be retrieved by other means.
    | ``mask`` | ``index`` | zeroed to match any port index |
    +----------+-----------+--------------------------------+
 
+Item: ``PORT_ID``
+^^^^^^^^^^^^^^^^^
+
+Matches traffic originating from (ingress) or going to (egress) a given DPDK
+port ID.
+
+Normally only supported if the port ID in question is known by the
+underlying PMD and related to the device the flow rule is created against.
+
+This must not be confused with `Item: PHY_PORT`_ which refers to the
+physical port of a device, whereas `Item: PORT_ID`_ refers to a ``struct
+rte_eth_dev`` object on the application side (also known as "port
+representor" depending on the kind of underlying device).
+
+- Default ``mask`` matches the specified DPDK port ID.
+
+.. _table_rte_flow_item_port_id:
+
+.. table:: PORT_ID
+
+   +----------+----------+-----------------------------+
+   | Field    | Subfield | Value                       |
+   +==========+==========+=============================+
+   | ``spec`` | ``id``   | DPDK port ID                |
+   +----------+----------+-----------------------------+
+   | ``last`` | ``id``   | upper range value           |
+   +----------+----------+-----------------------------+
+   | ``mask`` | ``id``   | zeroed to match any port ID |
+   +----------+----------+-----------------------------+
+
 Data matching item types
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1437,6 +1467,24 @@ See `Item: PHY_PORT`_.
    | ``index``    | physical port index                 |
    +--------------+-------------------------------------+
 
+Action: ``PORT_ID``
+^^^^^^^^^^^^^^^^^^^
+Directs matching traffic to a given DPDK port ID.
+
+See `Item: PORT_ID`_.
+
+.. _table_rte_flow_action_port_id:
+
+.. table:: PORT_ID
+
+   +--------------+---------------------------------------+
+   | Field        | Value                                 |
+   +==============+=======================================+
+   | ``original`` | use original DPDK port ID if possible |
+   +--------------+---------------------------------------+
+   | ``id``       | DPDK port ID                          |
+   +--------------+---------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index ca23ba146..e78f26dce 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3212,6 +3212,10 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: match traffic from/to a given DPDK port ID.
+
+  - ``id {unsigned}``: DPDK port ID.
+
 - ``raw``: match an arbitrary byte string.
 
   - ``relative {boolean}``: look for pattern after the previous item.
@@ -3426,6 +3430,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original port index if possible.
   - ``index {unsigned}``: physical port index.
 
+- ``port_id``: direct matching traffic to a given DPDK port ID.
+
+  - ``original {boolean}``: use original DPDK port ID if possible.
+  - ``id {unsigned}``: DPDK port ID.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index e0fd78dd5..3d8116ebd 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,6 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
+	MK_FLOW_ITEM(PORT_ID, sizeof(struct rte_flow_item_port_id)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
@@ -77,6 +78,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
+	MK_FLOW_ACTION(PORT_ID, sizeof(struct rte_flow_action_port_id)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 68a13ffa9..bed727df8 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -176,6 +176,16 @@ enum rte_flow_item_type {
 	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
+	 * [META]
+	 *
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given DPDK port ID.
+	 *
+	 * See struct rte_flow_item_port_id.
+	 */
+	RTE_FLOW_ITEM_TYPE_PORT_ID,
+
+	/**
 	 * Matches a byte string of a given length at a given offset.
 	 *
 	 * See struct rte_flow_item_raw.
@@ -410,6 +420,32 @@ static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 #endif
 
 /**
+ * RTE_FLOW_ITEM_TYPE_PORT_ID
+ *
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * DPDK port ID.
+ *
+ * Normally only supported if the port ID in question is known by the
+ * underlying PMD and related to the device the flow rule is created
+ * against.
+ *
+ * This must not be confused with @p PHY_PORT which refers to the physical
+ * port of a device, whereas @p PORT_ID refers to a struct rte_eth_dev
+ * object on the application side (also known as "port representor"
+ * depending on the kind of underlying device).
+ */
+struct rte_flow_item_port_id {
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/** Default mask for RTE_FLOW_ITEM_TYPE_PORT_ID. */
+#ifndef __cplusplus
+static const struct rte_flow_item_port_id rte_flow_item_port_id_mask = {
+	.id = 0xffffffff,
+};
+#endif
+
+/**
  * RTE_FLOW_ITEM_TYPE_RAW
  *
  * Matches a byte string of a given length at a given offset.
@@ -993,6 +1029,13 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_PHY_PORT,
 
 	/**
+	 * Directs matching traffic to a given DPDK port ID.
+	 *
+	 * See struct rte_flow_action_port_id.
+	 */
+	RTE_FLOW_ACTION_TYPE_PORT_ID,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1124,6 +1167,19 @@ struct rte_flow_action_phy_port {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PORT_ID
+ *
+ * Directs matching traffic to a given DPDK port ID.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PORT_ID
+ */
+struct rte_flow_action_port_id {
+	uint32_t original:1; /**< Use original DPDK port ID if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t id; /**< DPDK port ID. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (10 preceding siblings ...)
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 13/15] ethdev: rename physical port item " Adrien Mazarguil
@ 2018-04-06 13:25  3%   ` Adrien Mazarguil
  2018-04-07  9:51  0%     ` Andrew Rybchenko
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 15/15] ethdev: add port ID item and " Adrien Mazarguil
  2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev; +Cc: Zhang, Qi Z

This patch adds the missing action counterpart to the PHY_PORT pattern
item, that is, the ability to directly inject matching traffic into a
physical port of the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: "Zhang, Qi Z" <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 35 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 20 ++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  5 ++++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 22 +++++++++++++++
 6 files changed, 84 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index bfe532f0a..c77525ad9 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -182,6 +182,9 @@ enum index {
 	ACTION_VF,
 	ACTION_VF_ORIGINAL,
 	ACTION_VF_ID,
+	ACTION_PHY_PORT,
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
 	ACTION_METER,
 	ACTION_METER_ID,
 };
@@ -623,6 +626,7 @@ static const enum index next_action[] = {
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
+	ACTION_PHY_PORT,
 	ACTION_METER,
 	ZERO,
 };
@@ -657,6 +661,13 @@ static const enum index action_vf[] = {
 	ZERO,
 };
 
+static const enum index action_phy_port[] = {
+	ACTION_PHY_PORT_ORIGINAL,
+	ACTION_PHY_PORT_INDEX,
+	ACTION_NEXT,
+	ZERO,
+};
+
 static const enum index action_meter[] = {
 	ACTION_METER_ID,
 	ACTION_NEXT,
@@ -1714,6 +1725,30 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
 	},
+	[ACTION_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "direct packets to physical port index",
+		.priv = PRIV_ACTION(PHY_PORT,
+				    sizeof(struct rte_flow_action_phy_port)),
+		.next = NEXT(action_phy_port),
+		.call = parse_vc,
+	},
+	[ACTION_PHY_PORT_ORIGINAL] = {
+		.name = "original",
+		.help = "use original port index if possible",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(BOOLEAN)),
+		.args = ARGS(ARGS_ENTRY_BF(struct rte_flow_action_phy_port,
+					   original, 1)),
+		.call = parse_vc_conf,
+	},
+	[ACTION_PHY_PORT_INDEX] = {
+		.name = "index",
+		.help = "physical port index",
+		.next = NEXT(action_phy_port, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_phy_port,
+					index)),
+		.call = parse_vc_conf,
+	},
 	[ACTION_METER] = {
 		.name = "meter",
 		.help = "meter the directed packets at given id",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9f968919e..effb4ff81 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1058,6 +1058,7 @@ static const struct {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
 };
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 656d4b5b7..ec59d0f3c 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1417,6 +1417,26 @@ See `Item: VF`_.
    | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
+Action: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^^^
+
+Directs matching traffic to a given physical port index of the underlying
+device.
+
+See `Item: PHY_PORT`_.
+
+.. _table_rte_flow_action_phy_port:
+
+.. table:: PHY_PORT
+
+   +--------------+-------------------------------------+
+   | Field        | Value                               |
+   +==============+=====================================+
+   | ``original`` | use original port index if possible |
+   +--------------+-------------------------------------+
+   | ``index``    | physical port index                 |
+   +--------------+-------------------------------------+
+
 Action: ``METER``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 1d9ce6963..ca23ba146 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3421,6 +3421,11 @@ This section lists supported actions and their attributes, if any.
   - ``original {boolean}``: use original VF ID if possible.
   - ``id {unsigned}``: VF ID.
 
+- ``phy_port``: direct packets to physical port index.
+
+  - ``original {boolean}``: use original port index if possible.
+  - ``index {unsigned}``: physical port index.
+
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 6d4d7f5ed..e0fd78dd5 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -76,6 +76,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
+	MK_FLOW_ACTION(PHY_PORT, sizeof(struct rte_flow_action_phy_port)),
 };
 
 static int
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cd4cde3fa..68a13ffa9 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -985,6 +985,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VF,
 
 	/**
+	 * Directs packets to a given physical port index of the underlying
+	 * device.
+	 *
+	 * See struct rte_flow_action_phy_port.
+	 */
+	RTE_FLOW_ACTION_TYPE_PHY_PORT,
+
+	/**
 	 * Traffic metering and policing (MTR).
 	 *
 	 * See struct rte_flow_action_meter.
@@ -1102,6 +1110,20 @@ struct rte_flow_action_vf {
 };
 
 /**
+ * RTE_FLOW_ACTION_TYPE_PHY_PORT
+ *
+ * Directs packets to a given physical port index of the underlying
+ * device.
+ *
+ * @see RTE_FLOW_ITEM_TYPE_PHY_PORT
+ */
+struct rte_flow_action_phy_port {
+	uint32_t original:1; /**< Use original port index if possible. */
+	uint32_t reserved:31; /**< Reserved, must be zero. */
+	uint32_t index; /**< Physical port index. */
+};
+
+/**
  * RTE_FLOW_ACTION_TYPE_METER
  *
  * Traffic metering and policing (MTR).
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v2 13/15] ethdev: rename physical port item in flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (9 preceding siblings ...)
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in " Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to " Adrien Mazarguil
                     ` (2 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

While RTE_FLOW_ITEM_TYPE_PORT refers to physical ports of the underlying
device using specific identifiers, these are often confused with DPDK port
IDs exposed to applications in the global name space.

Since this pattern item is seldom used, rename it RTE_FLOW_ITEM_PHY_PORT
for better clarity.

No ABI impact.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 27 +++++++++++----------
 app/test-pmd/config.c                       |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 22 ++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 +-
 lib/librte_ether/rte_flow.c                 |  2 +-
 lib/librte_ether/rte_flow.h                 | 31 ++++++++++--------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 741d66b22..bfe532f0a 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -87,8 +87,8 @@ enum index {
 	ITEM_PF,
 	ITEM_VF,
 	ITEM_VF_ID,
-	ITEM_PORT,
-	ITEM_PORT_INDEX,
+	ITEM_PHY_PORT,
+	ITEM_PHY_PORT_INDEX,
 	ITEM_RAW,
 	ITEM_RAW_RELATIVE,
 	ITEM_RAW_SEARCH,
@@ -441,7 +441,7 @@ static const enum index next_item[] = {
 	ITEM_ANY,
 	ITEM_PF,
 	ITEM_VF,
-	ITEM_PORT,
+	ITEM_PHY_PORT,
 	ITEM_RAW,
 	ITEM_ETH,
 	ITEM_VLAN,
@@ -482,8 +482,8 @@ static const enum index item_vf[] = {
 	ZERO,
 };
 
-static const enum index item_port[] = {
-	ITEM_PORT_INDEX,
+static const enum index item_phy_port[] = {
+	ITEM_PHY_PORT_INDEX,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1059,18 +1059,19 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
-	[ITEM_PORT] = {
-		.name = "port",
-		.help = "device-specific physical port index to use",
-		.priv = PRIV_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-		.next = NEXT(item_port),
+	[ITEM_PHY_PORT] = {
+		.name = "phy_port",
+		.help = "match traffic from/to a specific physical port",
+		.priv = PRIV_ITEM(PHY_PORT,
+				  sizeof(struct rte_flow_item_phy_port)),
+		.next = NEXT(item_phy_port),
 		.call = parse_vc,
 	},
-	[ITEM_PORT_INDEX] = {
+	[ITEM_PHY_PORT_INDEX] = {
 		.name = "index",
 		.help = "physical port index",
-		.next = NEXT(item_port, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_port, index)),
+		.next = NEXT(item_phy_port, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_phy_port, index)),
 	},
 	[ITEM_RAW] = {
 		.name = "raw",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 49ef87782..9f968919e 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -960,7 +960,7 @@ static const struct {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index beedc713b..656d4b5b7 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -189,8 +189,8 @@ When supported, this effectively enables an application to re-route traffic
 not necessarily intended for it (e.g. coming from or addressed to different
 physical ports, VFs or applications) at the device level.
 
-It complements the behavior of some pattern items such as `Item: PORT`_ and
-is meaningless without them.
+It complements the behavior of some pattern items such as `Item: PHY_PORT`_
+and is meaningless without them.
 
 Pattern item
 ~~~~~~~~~~~~
@@ -573,15 +573,15 @@ separate entities, should be addressed through their own DPDK port IDs.
    | ``mask`` | ``id``   | zeroed to match any VF ID |
    +----------+----------+---------------------------+
 
-Item: ``PORT``
-^^^^^^^^^^^^^^
+Item: ``PHY_PORT``
+^^^^^^^^^^^^^^^^^^
 
-Matches packets coming from the specified physical port of the underlying
-device.
+Matches traffic originating from (ingress) or going to (egress) a physical
+port of the underlying device.
 
-The first PORT item overrides the physical port normally associated with the
-specified DPDK input port (port_id). This item can be provided several times
-to match additional physical ports.
+The first PHY_PORT item overrides the physical port normally associated with
+the specified DPDK input port (port_id). This item can be provided several
+times to match additional physical ports.
 
 Note that physical ports are not necessarily tied to DPDK input ports
 (port_id) when those are not under DPDK control. Possible values are
@@ -593,9 +593,9 @@ associated with a port_id should be retrieved by other means.
 
 - Default ``mask`` matches any port index.
 
-.. _table_rte_flow_item_port:
+.. _table_rte_flow_item_phy_port:
 
-.. table:: PORT
+.. table:: PHY_PORT
 
    +----------+-----------+--------------------------------+
    | Field    | Subfield  | Value                          |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 2f1db9a29..1d9ce6963 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3208,7 +3208,7 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``id {unsigned}``: VF ID.
 
-- ``port``: device-specific physical port index to use.
+- ``phy_port``: match traffic from/to a specific physical port.
 
   - ``index {unsigned}``: physical port index.
 
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 1f247d656..6d4d7f5ed 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -38,7 +38,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(ANY, sizeof(struct rte_flow_item_any)),
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
-	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
+	MK_FLOW_ITEM(PHY_PORT, sizeof(struct rte_flow_item_phy_port)),
 	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index bb9d59833..cd4cde3fa 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -84,7 +84,7 @@ struct rte_flow_attr {
 	 * applications) at the device level.
 	 *
 	 * It complements the behavior of some pattern items such as
-	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 * RTE_FLOW_ITEM_TYPE_PHY_PORT and is meaningless without them.
 	 */
 	uint32_t transfer:1;
 	uint32_t reserved:29; /**< Reserved, must be zero. */
@@ -168,17 +168,12 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets coming from the specified physical port of the
-	 * underlying device.
-	 *
-	 * The first PORT item overrides the physical port normally
-	 * associated with the specified DPDK input port (port_id). This
-	 * item can be provided several times to match additional physical
-	 * ports.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * physical port of the underlying device.
 	 *
-	 * See struct rte_flow_item_port.
+	 * See struct rte_flow_item_phy_port.
 	 */
-	RTE_FLOW_ITEM_TYPE_PORT,
+	RTE_FLOW_ITEM_TYPE_PHY_PORT,
 
 	/**
 	 * Matches a byte string of a given length at a given offset.
@@ -384,13 +379,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
 #endif
 
 /**
- * RTE_FLOW_ITEM_TYPE_PORT
+ * RTE_FLOW_ITEM_TYPE_PHY_PORT
  *
- * Matches packets coming from the specified physical port of the underlying
- * device.
+ * Matches traffic originating from (ingress) or going to (egress) a
+ * physical port of the underlying device.
  *
- * The first PORT item overrides the physical port normally associated with
- * the specified DPDK input port (port_id). This item can be provided
+ * The first PHY_PORT item overrides the physical port normally associated
+ * with the specified DPDK input port (port_id). This item can be provided
  * several times to match additional physical ports.
  *
  * Note that physical ports are not necessarily tied to DPDK input ports
@@ -403,13 +398,13 @@ static const struct rte_flow_item_vf rte_flow_item_vf_mask = {
  *
  * A zeroed mask can be used to match any port index.
  */
-struct rte_flow_item_port {
+struct rte_flow_item_phy_port {
 	uint32_t index; /**< Physical port index. */
 };
 
-/** Default mask for RTE_FLOW_ITEM_TYPE_PORT. */
+/** Default mask for RTE_FLOW_ITEM_TYPE_PHY_PORT. */
 #ifndef __cplusplus
-static const struct rte_flow_item_port rte_flow_item_port_mask = {
+static const struct rte_flow_item_phy_port rte_flow_item_phy_port_mask = {
 	.index = 0x00000000,
 };
 #endif
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (8 preceding siblings ...)
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 11/15] ethdev: add transfer attribute to " Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-07  9:41  0%     ` Andrew Rybchenko
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 13/15] ethdev: rename physical port item " Adrien Mazarguil
                     ` (3 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Somnath Kotur, Beilei Xing, Qi Zhang

Contrary to all other pattern items, these are inconsistently documented as
affecting traffic instead of simply matching its origin, without provision
for the latter.

This commit clarifies documentation and updates PMDs since the original
behavior now has to be explicitly requested using the new transfer
attribute.

It breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

Impacted PMDs are bnxt and i40e, for which the VF pattern item is now only
supported when a transfer attribute is also present.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/cmdline_flow.c                 | 12 +++---
 doc/guides/prog_guide/rte_flow.rst          | 36 +++++++++---------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 12 +++---
 drivers/net/bnxt/bnxt_filter.c              | 22 ++++++-----
 drivers/net/i40e/i40e_flow.c                | 23 +++++++-----
 lib/librte_ether/rte_flow.h                 | 47 ++++++++++--------------
 6 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 122e9d50b..741d66b22 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -1041,21 +1041,21 @@ static const struct token token_list[] = {
 	},
 	[ITEM_PF] = {
 		.name = "pf",
-		.help = "match packets addressed to the physical function",
+		.help = "match traffic from/to the physical function",
 		.priv = PRIV_ITEM(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ITEM_NEXT)),
 		.call = parse_vc,
 	},
 	[ITEM_VF] = {
 		.name = "vf",
-		.help = "match packets addressed to a virtual function ID",
+		.help = "match traffic from/to a virtual function ID",
 		.priv = PRIV_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 		.next = NEXT(item_vf),
 		.call = parse_vc,
 	},
 	[ITEM_VF_ID] = {
 		.name = "id",
-		.help = "destination VF ID",
+		.help = "VF ID",
 		.next = NEXT(item_vf, NEXT_ENTRY(UNSIGNED), item_param),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_vf, id)),
 	},
@@ -1686,14 +1686,14 @@ static const struct token token_list[] = {
 	},
 	[ACTION_PF] = {
 		.name = "pf",
-		.help = "redirect packets to physical device function",
+		.help = "direct traffic to physical function",
 		.priv = PRIV_ACTION(PF, 0),
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
 	[ACTION_VF] = {
 		.name = "vf",
-		.help = "redirect packets to virtual device function",
+		.help = "direct traffic to a virtual function ID",
 		.priv = PRIV_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 		.next = NEXT(action_vf),
 		.call = parse_vc,
@@ -1708,7 +1708,7 @@ static const struct token token_list[] = {
 	},
 	[ACTION_VF_ID] = {
 		.name = "id",
-		.help = "VF ID to redirect packets to",
+		.help = "VF ID",
 		.next = NEXT(action_vf, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_vf, id)),
 		.call = parse_vc_conf,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 735ce6323..beedc713b 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -518,15 +518,12 @@ Usage example, matching non-TCPv4 packets only:
 Item: ``PF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to the physical function of the device.
+Matches traffic originating from (ingress) or going to (egress) the physical
+function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: PF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the physical function is not managed by
+the application and thus not associated with a DPDK port ID.
 
-- Likely to return an error or never match any traffic if applied to a VF
-  device.
 - Can be combined with any number of `Item: VF`_ to match both PF and VF
   traffic.
 - ``spec``, ``last`` and ``mask`` must not be set.
@@ -548,15 +545,15 @@ duplicated between device instances by default.
 Item: ``VF``
 ^^^^^^^^^^^^
 
-Matches packets addressed to a virtual function ID of the device.
+Matches traffic originating from (ingress) or going to (egress) a given
+virtual function of the current device.
 
-If the underlying device function differs from the one that would normally
-receive the matched traffic, specifying this item prevents it from reaching
-that device unless the flow rule contains a `Action: VF`_. Packets are not
-duplicated between device instances by default.
+If supported, should work even if the virtual function is not managed by the
+application and thus not associated with a DPDK port ID.
+
+Note this pattern item does not match VF representors traffic which, as
+separate entities, should be addressed through their own DPDK port IDs.
 
-- Likely to return an error or never match any traffic if this causes a VF
-  device to match traffic addressed to a different VF.
 - Can be specified multiple times to match traffic addressed to several VF
   IDs.
 - Can be combined with a PF item to match both PF and VF traffic.
@@ -1379,7 +1376,10 @@ only matching traffic goes through.
 Action: ``PF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to the physical function (PF) of the current device.
+Directs matching traffic to the physical function (PF) of the current
+device.
+
+See `Item: PF`_.
 
 - No configurable properties.
 
@@ -1396,13 +1396,15 @@ Redirects packets to the physical function (PF) of the current device.
 Action: ``VF``
 ^^^^^^^^^^^^^^
 
-Redirects packets to a virtual function (VF) of the current device.
+Directs matching traffic to a given virtual function of the current device.
 
 Packets matched by a VF pattern item can be redirected to their original VF
 ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
+See `Item: VF`_.
+
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1412,7 +1414,7 @@ rule or if packets are not addressed to a VF in the first place.
    +==============+================================+
    | ``original`` | use original VF ID if possible |
    +--------------+--------------------------------+
-   | ``vf``       | VF ID to redirect packets to   |
+   | ``id``       | VF ID                          |
    +--------------+--------------------------------+
 
 Action: ``METER``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a87cd1542..2f1db9a29 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3202,11 +3202,11 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``num {unsigned}``: number of layers covered.
 
-- ``pf``: match packets addressed to the physical function.
+- ``pf``: match traffic from/to the physical function.
 
-- ``vf``: match packets addressed to a virtual function ID.
+- ``vf``: match traffic from/to a virtual function ID.
 
-  - ``id {unsigned}``: destination VF ID.
+  - ``id {unsigned}``: VF ID.
 
 - ``port``: device-specific physical port index to use.
 
@@ -3414,12 +3414,12 @@ This section lists supported actions and their attributes, if any.
 
   - ``queues [{unsigned} [...]] end``: queue indices to use.
 
-- ``pf``: redirect packets to physical device function.
+- ``pf``: direct traffic to physical function.
 
-- ``vf``: redirect packets to virtual device function.
+- ``vf``: direct traffic to a virtual function ID.
 
   - ``original {boolean}``: use original VF ID if possible.
-  - ``id {unsigned}``: VF ID to redirect packets to.
+  - ``id {unsigned}``: VF ID.
 
 Destroying flow rules
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 862e03188..f2ee250e8 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -276,6 +276,7 @@ bnxt_filter_type_check(const struct rte_flow_item pattern[],
 
 static int
 bnxt_validate_and_parse_flow_type(struct bnxt *bp,
+				  const struct rte_flow_attr *attr,
 				  const struct rte_flow_item pattern[],
 				  struct rte_flow_error *error,
 				  struct bnxt_filter_info *filter)
@@ -703,6 +704,16 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 				return -rte_errno;
 			}
 
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM,
+					   item,
+					   "Matching VF traffic without"
+					   " affecting it (transfer attribute)"
+					   " is unsupported");
+				return -rte_errno;
+			}
+
 			filter->mirror_vnic_id =
 			dflt_vnic = bnxt_hwrm_func_qcfg_vf_dflt_vnic_id(bp, vf);
 			if (dflt_vnic < 0) {
@@ -750,14 +761,6 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -837,7 +840,8 @@ bnxt_validate_and_parse_flow(struct rte_eth_dev *dev,
 		goto ret;
 	}
 
-	rc = bnxt_validate_and_parse_flow_type(bp, pattern, error, filter);
+	rc = bnxt_validate_and_parse_flow_type(bp, attr, pattern, error,
+					       filter);
 	if (rc != 0)
 		goto ret;
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 0a3f93fad..96a698a2c 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -54,6 +54,7 @@ static int i40e_flow_parse_ethertype_action(struct rte_eth_dev *dev,
 				    struct rte_flow_error *error,
 				    struct rte_eth_ethertype_filter *filter);
 static int i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+					const struct rte_flow_attr *attr,
 					const struct rte_flow_item *pattern,
 					struct rte_flow_error *error,
 					struct i40e_fdir_filter_conf *filter);
@@ -1918,14 +1919,6 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
-	if (attr->transfer) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
-				   attr, "No support for transfer.");
-		return -rte_errno;
-	}
-
-	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -2429,6 +2422,7 @@ i40e_flow_fdir_get_pctype_value(struct i40e_pf *pf,
  */
 static int
 i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
 			     const struct rte_flow_item *pattern,
 			     struct rte_flow_error *error,
 			     struct i40e_fdir_filter_conf *filter)
@@ -2969,6 +2963,16 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ITEM_TYPE_VF:
 			vf_spec = item->spec;
+			if (!attr->transfer) {
+				rte_flow_error_set(error, ENOTSUP,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "Matching VF traffic"
+						   " without affecting it"
+						   " (transfer attribute)"
+						   " is unsupported");
+				return -rte_errno;
+			}
 			filter->input.flow_ext.is_vf = 1;
 			filter->input.flow_ext.dst_id = vf_spec->id;
 			if (filter->input.flow_ext.is_vf &&
@@ -3131,7 +3135,8 @@ i40e_flow_parse_fdir_filter(struct rte_eth_dev *dev,
 		&filter->fdir_filter;
 	int ret;
 
-	ret = i40e_flow_parse_fdir_pattern(dev, pattern, error, fdir_filter);
+	ret = i40e_flow_parse_fdir_pattern(dev, attr, pattern, error,
+					   fdir_filter);
 	if (ret)
 		return ret;
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index b7bdc0469..bb9d59833 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -148,13 +148,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to the physical function of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a PF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress)
+	 * the physical function of the current device.
 	 *
 	 * No associated specification structure.
 	 */
@@ -163,13 +158,8 @@ enum rte_flow_item_type {
 	/**
 	 * [META]
 	 *
-	 * Matches packets addressed to a virtual function ID of the device.
-	 *
-	 * If the underlying device function differs from the one that would
-	 * normally receive the matched traffic, specifying this item
-	 * prevents it from reaching that device unless the flow rule
-	 * contains a VF action. Packets are not duplicated between device
-	 * instances by default.
+	 * Matches traffic originating from (ingress) or going to (egress) a
+	 * given virtual function of the current device.
 	 *
 	 * See struct rte_flow_item_vf.
 	 */
@@ -367,15 +357,15 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
 /**
  * RTE_FLOW_ITEM_TYPE_VF
  *
- * Matches packets addressed to a virtual function ID of the device.
+ * Matches traffic originating from (ingress) or going to (egress) a given
+ * virtual function of the current device.
  *
- * If the underlying device function differs from the one that would
- * normally receive the matched traffic, specifying this item prevents it
- * from reaching that device unless the flow rule contains a VF
- * action. Packets are not duplicated between device instances by default.
+ * If supported, should work even if the virtual function is not managed by
+ * the application and thus not associated with a DPDK port ID.
+ *
+ * Note this pattern item does not match VF representors traffic which, as
+ * separate entities, should be addressed through their own DPDK port IDs.
  *
- * - Likely to return an error or never match any traffic if this causes a
- *   VF device to match traffic addressed to a different VF.
  * - Can be specified multiple times to match traffic addressed to several
  *   VF IDs.
  * - Can be combined with a PF item to match both PF and VF traffic.
@@ -383,7 +373,7 @@ static const struct rte_flow_item_any rte_flow_item_any_mask = {
  * A zeroed mask can be used to match any VF ID.
  */
 struct rte_flow_item_vf {
-	uint32_t id; /**< Destination VF ID. */
+	uint32_t id; /**< VF ID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VF. */
@@ -984,16 +974,16 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_RSS,
 
 	/**
-	 * Redirects packets to the physical function (PF) of the current
-	 * device.
+	 * Directs matching traffic to the physical function (PF) of the
+	 * current device.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PF,
 
 	/**
-	 * Redirects packets to the virtual function (VF) of the current
-	 * device with the specified ID.
+	 * Directs matching traffic to a given virtual function of the
+	 * current device.
 	 *
 	 * See struct rte_flow_action_vf.
 	 */
@@ -1101,7 +1091,8 @@ struct rte_flow_action_rss {
 /**
  * RTE_FLOW_ACTION_TYPE_VF
  *
- * Redirects packets to a virtual function (VF) of the current device.
+ * Directs matching traffic to a given virtual function of the current
+ * device.
  *
  * Packets matched by a VF pattern item can be redirected to their original
  * VF ID instead of the specified one. This parameter may not be available
@@ -1112,7 +1103,7 @@ struct rte_flow_action_rss {
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
 	uint32_t reserved:31; /**< Reserved, must be zero. */
-	uint32_t id; /**< VF ID to redirect packets to. */
+	uint32_t id; /**< VF ID. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 11/15] ethdev: add transfer attribute to flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (7 preceding siblings ...)
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in " Adrien Mazarguil
                     ` (4 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This new attribute enables applications to create flow rules that do not
simply match traffic whose origin is specified in the pattern (e.g. some
non-default physical port or VF), but actively affect it by applying the
flow rule at the lowest possible level in the underlying device.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 11 +++++
 app/test-pmd/config.c                       |  6 ++-
 doc/guides/prog_guide/rte_flow.rst          | 14 +++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst | 11 ++---
 drivers/net/bnxt/bnxt_filter.c              |  8 ++++
 drivers/net/e1000/igb_flow.c                | 44 ++++++++++++++++++++
 drivers/net/enic/enic_flow.c                |  6 +++
 drivers/net/i40e/i40e_flow.c                |  8 ++++
 drivers/net/ixgbe/ixgbe_flow.c              | 53 ++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_flow.c                |  4 ++
 drivers/net/mlx5/mlx5_flow.c                |  7 ++++
 drivers/net/mvpp2/mrvl_flow.c               |  6 +++
 drivers/net/sfc/sfc_flow.c                  |  6 +++
 drivers/net/tap/tap_flow.c                  |  6 +++
 lib/librte_ether/rte_flow.h                 | 18 +++++++-
 15 files changed, 200 insertions(+), 8 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 3a486032d..122e9d50b 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -69,6 +69,7 @@ enum index {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 
 	/* Validate/create pattern. */
 	PATTERN,
@@ -407,6 +408,7 @@ static const enum index next_vc_attr[] = {
 	PRIORITY,
 	INGRESS,
 	EGRESS,
+	TRANSFER,
 	PATTERN,
 	ZERO,
 };
@@ -960,6 +962,12 @@ static const struct token token_list[] = {
 		.next = NEXT(next_vc_attr),
 		.call = parse_vc,
 	},
+	[TRANSFER] = {
+		.name = "transfer",
+		.help = "apply rule directly to endpoints found in pattern",
+		.next = NEXT(next_vc_attr),
+		.call = parse_vc,
+	},
 	/* Validate/create pattern. */
 	[PATTERN] = {
 		.name = "pattern",
@@ -1945,6 +1953,9 @@ parse_vc(struct context *ctx, const struct token *token,
 	case EGRESS:
 		out->args.vc.attr.egress = 1;
 		return len;
+	case TRANSFER:
+		out->args.vc.attr.transfer = 1;
+		return len;
 	case PATTERN:
 		out->args.vc.pattern =
 			(void *)RTE_ALIGN_CEIL((uintptr_t)(out + 1),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index c0fefe475..49ef87782 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1223,6 +1223,7 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
+		[RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
 		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
@@ -1488,12 +1489,13 @@ port_flow_list(portid_t port_id, uint32_t n, const uint32_t group[n])
 		const struct rte_flow_item *item = pf->pattern;
 		const struct rte_flow_action *action = pf->actions;
 
-		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c\t",
+		printf("%" PRIu32 "\t%" PRIu32 "\t%" PRIu32 "\t%c%c%c\t",
 		       pf->id,
 		       pf->attr.group,
 		       pf->attr.priority,
 		       pf->attr.ingress ? 'i' : '-',
-		       pf->attr.egress ? 'e' : '-');
+		       pf->attr.egress ? 'e' : '-',
+		       pf->attr.transfer ? 't' : '-');
 		while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 			if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
 				printf("%s ", flow_item[item->type].name);
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index c6f16d444..735ce6323 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -178,6 +178,20 @@ directions. At least one direction must be specified.
 Specifying both directions at once for a given rule is not recommended but
 may be valid in a few cases (e.g. shared counters).
 
+Attribute: Transfer
+^^^^^^^^^^^^^^^^^^^
+
+Instead of simply matching the properties of traffic as it would appear on a
+given DPDK port ID, enabling this attribute transfers a flow rule to the
+lowest possible level of any device endpoints found in the pattern.
+
+When supported, this effectively enables an application to re-route traffic
+not necessarily intended for it (e.g. coming from or addressed to different
+physical ports, VFs or applications) at the device level.
+
+It complements the behavior of some pattern items such as `Item: PORT`_ and
+is meaningless without them.
+
 Pattern item
 ~~~~~~~~~~~~
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 25fac8430..a87cd1542 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -2970,14 +2970,14 @@ following sections.
 - Check whether a flow rule can be created::
 
    flow validate {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
 - Create a flow rule::
 
    flow create {port_id}
-       [group {group_id}] [priority {level}] [ingress] [egress]
+       [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
        pattern {item} [/ {item} [...]] / end
        actions {action} [/ {action} [...]] / end
 
@@ -3010,7 +3010,7 @@ underlying device in its current state but stops short of creating it. It is
 bound to ``rte_flow_validate()``::
 
    flow validate {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3047,7 +3047,7 @@ Creating flow rules
 to ``rte_flow_create()``::
 
    flow create {port_id}
-      [group {group_id}] [priority {level}] [ingress] [egress]
+      [group {group_id}] [priority {level}] [ingress] [egress] [transfer]
       pattern {item} [/ {item} [...]] / end
       actions {action} [/ {action} [...]] / end
 
@@ -3061,7 +3061,7 @@ Otherwise it will show an error message of the form::
 
 Parameters describe in the following order:
 
-- Attributes (*group*, *priority*, *ingress*, *egress* tokens).
+- Attributes (*group*, *priority*, *ingress*, *egress*, *transfer* tokens).
 - A matching pattern, starting with the *pattern* token and terminated by an
   *end* pattern item.
 - Actions, starting with the *actions* token and terminated by an *end*
@@ -3089,6 +3089,7 @@ specified before the ``pattern`` token.
 - ``priority {level}``: priority level within group.
 - ``ingress``: rule applies to ingress traffic.
 - ``egress``: rule applies to egress traffic.
+- ``transfer``: apply rule directly to endpoints found in pattern.
 
 Each instance of an attribute specified several times overrides the previous
 value as shown below (group 4 is used)::
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 51e3e8de4..862e03188 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -750,6 +750,14 @@ bnxt_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 13f6f2a28..ac0d05bfa 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -379,6 +379,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -624,6 +633,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -923,6 +940,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1211,6 +1237,15 @@ cons_parse_flex_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_flex_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -1361,6 +1396,15 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct igb_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index d645dfb63..b575d0365 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -1301,6 +1301,12 @@ enic_flow_parse(struct rte_eth_dev *dev,
 					   NULL,
 					   "egress is not supported");
 			return -rte_errno;
+		} else if (attrs->transfer) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					   NULL,
+					   "transfer is not supported");
+			return -rte_errno;
 		} else if (!attrs->ingress) {
 			rte_flow_error_set(error, ENOTSUP,
 					   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index e9550b0ce..0a3f93fad 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -1918,6 +1918,14 @@ i40e_flow_parse_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 67d22b382..1f1fa9dc4 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -557,6 +557,15 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(filter, 0, sizeof(struct rte_eth_ntuple_filter));
 		rte_flow_error_set(error, EINVAL,
@@ -787,6 +796,14 @@ cons_parse_ethertype_filter(const struct rte_flow_attr *attr,
 	}
 
 	/* Not supported */
+	if (attr->transfer) {
+		rte_flow_error_set(error, EINVAL,
+				RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* Not supported */
 	if (attr->priority) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
@@ -1078,6 +1095,15 @@ cons_parse_syn_filter(const struct rte_flow_attr *attr,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_syn_filter));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	/* Support 2 priorities, the lowest or highest. */
 	if (!attr->priority) {
 		filter->hig_pri = 0;
@@ -1250,6 +1276,15 @@ cons_parse_l2_tn_filter(struct rte_eth_dev *dev,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(filter, 0, sizeof(struct rte_eth_l2_tunnel_conf));
 		rte_flow_error_set(error, EINVAL,
@@ -1354,6 +1389,15 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr,
 	}
 
 	/* not supported */
+	if (attr->transfer) {
+		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
+		rte_flow_error_set(error, EINVAL,
+			RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
+	/* not supported */
 	if (attr->priority) {
 		memset(rule, 0, sizeof(struct ixgbe_fdir_rule));
 		rte_flow_error_set(error, EINVAL,
@@ -2829,6 +2873,15 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
+	/* not supported */
+	if (attr->transfer) {
+		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   attr, "No support for transfer.");
+		return -rte_errno;
+	}
+
 	if (attr->priority > 0xFFFF) {
 		memset(rss_conf, 0, sizeof(struct ixgbe_rte_flow_rss_conf));
 		rte_flow_error_set(error, EINVAL,
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 779641e11..480442f87 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -652,6 +652,10 @@ mlx4_flow_prepare(struct priv *priv,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
 			 NULL, "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			 NULL, "transfer is not supported");
 	if (!attr->ingress)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 8863e451c..288610620 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -569,6 +569,13 @@ mlx5_flow_convert_attributes(const struct rte_flow_attr *attr,
 				   "egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+				   NULL,
+				   "transfer is not supported");
+		return -rte_errno;
+	}
 	if (!attr->ingress) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 6604a411f..4848d5cf9 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -2188,6 +2188,12 @@ mrvl_flow_parse_attr(struct mrvl_priv *priv __rte_unused,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, NULL,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index f61d4ec92..b12a47281 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1126,6 +1126,12 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr,
 				   "Egress is not supported");
 		return -rte_errno;
 	}
+	if (attr->transfer != 0) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, attr,
+				   "Transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->ingress == 0) {
 		rte_flow_error_set(error, ENOTSUP,
 				   RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, attr,
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index e53eff6ce..597945ad2 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1041,6 +1041,12 @@ priv_flow_process(struct pmd_internals *pmd,
 	};
 	int action = 0; /* Only one action authorized for now */
 
+	if (attr->transfer) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+			NULL, "transfer is not supported");
+		return -rte_errno;
+	}
 	if (attr->group > MAX_GROUP) {
 		rte_flow_error_set(
 			error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index b13b0e2e6..b7bdc0469 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -72,7 +72,22 @@ struct rte_flow_attr {
 	uint32_t priority; /**< Priority level within group. */
 	uint32_t ingress:1; /**< Rule applies to ingress traffic. */
 	uint32_t egress:1; /**< Rule applies to egress traffic. */
-	uint32_t reserved:30; /**< Reserved, must be zero. */
+	/**
+	 * Instead of simply matching the properties of traffic as it would
+	 * appear on a given DPDK port ID, enabling this attribute transfers
+	 * a flow rule to the lowest possible level of any device endpoints
+	 * found in the pattern.
+	 *
+	 * When supported, this effectively enables an application to
+	 * re-route traffic not necessarily intended for it (e.g. coming
+	 * from or addressed to different physical ports, VFs or
+	 * applications) at the device level.
+	 *
+	 * It complements the behavior of some pattern items such as
+	 * RTE_FLOW_ITEM_TYPE_PORT and is meaningless without them.
+	 */
+	uint32_t transfer:1;
+	uint32_t reserved:29; /**< Reserved, must be zero. */
 };
 
 /**
@@ -1175,6 +1190,7 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, /**< Priority field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, /**< Ingress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
+	RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, /**< Transfer field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
 	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (6 preceding siblings ...)
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level " Adrien Mazarguil
@ 2018-04-06 13:25  1%   ` Adrien Mazarguil
  2018-04-06 17:11  0%     ` Andrew Rybchenko
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 11/15] ethdev: add transfer attribute to " Adrien Mazarguil
                     ` (5 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Ajit Khaparde, Somnath Kotur,
	John Daley, Hyong Youb Kim, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Tomasz Duszynski, Dmitri Epshtein, Natalie Samsonov, Jianbo Liu,
	Andrew Rybchenko, Pascal Mazon

TPID handling in rte_flow VLAN and E_TAG pattern item definitions is not
consistent with the normal stacking order of pattern items, which is
confusing to applications.

Problem is that when followed by one of these layers, the EtherType field
of the preceding layer keeps its "inner" definition, and the "outer" TPID
is provided by the subsequent layer, the reverse of how a packet looks like
on the wire:

 Wire:     [ ETH TPID = A | VLAN EtherType = B | B DATA ]
 rte_flow: [ ETH EtherType = B | VLAN TPID = A | B DATA ]

Worse, when QinQ is involved, the stacking order of VLAN layers is
unspecified. It is unclear whether it should be reversed (innermost to
outermost) as well given TPID applies to the previous layer:

 Wire:       [ ETH TPID = A | VLAN TPID = B | VLAN EtherType = C | C DATA ]
 rte_flow 1: [ ETH EtherType = C | VLAN TPID = B | VLAN TPID = A | C DATA ]
 rte_flow 2: [ ETH EtherType = C | VLAN TPID = A | VLAN TPID = B | C DATA ]

While specifying EtherType/TPID is hopefully rarely necessary, the stacking
order in case of QinQ and the lack of documentation remain an issue.

This patch replaces TPID in the VLAN pattern item with an inner
EtherType/TPID as is usually done everywhere else (e.g. struct vlan_hdr),
clarifies documentation and updates all relevant code.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Summary of changes for PMDs that implement ETH, VLAN or E_TAG pattern
items:

- bnxt: EtherType matching is supported, and vlan->inner_type overrides
  eth->type if the latter has standard TPID value 0x8100, otherwise an
  error is triggered.

- e1000: EtherType matching is only supported with the ETHERTYPE filter,
  which does not support VLAN matching, therefore no impact.

- enic: same as bnxt.

- i40e: same as bnxt with a configurable TPID value for the FDIR filter,
  with existing limitations on allowed EtherType values. The remaining
  filter types (VXLAN, NVGRE, QINQ) do not support EtherType matching.

- ixgbe: same as e1000, with additional minor change to rely on the new
  E-Tag macro definition.

- mlx4: EtherType/TPID matching is not supported, no impact.

- mlx5: same as bnxt.

- mrvl: EtherType matching is supported but eth->type cannot be specified
  when a VLAN item is present. However vlan->inner_type is used if
  specified.

- sfc: same as bnxt with QinQ TPID value 0x88a8 additionally supported.

- tap: same as bnxt.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: John Daley <johndale@cisco.com>
Cc: Hyong Youb Kim <hyonkim@cisco.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Dmitri Epshtein <dima@marvell.com>
Cc: Natalie Samsonov <nsamsono@marvell.com>
Cc: Jianbo Liu <jianbo.liu@arm.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>

---

Hi PMD maintainers, while I'm pretty confident in these changes, I could
not validate them with all devices.

It would be great if you could apply this patch, run testpmd, create VLAN
flow rules with/without inner EtherType as described and send matching
traffic while making sure nothing was broken in the process.

Thanks!
---
 app/test-pmd/cmdline_flow.c                 | 17 +++---
 doc/guides/nics/tap.rst                     |  2 +-
 doc/guides/prog_guide/rte_flow.rst          | 21 ++++++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  4 +-
 drivers/net/bnxt/bnxt_filter.c              | 39 +++++++++++---
 drivers/net/enic/enic_flow.c                | 22 +++++---
 drivers/net/i40e/i40e_flow.c                | 69 +++++++++++++++++++-----
 drivers/net/ixgbe/ixgbe_ethdev.c            |  3 +-
 drivers/net/mlx5/mlx5_flow.c                | 16 +++++-
 drivers/net/mvpp2/mrvl_flow.c               | 27 +++++++---
 drivers/net/sfc/sfc_flow.c                  | 28 ++++++++++
 drivers/net/tap/tap_flow.c                  | 16 ++++--
 lib/librte_ether/rte_flow.h                 | 24 ++++++---
 lib/librte_net/rte_ether.h                  |  1 +
 14 files changed, 229 insertions(+), 60 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 2fbd3d8ef..3a486032d 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -99,11 +99,11 @@ enum index {
 	ITEM_ETH_SRC,
 	ITEM_ETH_TYPE,
 	ITEM_VLAN,
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_IPV4,
 	ITEM_IPV4_TOS,
 	ITEM_IPV4_TTL,
@@ -505,11 +505,11 @@ static const enum index item_eth[] = {
 };
 
 static const enum index item_vlan[] = {
-	ITEM_VLAN_TPID,
 	ITEM_VLAN_TCI,
 	ITEM_VLAN_PCP,
 	ITEM_VLAN_DEI,
 	ITEM_VLAN_VID,
+	ITEM_VLAN_INNER_TYPE,
 	ITEM_NEXT,
 	ZERO,
 };
@@ -1142,12 +1142,6 @@ static const struct token token_list[] = {
 		.next = NEXT(item_vlan),
 		.call = parse_vc,
 	},
-	[ITEM_VLAN_TPID] = {
-		.name = "tpid",
-		.help = "tag protocol identifier",
-		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
-		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan, tpid)),
-	},
 	[ITEM_VLAN_TCI] = {
 		.name = "tci",
 		.help = "tag control information",
@@ -1175,6 +1169,13 @@ static const struct token token_list[] = {
 		.args = ARGS(ARGS_ENTRY_MASK_HTON(struct rte_flow_item_vlan,
 						  tci, "\x0f\xff")),
 	},
+	[ITEM_VLAN_INNER_TYPE] = {
+		.name = "inner_type",
+		.help = "inner EtherType",
+		.next = NEXT(item_vlan, NEXT_ENTRY(UNSIGNED), item_param),
+		.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_vlan,
+					     inner_type)),
+	},
 	[ITEM_IPV4] = {
 		.name = "ipv4",
 		.help = "match IPv4 header",
diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index 76eb0bde4..bcf3efe9e 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -97,7 +97,7 @@ The kernel support can be checked with this command::
 Supported items:
 
 - eth: src and dst (with variable masks), and eth_type (0xffff mask).
-- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9)
+- vlan: vid, pcp, but not eid. (requires kernel 4.9)
 - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask).
 - udp/tcp: src and dst port (0xffff) mask.
 
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 97d429ee5..c6f16d444 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -784,9 +784,15 @@ Item: ``ETH``
 
 Matches an Ethernet header.
 
+The ``type`` field either stands for "EtherType" or "TPID" when followed by
+so-called layer 2.5 pattern items such as ``RTE_FLOW_ITEM_TYPE_VLAN``. In
+the latter case, ``type`` refers to that of the outer header, with the inner
+EtherType/TPID provided by the subsequent pattern item. This is the same
+order as on the wire.
+
 - ``dst``: destination MAC.
 - ``src``: source MAC.
-- ``type``: EtherType.
+- ``type``: EtherType or TPID.
 - Default ``mask`` matches destination and source addresses only.
 
 Item: ``VLAN``
@@ -794,9 +800,13 @@ Item: ``VLAN``
 
 Matches an 802.1Q/ad VLAN tag.
 
-- ``tpid``: tag protocol identifier.
+The corresponding standard outer EtherType (TPID) values are
+``ETHER_TYPE_VLAN`` or ``ETHER_TYPE_QINQ``. It can be overridden by the
+preceding pattern item.
+
 - ``tci``: tag control information.
-- Default ``mask`` matches TCI only.
+- ``inner_type``: inner EtherType or TPID.
+- Default ``mask`` matches the VID part of TCI only (lower 12 bits).
 
 Item: ``IPV4``
 ^^^^^^^^^^^^^^
@@ -866,12 +876,15 @@ Item: ``E_TAG``
 
 Matches an IEEE 802.1BR E-Tag header.
 
-- ``tpid``: tag protocol identifier (0x893F)
+The corresponding standard outer EtherType (TPID) value is
+``ETHER_TYPE_ETAG``. It can be overridden by the preceding pattern item.
+
 - ``epcp_edei_in_ecid_b``: E-Tag control information (E-TCI), E-PCP (3b),
   E-DEI (1b), ingress E-CID base (12b).
 - ``rsvd_grp_ecid_b``: reserved (2b), GRP (2b), E-CID base (12b).
 - ``in_ecid_e``: ingress E-CID ext.
 - ``ecid_e``: E-CID ext.
+- ``inner_type``: inner EtherType or TPID.
 - Default ``mask`` simultaneously matches GRP and E-CID base.
 
 Item: ``NVGRE``
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index 738461f44..25fac8430 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3223,15 +3223,15 @@ This section lists supported pattern items and their attributes, if any.
 
   - ``dst {MAC-48}``: destination MAC.
   - ``src {MAC-48}``: source MAC.
-  - ``type {unsigned}``: EtherType.
+  - ``type {unsigned}``: EtherType or TPID.
 
 - ``vlan``: match 802.1Q/ad VLAN tag.
 
-  - ``tpid {unsigned}``: tag protocol identifier.
   - ``tci {unsigned}``: tag control information.
   - ``pcp {unsigned}``: priority code point.
   - ``dei {unsigned}``: drop eligible indicator.
   - ``vid {unsigned}``: VLAN identifier.
+  - ``inner_type {unsigned}``: inner EtherType or TPID.
 
 - ``ipv4``: match IPv4 header.
 
diff --git a/drivers/net/bnxt/bnxt_filter.c b/drivers/net/bnxt/bnxt_filter.c
index 0f9c1c9ae..51e3e8de4 100644
--- a/drivers/net/bnxt/bnxt_filter.c
+++ b/drivers/net/bnxt/bnxt_filter.c
@@ -6,6 +6,7 @@
 #include <sys/queue.h>
 
 #include <rte_byteorder.h>
+#include <rte_ether.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
 #include <rte_flow.h>
@@ -299,6 +300,7 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 	uint32_t vf = 0;
 	int use_ntuple;
 	uint32_t en = 0;
+	uint32_t en_ethertype;
 	int dflt_vnic;
 
 	use_ntuple = bnxt_filter_type_check(pattern, error);
@@ -308,6 +310,9 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 
 	filter->filter_type = use_ntuple ?
 		HWRM_CFA_NTUPLE_FILTER : HWRM_CFA_EM_FILTER;
+	en_ethertype = use_ntuple ?
+		NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
+		EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
 
 	while (item->type != RTE_FLOW_ITEM_TYPE_END) {
 		if (item->last) {
@@ -377,30 +382,52 @@ bnxt_validate_and_parse_flow_type(struct bnxt *bp,
 			if (eth_mask->type) {
 				filter->ethertype =
 					rte_be_to_cpu_16(eth_spec->type);
-				en |= use_ntuple ?
-					NTUPLE_FLTR_ALLOC_INPUT_EN_ETHERTYPE :
-					EM_FLOW_ALLOC_INPUT_EN_ETHERTYPE;
+				en |= en_ethertype;
 			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+			if (en & en_ethertype &&
+			    filter->ethertype != RTE_BE16(ETHER_TYPE_VLAN)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "unsupported outer TPID");
+				return -rte_errno;
+			}
 			if (vlan_mask->tci &&
-			    vlan_mask->tci == RTE_BE16(0x0fff) &&
-			    !vlan_mask->tpid) {
+			    vlan_mask->tci == RTE_BE16(0x0fff)) {
 				/* Only the VLAN ID can be matched. */
 				filter->l2_ovlan =
 					rte_be_to_cpu_16(vlan_spec->tci &
 							 RTE_BE16(0x0fff));
 				en |= EM_FLOW_ALLOC_INPUT_EN_OVLAN_VID;
-			} else if (vlan_mask->tci || vlan_mask->tpid) {
+			} else if (vlan_mask->tci) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
 						   "VLAN mask is invalid");
 				return -rte_errno;
 			}
+			if (vlan_mask->inner_type &&
+			    vlan_mask->inner_type != RTE_BE16(0xffff)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ITEM,
+						   item,
+						   "inner ethertype mask not"
+						   " valid");
+				return -rte_errno;
+			}
+			if (vlan_mask->inner_type) {
+				filter->ethertype =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+				en |= en_ethertype;
+			} else {
+				filter->ethertype = RTE_BE16(0x0000);
+				en &= ~en_ethertype;
+			}
 
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index c5c98b870..d645dfb63 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -4,6 +4,8 @@
 
 #include <errno.h>
 #include <stdint.h>
+#include <rte_byteorder.h>
+#include <rte_ether.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -545,16 +547,22 @@ enic_copy_item_vlan_v2(const struct rte_flow_item *item,
 	if (!spec)
 		return 0;
 
-	/* Don't support filtering in tpid */
-	if (mask) {
-		if (mask->tpid != 0)
-			return ENOTSUP;
-	} else {
+	if (!mask)
 		mask = &rte_flow_item_vlan_mask;
-		RTE_ASSERT(mask->tpid == 0);
-	}
 
 	if (*inner_ofst == 0) {
+		struct ether_hdr *eth_mask =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].mask;
+		struct ether_hdr *eth_val =
+			(void *)gp->layer[FILTER_GENERIC_1_L2].val;
+
+		/* Exactly one TPID value is allowed if specified */
+		if ((eth_val->ether_type & eth_mask->ether_type) !=
+		    (RTE_BE16(ETHER_TYPE_VLAN) & eth_mask->ether_type))
+			return ENOTSUP;
+		eth_mask->ether_type = mask->inner_type;
+		eth_val->ether_type = spec->inner_type;
+
 		/* Outer header. Use the vlan mask/val fields */
 		gp->mask_vlan = mask->tci;
 		gp->val_vlan = spec->tci;
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index c0a2bc4a6..e9550b0ce 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <stdarg.h>
 
+#include <rte_debug.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_log.h>
@@ -2491,24 +2492,36 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						      "Invalid MAC_addr mask.");
 					return -rte_errno;
 				}
+			}
+			if (eth_spec && eth_mask && eth_mask->type) {
+				enum rte_flow_item_type next = (item + 1)->type;
 
-				if ((eth_mask->type & UINT16_MAX) ==
-				    UINT16_MAX) {
-					input_set |= I40E_INSET_LAST_ETHER_TYPE;
-					filter->input.flow.l2_flow.ether_type =
-						eth_spec->type;
+				if (eth_mask->type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid type mask.");
+					return -rte_errno;
 				}
 
 				ether_type = rte_be_to_cpu_16(eth_spec->type);
-				if (ether_type == ETHER_TYPE_IPv4 ||
-				    ether_type == ETHER_TYPE_IPv6 ||
-				    ether_type == ETHER_TYPE_ARP ||
-				    ether_type == outer_tpid) {
+
+				if ((next == RTE_FLOW_ITEM_TYPE_VLAN &&
+				     ether_type != outer_tpid) ||
+				    (next != RTE_FLOW_ITEM_TYPE_VLAN &&
+				     (ether_type == ETHER_TYPE_IPv4 ||
+				      ether_type == ETHER_TYPE_IPv6 ||
+				      ether_type == ETHER_TYPE_ARP ||
+				      ether_type == outer_tpid))) {
 					rte_flow_error_set(error, EINVAL,
 						     RTE_FLOW_ERROR_TYPE_ITEM,
 						     item,
 						     "Unsupported ether_type.");
 					return -rte_errno;
+				} else if (next != RTE_FLOW_ITEM_TYPE_VLAN) {
+					input_set |= I40E_INSET_LAST_ETHER_TYPE;
+					filter->input.flow.l2_flow.ether_type =
+						eth_spec->type;
 				}
 			}
 
@@ -2519,6 +2532,8 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
+
+			RTE_ASSERT(!(input_set & I40E_INSET_LAST_ETHER_TYPE));
 			if (vlan_spec && vlan_mask) {
 				if (vlan_mask->tci ==
 				    rte_cpu_to_be_16(I40E_TCI_MASK)) {
@@ -2527,6 +2542,33 @@ i40e_flow_parse_fdir_pattern(struct rte_eth_dev *dev,
 						vlan_spec->tci;
 				}
 			}
+			if (vlan_spec && vlan_mask && vlan_mask->inner_type) {
+				if (vlan_mask->inner_type != RTE_BE16(0xffff)) {
+					rte_flow_error_set(error, EINVAL,
+						      RTE_FLOW_ERROR_TYPE_ITEM,
+						      item,
+						      "Invalid inner_type"
+						      " mask.");
+					return -rte_errno;
+				}
+
+				ether_type =
+					rte_be_to_cpu_16(vlan_spec->inner_type);
+
+				if (ether_type == ETHER_TYPE_IPv4 ||
+				    ether_type == ETHER_TYPE_IPv6 ||
+				    ether_type == ETHER_TYPE_ARP ||
+				    ether_type == outer_tpid) {
+					rte_flow_error_set(error, EINVAL,
+						     RTE_FLOW_ERROR_TYPE_ITEM,
+						     item,
+						     "Unsupported inner_type.");
+					return -rte_errno;
+				}
+				input_set |= I40E_INSET_LAST_ETHER_TYPE;
+				filter->input.flow.l2_flow.ether_type =
+					vlan_spec->inner_type;
+			}
 
 			pctype = I40E_FILTER_PCTYPE_L2_PAYLOAD;
 			layer_idx = I40E_FLXPLD_L2_IDX;
@@ -3285,7 +3327,8 @@ i40e_flow_parse_vxlan_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -3515,7 +3558,8 @@ i40e_flow_parse_nvgre_pattern(__rte_unused struct rte_eth_dev *dev,
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ITEM,
 						   item,
@@ -4023,7 +4067,8 @@ i40e_flow_parse_qinq_pattern(__rte_unused struct rte_eth_dev *dev,
 			vlan_spec = item->spec;
 			vlan_mask = item->mask;
 
-			if (!(vlan_spec && vlan_mask)) {
+			if (!(vlan_spec && vlan_mask) ||
+			    vlan_mask->inner_type) {
 				rte_flow_error_set(error, EINVAL,
 					   RTE_FLOW_ERROR_TYPE_ITEM,
 					   item,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 227f4c342..0d2726115 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -115,7 +115,6 @@
 
 #define IXGBE_VT_CTL_POOLING_MODE_MASK         0x00030000
 #define IXGBE_VT_CTL_POOLING_MODE_ETAG         0x00010000
-#define DEFAULT_ETAG_ETYPE                     0x893f
 #define IXGBE_ETAG_ETYPE                       0x00005084
 #define IXGBE_ETAG_ETYPE_MASK                  0x0000ffff
 #define IXGBE_ETAG_ETYPE_VALID                 0x80000000
@@ -1481,7 +1480,7 @@ static int ixgbe_l2_tn_filter_init(struct rte_eth_dev *eth_dev)
 	}
 	l2_tn_info->e_tag_en = FALSE;
 	l2_tn_info->e_tag_fwd_en = FALSE;
-	l2_tn_info->e_tag_ether_type = DEFAULT_ETAG_ETYPE;
+	l2_tn_info->e_tag_ether_type = ETHER_TYPE_ETAG;
 
 	return 0;
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bc1176819..8863e451c 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -17,7 +17,9 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_byteorder.h>
 #include <rte_common.h>
+#include <rte_ether.h>
 #include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
@@ -306,6 +308,7 @@ static const struct mlx5_flow_items mlx5_flow_items[] = {
 		.actions = valid_actions,
 		.mask = &(const struct rte_flow_item_vlan){
 			.tci = -1,
+			.inner_type = -1,
 		},
 		.default_mask = &rte_flow_item_vlan_mask,
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
@@ -1285,6 +1288,7 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 	struct mlx5_flow_parse *parser = data->parser;
 	struct ibv_flow_spec_eth *eth;
 	const unsigned int eth_size = sizeof(struct ibv_flow_spec_eth);
+	const char *msg = "VLAN cannot be empty";
 
 	if (spec) {
 		unsigned int i;
@@ -1306,12 +1310,22 @@ mlx5_flow_create_vlan(const struct rte_flow_item *item,
 			 */
 			if (!eth->mask.vlan_tag)
 				goto error;
+			/* Exactly one TPID value is allowed if specified. */
+			if ((eth->val.ether_type & eth->mask.ether_type) !=
+			    (RTE_BE16(ETHER_TYPE_VLAN) &
+			     eth->mask.ether_type)) {
+				msg = "unsupported outer TPID";
+				goto error;
+			}
+			eth->val.ether_type = spec->inner_type;
+			eth->mask.ether_type = mask->inner_type;
+			eth->val.ether_type &= eth->mask.ether_type;
 		}
 		return 0;
 	}
 error:
 	return rte_flow_error_set(data->error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				  item, "VLAN cannot be empty");
+				  item, msg);
 }
 
 /**
diff --git a/drivers/net/mvpp2/mrvl_flow.c b/drivers/net/mvpp2/mrvl_flow.c
index 8fd4dbfb1..6604a411f 100644
--- a/drivers/net/mvpp2/mrvl_flow.c
+++ b/drivers/net/mvpp2/mrvl_flow.c
@@ -1091,12 +1091,6 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 	if (ret)
 		return ret;
 
-	if (mask->tpid) {
-		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM,
-				   NULL, "Not supported by classifier\n");
-		return -rte_errno;
-	}
-
 	m = rte_be_to_cpu_16(mask->tci);
 	if (m & MRVL_VLAN_ID_MASK) {
 		RTE_LOG(WARNING, PMD, "vlan id mask is ignored\n");
@@ -1112,6 +1106,27 @@ mrvl_parse_vlan(const struct rte_flow_item *item,
 			goto out;
 	}
 
+	if (flow->pattern & F_TYPE) {
+		rte_flow_error_set(error, ENOTSUP,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "outer TPID cannot be explicitly matched"
+				   " when VLAN item is also specified\n");
+		return -rte_errno;
+	}
+	if (mask->inner_type) {
+		struct rte_flow_item_eth spec_eth = {
+			.type = spec->inner_type,
+		};
+		struct rte_flow_item_eth mask_eth = {
+			.type = mask->inner_type,
+		};
+
+		RTE_LOG(WARNING, PMD, "inner eth type mask is ignored\n");
+		ret = mrvl_parse_type(spec_eth, mask_eth, flow);
+		if (ret)
+			goto out;
+	}
+
 	return 0;
 out:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index bc4974edf..f61d4ec92 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -7,6 +7,7 @@
  * for Solarflare) and Solarflare Communications, Inc.
  */
 
+#include <rte_byteorder.h>
 #include <rte_tailq.h>
 #include <rte_common.h>
 #include <rte_ethdev_driver.h>
@@ -351,6 +352,7 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 	const struct rte_flow_item_vlan *mask = NULL;
 	const struct rte_flow_item_vlan supp_mask = {
 		.tci = rte_cpu_to_be_16(ETH_VLAN_ID_MAX),
+		.inner_type = RTE_BE16(0xffff),
 	};
 
 	rc = sfc_flow_parse_init(item,
@@ -393,6 +395,32 @@ sfc_flow_parse_vlan(const struct rte_flow_item *item,
 		return -rte_errno;
 	}
 
+	/*
+	 * If an EtherType was already specified, make sure it is a valid
+	 * TPID for the current VLAN layer before overwriting it with the
+	 * specified inner type.
+	 */
+	if (efx_spec->efs_match_flags & EFX_FILTER_MATCH_ETHER_TYPE &&
+	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_VLAN) &&
+	    efx_spec->efs_ether_type != RTE_BE16(ETHER_TYPE_QINQ)) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Unsupported outer TPID");
+		return -rte_errno;
+	}
+	if (!mask->inner_type) {
+		efx_spec->efs_match_flags &= ~EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = RTE_BE16(0x0000);
+	} else if (mask->inner_type == supp_mask.inner_type) {
+		efx_spec->efs_match_flags |= EFX_FILTER_MATCH_ETHER_TYPE;
+		efx_spec->efs_ether_type = rte_bswap16(spec->inner_type);
+	} else {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "Bad mask for VLAN inner_type");
+		return -rte_errno;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index e5eb50fc5..e53eff6ce 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -270,13 +270,13 @@ static const struct tap_flow_items tap_flow_items[] = {
 		.items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
 			       RTE_FLOW_ITEM_TYPE_IPV6),
 		.mask = &(const struct rte_flow_item_vlan){
-			.tpid = -1,
 			/* DEI matching is not supported */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 			.tci = 0xffef,
 #else
 			.tci = 0xefff,
 #endif
+			.inner_type = -1,
 		},
 		.mask_sz = sizeof(struct rte_flow_item_vlan),
 		.default_mask = &rte_flow_item_vlan_mask,
@@ -578,13 +578,21 @@ tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
 	/* use default mask if none provided */
 	if (!mask)
 		mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
-	/* TC does not support tpid masking. Only accept if exact match. */
-	if (mask->tpid && mask->tpid != 0xffff)
+	/* check that previous eth type is compatible with VLAN */
+	if (info->eth_type && info->eth_type != RTE_BE16(ETH_P_8021Q))
 		return -1;
 	/* Double-tagging not supported. */
-	if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
+	if (info->vlan)
 		return -1;
 	info->vlan = 1;
+	if (mask->inner_type) {
+		/* TC does not support partial eth_type masking */
+		if (mask->inner_type != RTE_BE16(0xffff))
+			return -1;
+		info->eth_type = spec->inner_type;
+	} else {
+		info->eth_type = 0;
+	}
 	if (!flow)
 		return 0;
 	msg = &flow->msg;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index fc7e6705d..b13b0e2e6 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -454,11 +454,17 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
  * RTE_FLOW_ITEM_TYPE_ETH
  *
  * Matches an Ethernet header.
+ *
+ * The @p type field either stands for "EtherType" or "TPID" when followed
+ * by so-called layer 2.5 pattern items such as RTE_FLOW_ITEM_TYPE_VLAN. In
+ * the latter case, @p type refers to that of the outer header, with the
+ * inner EtherType/TPID provided by the subsequent pattern item. This is the
+ * same order as on the wire.
  */
 struct rte_flow_item_eth {
 	struct ether_addr dst; /**< Destination MAC. */
 	struct ether_addr src; /**< Source MAC. */
-	rte_be16_t type; /**< EtherType. */
+	rte_be16_t type; /**< EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_ETH. */
@@ -475,19 +481,20 @@ static const struct rte_flow_item_eth rte_flow_item_eth_mask = {
  *
  * Matches an 802.1Q/ad VLAN tag.
  *
- * This type normally follows either RTE_FLOW_ITEM_TYPE_ETH or
- * RTE_FLOW_ITEM_TYPE_VLAN.
+ * The corresponding standard outer EtherType (TPID) values are
+ * ETHER_TYPE_VLAN or ETHER_TYPE_QINQ. It can be overridden by the preceding
+ * pattern item.
  */
 struct rte_flow_item_vlan {
-	rte_be16_t tpid; /**< Tag protocol identifier. */
 	rte_be16_t tci; /**< Tag control information. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_VLAN. */
 #ifndef __cplusplus
 static const struct rte_flow_item_vlan rte_flow_item_vlan_mask = {
-	.tpid = RTE_BE16(0x0000),
-	.tci = RTE_BE16(0xffff),
+	.tci = RTE_BE16(0x0fff),
+	.inner_type = RTE_BE16(0x0000),
 };
 #endif
 
@@ -636,9 +643,11 @@ static const struct rte_flow_item_vxlan rte_flow_item_vxlan_mask = {
  * RTE_FLOW_ITEM_TYPE_E_TAG.
  *
  * Matches a E-tag header.
+ *
+ * The corresponding standard outer EtherType (TPID) value is
+ * ETHER_TYPE_ETAG. It can be overridden by the preceding pattern item.
  */
 struct rte_flow_item_e_tag {
-	rte_be16_t tpid; /**< Tag protocol identifier (0x893F). */
 	/**
 	 * E-Tag control information (E-TCI).
 	 * E-PCP (3b), E-DEI (1b), ingress E-CID base (12b).
@@ -648,6 +657,7 @@ struct rte_flow_item_e_tag {
 	rte_be16_t rsvd_grp_ecid_b;
 	uint8_t in_ecid_e; /**< Ingress E-CID ext. */
 	uint8_t ecid_e; /**< E-CID ext. */
+	rte_be16_t inner_type; /**< Inner EtherType or TPID. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_E_TAG. */
diff --git a/lib/librte_net/rte_ether.h b/lib/librte_net/rte_ether.h
index 45daa911a..a271d1c86 100644
--- a/lib/librte_net/rte_ether.h
+++ b/lib/librte_net/rte_ether.h
@@ -301,6 +301,7 @@ struct vxlan_hdr {
 #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */
 #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */
 #define ETHER_TYPE_QINQ 0x88A8 /**< IEEE 802.1ad QinQ tagging. */
+#define ETHER_TYPE_ETAG 0x893F /**< IEEE 802.1BR E-Tag. */
 #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */
 #define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */
 #define ETHER_TYPE_TEB  0x6558 /**< Transparent Ethernet Bridging. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level to RSS flow API action
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (5 preceding siblings ...)
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action Adrien Mazarguil
@ 2018-04-06 13:25  3%   ` Adrien Mazarguil
  2018-04-07  8:27  0%     ` Andrew Rybchenko
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API Adrien Mazarguil
                     ` (6 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 23e10d623..2fbd3d8ef 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index b258c93e8..c0fefe475 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1085,6 +1085,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 7a97ced2a..97d429ee5 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1305,6 +1305,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1314,6 +1336,8 @@ field only, both can be requested simultaneously.
    +===============+====================================+
    | ``func``      | RSS hash function to apply         |
    +---------------+------------------------------------+
+   | ``level``     | encapsulation level for ``types``  |
+   +---------------+------------------------------------+
    | ``types``     | RSS hash types (see ``ETH_RSS_*``) |
    +---------------+------------------------------------+
    | ``key_len``   | hash key length in bytes           |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index d9d68ad9b..738461f44 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
     are the same as `set_hash_input_set`_, an empty list means none (0).
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 747c524f5..13f6f2a28 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index d5c1cd3d3..a3776a0d7 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2906,6 +2906,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2921,6 +2922,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 5e313950c..b104b551c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11975,6 +11975,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11990,6 +11991,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 0a6ed0f2e..c0a2bc4a6 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4330,6 +4330,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 10056a0f7..67d22b382 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index e17f5a433..23af21712 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5683,6 +5683,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5698,6 +5699,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dcaf8df44..779641e11 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 0771ad339..bc1176819 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index dbe4c2baa..bc4974edf 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1264,6 +1264,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	if (rss->func)
 		return -EINVAL;
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3d91da216..e5eb50fc5 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 0a2c0ac00..1f247d656 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 66cadc74e..fc7e6705d 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1040,6 +1040,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (4 preceding siblings ...)
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in " Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-06 15:41  0%     ` Andrew Rybchenko
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level " Adrien Mazarguil
                     ` (7 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

By definition, RSS involves some kind of hash algorithm, usually Toeplitz.

Until now it could not be modified on a flow rule basis and PMDs had to
always assume RTE_ETH_HASH_FUNCTION_DEFAULT, which remains the default
behavior when unspecified (0).

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 72 ++++++++++++++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          |  2 +
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  3 +
 drivers/net/e1000/igb_flow.c                |  4 ++
 drivers/net/e1000/igb_rxtx.c                |  4 +-
 drivers/net/i40e/i40e_ethdev.c              |  4 +-
 drivers/net/i40e/i40e_flow.c                |  4 ++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  4 +-
 drivers/net/mlx4/mlx4_flow.c                |  7 +++
 drivers/net/mlx5/mlx5_flow.c                | 13 +++++
 drivers/net/sfc/sfc_flow.c                  |  3 +
 drivers/net/tap/tap_flow.c                  |  6 ++
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 |  2 +
 16 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 0322f36c4..23e10d623 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -14,6 +14,7 @@
 #include <sys/socket.h>
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev.h>
 #include <rte_byteorder.h>
 #include <cmdline_parse.h>
@@ -165,6 +166,10 @@ enum index {
 	ACTION_DROP,
 	ACTION_COUNT,
 	ACTION_RSS,
+	ACTION_RSS_FUNC,
+	ACTION_RSS_FUNC_DEFAULT,
+	ACTION_RSS_FUNC_TOEPLITZ,
+	ACTION_RSS_FUNC_SIMPLE_XOR,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
 	ACTION_RSS_KEY,
@@ -632,6 +637,7 @@ static const enum index action_queue[] = {
 };
 
 static const enum index action_rss[] = {
+	ACTION_RSS_FUNC,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -666,6 +672,9 @@ static int parse_vc_conf(struct context *, const struct token *,
 static int parse_vc_action_rss(struct context *, const struct token *,
 			       const char *, unsigned int, void *,
 			       unsigned int);
+static int parse_vc_action_rss_func(struct context *, const struct token *,
+				    const char *, unsigned int, void *,
+				    unsigned int);
 static int parse_vc_action_rss_type(struct context *, const struct token *,
 				    const char *, unsigned int, void *,
 				    unsigned int);
@@ -1584,6 +1593,29 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
+	[ACTION_RSS_FUNC] = {
+		.name = "func",
+		.help = "RSS hash function to apply",
+		.next = NEXT(action_rss,
+			     NEXT_ENTRY(ACTION_RSS_FUNC_DEFAULT,
+					ACTION_RSS_FUNC_TOEPLITZ,
+					ACTION_RSS_FUNC_SIMPLE_XOR)),
+	},
+	[ACTION_RSS_FUNC_DEFAULT] = {
+		.name = "default",
+		.help = "default hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_TOEPLITZ] = {
+		.name = "toeplitz",
+		.help = "Toeplitz hash function",
+		.call = parse_vc_action_rss_func,
+	},
+	[ACTION_RSS_FUNC_SIMPLE_XOR] = {
+		.name = "simple_xor",
+		.help = "simple XOR hash function",
+		.call = parse_vc_action_rss_func,
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "RSS hash types",
@@ -2074,6 +2106,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
+			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
@@ -2111,6 +2144,45 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 }
 
 /**
+ * Parse func field for RSS action.
+ *
+ * The RTE_ETH_HASH_FUNCTION_* value to assign is derived from the
+ * ACTION_RSS_FUNC_* index that called this function.
+ */
+static int
+parse_vc_action_rss_func(struct context *ctx, const struct token *token,
+			 const char *str, unsigned int len,
+			 void *buf, unsigned int size)
+{
+	struct action_rss_data *action_rss_data;
+	enum rte_eth_hash_function func;
+
+	(void)buf;
+	(void)size;
+	/* Token name must match. */
+	if (parse_default(ctx, token, str, len, NULL, 0) < 0)
+		return -1;
+	switch (ctx->curr) {
+	case ACTION_RSS_FUNC_DEFAULT:
+		func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+		break;
+	case ACTION_RSS_FUNC_TOEPLITZ:
+		func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+		break;
+	case ACTION_RSS_FUNC_SIMPLE_XOR:
+		func = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR;
+		break;
+	default:
+		return -1;
+	}
+	if (!ctx->object)
+		return len;
+	action_rss_data = ctx->object;
+	action_rss_data->conf.func = func;
+	return len;
+}
+
+/**
  * Parse type field for RSS action.
  *
  * Valid tokens are type field names and the "end" token.
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 717f31774..b258c93e8 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,6 +1084,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 5ce041d91..7a97ced2a 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1312,6 +1312,8 @@ field only, both can be requested simultaneously.
    +---------------+------------------------------------+
    | Field         | Value                              |
    +===============+====================================+
+   | ``func``      | RSS hash function to apply         |
+   +---------------+------------------------------------+
    | ``types``     | RSS hash types (see ``ETH_RSS_*``) |
    +---------------+------------------------------------+
    | ``key_len``   | hash key length in bytes           |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index a015d02a4..d9d68ad9b 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3398,6 +3398,9 @@ This section lists supported actions and their attributes, if any.
 
 - ``rss``: spread packets among several queues.
 
+  - ``func {hash function}``: RSS hash function to apply, allowed tokens are
+    the same as `set_hash_global_config`_.
+
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
     are the same as `set_hash_input_set`_, an empty list means none (0).
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 8dc5f75f2..747c524f5 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1310,6 +1310,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 45bb3455c..d5c1cd3d3 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2905,6 +2905,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2919,7 +2920,8 @@ int
 igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 0242b5d59..5e313950c 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11974,6 +11974,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11988,7 +11989,8 @@ int
 i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 1c09f8121..0a6ed0f2e 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4326,6 +4326,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 4e31c7c56..10056a0f7 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2779,6 +2779,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
+	if (rss->func)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "non-default RSS hash functions are not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 94ea7444d..e17f5a433 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5682,6 +5682,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 	    in->queue_num > RTE_DIM(out->queue))
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
+		.func = in->func,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5696,7 +5697,8 @@ int
 ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
-	return (comp->types == with->types &&
+	return (comp->func == with->func &&
+		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
 		!memcmp(comp->key, with->key, with->key_len) &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4dbcaa39c..dcaf8df44 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -790,6 +790,12 @@ mlx4_flow_prepare(struct priv *priv,
 					" of the context size";
 				goto exit_action_not_supported;
 			}
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				msg = "the only supported RSS hash function"
+					" is Toeplitz";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1283,6 +1289,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 7798052f9..0771ad339 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #endif
 
 #include <rte_common.h>
+#include <rte_eth_ctrl.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
@@ -634,6 +635,15 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
+			if (rss->func &&
+			    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "the only supported RSS hash"
+						   " function is Toeplitz");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -683,6 +693,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				}
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
+				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1915,6 +1926,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2429,6 +2441,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 1a2c0299c..dbe4c2baa 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1261,6 +1261,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
+	if (rss->func)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 78f20913f..3d91da216 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,6 +2055,12 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
+	/* Check supported hash functions */
+	if (rss->func)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "non-default RSS hash functions are not supported");
+
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
 	if (err < 0) {
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 2fabc9a29..0a2c0ac00 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,6 +330,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
+				.func = src.rss->func,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index e2eba9c26..66cadc74e 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -19,6 +19,7 @@
 
 #include <rte_arp.h>
 #include <rte_ether.h>
+#include <rte_eth_ctrl.h>
 #include <rte_icmp.h>
 #include <rte_ip.h>
 #include <rte_sctp.h>
@@ -1038,6 +1039,7 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
+	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
 	uint64_t types; /**< RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (3 preceding siblings ...)
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 06/15] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
@ 2018-04-06 13:25  1%   ` Adrien Mazarguil
  2018-04-07  9:05  0%     ` Andrew Rybchenko
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action Adrien Mazarguil
                     ` (8 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon, Radu Nicolau, Akhil Goyal

Since its inception, the rte_flow RSS action has been relying in part on
external struct rte_eth_rss_conf for compatibility with the legacy RSS API.
This structure lacks parameters such as the hash algorithm to use, and more
recently, a method to tell which layer RSS should be performed on [1].

Given struct rte_eth_rss_conf will never be flexible enough to represent a
complete RSS configuration (e.g. RETA table), this patch supersedes it by
extending the rte_flow RSS action directly.

A subsequent patch will add a field to use a non-default RSS hash
algorithm. To that end, a field named "types" replaces the field formerly
known as "rss_hf" and standing for "RSS hash functions" as it was
confusing. Actual RSS hash function types are defined by enum
rte_eth_hash_function.

This patch updates all PMDs and example applications accordingly.

It breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

[1] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
Cc: Radu Nicolau <radu.nicolau@intel.com>
Cc: Akhil Goyal <akhil.goyal@nxp.com>
---
 app/test-pmd/cmdline_flow.c        |  59 +++++-----
 app/test-pmd/config.c              |  39 +++----
 doc/guides/prog_guide/rte_flow.rst |  22 ++--
 drivers/net/e1000/e1000_ethdev.h   |  13 ++-
 drivers/net/e1000/igb_ethdev.c     |   4 +-
 drivers/net/e1000/igb_flow.c       |  31 ++---
 drivers/net/e1000/igb_rxtx.c       |  51 +++++++--
 drivers/net/i40e/i40e_ethdev.c     |  53 +++++++--
 drivers/net/i40e/i40e_ethdev.h     |  15 ++-
 drivers/net/i40e/i40e_flow.c       |  47 ++++----
 drivers/net/ixgbe/ixgbe_ethdev.c   |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h   |  13 ++-
 drivers/net/ixgbe/ixgbe_flow.c     |  30 ++---
 drivers/net/ixgbe/ixgbe_rxtx.c     |  51 +++++++--
 drivers/net/mlx4/mlx4.c            |   2 +-
 drivers/net/mlx4/mlx4_flow.c       |  61 +++++-----
 drivers/net/mlx4/mlx4_flow.h       |   2 +-
 drivers/net/mlx4/mlx4_rxq.c        |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h       |   2 +-
 drivers/net/mlx5/mlx5_flow.c       | 193 +++++++++++++++-----------------
 drivers/net/mlx5/mlx5_rxq.c        |  22 ++--
 drivers/net/mlx5/mlx5_rxtx.h       |  26 +++--
 drivers/net/sfc/sfc_flow.c         |  21 ++--
 drivers/net/tap/tap_flow.c         |   8 +-
 examples/ipsec-secgw/ipsec.c       |  10 +-
 lib/librte_ether/rte_flow.c        |  39 +++----
 lib/librte_ether/rte_flow.h        |   6 +-
 27 files changed, 473 insertions(+), 353 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 16227e752..0322f36c4 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -192,9 +192,8 @@ enum index {
 /** Storage for struct rte_flow_action_rss including external data. */
 struct action_rss_data {
 	struct rte_flow_action_rss conf;
+	uint8_t key[RSS_HASH_KEY_LENGTH];
 	uint16_t queue[ACTION_RSS_QUEUE_NUM];
-	struct rte_eth_rss_conf rss_conf;
-	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -1602,21 +1601,21 @@ static const struct token token_list[] = {
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
 		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
 			     ARGS_ENTRY_ARB
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len)),
-			     ARGS_ENTRY(struct action_rss_data, rss_key)),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len)),
+			     ARGS_ENTRY(struct action_rss_data, key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (offsetof(struct action_rss_data, rss_conf) +
-			      offsetof(struct rte_eth_rss_conf, rss_key_len),
-			      sizeof(((struct rte_eth_rss_conf *)0)->
-				     rss_key_len),
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, key_len),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     key_len),
 			      0,
 			      RSS_HASH_KEY_LENGTH)),
 	},
@@ -2075,30 +2074,36 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	action_rss_data = ctx->object;
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->rss_conf,
-			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.types = rss_hf,
+			.key_len = sizeof(action_rss_data->key),
+			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.key = action_rss_data->key,
 			.queue = action_rss_data->queue,
 		},
+		.key = "testpmd's default RSS hash key",
 		.queue = { 0 },
-		.rss_conf = (struct rte_eth_rss_conf){
-			.rss_key = action_rss_data->rss_key,
-			.rss_key_len = sizeof(action_rss_data->rss_key),
-			.rss_hf = rss_hf,
-		},
-		.rss_key = "testpmd's default RSS hash key",
 	};
-	for (i = 0; i < action_rss_data->conf.num; ++i)
+	for (i = 0; i < action_rss_data->conf.queue_num; ++i)
 		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
-		if (rte_eth_dev_rss_hash_conf_get
-		    (ctx->port, &action_rss_data->rss_conf) < 0) {
+		struct rte_eth_rss_conf rss_conf = {
+			.rss_key = action_rss_data->key,
+			.rss_key_len = sizeof(action_rss_data->key),
+		};
+
+		if (rte_eth_dev_rss_hash_conf_get(ctx->port, &rss_conf) < 0) {
 			struct rte_eth_dev_info info;
 
 			rte_eth_dev_info_get(ctx->port, &info);
-			action_rss_data->rss_conf.rss_key_len =
-				RTE_MIN(sizeof(action_rss_data->rss_key),
+			action_rss_data->conf.key_len =
+				RTE_MIN(sizeof(action_rss_data->key),
 					info.hash_key_size);
+		} else {
+			action_rss_data->conf.types = rss_conf.rss_hf;
+			action_rss_data->conf.key_len =
+				RTE_MIN(sizeof(action_rss_data->key),
+					rss_conf.rss_key_len);
 		}
 	}
 	action->conf = &action_rss_data->conf;
@@ -2126,7 +2131,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->rss_conf.rss_hf = 0;
+		action_rss_data->conf.types = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2145,7 +2150,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->conf.types |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2195,7 +2200,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue_num = i;
 	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 052163357..717f31774 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1084,40 +1084,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index acbeaacbd..5ce041d91 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,19 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+--------------------------------+
-   | Field        | Value                          |
-   +==============+================================+
-   | ``rss_conf`` | RSS parameters                 |
-   +--------------+--------------------------------+
-   | ``num``      | number of entries in ``queue`` |
-   +--------------+--------------------------------+
-   | ``queue``    | queue indices to use           |
-   +--------------+--------------------------------+
+   +---------------+------------------------------------+
+   | Field         | Value                              |
+   +===============+====================================+
+   | ``types``     | RSS hash types (see ``ETH_RSS_*``) |
+   +---------------+------------------------------------+
+   | ``key_len``   | hash key length in bytes           |
+   +---------------+------------------------------------+
+   | ``queue_num`` | number of entries in ``queue``     |
+   +---------------+------------------------------------+
+   | ``key``       | hash key                           |
+   +---------------+------------------------------------+
+   | ``queue``     | queue indices to use               |
+   +---------------+------------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 6354b894a..902001f36 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -4,6 +4,10 @@
 
 #ifndef _E1000_ETHDEV_H_
 #define _E1000_ETHDEV_H_
+
+#include <stdint.h>
+
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_pci.h>
 
@@ -27,6 +31,7 @@
 #define E1000_CTRL_EXT_EXTEND_VLAN  (1<<26)    /* EXTENDED VLAN */
 #define IGB_VFTA_SIZE 128
 
+#define IGB_HKEY_MAX_INDEX             10
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
@@ -229,8 +234,8 @@ struct igb_ethertype_filter {
 };
 
 struct igb_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IGB_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IGB_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -501,6 +506,10 @@ int eth_igb_syn_filter_set(struct rte_eth_dev *dev,
 int eth_igb_add_del_flex_filter(struct rte_eth_dev *dev,
 			struct rte_eth_flex_filter *filter,
 			bool add);
+int igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		      const struct rte_flow_action_rss *in);
+int igb_action_rss_same(const struct rte_flow_action_rss *comp,
+			const struct rte_flow_action_rss *with);
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..7a431ac33 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -41,8 +41,6 @@
 #define IGB_DEFAULT_TX_HTHRESH      1
 #define IGB_DEFAULT_TX_WTHRESH      ((hw->mac.type == e1000_82576) ? 1 : 16)
 
-#define IGB_HKEY_MAX_INDEX 10
-
 /* Bit shift and mask */
 #define IGB_4_BIT_WIDTH  (CHAR_BIT / 2)
 #define IGB_4_BIT_MASK   RTE_LEN2MASK(IGB_4_BIT_WIDTH, uint8_t)
@@ -5576,7 +5574,7 @@ igb_rss_filter_restore(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter_info->rss_info, TRUE);
 }
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index c0f5b5190..8dc5f75f2 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1292,7 +1292,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -1300,7 +1300,7 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1310,14 +1310,18 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		}
 	}
 
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IGB_RSS_OFFLOAD_ALL;
-
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (igb_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	index++;
@@ -1518,9 +1522,8 @@ igb_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct igb_rte_flow_rss_conf));
+			igb_rss_conf_init(&rss_filter_ptr->filter_info,
+					  &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&igb_filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
@@ -1757,7 +1760,7 @@ igb_clear_rss_filter(struct rte_eth_dev *dev)
 	struct e1000_filter_info *filter =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter->rss_info.num)
+	if (filter->rss_info.conf.queue_num)
 		igb_config_rss_filter(dev, &filter->rss_info, FALSE);
 }
 
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 323913f0d..45bb3455c 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2898,12 +2898,47 @@ igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 }
 
 int
+igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
+		  const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+igb_action_rss_same(const struct rte_flow_action_rss *comp,
+		    const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 igb_config_rss_filter(struct rte_eth_dev *dev,
 		struct igb_rte_flow_rss_conf *conf, bool add)
 {
 	uint32_t shift;
 	uint16_t i, j;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct e1000_filter_info *filter_info =
 		E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
@@ -2911,8 +2946,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct igb_rte_flow_rss_conf)) == 0) {
+		if (igb_action_rss_same(&filter_info->rss_info.conf,
+					&conf->conf)) {
 			igb_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct igb_rte_flow_rss_conf));
@@ -2921,7 +2956,7 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 
 	/* Fill in redirection table. */
@@ -2933,9 +2968,9 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		} reta;
 		uint8_t q_idx;
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		q_idx = conf->queue[j];
+		q_idx = conf->conf.queue[j];
 		reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
 		if ((i & 3) == 3)
 			E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
@@ -2952,8 +2987,8 @@ igb_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	igb_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct igb_rte_flow_rss_conf));
+	if (igb_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..0242b5d59 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include <assert.h>
 
+#include <rte_common.h>
 #include <rte_eal.h>
 #include <rte_string_fns.h>
 #include <rte_pci.h>
@@ -11467,7 +11468,7 @@ i40e_rss_filter_restore(struct i40e_pf *pf)
 {
 	struct i40e_rte_flow_rss_conf *conf =
 					&pf->rss_info;
-	if (conf->num)
+	if (conf->conf.queue_num)
 		i40e_config_rss_filter(pf, conf, TRUE);
 }
 
@@ -11966,18 +11967,52 @@ i40e_cloud_filter_qinq_create(struct i40e_pf *pf)
 }
 
 int
+i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		   const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+		     const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add)
 {
 	struct i40e_hw *hw = I40E_PF_TO_HW(pf);
 	uint32_t i, lut = 0;
 	uint16_t j, num;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct i40e_rte_flow_rss_conf *rss_info = &pf->rss_info;
 
 	if (!add) {
-		if (memcmp(conf, rss_info,
-			sizeof(struct i40e_rte_flow_rss_conf)) == 0) {
+		if (i40e_action_rss_same(&rss_info->conf, &conf->conf)) {
 			i40e_pf_disable_rss(pf);
 			memset(rss_info, 0,
 				sizeof(struct i40e_rte_flow_rss_conf));
@@ -11986,7 +12021,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 		return -EINVAL;
 	}
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		return -EINVAL;
 
 	/* If both VMDQ and RSS enabled, not all of PF queues are configured.
@@ -11997,7 +12032,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	else
 		num = pf->dev_data->nb_rx_queues;
 
-	num = RTE_MIN(num, conf->num);
+	num = RTE_MIN(num, conf->conf.queue_num);
 	PMD_DRV_LOG(INFO, "Max of contiguous %u PF queues are configured",
 			num);
 
@@ -12010,7 +12045,7 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 	for (i = 0, j = 0; i < hw->func_caps.rss_table_size; i++, j++) {
 		if (j == num)
 			j = 0;
-		lut = (lut << 8) | (conf->queue[j] & ((0x1 <<
+		lut = (lut << 8) | (conf->conf.queue[j] & ((0x1 <<
 			hw->func_caps.rss_table_entry_width) - 1));
 		if ((i & 3) == 3)
 			I40E_WRITE_REG(hw, I40E_PFQF_HLUT(i >> 2), lut);
@@ -12035,8 +12070,8 @@ i40e_config_rss_filter(struct i40e_pf *pf,
 
 	i40e_hw_rss_hash_set(pf, &rss_conf);
 
-	rte_memcpy(rss_info,
-		conf, sizeof(struct i40e_rte_flow_rss_conf));
+	if (i40e_rss_conf_init(rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/i40e/i40e_ethdev.h b/drivers/net/i40e/i40e_ethdev.h
index 151ed1a8c..5c02b37a0 100644
--- a/drivers/net/i40e/i40e_ethdev.h
+++ b/drivers/net/i40e/i40e_ethdev.h
@@ -5,13 +5,18 @@
 #ifndef _I40E_ETHDEV_H_
 #define _I40E_ETHDEV_H_
 
+#include <stdint.h>
+
 #include <rte_eth_ctrl.h>
 #include <rte_time.h>
 #include <rte_kvargs.h>
 #include <rte_hash.h>
+#include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_tm_driver.h>
 
+#include "base/i40e_register.h"
+
 #define I40E_VLAN_TAG_SIZE        4
 
 #define I40E_AQ_LEN               32
@@ -877,9 +882,11 @@ struct i40e_customized_pctype {
 };
 
 struct i40e_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
 	uint16_t queue_region_conf; /**< Queue region config flag */
-	uint16_t num; /**< Number of entries in queue[]. */
+	uint8_t key[(I40E_VFQF_HKEY_MAX_INDEX > I40E_PFQF_HKEY_MAX_INDEX ?
+		     I40E_VFQF_HKEY_MAX_INDEX : I40E_PFQF_HKEY_MAX_INDEX) + 1 *
+		    sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[I40E_MAX_Q_PER_TC]; /**< Queues indices to use. */
 };
 
@@ -1217,6 +1224,10 @@ void i40e_init_queue_region_conf(struct rte_eth_dev *dev);
 void i40e_flex_payload_reg_set_default(struct i40e_hw *hw);
 int i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, uint8_t key_len);
 int i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, uint16_t lut_size);
+int i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
+		       const struct rte_flow_action_rss *in);
+int i40e_action_rss_same(const struct rte_flow_action_rss *comp,
+			 const struct rte_flow_action_rss *with);
 int i40e_config_rss_filter(struct i40e_pf *pf,
 		struct i40e_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 79e05c2cc..1c09f8121 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4207,7 +4207,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 
 	if (action_flag) {
 		for (n = 0; n < 64; n++) {
-			if (rss->rss_conf->rss_hf & (hf_bit << n)) {
+			if (rss->types & (hf_bit << n)) {
 				conf_info->region[0].hw_flowtype[0] = n;
 				conf_info->region[0].flowtype_num = 1;
 				conf_info->queue_region_number = 1;
@@ -4219,8 +4219,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	for (n = 0; n < conf_info->queue_region_number; n++) {
 		if (conf_info->region[n].user_priority_num ||
 				conf_info->region[n].flowtype_num) {
-			if (!((rte_is_power_of_2(rss->num)) &&
-					rss->num <= 64)) {
+			if (!((rte_is_power_of_2(rss->queue_num)) &&
+					rss->queue_num <= 64)) {
 				PMD_DRV_LOG(ERR, "The region sizes should be any of the following values: 1, 2, 4, 8, 16, 32, 64 as long as the "
 				"total number of queues do not exceed the VSI allocation");
 				return -rte_errno;
@@ -4238,10 +4238,11 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				return -rte_errno;
 			}
 
-			if (rss_info->num < rss->num ||
-				rss_info->queue[0] < rss->queue[0] ||
-				(rss->queue[0] + rss->num >
-					rss_info->num + rss_info->queue[0])) {
+			if (rss_info->conf.queue_num < rss->queue_num ||
+				rss_info->conf.queue[0] < rss->queue[0] ||
+				(rss->queue[0] + rss->queue_num >
+					rss_info->conf.queue_num +
+					rss_info->queue[0])) {
 				rte_flow_error_set(error, EINVAL,
 					RTE_FLOW_ERROR_TYPE_ACTION,
 					act,
@@ -4250,7 +4251,8 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			}
 
 			for (i = 0; i < info->queue_region_number; i++) {
-				if (info->region[i].queue_num == rss->num &&
+				if (info->region[i].queue_num ==
+				    rss->queue_num &&
 					info->region[i].queue_start_index ==
 						rss->queue[0])
 					break;
@@ -4263,7 +4265,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 				}
 
 				info->region[i].queue_num =
-					rss->num;
+					rss->queue_num;
 				info->region[i].queue_start_index =
 					rss->queue[0];
 				info->region[i].region_id =
@@ -4306,7 +4308,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 	if (rss_config->queue_region_conf)
 		return 0;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -4314,7 +4316,7 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -4323,15 +4325,20 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_config->rss_conf = *rss->rss_conf;
-	else
-		rss_config->rss_conf.rss_hf =
-			pf->adapter->flow_types_mask;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_config->queue[n] = rss->queue[n];
-	rss_config->num = rss->num;
+	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key too large");
+	if (rss->queue_num > RTE_DIM(rss_config->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (i40e_rss_conf_init(rss_config, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
+
 	index++;
 
 	/* check if the next not void action is END */
@@ -4851,7 +4858,7 @@ i40e_flow_flush_rss_filter(struct rte_eth_dev *dev)
 
 	ret = i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
 
-	if (rss_info->num)
+	if (rss_info->conf.queue_num)
 		ret = i40e_config_rss_filter(pf, rss_info, FALSE);
 	return ret;
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index fbc048f7d..227f4c342 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -100,8 +100,6 @@
 
 #define IXGBE_QUEUE_STAT_COUNTERS (sizeof(hw_stats->qprc) / sizeof(hw_stats->qprc[0]))
 
-#define IXGBE_HKEY_MAX_INDEX 10
-
 /* Additional timesync values. */
 #define NSEC_PER_SEC             1000000000L
 #define IXGBE_INCVAL_10GB        0x66666666
@@ -8272,7 +8270,7 @@ ixgbe_rss_filter_restore(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev,
 			&filter_info->rss_info, TRUE);
 }
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index 655077700..9491b03f4 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -4,6 +4,9 @@
 
 #ifndef _IXGBE_ETHDEV_H_
 #define _IXGBE_ETHDEV_H_
+
+#include <stdint.h>
+
 #include "base/ixgbe_type.h"
 #include "base/ixgbe_dcb.h"
 #include "base/ixgbe_dcb_82599.h"
@@ -12,6 +15,7 @@
 #ifdef RTE_LIBRTE_SECURITY
 #include "ixgbe_ipsec.h"
 #endif
+#include <rte_flow.h>
 #include <rte_time.h>
 #include <rte_hash.h>
 #include <rte_pci.h>
@@ -39,6 +43,7 @@
 #define IXGBE_EXTENDED_VLAN	  (uint32_t)(1 << 26) /* EXTENDED VLAN ENABLE */
 #define IXGBE_VFTA_SIZE 128
 #define IXGBE_VLAN_TAG_SIZE 4
+#define IXGBE_HKEY_MAX_INDEX 10
 #define IXGBE_MAX_RX_QUEUE_NUM	128
 #define IXGBE_MAX_INTR_QUEUE_NUM	15
 #define IXGBE_VMDQ_DCB_NB_QUEUES     IXGBE_MAX_RX_QUEUE_NUM
@@ -196,8 +201,8 @@ struct ixgbe_hw_fdir_info {
 };
 
 struct ixgbe_rte_flow_rss_conf {
-	struct rte_eth_rss_conf rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss conf; /**< RSS parameters. */
+	uint8_t key[IXGBE_HKEY_MAX_INDEX * sizeof(uint32_t)]; /* Hash key. */
 	uint16_t queue[IXGBE_MAX_RX_QUEUE_NUM]; /**< Queues indices to use. */
 };
 
@@ -696,6 +701,10 @@ void ixgbe_tm_conf_init(struct rte_eth_dev *dev);
 void ixgbe_tm_conf_uninit(struct rte_eth_dev *dev);
 int ixgbe_set_queue_rate_limit(struct rte_eth_dev *dev, uint16_t queue_idx,
 			       uint16_t tx_rate);
+int ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+			const struct rte_flow_action_rss *in);
+int ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+			  const struct rte_flow_action_rss *with);
 int ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add);
 
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index abdeac28b..4e31c7c56 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2761,7 +2761,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 
 	rss = (const struct rte_flow_action_rss *)act->conf;
 
-	if (!rss || !rss->num) {
+	if (!rss || !rss->queue_num) {
 		rte_flow_error_set(error, EINVAL,
 				RTE_FLOW_ERROR_TYPE_ACTION,
 				act,
@@ -2769,7 +2769,7 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return -rte_errno;
 	}
 
-	for (n = 0; n < rss->num; n++) {
+	for (n = 0; n < rss->queue_num; n++) {
 		if (rss->queue[n] >= dev->data->nb_rx_queues) {
 			rte_flow_error_set(error, EINVAL,
 				   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -2778,14 +2778,19 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 			return -rte_errno;
 		}
 	}
-	if (rss->rss_conf)
-		rss_conf->rss_conf = *rss->rss_conf;
-	else
-		rss_conf->rss_conf.rss_hf = IXGBE_RSS_OFFLOAD_ALL;
 
-	for (n = 0; n < rss->num; ++n)
-		rss_conf->queue[n] = rss->queue[n];
-	rss_conf->num = rss->num;
+	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS hash key must be exactly 40 bytes");
+	if (rss->queue_num > RTE_DIM(rss_conf->queue))
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "too many queues for RSS context");
+	if (ixgbe_rss_conf_init(rss_conf, rss))
+		return rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "RSS context initialization failure");
 
 	/* check if the next not void item is END */
 	act = next_no_void_action(actions, act);
@@ -2834,7 +2839,7 @@ ixgbe_clear_rss_filter(struct rte_eth_dev *dev)
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		ixgbe_config_rss_filter(dev, &filter_info->rss_info, FALSE);
 }
 
@@ -3153,9 +3158,8 @@ ixgbe_flow_create(struct rte_eth_dev *dev,
 				PMD_DRV_LOG(ERR, "failed to allocate memory");
 				goto out;
 			}
-			rte_memcpy(&rss_filter_ptr->filter_info,
-				&rss_conf,
-				sizeof(struct ixgbe_rte_flow_rss_conf));
+			ixgbe_rss_conf_init(&rss_filter_ptr->filter_info,
+					    &rss_conf.conf);
 			TAILQ_INSERT_TAIL(&filter_rss_list,
 				rss_filter_ptr, entries);
 			flow->rule = rss_filter_ptr;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 7511e183f..94ea7444d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5675,6 +5675,36 @@ ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
 }
 
 int
+ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
+		    const struct rte_flow_action_rss *in)
+{
+	if (in->key_len > RTE_DIM(out->key) ||
+	    in->queue_num > RTE_DIM(out->queue))
+		return -EINVAL;
+	out->conf = (struct rte_flow_action_rss){
+		.types = in->types,
+		.key_len = in->key_len,
+		.queue_num = in->queue_num,
+		.key = memcpy(out->key, in->key, in->key_len),
+		.queue = memcpy(out->queue, in->queue,
+				sizeof(*in->queue) * in->queue_num),
+	};
+	return 0;
+}
+
+int
+ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
+		      const struct rte_flow_action_rss *with)
+{
+	return (comp->types == with->types &&
+		comp->key_len == with->key_len &&
+		comp->queue_num == with->queue_num &&
+		!memcmp(comp->key, with->key, with->key_len) &&
+		!memcmp(comp->queue, with->queue,
+			sizeof(*with->queue) * with->queue_num));
+}
+
+int
 ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		struct ixgbe_rte_flow_rss_conf *conf, bool add)
 {
@@ -5684,7 +5714,12 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	uint16_t j;
 	uint16_t sp_reta_size;
 	uint32_t reta_reg;
-	struct rte_eth_rss_conf rss_conf = conf->rss_conf;
+	struct rte_eth_rss_conf rss_conf = {
+		.rss_key = conf->conf.key_len ?
+			(void *)(uintptr_t)conf->conf.key : NULL,
+		.rss_key_len = conf->conf.key_len,
+		.rss_hf = conf->conf.types,
+	};
 	struct ixgbe_filter_info *filter_info =
 		IXGBE_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
 
@@ -5694,8 +5729,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	sp_reta_size = ixgbe_reta_size_get(hw->mac.type);
 
 	if (!add) {
-		if (memcmp(conf, &filter_info->rss_info,
-			sizeof(struct ixgbe_rte_flow_rss_conf)) == 0) {
+		if (ixgbe_action_rss_same(&filter_info->rss_info.conf,
+					  &conf->conf)) {
 			ixgbe_rss_disable(dev);
 			memset(&filter_info->rss_info, 0,
 				sizeof(struct ixgbe_rte_flow_rss_conf));
@@ -5704,7 +5739,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-	if (filter_info->rss_info.num)
+	if (filter_info->rss_info.conf.queue_num)
 		return -EINVAL;
 	/* Fill in redirection table
 	 * The byte-swap is needed because NIC registers are in
@@ -5714,9 +5749,9 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 	for (i = 0, j = 0; i < sp_reta_size; i++, j++) {
 		reta_reg = ixgbe_reta_reg_get(hw->mac.type, i);
 
-		if (j == conf->num)
+		if (j == conf->conf.queue_num)
 			j = 0;
-		reta = (reta << 8) | conf->queue[j];
+		reta = (reta << 8) | conf->conf.queue[j];
 		if ((i & 3) == 3)
 			IXGBE_WRITE_REG(hw, reta_reg,
 					rte_bswap32(reta));
@@ -5733,8 +5768,8 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 		rss_conf.rss_key = rss_intel_key; /* Default hash key */
 	ixgbe_hw_rss_hash_set(hw, &rss_conf);
 
-	rte_memcpy(&filter_info->rss_info,
-		conf, sizeof(struct ixgbe_rte_flow_rss_conf));
+	if (ixgbe_rss_conf_init(&filter_info->rss_info, &conf->conf))
+		return -EINVAL;
 
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index fb8a8b848..c7854bead 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -569,7 +569,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			     " for UDP RSS and inner VXLAN RSS");
 			/* Fake support for all possible RSS hash fields. */
 			priv->hw_rss_sup = ~UINT64_C(0);
-			priv->hw_rss_sup = mlx4_conv_rss_hf(priv, -1);
+			priv->hw_rss_sup = mlx4_conv_rss_types(priv, -1);
 			/* Filter out known unsupported fields. */
 			priv->hw_rss_sup &=
 				~(uint64_t)(IBV_RX_HASH_SRC_PORT_UDP |
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 5a1b7dedd..4dbcaa39c 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -76,22 +76,22 @@ struct mlx4_drop {
 };
 
 /**
- * Convert DPDK RSS hash fields to their Verbs equivalent.
+ * Convert DPDK RSS hash types to their Verbs equivalent.
  *
- * This function returns the supported (default) set when @p rss_hf has
+ * This function returns the supported (default) set when @p types has
  * special value (uint64_t)-1.
  *
  * @param priv
  *   Pointer to private structure.
- * @param rss_hf
- *   Hash fields in DPDK format (see struct rte_eth_rss_conf).
+ * @param types
+ *   Hash types in DPDK format (see struct rte_eth_rss_conf).
  *
  * @return
  *   A valid Verbs RSS hash fields mask for mlx4 on success, (uint64_t)-1
  *   otherwise and rte_errno is set.
  */
 uint64_t
-mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
+mlx4_conv_rss_types(struct priv *priv, uint64_t types)
 {
 	enum { IPV4, IPV6, TCP, UDP, };
 	const uint64_t in[] = {
@@ -126,17 +126,17 @@ mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf)
 	unsigned int i;
 
 	for (i = 0; i != RTE_DIM(in); ++i)
-		if (rss_hf & in[i]) {
-			seen |= rss_hf & in[i];
+		if (types & in[i]) {
+			seen |= types & in[i];
 			conv |= out[i];
 		}
 	if ((conv & priv->hw_rss_sup) == conv) {
-		if (rss_hf == (uint64_t)-1) {
+		if (types == (uint64_t)-1) {
 			/* Include inner RSS by default if supported. */
 			conv |= priv->hw_rss_sup & IBV_RX_HASH_INNER;
 			return conv;
 		}
-		if (!(rss_hf & ~seen))
+		if (!(types & ~seen))
 			return conv;
 	}
 	rte_errno = ENOTSUP;
@@ -717,7 +717,8 @@ mlx4_flow_prepare(struct priv *priv,
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
-			const struct rte_eth_rss_conf *rss_conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint64_t fields;
 			unsigned int i;
 
@@ -747,58 +748,56 @@ mlx4_flow_prepare(struct priv *priv,
 				break;
 			rss = action->conf;
 			/* Default RSS configuration if none is provided. */
-			rss_conf =
-				rss->rss_conf ?
-				rss->rss_conf :
-				&(struct rte_eth_rss_conf){
-					.rss_key = mlx4_rss_hash_key_default,
-					.rss_key_len = MLX4_RSS_HASH_KEY_SIZE,
-					.rss_hf = -1,
-				};
+			if (rss->key_len) {
+				rss_key = rss->key;
+				rss_key_len = rss->key_len;
+			} else {
+				rss_key = mlx4_rss_hash_key_default;
+				rss_key_len = MLX4_RSS_HASH_KEY_SIZE;
+			}
 			/* Sanity checks. */
-			for (i = 0; i < rss->num; ++i)
+			for (i = 0; i < rss->queue_num; ++i)
 				if (rss->queue[i] >=
 				    priv->dev->data->nb_rx_queues)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "queue index target beyond number of"
 					" configured Rx queues";
 				goto exit_action_not_supported;
 			}
-			if (!rte_is_power_of_2(rss->num)) {
+			if (!rte_is_power_of_2(rss->queue_num)) {
 				msg = "for RSS, mlx4 requires the number of"
 					" queues to be a power of two";
 				goto exit_action_not_supported;
 			}
-			if (rss_conf->rss_key_len !=
-			    sizeof(flow->rss->key)) {
+			if (rss_key_len != sizeof(flow->rss->key)) {
 				msg = "mlx4 supports exactly one RSS hash key"
 					" length: "
 					MLX4_STR_EXPAND(MLX4_RSS_HASH_KEY_SIZE);
 				goto exit_action_not_supported;
 			}
-			for (i = 1; i < rss->num; ++i)
+			for (i = 1; i < rss->queue_num; ++i)
 				if (rss->queue[i] - rss->queue[i - 1] != 1)
 					break;
-			if (i != rss->num) {
+			if (i != rss->queue_num) {
 				msg = "mlx4 requires RSS contexts to use"
 					" consecutive queue indices only";
 				goto exit_action_not_supported;
 			}
-			if (rss->queue[0] % rss->num) {
+			if (rss->queue[0] % rss->queue_num) {
 				msg = "mlx4 requires the first queue of a RSS"
 					" context to be aligned on a multiple"
 					" of the context size";
 				goto exit_action_not_supported;
 			}
 			rte_errno = 0;
-			fields = mlx4_conv_rss_hf(priv, rss_conf->rss_hf);
+			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
 				msg = "unsupported RSS hash type requested";
 				goto exit_action_not_supported;
 			}
 			flow->rss = mlx4_rss_get
-				(priv, fields, rss_conf->rss_key, rss->num,
+				(priv, fields, rss_key, rss->queue_num,
 				 rss->queue);
 			if (!flow->rss) {
 				msg = "either invalid parameters or not enough"
@@ -1284,8 +1283,10 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
+		.types = -1,
+		.key_len = MLX4_RSS_HASH_KEY_SIZE,
+		.queue_num = queues,
+		.key = mlx4_rss_hash_key_default,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx4/mlx4_flow.h b/drivers/net/mlx4/mlx4_flow.h
index 00188a65c..f71078ecc 100644
--- a/drivers/net/mlx4/mlx4_flow.h
+++ b/drivers/net/mlx4/mlx4_flow.h
@@ -47,7 +47,7 @@ struct rte_flow {
 
 /* mlx4_flow.c */
 
-uint64_t mlx4_conv_rss_hf(struct priv *priv, uint64_t rss_hf);
+uint64_t mlx4_conv_rss_types(struct priv *priv, uint64_t rss_hf);
 int mlx4_flow_sync(struct priv *priv, struct rte_flow_error *error);
 void mlx4_flow_clean(struct priv *priv);
 int mlx4_filter_ctrl(struct rte_eth_dev *dev,
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 7a036ed83..474614e4d 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -88,7 +88,7 @@ mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE] = {
  */
 struct mlx4_rss *
 mlx4_rss_get(struct priv *priv, uint64_t fields,
-	     uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+	     const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 	     uint16_t queues, const uint16_t queue_id[])
 {
 	struct mlx4_rss *rss;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index dd46ac006..521267724 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -126,7 +126,7 @@ uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE];
 int mlx4_rss_init(struct priv *priv);
 void mlx4_rss_deinit(struct priv *priv);
 struct mlx4_rss *mlx4_rss_get(struct priv *priv, uint64_t fields,
-			      uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
+			      const uint8_t key[MLX4_RSS_HASH_KEY_SIZE],
 			      uint16_t queues, const uint16_t queue_id[]);
 void mlx4_rss_put(struct mlx4_rss *rss);
 int mlx4_rss_attach(struct mlx4_rss *rss);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a52dcf263..7798052f9 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -214,9 +214,8 @@ struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t (*queues)[]; /**< Queues indexes to use. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	struct ibv_counter_set *cs; /**< Holds the counters for the rule. */
 	struct mlx5_flow_counter_stats counter_stats;/**<The counter stats. */
@@ -406,9 +405,8 @@ struct mlx5_flow_parse {
 	uint32_t mark:1; /**< Mark is present in the flow. */
 	uint32_t count:1; /**< Count is present in the flow. */
 	uint32_t mark_id; /**< Mark identifier. */
+	struct rte_flow_action_rss rss_conf; /**< RSS configuration */
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
-	uint16_t queues_n; /**< Number of entries in queue[]. */
-	struct rte_eth_rss_conf rss_conf; /**< RSS configuration */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
@@ -532,47 +530,6 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Copy the RSS configuration from the user ones, of the rss_conf is null,
- * uses the driver one.
- *
- * @param parser
- *   Internal parser structure.
- * @param rss_conf
- *   User RSS configuration to save.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_flow_convert_rss_conf(struct mlx5_flow_parse *parser,
-			   const struct rte_eth_rss_conf *rss_conf)
-{
-	/*
-	 * This function is also called at the beginning of
-	 * mlx5_flow_convert_actions() to initialize the parser with the
-	 * device default RSS configuration.
-	 */
-	if (rss_conf) {
-		if (rss_conf->rss_hf & MLX5_RSS_HF_MASK) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len != 40) {
-			rte_errno = EINVAL;
-			return -rte_errno;
-		}
-		if (rss_conf->rss_key_len && rss_conf->rss_key) {
-			parser->rss_conf.rss_key_len = rss_conf->rss_key_len;
-			memcpy(parser->rss_key, rss_conf->rss_key,
-			       rss_conf->rss_key_len);
-			parser->rss_conf.rss_key = parser->rss_key;
-		}
-		parser->rss_conf.rss_hf = rss_conf->rss_hf;
-	}
-	return 0;
-}
-
-/**
  * Extract attribute to the parser.
  *
  * @param[in] attr
@@ -642,17 +599,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	enum { FATE = 1, MARK = 2, COUNT = 4, };
 	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
 
-	/*
-	 * Add default RSS configuration necessary for Verbs to create QP even
-	 * if no RSS is necessary.
-	 */
-	ret = mlx5_flow_convert_rss_conf(parser,
-					 (const struct rte_eth_rss_conf *)
-					 &priv->rss_conf);
-	if (ret)
-		return ret;
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
@@ -671,25 +618,53 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			parser->queues_n = 1;
 			parser->queues[0] = queue->index;
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.queue_num = 1,
+				.queue = parser->queues,
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
+			const uint8_t *rss_key;
+			uint32_t rss_key_len;
 			uint16_t n;
 
 			if (overlap & FATE)
 				goto exit_action_overlap;
 			overlap |= FATE;
-			if (!rss || !rss->num) {
+			if (rss->types & MLX5_RSS_HF_MASK) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "unsupported RSS type"
+						   " requested");
+				return -rte_errno;
+			}
+			if (rss->key_len) {
+				rss_key_len = rss->key_len;
+				rss_key = rss->key;
+			} else {
+				rss_key_len = rss_hash_default_key_len;
+				rss_key = rss_hash_default_key;
+			}
+			if (rss_key_len != RTE_DIM(parser->rss_key)) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS hash key must be"
+						   " exactly 40 bytes long");
+				return -rte_errno;
+			}
+			if (!rss->queue_num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (rss->num > RTE_DIM(parser->queues)) {
+			if (rss->queue_num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -697,7 +672,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " context");
 				return -rte_errno;
 			}
-			for (n = 0; n < rss->num; ++n) {
+			for (n = 0; n < rss->queue_num; ++n) {
 				if (rss->queue[n] >= priv->rxqs_n) {
 					rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -707,16 +682,16 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 					return -rte_errno;
 				}
 			}
-			for (n = 0; n < rss->num; ++n)
-				parser->queues[n] = rss->queue[n];
-			parser->queues_n = rss->num;
-			if (mlx5_flow_convert_rss_conf(parser, rss->rss_conf)) {
-				rte_flow_error_set(error, EINVAL,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "wrong RSS configuration");
-				return -rte_errno;
-			}
+			parser->rss_conf = (struct rte_flow_action_rss){
+				.types = rss->types,
+				.key_len = rss_key_len,
+				.queue_num = rss->queue_num,
+				.key = memcpy(parser->rss_key, rss_key,
+					      sizeof(*rss_key) * rss_key_len),
+				.queue = memcpy(parser->queues, rss->queue,
+						sizeof(*rss->queue) *
+						rss->queue_num),
+			};
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_MARK) {
 			const struct rte_flow_action_mark *mark =
 				(const struct rte_flow_action_mark *)
@@ -761,7 +736,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
-	if (!parser->queues_n && !parser->drop) {
+	if (!parser->rss_conf.queue_num && !parser->drop) {
 		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "no valid action");
 		return -rte_errno;
@@ -941,7 +916,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	unsigned int i;
 
 	/* Remove any other flow not matching the pattern. */
-	if (parser->queues_n == 1 && !parser->rss_conf.rss_hf) {
+	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			if (i == HASH_RXQ_ETH)
 				continue;
@@ -969,7 +944,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 	}
 	/* Remove impossible flow according to the RSS configuration. */
 	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.rss_hf) {
+	    parser->rss_conf.types) {
 		/* Remove any other flow. */
 		for (i = hmin; i != (hmax + 1); ++i) {
 			if ((i == parser->layer) ||
@@ -980,7 +955,7 @@ mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		}
 	} else  if (!parser->queue[ip].ibv_attr) {
 		/* no RSS possible with the current configuration. */
-		parser->queues_n = 1;
+		parser->rss_conf.queue_num = 1;
 		return;
 	}
 fill:
@@ -1109,7 +1084,7 @@ mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.rss_hf &
+			if (!(parser->rss_conf.types &
 			      hash_rxq_init[i].dpdk_rss_hf) &&
 			    (i != HASH_RXQ_ETH))
 				continue;
@@ -1777,20 +1752,20 @@ mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
-				      parser->rss_conf.rss_key,
-				      parser->rss_conf.rss_key_len,
+				      parser->rss_conf.key,
+				      parser->rss_conf.key_len,
 				      hash_fields,
-				      parser->queues,
-				      parser->queues_n);
+				      parser->rss_conf.queue,
+				      parser->rss_conf.queue_num);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -1861,9 +1836,9 @@ mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 				   NULL, "internal error in flow creation");
 		goto error;
 	}
-	for (i = 0; i != parser->queues_n; ++i) {
+	for (i = 0; i != parser->rss_conf.queue_num; ++i) {
 		struct mlx5_rxq_data *q =
-			(*priv->rxqs)[parser->queues[i]];
+			(*priv->rxqs)[parser->rss_conf.queue[i]];
 
 		q->mark |= parser->mark;
 	}
@@ -1927,7 +1902,8 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	if (ret)
 		goto exit;
 	flow = rte_calloc(__func__, 1,
-			  sizeof(*flow) + parser.queues_n * sizeof(uint16_t),
+			  sizeof(*flow) +
+			  parser.rss_conf.queue_num * sizeof(uint16_t),
 			  0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
@@ -1936,15 +1912,20 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 				   "cannot allocate flow memory");
 		return NULL;
 	}
-	/* Copy queues configuration. */
+	/* Copy configuration. */
 	flow->queues = (uint16_t (*)[])(flow + 1);
-	memcpy(flow->queues, parser.queues, parser.queues_n * sizeof(uint16_t));
-	flow->queues_n = parser.queues_n;
+	flow->rss_conf = (struct rte_flow_action_rss){
+		.types = parser.rss_conf.types,
+		.key_len = parser.rss_conf.key_len,
+		.queue_num = parser.rss_conf.queue_num,
+		.key = memcpy(flow->rss_key, parser.rss_conf.key,
+			      sizeof(*parser.rss_conf.key) *
+			      parser.rss_conf.key_len),
+		.queue = memcpy(flow->queues, parser.rss_conf.queue,
+				sizeof(*parser.rss_conf.queue) *
+				parser.rss_conf.queue_num),
+	};
 	flow->mark = parser.mark;
-	/* Copy RSS configuration. */
-	flow->rss_conf = parser.rss_conf;
-	flow->rss_conf.rss_key = flow->rss_key;
-	memcpy(flow->rss_key, parser.rss_key, parser.rss_conf.rss_key_len);
 	/* finalise the flow. */
 	if (parser.drop)
 		ret = mlx5_flow_create_action_queue_drop(dev, &parser, flow,
@@ -2024,7 +2005,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 
 	if (flow->drop || !flow->mark)
 		goto free;
-	for (i = 0; i != flow->queues_n; ++i) {
+	for (i = 0; i != flow->rss_conf.queue_num; ++i) {
 		struct rte_flow *tmp;
 		int mark = 0;
 
@@ -2334,19 +2315,19 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			if (!flow->frxq[i].ibv_attr)
 				continue;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_get(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_get(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
-				mlx5_hrxq_new(dev, flow->rss_conf.rss_key,
-					      flow->rss_conf.rss_key_len,
+				mlx5_hrxq_new(dev, flow->rss_conf.key,
+					      flow->rss_conf.key_len,
 					      hash_rxq_init[i].hash_fields,
-					      (*flow->queues),
-					      flow->queues_n);
+					      flow->rss_conf.queue,
+					      flow->rss_conf.queue_num);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
@@ -2370,8 +2351,8 @@ mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 		}
 		if (!flow->mark)
 			continue;
-		for (i = 0; i != flow->queues_n; ++i)
-			(*priv->rxqs)[(*flow->queues)[i]]->mark = 1;
+		for (i = 0; i != flow->rss_conf.queue_num; ++i)
+			(*priv->rxqs)[flow->rss_conf.queue[i]]->mark = 1;
 	}
 	return 0;
 }
@@ -2448,8 +2429,10 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	};
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
-		.rss_conf = &priv->rss_conf,
-		.num = priv->reta_idx_n,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
 		.queue = queue,
 	};
 	struct rte_flow_action actions[] = {
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 1b4570586..1e4354ab3 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1218,8 +1218,8 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1286,8 +1286,8 @@ mlx5_ind_table_ibv_new(struct rte_eth_dev *dev, uint16_t queues[],
  *   An indirection table if found.
  */
 struct mlx5_ind_table_ibv *
-mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, uint16_t queues[],
-		       uint16_t queues_n)
+mlx5_ind_table_ibv_get(struct rte_eth_dev *dev, const uint16_t *queues,
+		       uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_ind_table_ibv *ind_tbl;
@@ -1391,8 +1391,10 @@ mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_new(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1419,7 +1421,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
 				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = rss_key,
+				.rx_hash_key = (void *)(uintptr_t)rss_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
@@ -1469,8 +1471,10 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
  *   An hash Rx queue on success.
  */
 struct mlx5_hrxq *
-mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key, uint8_t rss_key_len,
-	      uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
+mlx5_hrxq_get(struct rte_eth_dev *dev,
+	      const uint8_t *rss_key, uint32_t rss_key_len,
+	      uint64_t hash_fields,
+	      const uint16_t *queues, uint32_t queues_n)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index f5af43735..a702cb603 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -134,7 +134,7 @@ struct mlx5_ind_table_ibv {
 	LIST_ENTRY(mlx5_ind_table_ibv) next; /* Pointer to the next element. */
 	rte_atomic32_t refcnt; /* Reference counter. */
 	struct ibv_rwq_ind_table *ind_table; /**< Indirection table. */
-	uint16_t queues_n; /**< Number of queues in the list. */
+	uint32_t queues_n; /**< Number of queues in the list. */
 	uint16_t queues[]; /**< Queue list. */
 };
 
@@ -145,7 +145,7 @@ struct mlx5_hrxq {
 	struct mlx5_ind_table_ibv *ind_table; /* Indirection table. */
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
-	uint8_t rss_key_len; /* Hash key length in bytes. */
+	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
 
@@ -237,20 +237,22 @@ int mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx5_rxq_verify(struct rte_eth_dev *dev);
 int rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_new(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 struct mlx5_ind_table_ibv *mlx5_ind_table_ibv_get(struct rte_eth_dev *dev,
-						  uint16_t queues[],
-						  uint16_t queues_n);
+						  const uint16_t *queues,
+						  uint32_t queues_n);
 int mlx5_ind_table_ibv_release(struct rte_eth_dev *dev,
 			       struct mlx5_ind_table_ibv *ind_tbl);
 int mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev);
-struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
-struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev, uint8_t *rss_key,
-				uint8_t rss_key_len, uint64_t hash_fields,
-				uint16_t queues[], uint16_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
+struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
+				const uint8_t *rss_key, uint32_t rss_key_len,
+				uint64_t hash_fields,
+				const uint16_t *queues, uint32_t queues_n);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index 056405515..1a2c0299c 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1234,13 +1234,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	struct sfc_rxq *rxq;
 	unsigned int rxq_hw_index_min;
 	unsigned int rxq_hw_index_max;
-	const struct rte_eth_rss_conf *rss_conf = rss->rss_conf;
-	uint64_t rss_hf;
-	uint8_t *rss_key = NULL;
+	const uint8_t *rss_key;
 	struct sfc_flow_rss *sfc_rss_conf = &flow->rss_conf;
 	unsigned int i;
 
-	if (rss->num == 0)
+	if (rss->queue_num == 0)
 		return -EINVAL;
 
 	rxq_sw_index = sa->rxq_count - 1;
@@ -1248,7 +1246,7 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	rxq_hw_index_min = rxq->hw_index;
 	rxq_hw_index_max = 0;
 
-	for (i = 0; i < rss->num; ++i) {
+	for (i = 0; i < rss->queue_num; ++i) {
 		rxq_sw_index = rss->queue[i];
 
 		if (rxq_sw_index >= sa->rxq_count)
@@ -1263,15 +1261,14 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 			rxq_hw_index_max = rxq->hw_index;
 	}
 
-	rss_hf = (rss_conf != NULL) ? rss_conf->rss_hf : SFC_RSS_OFFLOADS;
-	if ((rss_hf & ~SFC_RSS_OFFLOADS) != 0)
+	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
-	if (rss_conf != NULL) {
-		if (rss_conf->rss_key_len != sizeof(sa->rss_key))
+	if (rss->key_len) {
+		if (rss->key_len != sizeof(sa->rss_key))
 			return -EINVAL;
 
-		rss_key = rss_conf->rss_key;
+		rss_key = rss->key;
 	} else {
 		rss_key = sa->rss_key;
 	}
@@ -1280,11 +1277,11 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 
 	sfc_rss_conf->rxq_hw_index_min = rxq_hw_index_min;
 	sfc_rss_conf->rxq_hw_index_max = rxq_hw_index_max;
-	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss_hf);
+	sfc_rss_conf->rss_hash_types = sfc_rte_to_efx_hash_type(rss->types);
 	rte_memcpy(sfc_rss_conf->rss_key, rss_key, sizeof(sa->rss_key));
 
 	for (i = 0; i < RTE_DIM(sfc_rss_conf->rss_tbl); ++i) {
-		unsigned int rxq_sw_index = rss->queue[i % rss->num];
+		unsigned int rxq_sw_index = rss->queue[i % rss->queue_num];
 		struct sfc_rxq *rxq = sa->rxq_info[rxq_sw_index].rxq;
 
 		sfc_rss_conf->rss_tbl[i] = rxq->hw_index - rxq_hw_index_min;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index aea3462a6..78f20913f 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1215,7 +1215,7 @@ priv_flow_process(struct pmd_internals *pmd,
 				if (err)
 					goto exit_action_not_supported;
 			}
-			if (flow && rss)
+			if (flow)
 				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
@@ -2050,7 +2050,7 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 			   struct rte_flow_error *error)
 {
 	/* 4096 is the maximum number of instructions for a BPF program */
-	int i;
+	unsigned int i;
 	int err;
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
@@ -2066,8 +2066,8 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	}
 
 	/* Update RSS map entry with queues */
-	rss_entry.nb_queues = rss->num;
-	for (i = 0; i < rss->num; i++)
+	rss_entry.nb_queues = rss->queue_num;
+	for (i = 0; i < rss->queue_num; i++)
 		rss_entry.queues[i] = rss->queue[i];
 	rss_entry.hash_fields =
 		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 8b2047adb..3ce76c413 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -202,9 +202,13 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
 						queue[j++] = i;
-				action_rss.rss_conf = &rss_conf;
-				action_rss.num = j;
-				action_rss.queue = queue;
+				action_rss = (struct rte_flow_action_rss){
+					.types = rss_conf.rss_hf,
+					.key_len = rss_conf.rss_key_len,
+					.queue_num = j,
+					.key = rss_key,
+					.queue = queue,
+				};
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 550086411..2fabc9a29 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -330,40 +330,27 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		off = 0;
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
-				.num = src.rss->num,
+				.types = src.rss->types,
+				.key_len = src.rss->key_len,
+				.queue_num = src.rss->queue_num,
 			};
 		off += sizeof(*src.rss);
-		if (src.rss->num) {
+		if (src.rss->key_len) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->queue) * src.rss->num;
+			size = sizeof(*src.rss->key) * src.rss->key_len;
 			if (dst.rss)
-				dst.rss->queue = memcpy
+				dst.rss->key = memcpy
 					((void *)((uintptr_t)dst.rss + off),
-					 src.rss->queue, size);
+					 src.rss->key, size);
 			off += size;
 		}
-		off = RTE_ALIGN_CEIL(off, sizeof(double));
-		if (dst.rss) {
-			dst.rss->rss_conf = (void *)((uintptr_t)dst.rss + off);
-			*(struct rte_eth_rss_conf *)(uintptr_t)
-				dst.rss->rss_conf = (struct rte_eth_rss_conf){
-				.rss_key_len = src.rss->rss_conf->rss_key_len,
-				.rss_hf = src.rss->rss_conf->rss_hf,
-			};
-		}
-		off += sizeof(*src.rss->rss_conf);
-		if (src.rss->rss_conf->rss_key_len) {
+		if (src.rss->queue_num) {
 			off = RTE_ALIGN_CEIL(off, sizeof(double));
-			size = sizeof(*src.rss->rss_conf->rss_key) *
-				src.rss->rss_conf->rss_key_len;
-			if (dst.rss) {
-				((struct rte_eth_rss_conf *)(uintptr_t)
-				 dst.rss->rss_conf)->rss_key =
-					(void *)((uintptr_t)dst.rss + off);
-				memcpy(dst.rss->rss_conf->rss_key,
-				       src.rss->rss_conf->rss_key,
-				       size);
-			}
+			size = sizeof(*src.rss->queue) * src.rss->queue_num;
+			if (dst.rss)
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		size = off;
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 895feb1a3..e2eba9c26 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1038,8 +1038,10 @@ struct rte_flow_query_count {
  * both can be requested simultaneously.
  */
 struct rte_flow_action_rss {
-	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in @p queue. */
+	uint64_t types; /**< RSS hash types (see ETH_RSS_*). */
+	uint32_t key_len; /**< Hash key length in bytes. */
+	uint32_t queue_num; /**< Number of entries in @p queue. */
+	const uint8_t *key; /**< Hash key. */
 	const uint16_t *queue; /**< Queue indices to use. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v2 06/15] ethdev: remove C99 flexible arrays from flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
                     ` (2 preceding siblings ...)
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions Adrien Mazarguil
@ 2018-04-06 13:25  1%   ` Adrien Mazarguil
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in " Adrien Mazarguil
                     ` (9 subsequent siblings)
  13 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

This patch replaces C99-style flexible arrays in struct rte_flow_action_rss
and struct rte_flow_item_raw with standard pointers to the same data.

They proved difficult to use in the field (e.g. no possibility of static
initialization) and unsuitable for C++ applications.

Affected PMDs and examples are updated accordingly.

This breaks ABI compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c        | 119 +++++++++++++++++---------------
 app/test-pmd/config.c              |  25 ++++---
 doc/guides/prog_guide/rte_flow.rst |  18 ++---
 drivers/net/mlx4/mlx4_flow.c       |  22 +++---
 drivers/net/mlx5/mlx5_flow.c       |  20 +++---
 examples/ipsec-secgw/ipsec.c       |  17 ++---
 lib/librte_ether/rte_flow.c        |  25 ++++---
 lib/librte_ether/rte_flow.h        |   8 ++-
 8 files changed, 136 insertions(+), 118 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 9702b3ef3..16227e752 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -179,25 +179,22 @@ enum index {
 	ACTION_METER_ID,
 };
 
-/** Size of pattern[] field in struct rte_flow_item_raw. */
-#define ITEM_RAW_PATTERN_SIZE 36
+/** Maximum size for pattern in struct rte_flow_item_raw. */
+#define ITEM_RAW_PATTERN_SIZE 40
 
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
-	(offsetof(struct rte_flow_item_raw, pattern) + ITEM_RAW_PATTERN_SIZE)
+	(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
 
 /** Maximum number of queue indices in struct rte_flow_action_rss. */
 #define ACTION_RSS_QUEUE_NUM 32
 
 /** Storage for struct rte_flow_action_rss including external data. */
-union action_rss_data {
+struct action_rss_data {
 	struct rte_flow_action_rss conf;
-	struct {
-		uint8_t conf_data[offsetof(struct rte_flow_action_rss, queue)];
-		uint16_t queue[ACTION_RSS_QUEUE_NUM];
-		struct rte_eth_rss_conf rss_conf;
-		uint8_t rss_key[RSS_HASH_KEY_LENGTH];
-	} s;
+	uint16_t queue[ACTION_RSS_QUEUE_NUM];
+	struct rte_eth_rss_conf rss_conf;
+	uint8_t rss_key[RSS_HASH_KEY_LENGTH];
 };
 
 /** Maximum number of subsequent tokens and arguments on the stack. */
@@ -320,13 +317,6 @@ struct token {
 		.size = sizeof(*((s *)0)->f), \
 	})
 
-/** Static initializer for ARGS() with arbitrary size. */
-#define ARGS_ENTRY_USZ(s, f, sz) \
-	(&(const struct arg){ \
-		.offset = offsetof(s, f), \
-		.size = (sz), \
-	})
-
 /** Static initializer for ARGS() with arbitrary offset and size. */
 #define ARGS_ENTRY_ARB(o, s) \
 	(&(const struct arg){ \
@@ -1105,9 +1095,9 @@ static const struct token token_list[] = {
 			     NEXT_ENTRY(ITEM_PARAM_IS,
 					ITEM_PARAM_SPEC,
 					ITEM_PARAM_MASK)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, length),
-			     ARGS_ENTRY_USZ(struct rte_flow_item_raw,
-					    pattern,
+		.args = ARGS(ARGS_ENTRY(struct rte_flow_item_raw, pattern),
+			     ARGS_ENTRY(struct rte_flow_item_raw, length),
+			     ARGS_ENTRY_ARB(sizeof(struct rte_flow_item_raw),
 					    ITEM_RAW_PATTERN_SIZE)),
 	},
 	[ITEM_ETH] = {
@@ -1591,7 +1581,7 @@ static const struct token token_list[] = {
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
-		.priv = PRIV_ACTION(RSS, sizeof(union action_rss_data)),
+		.priv = PRIV_ACTION(RSS, sizeof(struct action_rss_data)),
 		.next = NEXT(action_rss),
 		.call = parse_vc_action_rss,
 	},
@@ -1610,23 +1600,21 @@ static const struct token token_list[] = {
 		.name = "key",
 		.help = "RSS hash key",
 		.next = NEXT(action_rss, NEXT_ENTRY(STRING)),
-		.args = ARGS(ARGS_ENTRY_ARB
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+		.args = ARGS(ARGS_ENTRY_ARB(0, 0),
+			     ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len)),
-			     ARGS_ENTRY_ARB
-			     (((uintptr_t)((union action_rss_data *)0)->
-			       s.rss_key),
-			      RSS_HASH_KEY_LENGTH)),
+			     ARGS_ENTRY(struct action_rss_data, rss_key)),
 	},
 	[ACTION_RSS_KEY_LEN] = {
 		.name = "key_len",
 		.help = "RSS hash key length in bytes",
 		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
 		.args = ARGS(ARGS_ENTRY_ARB_BOUNDED
-			     (((uintptr_t)&((union action_rss_data *)0)->
-			       s.rss_conf.rss_key_len),
+			     (offsetof(struct action_rss_data, rss_conf) +
+			      offsetof(struct rte_eth_rss_conf, rss_key_len),
 			      sizeof(((struct rte_eth_rss_conf *)0)->
 				     rss_key_len),
 			      0,
@@ -2067,7 +2055,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 {
 	struct buffer *out = buf;
 	struct rte_flow_action *action;
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 	int ret;
 
@@ -2085,31 +2073,31 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	ctx->objmask = NULL;
 	/* Set up default configuration. */
 	action_rss_data = ctx->object;
-	*action_rss_data = (union action_rss_data){
+	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
-			.rss_conf = &action_rss_data->s.rss_conf,
+			.rss_conf = &action_rss_data->rss_conf,
 			.num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
+			.queue = action_rss_data->queue,
 		},
+		.queue = { 0 },
+		.rss_conf = (struct rte_eth_rss_conf){
+			.rss_key = action_rss_data->rss_key,
+			.rss_key_len = sizeof(action_rss_data->rss_key),
+			.rss_hf = rss_hf,
+		},
+		.rss_key = "testpmd's default RSS hash key",
 	};
-	action_rss_data->s.rss_conf = (struct rte_eth_rss_conf){
-		.rss_key = action_rss_data->s.rss_key,
-		.rss_key_len = sizeof(action_rss_data->s.rss_key),
-		.rss_hf = rss_hf,
-	};
-	strncpy((void *)action_rss_data->s.rss_key,
-		"testpmd's default RSS hash key",
-		sizeof(action_rss_data->s.rss_key));
 	for (i = 0; i < action_rss_data->conf.num; ++i)
-		action_rss_data->conf.queue[i] = i;
+		action_rss_data->queue[i] = i;
 	if (!port_id_is_invalid(ctx->port, DISABLED_WARN) &&
 	    ctx->port != (portid_t)RTE_PORT_ALL) {
 		if (rte_eth_dev_rss_hash_conf_get
-		    (ctx->port, &action_rss_data->s.rss_conf) < 0) {
+		    (ctx->port, &action_rss_data->rss_conf) < 0) {
 			struct rte_eth_dev_info info;
 
 			rte_eth_dev_info_get(ctx->port, &info);
-			action_rss_data->s.rss_conf.rss_key_len =
-				RTE_MIN(sizeof(action_rss_data->s.rss_key),
+			action_rss_data->rss_conf.rss_key_len =
+				RTE_MIN(sizeof(action_rss_data->rss_key),
 					info.hash_key_size);
 		}
 	}
@@ -2128,7 +2116,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_TYPE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	unsigned int i;
 
 	(void)token;
@@ -2138,7 +2126,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 		return -1;
 	if (!(ctx->objdata >> 16) && ctx->object) {
 		action_rss_data = ctx->object;
-		action_rss_data->s.rss_conf.rss_hf = 0;
+		action_rss_data->rss_conf.rss_hf = 0;
 	}
 	if (!strcmp_partial("end", str, len)) {
 		ctx->objdata &= 0xffff;
@@ -2157,7 +2145,7 @@ parse_vc_action_rss_type(struct context *ctx, const struct token *token,
 	if (!ctx->object)
 		return len;
 	action_rss_data = ctx->object;
-	action_rss_data->s.rss_conf.rss_hf |= rss_type_table[i].rss_type;
+	action_rss_data->rss_conf.rss_hf |= rss_type_table[i].rss_type;
 	return len;
 }
 
@@ -2172,7 +2160,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 			  void *buf, unsigned int size)
 {
 	static const enum index next[] = NEXT_ENTRY(ACTION_RSS_QUEUE);
-	union action_rss_data *action_rss_data;
+	struct action_rss_data *action_rss_data;
 	int ret;
 	int i;
 
@@ -2189,10 +2177,9 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 	if (i >= ACTION_RSS_QUEUE_NUM)
 		return -1;
 	if (push_args(ctx,
-		      ARGS_ENTRY_ARB(offsetof(struct rte_flow_action_rss,
-					      queue) +
-				     i * sizeof(action_rss_data->s.queue[i]),
-				     sizeof(action_rss_data->s.queue[i]))))
+		      ARGS_ENTRY_ARB(offsetof(struct action_rss_data, queue) +
+				     i * sizeof(action_rss_data->queue[i]),
+				     sizeof(action_rss_data->queue[i]))))
 		return -1;
 	ret = parse_int(ctx, token, str, len, NULL, 0);
 	if (ret < 0) {
@@ -2209,6 +2196,7 @@ parse_vc_action_rss_queue(struct context *ctx, const struct token *token,
 		return len;
 	action_rss_data = ctx->object;
 	action_rss_data->conf.num = i;
+	action_rss_data->conf.queue = i ? action_rss_data->queue : NULL;
 	return len;
 }
 
@@ -2486,8 +2474,8 @@ parse_int(struct context *ctx, const struct token *token,
 /**
  * Parse a string.
  *
- * Two arguments (ctx->args) are retrieved from the stack to store data and
- * its length (in that order).
+ * Three arguments (ctx->args) are retrieved from the stack to store data,
+ * its actual length and address (in that order).
  */
 static int
 parse_string(struct context *ctx, const struct token *token,
@@ -2496,6 +2484,7 @@ parse_string(struct context *ctx, const struct token *token,
 {
 	const struct arg *arg_data = pop_args(ctx);
 	const struct arg *arg_len = pop_args(ctx);
+	const struct arg *arg_addr = pop_args(ctx);
 	char tmp[16]; /* Ought to be enough. */
 	int ret;
 
@@ -2506,6 +2495,11 @@ parse_string(struct context *ctx, const struct token *token,
 		push_args(ctx, arg_data);
 		return -1;
 	}
+	if (!arg_addr) {
+		push_args(ctx, arg_len);
+		push_args(ctx, arg_data);
+		return -1;
+	}
 	size = arg_data->size;
 	/* Bit-mask fill is not supported. */
 	if (arg_data->mask || size < len)
@@ -2528,8 +2522,23 @@ parse_string(struct context *ctx, const struct token *token,
 	memset((uint8_t *)buf + len, 0x00, size - len);
 	if (ctx->objmask)
 		memset((uint8_t *)ctx->objmask + arg_data->offset, 0xff, len);
+	/* Save address if requested. */
+	if (arg_addr->size) {
+		memcpy((uint8_t *)ctx->object + arg_addr->offset,
+		       (void *[]){
+			(uint8_t *)ctx->object + arg_data->offset
+		       },
+		       arg_addr->size);
+		if (ctx->objmask)
+			memcpy((uint8_t *)ctx->objmask + arg_addr->offset,
+			       (void *[]){
+				(uint8_t *)ctx->objmask + arg_data->offset
+			       },
+			       arg_addr->size);
+	}
 	return len;
 error:
+	push_args(ctx, arg_addr);
 	push_args(ctx, arg_len);
 	push_args(ctx, arg_data);
 	return -1;
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 8d42ea9a9..052163357 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -961,7 +961,7 @@ static const struct {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -1010,14 +1010,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = flow_item[item->type].size;
@@ -1049,7 +1055,7 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 	MK_FLOW_ACTION(METER, sizeof(struct rte_flow_action_meter)),
@@ -1080,11 +1086,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 80360d068..acbeaacbd 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1309,15 +1309,15 @@ field only, both can be requested simultaneously.
 
 .. table:: RSS
 
-   +--------------+------------------------------+
-   | Field        | Value                        |
-   +==============+==============================+
-   | ``rss_conf`` | RSS parameters               |
-   +--------------+------------------------------+
-   | ``num``      | number of entries in queue[] |
-   +--------------+------------------------------+
-   | ``queue[]``  | queue indices to use         |
-   +--------------+------------------------------+
+   +--------------+--------------------------------+
+   | Field        | Value                          |
+   +==============+================================+
+   | ``rss_conf`` | RSS parameters                 |
+   +--------------+--------------------------------+
+   | ``num``      | number of entries in ``queue`` |
+   +--------------+--------------------------------+
+   | ``queue``    | queue indices to use           |
+   +--------------+--------------------------------+
 
 Action: ``PF``
 ^^^^^^^^^^^^^^
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 582483076..5a1b7dedd 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -1282,14 +1282,16 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	 */
 	uint32_t queues =
 		rte_align32pow2(priv->dev->data->nb_rx_queues + 1) >> 1;
-	alignas(struct rte_flow_action_rss) uint8_t rss_conf_data
-		[offsetof(struct rte_flow_action_rss, queue) +
-		 sizeof(((struct rte_flow_action_rss *)0)->queue[0]) * queues];
-	struct rte_flow_action_rss *rss_conf = (void *)rss_conf_data;
+	uint16_t queue[queues];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = NULL, /* Rely on default fallback settings. */
+		.num = queues,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
-			.conf = rss_conf,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -1311,12 +1313,8 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	if (!queues)
 		goto error;
 	/* Prepare default RSS configuration. */
-	*rss_conf = (struct rte_flow_action_rss){
-		.rss_conf = NULL, /* Rely on default fallback settings. */
-		.num = queues,
-	};
 	for (i = 0; i != queues; ++i)
-		rss_conf->queue[i] = i;
+		queue[i] = i;
 	/*
 	 * Set up VLAN item if filtering is enabled and at least one VLAN
 	 * filter is configured.
@@ -1375,7 +1373,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 			if (j != sizeof(mac->addr_bytes))
 				continue;
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				continue;
 			break;
@@ -1415,7 +1413,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 		if (flow && flow->internal) {
 			assert(flow->rss);
 			if (flow->rss->queues != queues ||
-			    memcmp(flow->rss->queue_id, rss_conf->queue,
+			    memcmp(flow->rss->queue_id, action_rss.queue,
 				   queues * sizeof(flow->rss->queue_id[0])))
 				flow = NULL;
 		}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 84d6f9b92..a52dcf263 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2446,9 +2446,16 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 			.type = RTE_FLOW_ITEM_TYPE_END,
 		},
 	};
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.rss_conf = &priv->rss_conf,
+		.num = priv->reta_idx_n,
+		.queue = queue,
+	};
 	struct rte_flow_action actions[] = {
 		{
 			.type = RTE_FLOW_ACTION_TYPE_RSS,
+			.conf = &action_rss,
 		},
 		{
 			.type = RTE_FLOW_ACTION_TYPE_END,
@@ -2457,24 +2464,13 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct rte_flow *flow;
 	struct rte_flow_error error;
 	unsigned int i;
-	union {
-		struct rte_flow_action_rss rss;
-		struct {
-			const struct rte_eth_rss_conf *rss_conf;
-			uint16_t num;
-			uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-		} local;
-	} action_rss;
 
 	if (!priv->reta_idx_n) {
 		rte_errno = EINVAL;
 		return -rte_errno;
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
-		action_rss.local.queue[i] = (*priv->reta_idx)[i];
-	action_rss.local.rss_conf = &priv->rss_conf;
-	action_rss.local.num = priv->reta_idx_n;
-	actions[0].conf = (const void *)&action_rss.rss;
+		queue[i] = (*priv->reta_idx)[i];
 	flow = mlx5_flow_list_create(dev, &priv->ctrl_flows, &attr, items,
 				     actions, &error);
 	if (!flow)
diff --git a/examples/ipsec-secgw/ipsec.c b/examples/ipsec-secgw/ipsec.c
index 5fb5bc16e..8b2047adb 100644
--- a/examples/ipsec-secgw/ipsec.c
+++ b/examples/ipsec-secgw/ipsec.c
@@ -186,14 +186,8 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 					.rss_key_len = 40,
 				};
 				struct rte_eth_dev *eth_dev;
-				union {
-					struct rte_flow_action_rss rss;
-					struct {
-					const struct rte_eth_rss_conf *rss_conf;
-					uint16_t num;
-					uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
-					} local;
-				} action_rss;
+				uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
+				struct rte_flow_action_rss action_rss;
 				unsigned int i;
 				unsigned int j;
 
@@ -207,9 +201,10 @@ create_session(struct ipsec_ctx *ipsec_ctx, struct ipsec_sa *sa)
 				for (i = 0, j = 0;
 				     i < eth_dev->data->nb_rx_queues; ++i)
 					if (eth_dev->data->rx_queues[i])
-						action_rss.local.queue[j++] = i;
-				action_rss.local.num = j;
-				action_rss.local.rss_conf = &rss_conf;
+						queue[j++] = i;
+				action_rss.rss_conf = &rss_conf;
+				action_rss.num = j;
+				action_rss.queue = queue;
 				ret = rte_flow_validate(sa->portid, &sa->attr,
 							sa->pattern, sa->action,
 							&err);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index db04c4f94..550086411 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -39,7 +39,7 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 	MK_FLOW_ITEM(PF, 0),
 	MK_FLOW_ITEM(VF, sizeof(struct rte_flow_item_vf)),
 	MK_FLOW_ITEM(PORT, sizeof(struct rte_flow_item_port)),
-	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)), /* +pattern[] */
+	MK_FLOW_ITEM(RAW, sizeof(struct rte_flow_item_raw)),
 	MK_FLOW_ITEM(ETH, sizeof(struct rte_flow_item_eth)),
 	MK_FLOW_ITEM(VLAN, sizeof(struct rte_flow_item_vlan)),
 	MK_FLOW_ITEM(IPV4, sizeof(struct rte_flow_item_ipv4)),
@@ -73,7 +73,7 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
+	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)),
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
 };
@@ -282,14 +282,20 @@ flow_item_spec_copy(void *buf, const struct rte_flow_item *item,
 		union {
 			struct rte_flow_item_raw *raw;
 		} dst;
+		size_t off;
 
 	case RTE_FLOW_ITEM_TYPE_RAW:
 		src.raw = item_spec;
 		dst.raw = buf;
-		size = offsetof(struct rte_flow_item_raw, pattern) +
-			src.raw->length * sizeof(*src.raw->pattern);
-		if (dst.raw)
-			memcpy(dst.raw, src.raw, size);
+		off = RTE_ALIGN_CEIL(sizeof(struct rte_flow_item_raw),
+				     sizeof(*src.raw->pattern));
+		size = off + src.raw->length * sizeof(*src.raw->pattern);
+		if (dst.raw) {
+			memcpy(dst.raw, src.raw, sizeof(*src.raw));
+			dst.raw->pattern = memcpy((uint8_t *)dst.raw + off,
+						  src.raw->pattern,
+						  size - off);
+		}
 		break;
 	default:
 		size = rte_flow_desc_item[item->type].size;
@@ -326,11 +332,14 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 			*dst.rss = (struct rte_flow_action_rss){
 				.num = src.rss->num,
 			};
-		off += offsetof(struct rte_flow_action_rss, queue);
+		off += sizeof(*src.rss);
 		if (src.rss->num) {
+			off = RTE_ALIGN_CEIL(off, sizeof(double));
 			size = sizeof(*src.rss->queue) * src.rss->num;
 			if (dst.rss)
-				memcpy(dst.rss->queue, src.rss->queue, size);
+				dst.rss->queue = memcpy
+					((void *)((uintptr_t)dst.rss + off),
+					 src.rss->queue, size);
 			off += size;
 		}
 		off = RTE_ALIGN_CEIL(off, sizeof(double));
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index af9b14a4d..895feb1a3 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -14,6 +14,7 @@
  * associated actions in hardware through flow rules.
  */
 
+#include <stddef.h>
 #include <stdint.h>
 
 #include <rte_arp.h>
@@ -432,7 +433,7 @@ struct rte_flow_item_raw {
 	int32_t offset; /**< Absolute or relative offset for pattern. */
 	uint16_t limit; /**< Search area limit for start of pattern. */
 	uint16_t length; /**< Pattern length. */
-	uint8_t pattern[]; /**< Byte string to look for. */
+	const uint8_t *pattern; /**< Byte string to look for. */
 };
 
 /** Default mask for RTE_FLOW_ITEM_TYPE_RAW. */
@@ -444,6 +445,7 @@ static const struct rte_flow_item_raw rte_flow_item_raw_mask = {
 	.offset = 0xffffffff,
 	.limit = 0xffff,
 	.length = 0xffff,
+	.pattern = NULL,
 };
 #endif
 
@@ -1037,8 +1039,8 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
-	uint16_t num; /**< Number of entries in queue[]. */
-	uint16_t queue[]; /**< Queues indices to use. */
+	uint16_t num; /**< Number of entries in @p queue. */
+	const uint16_t *queue; /**< Queue indices to use. */
 };
 
 /**
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from " Adrien Mazarguil
@ 2018-04-06 13:25  1%   ` Adrien Mazarguil
  2018-04-06 15:06  0%     ` Andrew Rybchenko
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 06/15] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
                     ` (10 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Ajit Khaparde, Wenzhuo Lu, John Daley, Gaetan Rivet, Beilei Xing,
	Konstantin Ananyev, Nelio Laranjeiro, Andrew Rybchenko,
	Pascal Mazon

This patch makes the following changes to flow rule actions:

- List order now matters, they are redefined as performed first to last
  instead of "all simultaneously".

- Repeated actions are now supported (e.g. specifying QUEUE multiple times
  now duplicates traffic among them). Previously only the last action of
  any given kind was taken into account.

- No more distinction between terminating/non-terminating/meta actions.
  Flow rules themselves are now defined as always terminating unless a
  PASSTHRU action is specified.

These changes alter the behavior of flow rules in corner cases in order to
prepare the flow API for actions that modify traffic contents or properties
(e.g. encapsulation, compression) and for which order matter when combined.

Previously one would have to so through multiple flow rules by combining
PASSTRHU with priority levels, however this proved overly complex to
implement at the PMD level, hence this simpler approach.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_validate()

PMDs with rte_flow support are modified accordingly:

- bnxt: no change, implementation already forbids multiple actions and does
  not support PASSTHRU.

- e1000: no change, same as bnxt.

- enic: modified to forbid redundant actions, no support for default drop.

- failsafe: no change needed.

- i40e: no change, implementation already forbids multiple actions.

- ixgbe: same as i40e.

- mlx4: modified to forbid multiple fate-deciding actions and drop when
  unspecified.

- mlx5: same as mlx4, with other redundant actions also forbidden.

- sfc: same as mlx4.

- tap: implementation already complies with the new behavior except for
  the default pass-through modified as a default drop.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: John Daley <johndale@cisco.com>
Cc: Gaetan Rivet <gaetan.rivet@6wind.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/prog_guide/rte_flow.rst | 67 +++++++++++++-------------------
 drivers/net/enic/enic_flow.c       | 25 ++++++++++++
 drivers/net/mlx4/mlx4_flow.c       | 21 +++++++---
 drivers/net/mlx5/mlx5_flow.c       | 69 ++++++++++++++-------------------
 drivers/net/sfc/sfc_flow.c         | 22 +++++++----
 drivers/net/tap/tap_flow.c         | 11 ++++++
 lib/librte_ether/rte_flow.h        | 54 +++++++-------------------
 7 files changed, 138 insertions(+), 131 deletions(-)

diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index a237e4fd2..80360d068 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -995,28 +995,27 @@ Actions
 
 Each possible action is represented by a type. Some have associated
 configuration structures. Several actions combined in a list can be assigned
-to a flow rule. That list is not ordered.
+to a flow rule and are performed in order.
 
 They fall in three categories:
 
-- Terminating actions that prevent processing matched packets by subsequent
-  flow rules, unless overridden with PASSTHRU.
+- Actions that modify the fate of matching traffic, for instance by dropping
+  or assigning it a specific destination.
 
-- Non-terminating actions that leave matched packets up for additional
-  processing by subsequent flow rules.
+- Actions that modify matching traffic contents or its properties. This
+  includes adding/removing encapsulation, encryption, compression and marks.
 
-- Other non-terminating meta actions that do not affect the fate of packets.
+- Actions related to the flow rule itself, such as updating counters or
+  making it non-terminating.
 
-When several actions are combined in a flow rule, they should all have
-different types (e.g. dropping a packet twice is not possible).
+Flow rules being terminating by default, not specifying any action of the
+fate kind results in undefined behavior. This applies to both ingress and
+egress.
 
-Only the last action of a given type is taken into account. PMDs still
-perform error checking on the entire list.
+PASSTHRU, when supported, makes a flow rule non-terminating.
 
 Like matching patterns, action lists are terminated by END items.
 
-*Note that PASSTHRU is the only action able to override a terminating rule.*
-
 Example of action that redirects packets to queue index 10:
 
 .. _table_rte_flow_action_example:
@@ -1029,12 +1028,11 @@ Example of action that redirects packets to queue index 10:
    | ``index`` | 10    |
    +-----------+-------+
 
-Action lists examples, their order is not significant, applications must
-consider all actions to be performed simultaneously:
+Actions are performed in list order:
 
-.. _table_rte_flow_count_and_drop:
+.. _table_rte_flow_count_then_drop:
 
-.. table:: Count and drop
+.. table:: Count then drop
 
    +-------+--------+
    | Index | Action |
@@ -1050,7 +1048,7 @@ consider all actions to be performed simultaneously:
 
 .. _table_rte_flow_mark_count_redirect:
 
-.. table:: Mark, count and redirect
+.. table:: Mark, count then redirect
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1080,12 +1078,15 @@ consider all actions to be performed simultaneously:
    | 2     | END                        |
    +-------+----------------------------+
 
-In the above example, considering both actions are performed simultaneously,
-the end result is that only QUEUE has any effect.
+In the above example, while DROP and QUEUE must be performed in order, both
+have to happen before reaching END. Only QUEUE has a visible effect.
+
+Note that such a list may be thought as ambiguous and rejected on that
+basis.
 
-.. _table_rte_flow_redirect_queue_3:
+.. _table_rte_flow_redirect_queue_5_3:
 
-.. table:: Redirect to queue 3
+.. table:: Redirect to queues 5 and 3
 
    +-------+--------+-----------+-------+
    | Index | Action | Field     | Value |
@@ -1099,9 +1100,9 @@ the end result is that only QUEUE has any effect.
    | 3     | END                        |
    +-------+----------------------------+
 
-As previously described, only the last action of a given type found in the
-list is taken into account. The above example also shows that VOID is
-ignored.
+As previously described, all actions must be taken into account. This
+effectively duplicates traffic to both queues. The above example also shows
+that VOID is ignored.
 
 Action types
 ~~~~~~~~~~~~
@@ -1151,9 +1152,8 @@ PMDs.
 Action: ``PASSTHRU``
 ^^^^^^^^^^^^^^^^^^^^
 
-Leaves packets up for additional processing by subsequent flow rules. This
-is the default when a rule does not contain a terminating action, but can be
-specified to force a rule to become non-terminating.
+Leaves traffic up for additional processing by subsequent flow rules; makes
+a flow rule non-terminating.
 
 - No configurable properties.
 
@@ -1227,8 +1227,6 @@ Action: ``QUEUE``
 
 Assigns packets to a given queue index.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_queue:
 
 .. table:: QUEUE
@@ -1245,8 +1243,6 @@ Action: ``DROP``
 Drop packets.
 
 - No configurable properties.
-- Terminating by default.
-- PASSTHRU overrides this action if both are specified.
 
 .. _table_rte_flow_action_drop:
 
@@ -1309,8 +1305,6 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1331,7 +1325,6 @@ Action: ``PF``
 Redirects packets to the physical function (PF) of the current device.
 
 - No configurable properties.
-- Terminating by default.
 
 .. _table_rte_flow_action_pf:
 
@@ -1353,8 +1346,6 @@ ID instead of the specified one. This parameter may not be available and is
 not guaranteed to work properly if the VF part is matched by a prior flow
 rule or if packets are not addressed to a VF in the first place.
 
-- Terminating by default.
-
 .. _table_rte_flow_action_vf:
 
 .. table:: VF
@@ -1378,8 +1369,6 @@ action parameter. More than one flow can use the same MTR object through
 the meter action. The MTR object can be further updated or queried using
 the rte_mtr* API.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_meter:
 
 .. table:: METER
@@ -1415,8 +1404,6 @@ direction.
 
 Multiple flows can be configured to use the same security session.
 
-- Non-terminating by default.
-
 .. _table_rte_flow_action_security:
 
 .. table:: SECURITY
diff --git a/drivers/net/enic/enic_flow.c b/drivers/net/enic/enic_flow.c
index 28923b0e2..c5c98b870 100644
--- a/drivers/net/enic/enic_flow.c
+++ b/drivers/net/enic/enic_flow.c
@@ -3,6 +3,7 @@
  */
 
 #include <errno.h>
+#include <stdint.h>
 #include <rte_log.h>
 #include <rte_ethdev_driver.h>
 #include <rte_flow_driver.h>
@@ -952,6 +953,9 @@ static int
 enic_copy_action_v1(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -963,6 +967,10 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			break;
@@ -972,6 +980,8 @@ enic_copy_action_v1(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_RQ_STEERING;
 	return 0;
 }
@@ -989,6 +999,9 @@ static int
 enic_copy_action_v2(const struct rte_flow_action actions[],
 		    struct filter_action_v2 *enic_action)
 {
+	enum { FATE = 1, MARK = 2, };
+	uint32_t overlap = 0;
+
 	FLOW_TRACE();
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -997,6 +1010,10 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
+
+			if (overlap & FATE)
+				return ENOTSUP;
+			overlap |= FATE;
 			enic_action->rq_idx =
 				enic_rte_rq_idx_to_sop_idx(queue->index);
 			enic_action->flags |= FILTER_ACTION_RQ_STEERING_FLAG;
@@ -1007,6 +1024,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			/* ENIC_MAGIC_FILTER_ID is reserved and is the highest
 			 * in the range of allows mark ids.
 			 */
@@ -1017,6 +1037,9 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 		case RTE_FLOW_ACTION_TYPE_FLAG: {
+			if (overlap & MARK)
+				return ENOTSUP;
+			overlap |= MARK;
 			enic_action->filter_id = ENIC_MAGIC_FILTER_ID;
 			enic_action->flags |= FILTER_ACTION_FILTER_ID_FLAG;
 			break;
@@ -1028,6 +1051,8 @@ enic_copy_action_v2(const struct rte_flow_action actions[],
 			break;
 		}
 	}
+	if (!overlap & FATE)
+		return ENOTSUP;
 	enic_action->type = FILTER_ACTION_V2;
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index 4d26df326..582483076 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -637,6 +637,7 @@ mlx4_flow_prepare(struct priv *priv,
 	struct rte_flow temp = { .ibv_attr_size = sizeof(*temp.ibv_attr) };
 	struct rte_flow *flow = &temp;
 	const char *msg = NULL;
+	int overlap;
 
 	if (attr->group)
 		return rte_flow_error_set
@@ -656,6 +657,7 @@ mlx4_flow_prepare(struct priv *priv,
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
 			 NULL, "only ingress is supported");
 fill:
+	overlap = 0;
 	proc = mlx4_flow_proc_item_list;
 	/* Go over pattern. */
 	for (item = pattern; item->type; ++item) {
@@ -702,6 +704,16 @@ mlx4_flow_prepare(struct priv *priv,
 	}
 	/* Go over actions list. */
 	for (action = actions; action->type; ++action) {
+		/* This one may appear anywhere multiple times. */
+		if (action->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (overlap) {
+			msg = "cannot combine several fate-deciding actions,"
+				" choose between DROP, QUEUE or RSS";
+			goto exit_action_not_supported;
+		}
+		overlap = 1;
 		switch (action->type) {
 			const struct rte_flow_action_queue *queue;
 			const struct rte_flow_action_rss *rss;
@@ -709,8 +721,6 @@ mlx4_flow_prepare(struct priv *priv,
 			uint64_t fields;
 			unsigned int i;
 
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			continue;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			flow->drop = 1;
 			break;
@@ -801,10 +811,9 @@ mlx4_flow_prepare(struct priv *priv,
 			goto exit_action_not_supported;
 		}
 	}
-	if (!flow->rss && !flow->drop)
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			 NULL, "no valid action");
+	/* When fate is unknown, drop traffic. */
+	if (!overlap)
+		flow->drop = 1;
 	/* Validation ends here. */
 	if (!addr) {
 		if (flow->rss)
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index f051fbef5..84d6f9b92 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@
  */
 
 #include <sys/queue.h>
+#include <stdint.h>
 #include <string.h>
 
 /* Verbs header. */
@@ -638,6 +639,8 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			  struct rte_flow_error *error,
 			  struct mlx5_flow_parse *parser)
 {
+	enum { FATE = 1, MARK = 2, COUNT = 4, };
+	uint32_t overlap = 0;
 	struct priv *priv = dev->data->dev_private;
 	int ret;
 
@@ -654,39 +657,31 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
 			continue;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			parser->drop = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
 				actions->conf;
-			uint16_t n;
-			uint16_t found = 0;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!queue || (queue->index > (priv->rxqs_n - 1)))
 				goto exit_action_not_supported;
-			for (n = 0; n < parser->queues_n; ++n) {
-				if (parser->queues[n] == queue->index) {
-					found = 1;
-					break;
-				}
-			}
-			if (parser->queues_n > 1 && !found) {
-				rte_flow_error_set(error, ENOTSUP,
-					   RTE_FLOW_ERROR_TYPE_ACTION,
-					   actions,
-					   "queue action not in RSS queues");
-				return -rte_errno;
-			}
-			if (!found) {
-				parser->queues_n = 1;
-				parser->queues[0] = queue->index;
-			}
+			parser->queues_n = 1;
+			parser->queues[0] = queue->index;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 			uint16_t n;
 
+			if (overlap & FATE)
+				goto exit_action_overlap;
+			overlap |= FATE;
 			if (!rss || !rss->num) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,26 +689,6 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   "no valid queues");
 				return -rte_errno;
 			}
-			if (parser->queues_n == 1) {
-				uint16_t found = 0;
-
-				assert(parser->queues_n);
-				for (n = 0; n < rss->num; ++n) {
-					if (parser->queues[0] ==
-					    rss->queue[n]) {
-						found = 1;
-						break;
-					}
-				}
-				if (!found) {
-					rte_flow_error_set(error, ENOTSUP,
-						   RTE_FLOW_ERROR_TYPE_ACTION,
-						   actions,
-						   "queue action not in RSS"
-						   " queues");
-					return -rte_errno;
-				}
-			}
 			if (rss->num > RTE_DIM(parser->queues)) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -747,6 +722,9 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 				(const struct rte_flow_action_mark *)
 				actions->conf;
 
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			if (!mark) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -764,14 +742,23 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			parser->mark = 1;
 			parser->mark_id = mark->id;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_FLAG) {
+			if (overlap & MARK)
+				goto exit_action_overlap;
+			overlap |= MARK;
 			parser->mark = 1;
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT &&
 			   priv->config.flow_counter_en) {
+			if (overlap & COUNT)
+				goto exit_action_overlap;
+			overlap |= COUNT;
 			parser->count = 1;
 		} else {
 			goto exit_action_not_supported;
 		}
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!overlap & FATE)
+		parser->drop = 1;
 	if (parser->drop && parser->mark)
 		parser->mark = 0;
 	if (!parser->queues_n && !parser->drop) {
@@ -784,6 +771,10 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
 			   actions, "action not supported");
 	return -rte_errno;
+exit_action_overlap:
+	rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+			   actions, "overlapping actions are not supported");
+	return -rte_errno;
 }
 
 /**
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index fe4c0b0c5..056405515 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1467,10 +1467,19 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 	}
 
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		/* This one may appear anywhere multiple times. */
+		if (actions->type == RTE_FLOW_ACTION_TYPE_VOID)
+			continue;
+		/* Fate-deciding actions may appear exactly once. */
+		if (is_specified) {
+			rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 actions,
+				 "Cannot combine several fate-deciding actions,"
+				 "choose between QUEUE, RSS or DROP");
+			return -rte_errno;
+		}
 		switch (actions->type) {
-		case RTE_FLOW_ACTION_TYPE_VOID:
-			break;
-
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			rc = sfc_flow_parse_queue(sa, actions->conf, flow);
 			if (rc != 0) {
@@ -1512,11 +1521,10 @@ sfc_flow_parse_actions(struct sfc_adapter *sa,
 		}
 	}
 
+	/* When fate is unknown, drop traffic. */
 	if (!is_specified) {
-		rte_flow_error_set(error, EINVAL,
-				   RTE_FLOW_ERROR_TYPE_ACTION_NUM, actions,
-				   "Action is unspecified");
-		return -rte_errno;
+		flow->spec.template.efs_dmaq_id =
+			EFX_FILTER_SPEC_RX_DMAQ_ID_DROP;
 	}
 
 	return 0;
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 551b2d83d..aea3462a6 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -1140,6 +1140,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		else
 			goto end;
 	}
+actions:
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -1222,6 +1223,16 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+	/* When fate is unknown, drop traffic. */
+	if (!action) {
+		static const struct rte_flow_action drop[] = {
+			{ .type = RTE_FLOW_ACTION_TYPE_DROP, },
+			{ .type = RTE_FLOW_ACTION_TYPE_END, },
+		};
+
+		actions = drop;
+		goto actions;
+	}
 end:
 	if (flow)
 		tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index aab637a2c..af9b14a4d 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -859,32 +859,28 @@ struct rte_flow_item {
  *
  * Each possible action is represented by a type. Some have associated
  * configuration structures. Several actions combined in a list can be
- * affected to a flow rule. That list is not ordered.
+ * assigned to a flow rule and are performed in order.
  *
  * They fall in three categories:
  *
- * - Terminating actions that prevent processing matched packets by
- *   subsequent flow rules, unless overridden with PASSTHRU.
+ * - Actions that modify the fate of matching traffic, for instance by
+ *   dropping or assigning it a specific destination.
  *
- * - Non terminating actions that leave matched packets up for additional
- *   processing by subsequent flow rules.
+ * - Actions that modify matching traffic contents or its properties. This
+ *   includes adding/removing encapsulation, encryption, compression and
+ *   marks.
  *
- * - Other non terminating meta actions that do not affect the fate of
- *   packets.
+ * - Actions related to the flow rule itself, such as updating counters or
+ *   making it non-terminating.
  *
- * When several actions are combined in a flow rule, they should all have
- * different types (e.g. dropping a packet twice is not possible).
+ * Flow rules being terminating by default, not specifying any action of the
+ * fate kind results in undefined behavior. This applies to both ingress and
+ * egress.
  *
- * Only the last action of a given type is taken into account. PMDs still
- * perform error checking on the entire list.
- *
- * Note that PASSTHRU is the only action able to override a terminating
- * rule.
+ * PASSTHRU, when supported, makes a flow rule non-terminating.
  */
 enum rte_flow_action_type {
 	/**
-	 * [META]
-	 *
 	 * End marker for action lists. Prevents further processing of
 	 * actions, thereby ending the list.
 	 *
@@ -893,8 +889,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_END,
 
 	/**
-	 * [META]
-	 *
 	 * Used as a placeholder for convenience. It is ignored and simply
 	 * discarded by PMDs.
 	 *
@@ -903,18 +897,14 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_VOID,
 
 	/**
-	 * Leaves packets up for additional processing by subsequent flow
-	 * rules. This is the default when a rule does not contain a
-	 * terminating action, but can be specified to force a rule to
-	 * become non-terminating.
+	 * Leaves traffic up for additional processing by subsequent flow
+	 * rules; makes a flow rule non-terminating.
 	 *
 	 * No associated configuration structure.
 	 */
 	RTE_FLOW_ACTION_TYPE_PASSTHRU,
 
 	/**
-	 * [META]
-	 *
 	 * Attaches an integer value to packets and sets PKT_RX_FDIR and
 	 * PKT_RX_FDIR_ID mbuf flags.
 	 *
@@ -923,8 +913,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_MARK,
 
 	/**
-	 * [META]
-	 *
 	 * Flags packets. Similar to MARK without a specific value; only
 	 * sets the PKT_RX_FDIR mbuf flag.
 	 *
@@ -949,9 +937,7 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_DROP,
 
 	/**
-	 * [META]
-	 *
-	 * Enables counters for this rule.
+	 * Enables counters for this flow rule.
 	 *
 	 * These counters can be retrieved and reset through rte_flow_query(),
 	 * see struct rte_flow_query_count.
@@ -1020,8 +1006,6 @@ struct rte_flow_action_mark {
  * RTE_FLOW_ACTION_TYPE_QUEUE
  *
  * Assign packets to a given queue index.
- *
- * Terminating by default.
  */
 struct rte_flow_action_queue {
 	uint16_t index; /**< Queue index to use. */
@@ -1050,8 +1034,6 @@ struct rte_flow_query_count {
  * Note: RSS hash result is stored in the hash.rss mbuf field which overlaps
  * hash.fdir.lo. Since the MARK action sets the hash.fdir.hi field only,
  * both can be requested simultaneously.
- *
- * Terminating by default.
  */
 struct rte_flow_action_rss {
 	const struct rte_eth_rss_conf *rss_conf; /**< RSS parameters. */
@@ -1069,8 +1051,6 @@ struct rte_flow_action_rss {
  * and is not guaranteed to work properly if the VF part is matched by a
  * prior flow rule or if packets are not addressed to a VF in the first
  * place.
- *
- * Terminating by default.
  */
 struct rte_flow_action_vf {
 	uint32_t original:1; /**< Use original VF ID if possible. */
@@ -1085,8 +1065,6 @@ struct rte_flow_action_vf {
  *
  * Packets matched by items of this type can be either dropped or passed to the
  * next item with their color set by the MTR object.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_meter {
 	uint32_t mtr_id; /**< MTR object ID created with rte_mtr_create(). */
@@ -1116,8 +1094,6 @@ struct rte_flow_action_meter {
  * direction.
  *
  * Multiple flows can be configured to use the same security session.
- *
- * Non-terminating by default.
  */
 struct rte_flow_action_security {
 	void *security_session; /**< Pointer to security session structure. */
-- 
2.11.0

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
@ 2018-04-06 13:25  2%   ` Adrien Mazarguil
  2018-04-07  9:23  0%     ` Andrew Rybchenko
  2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions Adrien Mazarguil
                     ` (11 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch breaks ABI
compatibility for the following public functions:

- rte_flow_copy()
- rte_flow_create()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_ethdev_version.map     |  2 +-
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 7 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 30450f1a4..9702b3ef3 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 7ae0295f6..8d42ea9a9 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1049,7 +1049,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index e915e7929..8f1ae5ed2 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -147,7 +147,6 @@ DPDK_17.08 {
 
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
-	rte_flow_copy;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -199,6 +198,7 @@ DPDK_18.02 {
 DPDK_18.05 {
 	global:
 
+	rte_flow_copy;
 	rte_flow_create;
 	rte_flow_destroy;
 	rte_flow_error_set;
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ba6feddee..db04c4f94 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 36fd38ffa..aab637a2c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-06 13:25  3%   ` Adrien Mazarguil
  2018-04-07  9:15  0%     ` Andrew Rybchenko
  2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from " Adrien Mazarguil
                     ` (12 subsequent siblings)
  13 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

These enable more precise reporting of objects responsible for errors.

This breaks ABI compatibility for the following public functions:

- rte_flow_create()
- rte_flow_destroy()
- rte_flow_error_set()
- rte_flow_flush()
- rte_flow_isolate()
- rte_flow_query()
- rte_flow_validate()

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/config.c                   |  4 ++++
 lib/librte_ether/rte_ethdev_version.map | 20 +++++++++++++-------
 lib/librte_ether/rte_flow.h             |  4 ++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 2058e6ec8..7ae0295f6 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1228,8 +1228,12 @@ port_flow_complain(struct rte_flow_error *error)
 		[RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
 		[RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
 		[RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
+		[RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
+		[RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
+		[RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
 		[RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
 		[RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
+		[RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
 		[RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
 	};
 	const char *errstr;
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..e915e7929 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -127,11 +127,6 @@ DPDK_17.02 {
 
 	_rte_eth_dev_reset;
 	rte_eth_dev_fw_version_get;
-	rte_flow_create;
-	rte_flow_destroy;
-	rte_flow_flush;
-	rte_flow_query;
-	rte_flow_validate;
 
 } DPDK_16.07;
 
@@ -153,7 +148,6 @@ DPDK_17.08 {
 	_rte_eth_dev_callback_process;
 	rte_eth_dev_adjust_nb_rx_tx_desc;
 	rte_flow_copy;
-	rte_flow_isolate;
 	rte_tm_capabilities_get;
 	rte_tm_hierarchy_commit;
 	rte_tm_level_capabilities_get;
@@ -192,7 +186,6 @@ DPDK_17.11 {
 	rte_eth_dev_get_sec_ctx;
 	rte_eth_dev_pool_ops_supported;
 	rte_eth_dev_reset;
-	rte_flow_error_set;
 
 } DPDK_17.08;
 
@@ -203,6 +196,19 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_create;
+	rte_flow_destroy;
+	rte_flow_error_set;
+	rte_flow_flush;
+	rte_flow_isolate;
+	rte_flow_query;
+	rte_flow_validate;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index cdaaa3a5b..95799fd9c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1186,8 +1186,12 @@ enum rte_flow_error_type {
 	RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, /**< Egress field. */
 	RTE_FLOW_ERROR_TYPE_ATTR, /**< Attributes structure. */
 	RTE_FLOW_ERROR_TYPE_ITEM_NUM, /**< Pattern length. */
+	RTE_FLOW_ERROR_TYPE_ITEM_SPEC, /**< Item specification. */
+	RTE_FLOW_ERROR_TYPE_ITEM_LAST, /**< Item specification range. */
+	RTE_FLOW_ERROR_TYPE_ITEM_MASK, /**< Item specification mask. */
 	RTE_FLOW_ERROR_TYPE_ITEM, /**< Specific pattern item. */
 	RTE_FLOW_ERROR_TYPE_ACTION_NUM, /**< Number of actions. */
+	RTE_FLOW_ERROR_TYPE_ACTION_CONF, /**< Action configuration. */
 	RTE_FLOW_ERROR_TYPE_ACTION, /**< Specific action. */
 };
 
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads
  2018-04-04 15:56  4% [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
                   ` (2 preceding siblings ...)
  2018-04-04 15:56  2% ` [dpdk-dev] [PATCH v1 10/16] ethdev: add encap level to RSS flow API action Adrien Mazarguil
@ 2018-04-06 13:25  5% ` Adrien Mazarguil
  2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
                     ` (13 more replies)
  3 siblings, 14 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:25 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v3 00/11] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/095762.html

v2 changes:

- Squashed "ethdev: update ABI for flow API functions" in subsequent
  patches.
- Emphasized ABI impact in relevant commit logs.
- Modified documentation in "ethdev: alter behavior of flow API actions" to
  describe how terminating flow rules without any action of the fate kind
  result in undefined behavior instead of dropping traffic.
- Fixed other minor documentation formatting issues.
- Modified "ethdev: refine TPID handling in flow API" as follows:
  - Using standard macro definitions for VLAN, QinQ and E-Tag EtherTypes.
  - Fixed endian conversion in sfc.
  - Replaced a condition in VLAN pattern item processing with an assertion
    check for i40e.

Adrien Mazarguil (15):
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 405 ++++++++++-----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 602 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  54 +-
 drivers/net/bnxt/bnxt_filter.c              |  53 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 +++-
 drivers/net/e1000/igb_rxtx.c                |  55 ++-
 drivers/net/enic/enic_flow.c                |  53 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 139 ++++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   7 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 ++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 319 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  33 +-
 drivers/net/sfc/sfc_flow.c                  |  83 +++-
 drivers/net/tap/tap_flow.c                  |  51 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  22 +-
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 329 ++++++++-----
 lib/librte_net/rte_ether.h                  |   1 +
 34 files changed, 1755 insertions(+), 1124 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes
  2018-04-04 14:57  3% ` [dpdk-dev] [PATCH v2 00/13] " Adrien Mazarguil
  2018-04-04 14:58  4%   ` [dpdk-dev] [PATCH v2 12/13] ethdev: fix ABI version in meson build Adrien Mazarguil
@ 2018-04-06 13:22  3%   ` Adrien Mazarguil
  2018-04-10 16:34  3%     ` [dpdk-dev] [PATCH v4 " Adrien Mazarguil
  1 sibling, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-06 13:22 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v3 changes:

- Rebased series.
- Dropped unnecessary "net/sfc: fix endian conversions in flow API".
- Dropped "ethdev: fix ABI version in meson build", handled by prior commit
  d9736a248785 ("ethdev: fix library version in meson build").

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (11):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline_flow.c                 | 255 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 9 files changed, 495 insertions(+), 128 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] eal/service: remove experimental tags
  2018-04-05 13:15  9% [dpdk-dev] [PATCH] eal/service: remove experimental tags Harry van Haaren
@ 2018-04-06  6:18  0% ` Jerin Jacob
  0 siblings, 0 replies; 200+ results
From: Jerin Jacob @ 2018-04-06  6:18 UTC (permalink / raw)
  To: Harry van Haaren; +Cc: dev

-----Original Message-----
> Date: Thu, 5 Apr 2018 14:15:46 +0100
> From: Harry van Haaren <harry.van.haaren@intel.com>
> To: dev@dpdk.org
> CC: Harry van Haaren <harry.van.haaren@intel.com>
> Subject: [dpdk-dev] [PATCH] eal/service: remove experimental tags
> X-Mailer: git-send-email 2.7.4
> 
> This commit removes the experimental tags from the
> service cores functions, they now become part of the
> main DPDK API/ABI.
> 
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 3/3] net/netvsc: add hyper-v netvsc network device
  2018-04-05 21:07  3%     ` Thomas Monjalon
@ 2018-04-05 21:19  0%       ` Stephen Hemminger
  0 siblings, 0 replies; 200+ results
From: Stephen Hemminger @ 2018-04-05 21:19 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Stephen Hemminger, dev

On Thu, 05 Apr 2018 23:07:45 +0200
Thomas Monjalon <thomas@monjalon.net> wrote:

> 05/04/2018 22:59, Stephen Hemminger:
> > On Thu, 05 Apr 2018 22:52:31 +0200
> > Thomas Monjalon <thomas@monjalon.net> wrote:
> >   
> > > Hi Stephen,
> > > 
> > > Good to see there is a good progress.
> > > 
> > > This patch should add an entry in the release notes.
> > > But I guess it is not ready for 18.05?  
> 
> [...]
> > > > +The following prerequisites apply:
> > > > +
> > > > +*   Linux kernel uio_hv_generic driver that supports subchannels. This should be present in 4.17 or later.    
> > > 
> > > The DPDK policy is to wait for prerequisite be available for merging.  
> > 
> > Does linux-next count?  
> 
> I would say no, but I could be convinced of the contrary.
> Can we have ABI breakage from linux-next to mainline?
> What is the benefit of pushing the PMD early?

There are already people using earlier versions and sending feeback.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 3/3] net/netvsc: add hyper-v netvsc network device
  @ 2018-04-05 21:07  3%     ` Thomas Monjalon
  2018-04-05 21:19  0%       ` Stephen Hemminger
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2018-04-05 21:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Stephen Hemminger, dev

05/04/2018 22:59, Stephen Hemminger:
> On Thu, 05 Apr 2018 22:52:31 +0200
> Thomas Monjalon <thomas@monjalon.net> wrote:
> 
> > Hi Stephen,
> > 
> > Good to see there is a good progress.
> > 
> > This patch should add an entry in the release notes.
> > But I guess it is not ready for 18.05?

[...]
> > > +The following prerequisites apply:
> > > +
> > > +*   Linux kernel uio_hv_generic driver that supports subchannels. This should be present in 4.17 or later.  
> > 
> > The DPDK policy is to wait for prerequisite be available for merging.
> 
> Does linux-next count?

I would say no, but I could be convinced of the contrary.
Can we have ABI breakage from linux-next to mainline?
What is the benefit of pushing the PMD early?

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5] ethdev: replace bus specific struct with generic dev
    @ 2018-04-05 16:40  2%   ` Ferruh Yigit
  2018-04-09 12:09  2%     ` [dpdk-dev] [PATCH v6] " Ferruh Yigit
  1 sibling, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-04-05 16:40 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Ferruh Yigit, Shreyansh Jain, Allain Legacy,
	Tomasz Duszynski, Santosh Shukla, David Marchand

Public struct rte_eth_dev_info has a "struct rte_pci_device" field in it
although it is common for all ethdev in all buses.

Replacing pci specific struct with generic device struct and updating
places that are using pci device in a way to get this information from
generic device.

Signed-off-by: Ferruh Yigit <ferruh.yigit@intel.com>
Reviewed-by: David Marchand <david.marchand@6wind.com>
Acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
Cc: Shreyansh Jain <shreyansh.jain@nxp.com>
Cc: Allain Legacy <allain.legacy@windriver.com>
Cc: Tomasz Duszynski <tdu@semihalf.com>
Cc: Santosh Shukla <santosh.shukla@caviumnetworks.com>
Cc: David Marchand <david.marchand@6wind.com>

v2:
- prevent possible crash while getting bus (Pablo)
- Remove unnecessary __rte_unused
- Some PMD info_dev->device was assigned to NULL, fixed them

v3:
- rebased on latest next-net

v4:
- Move dev_info->device assignment to ethdev layer

v5:
- Document API change in related section in release notes
---
 app/test-pmd/config.c                   | 18 +++++++++++++++-
 app/test-pmd/testpmd.h                  | 38 +++++++++++++++++++++++++++------
 doc/guides/rel_notes/release_18_05.rst  |  3 +++
 drivers/net/ark/ark_ethdev.c            |  1 -
 drivers/net/avf/avf_ethdev.c            |  1 -
 drivers/net/avp/avp_ethdev.c            |  1 -
 drivers/net/bnx2x/bnx2x_ethdev.c        |  1 -
 drivers/net/bnxt/bnxt_ethdev.c          |  2 --
 drivers/net/cxgbe/cxgbe_ethdev.c        |  2 --
 drivers/net/e1000/em_ethdev.c           |  1 -
 drivers/net/e1000/igb_ethdev.c          |  2 --
 drivers/net/ena/ena_ethdev.c            |  2 --
 drivers/net/enic/enic_ethdev.c          |  1 -
 drivers/net/fm10k/fm10k_ethdev.c        |  1 -
 drivers/net/i40e/i40e_ethdev.c          |  1 -
 drivers/net/i40e/i40e_ethdev_vf.c       |  1 -
 drivers/net/ixgbe/ixgbe_ethdev.c        |  2 --
 drivers/net/kni/rte_eth_kni.c           |  1 -
 drivers/net/liquidio/lio_ethdev.c       |  2 --
 drivers/net/mlx4/mlx4_ethdev.c          |  1 -
 drivers/net/mlx5/mlx5_ethdev.c          |  1 -
 drivers/net/nfp/nfp_net.c               |  1 -
 drivers/net/octeontx/octeontx_ethdev.c  |  1 -
 drivers/net/qede/qede_ethdev.c          |  1 -
 drivers/net/sfc/sfc_ethdev.c            |  1 -
 drivers/net/szedata2/rte_eth_szedata2.c |  1 -
 drivers/net/tap/rte_eth_tap.c           |  1 -
 drivers/net/thunderx/nicvf_ethdev.c     |  2 --
 drivers/net/virtio/virtio_ethdev.c      |  1 -
 drivers/net/vmxnet3/vmxnet3_ethdev.c    |  4 +---
 examples/ethtool/lib/rte_ethtool.c      | 16 ++++++++------
 examples/ip_pipeline/init.c             | 11 ++++++++--
 examples/kni/main.c                     | 11 +++++++---
 lib/librte_ether/rte_ethdev.c           |  1 +
 lib/librte_ether/rte_ethdev.h           |  2 +-
 test/test/test_kni.c                    | 35 ++++++++++++++++++++++++------
 36 files changed, 111 insertions(+), 61 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 4bb255c62..dd051f5ca 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -754,6 +754,8 @@ vlan_id_is_invalid(uint16_t vlan_id)
 static int
 port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	uint64_t pci_len;
 
 	if (reg_off & 0x3) {
@@ -762,7 +764,21 @@ port_reg_off_is_invalid(portid_t port_id, uint32_t reg_off)
 		       (unsigned)reg_off);
 		return 1;
 	}
-	pci_len = ports[port_id].dev_info.pci_dev->mem_resource[0].len;
+
+	if (!ports[port_id].dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(ports[port_id].dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(ports[port_id].dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 1;
+	}
+
+	pci_len = pci_dev->mem_resource[0].len;
 	if (reg_off >= pci_len) {
 		printf("Port %d: register offset %u (0x%X) out of port PCI "
 		       "resource (length=%"PRIu64")\n",
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 153abea05..4d84e7b00 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -500,12 +500,25 @@ mbuf_pool_find(unsigned int sock_id)
 static inline uint32_t
 port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 	uint32_t reg_v;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return 0;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return 0;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	reg_v = *((volatile uint32_t *)reg_addr);
 	return rte_le_to_cpu_32(reg_v);
 }
@@ -516,11 +529,24 @@ port_pci_reg_read(struct rte_port *port, uint32_t reg_off)
 static inline void
 port_pci_reg_write(struct rte_port *port, uint32_t reg_off, uint32_t reg_v)
 {
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 	void *reg_addr;
 
-	reg_addr = (void *)
-		((char *)port->dev_info.pci_dev->mem_resource[0].addr +
-			reg_off);
+	if (!port->dev_info.device) {
+		printf("Invalid device\n");
+		return;
+	}
+
+	bus = rte_bus_find_by_device(port->dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(port->dev_info.device);
+	} else {
+		printf("Not a PCI device\n");
+		return;
+	}
+
+	reg_addr = ((char *)pci_dev->mem_resource[0].addr + reg_off);
 	*((volatile uint32_t *)reg_addr) = rte_cpu_to_le_32(reg_v);
 }
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1cd1..ffa5cae39 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -72,6 +72,9 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* ethdev, in struct ``struct rte_eth_dev_info``, field ``rte_pci_device *pci_dev``
+  replaced with field ``struct rte_device *device``.
+
 
 ABI Changes
 -----------
diff --git a/drivers/net/ark/ark_ethdev.c b/drivers/net/ark/ark_ethdev.c
index ff87c20e2..c9d541921 100644
--- a/drivers/net/ark/ark_ethdev.c
+++ b/drivers/net/ark/ark_ethdev.c
@@ -771,7 +771,6 @@ eth_ark_dev_info_get(struct rte_eth_dev *dev,
 				ETH_LINK_SPEED_40G |
 				ETH_LINK_SPEED_50G |
 				ETH_LINK_SPEED_100G);
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 }
 
 static int
diff --git a/drivers/net/avf/avf_ethdev.c b/drivers/net/avf/avf_ethdev.c
index b59e3cf79..8e2a1b066 100644
--- a/drivers/net/avf/avf_ethdev.c
+++ b/drivers/net/avf/avf_ethdev.c
@@ -507,7 +507,6 @@ avf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct avf_info *vf = AVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = AVF_BUF_SIZE_MIN;
diff --git a/drivers/net/avp/avp_ethdev.c b/drivers/net/avp/avp_ethdev.c
index a07a288ed..5b3c4cebf 100644
--- a/drivers/net/avp/avp_ethdev.c
+++ b/drivers/net/avp/avp_ethdev.c
@@ -2172,7 +2172,6 @@ avp_dev_info_get(struct rte_eth_dev *eth_dev,
 {
 	struct avp_dev *avp = AVP_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->max_rx_queues = avp->max_rx_queues;
 	dev_info->max_tx_queues = avp->max_tx_queues;
 	dev_info->min_rx_bufsize = AVP_MIN_RX_BUFSIZE;
diff --git a/drivers/net/bnx2x/bnx2x_ethdev.c b/drivers/net/bnx2x/bnx2x_ethdev.c
index 483d5a17c..8726b357a 100644
--- a/drivers/net/bnx2x/bnx2x_ethdev.c
+++ b/drivers/net/bnx2x/bnx2x_ethdev.c
@@ -447,7 +447,6 @@ static void
 bnx2x_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct bnx2x_softc *sc = dev->data->dev_private;
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues  = sc->max_rx_queues;
 	dev_info->max_tx_queues  = sc->max_tx_queues;
 	dev_info->min_rx_bufsize = BNX2X_MIN_RX_BUF_SIZE;
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 7c007c8f9..c447cd727 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -379,8 +379,6 @@ static void bnxt_dev_info_get_op(struct rte_eth_dev *eth_dev,
 	uint16_t max_vnics, i, j, vpool, vrxq;
 	unsigned int max_rx_rings;
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	/* MAC Specifics */
 	dev_info->max_mac_addrs = bp->max_l2_ctx;
 	dev_info->max_hash_mac_addrs = 0;
diff --git a/drivers/net/cxgbe/cxgbe_ethdev.c b/drivers/net/cxgbe/cxgbe_ethdev.c
index feae01d6a..50e1fcece 100644
--- a/drivers/net/cxgbe/cxgbe_ethdev.c
+++ b/drivers/net/cxgbe/cxgbe_ethdev.c
@@ -121,8 +121,6 @@ void cxgbe_dev_info_get(struct rte_eth_dev *eth_dev,
 		.nb_align = 1,
 	};
 
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
-
 	device_info->min_rx_bufsize = CXGBE_MIN_RX_BUFSIZE;
 	device_info->max_rx_pktlen = CXGBE_MAX_RX_PKTLEN;
 	device_info->max_rx_queues = max_queues;
diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c
index 087c192d5..c6062468c 100644
--- a/drivers/net/e1000/em_ethdev.c
+++ b/drivers/net/e1000/em_ethdev.c
@@ -1070,7 +1070,6 @@ eth_em_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen = em_get_max_pktlen(dev);
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index 8d4226676..872357146 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -2144,7 +2144,6 @@ eth_igb_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
@@ -2269,7 +2268,6 @@ eth_igbvf_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->min_rx_bufsize = 256; /* See BSIZE field of RCTL register. */
 	dev_info->max_rx_pktlen  = 0x3FFF; /* See RLPML register. */
 	dev_info->max_mac_addrs = hw->mac.rar_entry_count;
diff --git a/drivers/net/ena/ena_ethdev.c b/drivers/net/ena/ena_ethdev.c
index 34b2a8d78..a15436c99 100644
--- a/drivers/net/ena/ena_ethdev.c
+++ b/drivers/net/ena/ena_ethdev.c
@@ -1527,8 +1527,6 @@ static void ena_infos_get(struct rte_eth_dev *dev,
 	ena_dev = &adapter->ena_dev;
 	ena_assert_msg(ena_dev != NULL, "Uninitialized device");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->speed_capa =
 			ETH_LINK_SPEED_1G   |
 			ETH_LINK_SPEED_2_5G |
diff --git a/drivers/net/enic/enic_ethdev.c b/drivers/net/enic/enic_ethdev.c
index 03f0c2547..801f4704c 100644
--- a/drivers/net/enic/enic_ethdev.c
+++ b/drivers/net/enic/enic_ethdev.c
@@ -471,7 +471,6 @@ static void enicpmd_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct enic *enic = pmd_priv(eth_dev);
 
 	ENICPMD_FUNC_TRACE();
-	device_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	/* Scattered Rx uses two receive queues per rx queue exposed to dpdk */
 	device_info->max_rx_queues = enic->conf_rq_count / 2;
 	device_info->max_tx_queues = enic->conf_wq_count;
diff --git a/drivers/net/fm10k/fm10k_ethdev.c b/drivers/net/fm10k/fm10k_ethdev.c
index 61de4d772..34affd1cc 100644
--- a/drivers/net/fm10k/fm10k_ethdev.c
+++ b/drivers/net/fm10k/fm10k_ethdev.c
@@ -1404,7 +1404,6 @@ fm10k_dev_infos_get(struct rte_eth_dev *dev,
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev            = pdev;
 	dev_info->min_rx_bufsize     = FM10K_MIN_RX_BUF_SIZE;
 	dev_info->max_rx_pktlen      = FM10K_MAX_PKT_SIZE;
 	dev_info->max_rx_queues      = hw->mac.max_queues;
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 6e06f8a2b..6a8a2cd2a 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -3212,7 +3212,6 @@ i40e_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vsi *vsi = pf->main_vsi;
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = vsi->nb_qps;
 	dev_info->max_tx_queues = vsi->nb_qps;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c
index 2908c87e0..f6d7f40b1 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -2183,7 +2183,6 @@ i40evf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct i40e_vf *vf = I40EVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 
 	memset(dev_info, 0, sizeof(*dev_info));
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->max_tx_queues = vf->vsi_res->num_queue_pairs;
 	dev_info->min_rx_bufsize = I40E_BUF_SIZE_MIN;
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index fbc048f7d..bd1773978 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -3593,7 +3593,6 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	if (RTE_ETH_DEV_SRIOV(dev).active == 0) {
@@ -3712,7 +3711,6 @@ ixgbevf_dev_info_get(struct rte_eth_dev *dev,
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = pci_dev;
 	dev_info->max_rx_queues = (uint16_t)hw->mac.max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->mac.max_tx_queues;
 	dev_info->min_rx_bufsize = 1024; /* cf BSIZEPACKET in SRRCTL reg */
diff --git a/drivers/net/kni/rte_eth_kni.c b/drivers/net/kni/rte_eth_kni.c
index dc4e65f5d..c10e970c2 100644
--- a/drivers/net/kni/rte_eth_kni.c
+++ b/drivers/net/kni/rte_eth_kni.c
@@ -201,7 +201,6 @@ eth_kni_dev_info(struct rte_eth_dev *dev __rte_unused,
 	dev_info->max_rx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->max_tx_queues = KNI_MAX_QUEUE_PER_PORT;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 }
 
 static int
diff --git a/drivers/net/liquidio/lio_ethdev.c b/drivers/net/liquidio/lio_ethdev.c
index eeb8350e4..a13a566f9 100644
--- a/drivers/net/liquidio/lio_ethdev.c
+++ b/drivers/net/liquidio/lio_ethdev.c
@@ -373,8 +373,6 @@ lio_dev_info_get(struct rte_eth_dev *eth_dev,
 	struct lio_device *lio_dev = LIO_DEV(eth_dev);
 	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 
-	devinfo->pci_dev = pci_dev;
-
 	switch (pci_dev->id.subsystem_device_id) {
 	/* CN23xx 10G cards */
 	case PCI_SUBSYS_DEV_ID_CN2350_210:
diff --git a/drivers/net/mlx4/mlx4_ethdev.c b/drivers/net/mlx4/mlx4_ethdev.c
index 5f731e023..636100b23 100644
--- a/drivers/net/mlx4/mlx4_ethdev.c
+++ b/drivers/net/mlx4/mlx4_ethdev.c
@@ -556,7 +556,6 @@ mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index cc85f76c0..44cdbb622 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -405,7 +405,6 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	unsigned int max;
 	char ifname[IF_NAMESIZE];
 
-	info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	/* FIXME: we should ask the device for these values. */
 	info->min_rx_bufsize = 32;
 	info->max_rx_pktlen = 65536;
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 8591c7de0..d922b0259 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1159,7 +1159,6 @@ nfp_net_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_queues = (uint16_t)hw->max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->max_tx_queues;
 	dev_info->min_rx_bufsize = ETHER_MIN_MTU;
diff --git a/drivers/net/octeontx/octeontx_ethdev.c b/drivers/net/octeontx/octeontx_ethdev.c
index 4d7d1d7f4..1148652ea 100644
--- a/drivers/net/octeontx/octeontx_ethdev.c
+++ b/drivers/net/octeontx/octeontx_ethdev.c
@@ -617,7 +617,6 @@ octeontx_dev_info(struct rte_eth_dev *dev,
 	dev_info->max_rx_queues = 1;
 	dev_info->max_tx_queues = PKO_MAX_NUM_DQ;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 
 	dev_info->default_rxconf = (struct rte_eth_rxconf) {
 		.rx_free_thresh = 0,
diff --git a/drivers/net/qede/qede_ethdev.c b/drivers/net/qede/qede_ethdev.c
index a4e9e753e..13c2a3b87 100644
--- a/drivers/net/qede/qede_ethdev.c
+++ b/drivers/net/qede/qede_ethdev.c
@@ -1549,7 +1549,6 @@ qede_dev_info_get(struct rte_eth_dev *eth_dev,
 
 	PMD_INIT_FUNC_TRACE(edev);
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
 	dev_info->min_rx_bufsize = (uint32_t)QEDE_MIN_RX_BUFF_SIZE;
 	dev_info->max_rx_pktlen = (uint32_t)ETH_TX_MAX_NON_LSO_PKT_LEN;
 	dev_info->rx_desc_lim = qede_rx_desc_lim;
diff --git a/drivers/net/sfc/sfc_ethdev.c b/drivers/net/sfc/sfc_ethdev.c
index 2af898e08..6631c5a7e 100644
--- a/drivers/net/sfc/sfc_ethdev.c
+++ b/drivers/net/sfc/sfc_ethdev.c
@@ -89,7 +89,6 @@ sfc_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	sfc_log_init(sa, "entry");
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->max_rx_pktlen = EFX_MAC_PDU_MAX;
 
 	/* Autonegotiation may be disabled */
diff --git a/drivers/net/szedata2/rte_eth_szedata2.c b/drivers/net/szedata2/rte_eth_szedata2.c
index 1d02aee6f..58c177398 100644
--- a/drivers/net/szedata2/rte_eth_szedata2.c
+++ b/drivers/net/szedata2/rte_eth_szedata2.c
@@ -1031,7 +1031,6 @@ eth_dev_info(struct rte_eth_dev *dev,
 		struct rte_eth_dev_info *dev_info)
 {
 	struct pmd_internals *internals = dev->data->dev_private;
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
 	dev_info->if_index = 0;
 	dev_info->max_mac_addrs = 1;
 	dev_info->max_rx_pktlen = (uint32_t)-1;
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index ed6d7380e..6a39398b8 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -737,7 +737,6 @@ tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
 	dev_info->min_rx_bufsize = 0;
-	dev_info->pci_dev = NULL;
 	dev_info->speed_capa = tap_dev_speed_capa();
 	dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
 	dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
diff --git a/drivers/net/thunderx/nicvf_ethdev.c b/drivers/net/thunderx/nicvf_ethdev.c
index 067f2243b..75e9d16c5 100644
--- a/drivers/net/thunderx/nicvf_ethdev.c
+++ b/drivers/net/thunderx/nicvf_ethdev.c
@@ -1400,8 +1400,6 @@ nicvf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	PMD_INIT_FUNC_TRACE();
 
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	/* Autonegotiation may be disabled */
 	dev_info->speed_capa = ETH_LINK_SPEED_FIXED;
 	dev_info->speed_capa |= ETH_LINK_SPEED_10M | ETH_LINK_SPEED_100M |
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 11f758929..d7c81747e 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -2064,7 +2064,6 @@ virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 
 	dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
 
-	dev_info->pci_dev = dev->device ? RTE_ETH_DEV_TO_PCI(dev) : NULL;
 	dev_info->max_rx_queues =
 		RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_RX_QUEUES);
 	dev_info->max_tx_queues =
diff --git a/drivers/net/vmxnet3/vmxnet3_ethdev.c b/drivers/net/vmxnet3/vmxnet3_ethdev.c
index 426008722..01b4802e0 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethdev.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethdev.c
@@ -1022,11 +1022,9 @@ vmxnet3_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 }
 
 static void
-vmxnet3_dev_info_get(struct rte_eth_dev *dev,
+vmxnet3_dev_info_get(struct rte_eth_dev *dev __rte_unused,
 		     struct rte_eth_dev_info *dev_info)
 {
-	dev_info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
-
 	dev_info->max_rx_queues = VMXNET3_MAX_RX_QUEUES;
 	dev_info->max_tx_queues = VMXNET3_MAX_TX_QUEUES;
 	dev_info->min_rx_bufsize = 1518 + RTE_PKTMBUF_HEADROOM;
diff --git a/examples/ethtool/lib/rte_ethtool.c b/examples/ethtool/lib/rte_ethtool.c
index 90dfbb739..d519a50db 100644
--- a/examples/ethtool/lib/rte_ethtool.c
+++ b/examples/ethtool/lib/rte_ethtool.c
@@ -22,6 +22,8 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 {
 	struct rte_eth_dev_info dev_info;
 	struct rte_dev_reg_info reg_info;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 	int n;
 	int ret;
 
@@ -46,15 +48,17 @@ rte_ethtool_get_drvinfo(uint16_t port_id, struct ethtool_drvinfo *drvinfo)
 	snprintf(drvinfo->version, sizeof(drvinfo->version), "%s",
 		rte_version());
 	/* TODO: replace bus_info by rte_devargs.name */
-	if (dev_info.pci_dev)
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info),
 			"%04x:%02x:%02x.%x",
-			dev_info.pci_dev->addr.domain,
-			dev_info.pci_dev->addr.bus,
-			dev_info.pci_dev->addr.devid,
-			dev_info.pci_dev->addr.function);
-	else
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+	} else {
 		snprintf(drvinfo->bus_info, sizeof(drvinfo->bus_info), "N/A");
+	}
 
 	memset(&reg_info, 0, sizeof(reg_info));
 	rte_eth_dev_get_reg_info(port_id, &reg_info);
diff --git a/examples/ip_pipeline/init.c b/examples/ip_pipeline/init.c
index bb07efa13..87e998828 100644
--- a/examples/ip_pipeline/init.c
+++ b/examples/ip_pipeline/init.c
@@ -1266,6 +1266,8 @@ app_init_kni(struct app_params *app) {
 		struct rte_eth_dev_info dev_info;
 		struct app_mempool_params *mempool_params;
 		struct rte_mempool *mempool;
+		const struct rte_pci_device *pci_dev;
+		const struct rte_bus *bus = NULL;
 		struct rte_kni_conf conf;
 		struct rte_kni_ops ops;
 
@@ -1297,8 +1299,13 @@ app_init_kni(struct app_params *app) {
 		}
 		conf.group_id = p_link->pmd_id;
 		conf.mbuf_size = mempool_params->buffer_size;
-		conf.addr = dev_info.pci_dev->addr;
-		conf.id = dev_info.pci_dev->id;
+		if (dev_info.device)
+			bus = rte_bus_find_by_device(dev_info.device);
+		if (bus && !strcmp(bus->name, "pci")) {
+			pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+			conf.addr = pci_dev->addr;
+			conf.id = pci_dev->id;
+		}
 
 		memset(&ops, 0, sizeof(ops));
 		ops.port_id = (uint8_t) p_link->pmd_id;
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 0d9980ee1..aebfedd59 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -834,13 +834,18 @@ kni_alloc(uint16_t port_id)
 		if (i == 0) {
 			struct rte_kni_ops ops;
 			struct rte_eth_dev_info dev_info;
+			const struct rte_pci_device *pci_dev;
+			const struct rte_bus *bus = NULL;
 
 			memset(&dev_info, 0, sizeof(dev_info));
 			rte_eth_dev_info_get(port_id, &dev_info);
 
-			if (dev_info.pci_dev) {
-				conf.addr = dev_info.pci_dev->addr;
-				conf.id = dev_info.pci_dev->id;
+			if (dev_info.device)
+				bus = rte_bus_find_by_device(dev_info.device);
+			if (bus && !strcmp(bus->name, "pci")) {
+				pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+				conf.addr = pci_dev->addr;
+				conf.id = pci_dev->id;
 			}
 			/* Get the interface default mac address */
 			rte_eth_macaddr_get(port_id,
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e04..90c47ad12 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -2395,6 +2395,7 @@ rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info)
 	memset(dev_info, 0, sizeof(struct rte_eth_dev_info));
 	dev_info->rx_desc_lim = lim;
 	dev_info->tx_desc_lim = lim;
+	dev_info->device = dev->device;
 
 	RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get);
 	(*dev->dev_ops->dev_infos_get)(dev, dev_info);
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca6a..784c6faa4 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -992,7 +992,7 @@ struct rte_pci_device;
  * Ethernet device information
  */
 struct rte_eth_dev_info {
-	struct rte_pci_device *pci_dev; /**< Device PCI information. */
+	struct rte_device *device; /** Generic device information */
 	const char *driver_name; /**< Device Driver name. */
 	unsigned int if_index; /**< Index to bound host interface, or 0 if none.
 		Use if_indextoname() to translate into an interface name. */
diff --git a/test/test/test_kni.c b/test/test/test_kni.c
index e4839cdb7..3d1be56a9 100644
--- a/test/test/test_kni.c
+++ b/test/test/test_kni.c
@@ -357,6 +357,8 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
 
 	if (!mp)
 		return -1;
@@ -366,8 +368,13 @@ test_kni_processing(uint16_t port_id, struct rte_mempool *mp)
 	memset(&ops, 0, sizeof(ops));
 
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	snprintf(conf.name, sizeof(conf.name), TEST_KNI_PORT);
 
 	/* core id 1 configured for kernel thread */
@@ -465,6 +472,8 @@ test_kni(void)
 	struct rte_kni_conf conf;
 	struct rte_eth_dev_info info;
 	struct rte_kni_ops ops;
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus;
 
 	/* Initialize KNI subsytem */
 	rte_kni_init(KNI_TEST_MAX_PORTS);
@@ -523,8 +532,15 @@ test_kni(void)
 	memset(&conf, 0, sizeof(conf));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
@@ -552,8 +568,15 @@ test_kni(void)
 	memset(&info, 0, sizeof(info));
 	memset(&ops, 0, sizeof(ops));
 	rte_eth_dev_info_get(port_id, &info);
-	conf.addr = info.pci_dev->addr;
-	conf.id = info.pci_dev->id;
+	if (info.device)
+		bus = rte_bus_find_by_device(info.device);
+	else
+		bus = NULL;
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(info.device);
+		conf.addr = pci_dev->addr;
+		conf.id = pci_dev->id;
+	}
 	conf.group_id = port_id;
 	conf.mbuf_size = MAX_PACKET_SZ;
 
-- 
2.14.3

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2 3/3] ethdev: deprecate port count function
  @ 2018-04-05 15:33  1%   ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-04-05 15:33 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit

Some DPDK applications wrongly assume these requirements:
    - no hotplug, i.e. ports are never detached
    - all allocated ports are available to the application

Such application iterates over ports by its own mean.
The most common pattern is to request the port count and
assume ports with index in the range [0..count[ can be used.

In order to fix this common mistake in all external applications,
the function rte_eth_dev_count is deprecated, while introducing
the new functions rte_eth_dev_count_avail and rte_eth_dev_count_total.

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 app/proc-info/main.c                               |  2 +-
 app/test-eventdev/test_perf_atq.c                  |  2 +-
 app/test-eventdev/test_perf_common.c               |  2 +-
 app/test-eventdev/test_perf_queue.c                |  2 +-
 app/test-eventdev/test_pipeline_atq.c              |  4 ++--
 app/test-eventdev/test_pipeline_common.c           |  6 +++---
 app/test-eventdev/test_pipeline_queue.c            |  6 +++---
 app/test-pmd/cmdline.c                             |  2 +-
 app/test-pmd/testpmd.c                             |  6 +++---
 doc/guides/rel_notes/release_18_05.rst             |  7 ++++++
 doc/guides/sample_app_ug/quota_watermark.rst       |  2 +-
 examples/bond/main.c                               |  2 +-
 examples/distributor/main.c                        |  6 +++---
 examples/ethtool/ethtool-app/main.c                |  2 +-
 examples/eventdev_pipeline/main.c                  |  4 ++--
 examples/eventdev_pipeline/pipeline_worker_tx.c    |  4 ++--
 examples/exception_path/main.c                     |  2 +-
 examples/flow_classify/flow_classify.c             |  4 ++--
 examples/flow_filtering/Makefile                   |  3 +++
 examples/flow_filtering/main.c                     |  4 ++--
 examples/flow_filtering/meson.build                |  1 +
 examples/ip_fragmentation/main.c                   |  4 ++--
 examples/ip_reassembly/main.c                      |  2 +-
 examples/ipv4_multicast/main.c                     |  2 +-
 examples/kni/main.c                                |  2 +-
 examples/l2fwd-cat/l2fwd-cat.c                     |  2 +-
 examples/l2fwd-crypto/main.c                       |  2 +-
 examples/l2fwd-jobstats/main.c                     |  2 +-
 examples/l2fwd-keepalive/main.c                    |  2 +-
 examples/l2fwd/main.c                              |  2 +-
 examples/l3fwd-acl/main.c                          |  2 +-
 examples/l3fwd-power/main.c                        |  2 +-
 examples/l3fwd-vf/main.c                           |  2 +-
 examples/l3fwd/main.c                              |  2 +-
 examples/link_status_interrupt/Makefile            |  3 +++
 examples/link_status_interrupt/main.c              |  2 +-
 examples/link_status_interrupt/meson.build         |  1 +
 .../client_server_mp/mp_client/Makefile            |  1 +
 .../client_server_mp/mp_client/client.c            |  2 +-
 .../client_server_mp/mp_server/Makefile            |  1 +
 .../client_server_mp/mp_server/init.c              |  2 +-
 examples/multi_process/l2fwd_fork/main.c           |  2 +-
 examples/multi_process/symmetric_mp/main.c         |  2 +-
 examples/netmap_compat/bridge/Makefile             |  1 +
 examples/netmap_compat/bridge/bridge.c             |  2 +-
 examples/packet_ordering/main.c                    |  4 ++--
 examples/performance-thread/l3fwd-thread/main.c    |  2 +-
 examples/ptpclient/ptpclient.c                     |  2 +-
 examples/qos_sched/Makefile                        |  3 +++
 examples/qos_sched/init.c                          |  2 +-
 examples/qos_sched/meson.build                     |  1 +
 examples/quota_watermark/qw/Makefile               |  1 +
 examples/quota_watermark/qw/init.c                 |  2 +-
 examples/rxtx_callbacks/main.c                     |  2 +-
 examples/server_node_efd/node/Makefile             |  1 +
 examples/server_node_efd/node/node.c               |  2 +-
 examples/server_node_efd/server/Makefile           |  1 +
 examples/server_node_efd/server/init.c             |  2 +-
 examples/skeleton/basicfwd.c                       |  2 +-
 examples/skeleton/meson.build                      |  2 ++
 examples/tep_termination/main.c                    |  2 +-
 examples/vhost/main.c                              |  2 +-
 examples/vm_power_manager/main.c                   |  2 +-
 examples/vmdq/main.c                               |  2 +-
 examples/vmdq_dcb/main.c                           |  2 +-
 lib/librte_ether/rte_ethdev.c                      | 25 +++++++++++++++++++---
 lib/librte_ether/rte_ethdev.h                      | 23 ++++++++++++++++++++
 lib/librte_ether/rte_ethdev_version.map            |  4 +++-
 lib/librte_eventdev/rte_event_eth_rx_adapter.c     |  3 ++-
 test/test/test_event_eth_rx_adapter.c              |  4 ++--
 test/test/test_kni.c                               |  2 +-
 test/test/test_link_bonding_mode4.c                |  2 +-
 test/test/test_link_bonding_rssconf.c              |  2 +-
 test/test/test_pmd_perf.c                          |  2 +-
 test/test/test_pmd_ring.c                          |  2 +-
 75 files changed, 150 insertions(+), 78 deletions(-)

diff --git a/app/proc-info/main.c b/app/proc-info/main.c
index 115df9d96..539e13243 100644
--- a/app/proc-info/main.c
+++ b/app/proc-info/main.c
@@ -628,7 +628,7 @@ main(int argc, char **argv)
 		return 0;
 	}
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
diff --git a/app/test-eventdev/test_perf_atq.c b/app/test-eventdev/test_perf_atq.c
index b36b22a77..64ac7b982 100644
--- a/app/test-eventdev/test_perf_atq.c
+++ b/app/test-eventdev/test_perf_atq.c
@@ -11,7 +11,7 @@ atq_nb_event_queues(struct evt_options *opt)
 {
 	/* nb_queues = number of producers */
 	return opt->prod_type == EVT_PROD_TYPE_ETH_RX_ADPTR ?
-		rte_eth_dev_count() : evt_nr_active_lcores(opt->plcores);
+		rte_eth_dev_count_avail() : evt_nr_active_lcores(opt->plcores);
 }
 
 static inline __attribute__((always_inline)) void
diff --git a/app/test-eventdev/test_perf_common.c b/app/test-eventdev/test_perf_common.c
index 488cf0724..1736b6b52 100644
--- a/app/test-eventdev/test_perf_common.c
+++ b/app/test-eventdev/test_perf_common.c
@@ -491,7 +491,7 @@ perf_ethdev_setup(struct evt_test *test, struct evt_options *opt)
 	if (opt->prod_type == EVT_PROD_TYPE_SYNT)
 		return 0;
 
-	if (!rte_eth_dev_count()) {
+	if (!rte_eth_dev_count_avail()) {
 		evt_err("No ethernet ports found.");
 		return -ENODEV;
 	}
diff --git a/app/test-eventdev/test_perf_queue.c b/app/test-eventdev/test_perf_queue.c
index db8f2f3e5..e381ed1eb 100644
--- a/app/test-eventdev/test_perf_queue.c
+++ b/app/test-eventdev/test_perf_queue.c
@@ -11,7 +11,7 @@ perf_queue_nb_event_queues(struct evt_options *opt)
 {
 	/* nb_queues = number of producers * number of stages */
 	uint8_t nb_prod = opt->prod_type == EVT_PROD_TYPE_ETH_RX_ADPTR ?
-		rte_eth_dev_count() : evt_nr_active_lcores(opt->plcores);
+		rte_eth_dev_count_avail() : evt_nr_active_lcores(opt->plcores);
 	return nb_prod * opt->nb_stages;
 }
 
diff --git a/app/test-eventdev/test_pipeline_atq.c b/app/test-eventdev/test_pipeline_atq.c
index dd7189776..26dc79f90 100644
--- a/app/test-eventdev/test_pipeline_atq.c
+++ b/app/test-eventdev/test_pipeline_atq.c
@@ -12,7 +12,7 @@ pipeline_atq_nb_event_queues(struct evt_options *opt)
 {
 	RTE_SET_USED(opt);
 
-	return rte_eth_dev_count();
+	return rte_eth_dev_count_avail();
 }
 
 static int
@@ -324,7 +324,7 @@ pipeline_atq_eventdev_setup(struct evt_test *test, struct evt_options *opt)
 	uint8_t nb_worker_queues = 0;
 
 	nb_ports = evt_nr_active_lcores(opt->wlcores);
-	nb_queues = rte_eth_dev_count();
+	nb_queues = rte_eth_dev_count_avail();
 
 	/* One extra port and queueu for Tx service */
 	if (t->mt_unsafe) {
diff --git a/app/test-eventdev/test_pipeline_common.c b/app/test-eventdev/test_pipeline_common.c
index b4dbe3769..719518ff3 100644
--- a/app/test-eventdev/test_pipeline_common.c
+++ b/app/test-eventdev/test_pipeline_common.c
@@ -166,7 +166,7 @@ pipeline_opt_check(struct evt_options *opt, uint64_t nb_queues)
 	 */
 	lcores = 2;
 
-	if (!rte_eth_dev_count()) {
+	if (!rte_eth_dev_count_avail()) {
 		evt_err("test needs minimum 1 ethernet dev");
 		return -1;
 	}
@@ -234,7 +234,7 @@ pipeline_ethdev_setup(struct evt_test *test, struct evt_options *opt)
 	};
 
 	RTE_SET_USED(opt);
-	if (!rte_eth_dev_count()) {
+	if (!rte_eth_dev_count_avail()) {
 		evt_err("No ethernet ports found.\n");
 		return -ENODEV;
 	}
@@ -419,7 +419,7 @@ pipeline_event_tx_service_setup(struct evt_test *test, struct evt_options *opt,
 	tx->dev_id = opt->dev_id;
 	tx->queue_id = tx_queue_id;
 	tx->port_id = tx_port_id;
-	tx->nb_ethports = rte_eth_dev_count();
+	tx->nb_ethports = rte_eth_dev_count_avail();
 	tx->t = t;
 
 	/* Register Tx service */
diff --git a/app/test-eventdev/test_pipeline_queue.c b/app/test-eventdev/test_pipeline_queue.c
index 02fc27cf8..ca5f4578e 100644
--- a/app/test-eventdev/test_pipeline_queue.c
+++ b/app/test-eventdev/test_pipeline_queue.c
@@ -10,7 +10,7 @@
 static __rte_always_inline int
 pipeline_queue_nb_event_queues(struct evt_options *opt)
 {
-	uint16_t eth_count = rte_eth_dev_count();
+	uint16_t eth_count = rte_eth_dev_count_avail();
 
 	return (eth_count * opt->nb_stages) + eth_count;
 }
@@ -333,7 +333,7 @@ pipeline_queue_eventdev_setup(struct evt_test *test, struct evt_options *opt)
 	uint8_t nb_worker_queues = 0;
 
 	nb_ports = evt_nr_active_lcores(opt->wlcores);
-	nb_queues = rte_eth_dev_count() * (nb_stages);
+	nb_queues = rte_eth_dev_count_avail() * (nb_stages);
 
 	/* Extra port for Tx service. */
 	if (t->mt_unsafe) {
@@ -341,7 +341,7 @@ pipeline_queue_eventdev_setup(struct evt_test *test, struct evt_options *opt)
 		nb_ports++;
 		nb_queues++;
 	} else
-		nb_queues += rte_eth_dev_count();
+		nb_queues += rte_eth_dev_count_avail();
 
 	rte_event_dev_info_get(opt->dev_id, &info);
 
diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index f22fa935c..6f0419ce1 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -5402,7 +5402,7 @@ static void cmd_create_bonded_device_parsed(void *parsed_result,
 				port_id);
 
 		/* Update number of ports */
-		nb_ports = rte_eth_dev_count();
+		nb_ports = rte_eth_dev_count_avail();
 		reconfig(port_id, res->socket);
 		rte_eth_promiscuous_enable(port_id);
 	}
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 0708c32cb..5e0f24471 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1873,7 +1873,7 @@ attach_port(char *identifier)
 	reconfig(pi, socket_id);
 	rte_eth_promiscuous_enable(pi);
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	ports[pi].port_status = RTE_PORT_STOPPED;
 
@@ -1901,7 +1901,7 @@ detach_port(portid_t port_id)
 		return;
 	}
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	printf("Port '%s' is detached. Now total ports is %d\n",
 			name, nb_ports);
@@ -2496,7 +2496,7 @@ main(int argc, char** argv)
 	rte_pdump_init(NULL);
 #endif
 
-	nb_ports = (portid_t) rte_eth_dev_count();
+	nb_ports = (portid_t) rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		TESTPMD_LOG(WARNING, "No probed ethernet devices\n");
 
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1cd1..0c801000a 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -72,6 +72,13 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* ethdev: The function ``rte_eth_dev_count``, often mis-used to iterate
+  over ports, is deprecated and replaced by ``rte_eth_dev_count_avail``.
+  There is also a new function ``rte_eth_dev_count_total`` to get the
+  total number of allocated ports, available or not.
+  The hotplug-proof applications should use ``RTE_ETH_FOREACH_DEV`` or
+  ``RTE_ETH_FOREACH_DEV_OWNED_BY`` as port iterators.
+
 
 ABI Changes
 -----------
diff --git a/doc/guides/sample_app_ug/quota_watermark.rst b/doc/guides/sample_app_ug/quota_watermark.rst
index 8baec4df8..67200e15d 100644
--- a/doc/guides/sample_app_ug/quota_watermark.rst
+++ b/doc/guides/sample_app_ug/quota_watermark.rst
@@ -163,7 +163,7 @@ Then, a call to init_dpdk(), defined in init.c, is made to initialize the poll m
         if (ret < 0)
             rte_exit(EXIT_FAILURE, "rte_pci_probe(): error %d\n", ret);
 
-        if (rte_eth_dev_count() < 2)
+        if (rte_eth_dev_count_avail() < 2)
             rte_exit(EXIT_FAILURE, "Not enough Ethernet port available\n");
     }
 
diff --git a/examples/bond/main.c b/examples/bond/main.c
index d4097d04d..d8edc642b 100644
--- a/examples/bond/main.c
+++ b/examples/bond/main.c
@@ -748,7 +748,7 @@ main(int argc, char *argv[])
 	argc -= ret;
 	argv += ret;
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "Give at least one port\n");
 	else if (nb_ports > MAX_PORTS)
diff --git a/examples/distributor/main.c b/examples/distributor/main.c
index 2e6b09d21..2c5936489 100644
--- a/examples/distributor/main.c
+++ b/examples/distributor/main.c
@@ -193,7 +193,7 @@ struct lcore_params {
 static int
 lcore_rx(struct lcore_params *p)
 {
-	const uint16_t nb_ports = rte_eth_dev_count();
+	const uint16_t nb_ports = rte_eth_dev_count_avail();
 	const int socket_id = rte_socket_id();
 	uint16_t port;
 	struct rte_mbuf *bufs[BURST_SIZE*2];
@@ -542,7 +542,7 @@ lcore_worker(struct lcore_params *p)
 	 * for single port, xor_val will be zero so we won't modify the output
 	 * port, otherwise we send traffic from 0 to 1, 2 to 3, and vice versa
 	 */
-	const unsigned xor_val = (rte_eth_dev_count() > 1);
+	const unsigned xor_val = (rte_eth_dev_count_avail() > 1);
 	struct rte_mbuf *buf[8] __rte_cache_aligned;
 
 	for (i = 0; i < 8; i++)
@@ -678,7 +678,7 @@ main(int argc, char *argv[])
 				"1 lcore for packet TX\n"
 				"and at least 1 lcore for worker threads\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "Error: no ethernet ports detected\n");
 	if (nb_ports != 1 && (nb_ports & 1))
diff --git a/examples/ethtool/ethtool-app/main.c b/examples/ethtool/ethtool-app/main.c
index 702feabe9..c1815bb94 100644
--- a/examples/ethtool/ethtool-app/main.c
+++ b/examples/ethtool/ethtool-app/main.c
@@ -251,7 +251,7 @@ int main(int argc, char **argv)
 	if (cnt_args_parsed < 0)
 		rte_exit(EXIT_FAILURE, "rte_eal_init(): Failed");
 
-	cnt_ports = rte_eth_dev_count();
+	cnt_ports = rte_eth_dev_count_avail();
 	printf("Number of NICs: %i\n", cnt_ports);
 	if (cnt_ports == 0)
 		rte_exit(EXIT_FAILURE, "No available NIC ports!\n");
diff --git a/examples/eventdev_pipeline/main.c b/examples/eventdev_pipeline/main.c
index 48358a7d2..b698e4ca2 100644
--- a/examples/eventdev_pipeline/main.c
+++ b/examples/eventdev_pipeline/main.c
@@ -429,7 +429,7 @@ int
 main(int argc, char **argv)
 {
 	struct worker_data *worker_data;
-	unsigned int num_ports;
+	uint16_t num_ports;
 	int lcore_id;
 	int err;
 
@@ -451,7 +451,7 @@ main(int argc, char **argv)
 	/* Parse cli options*/
 	parse_app_args(argc, argv);
 
-	num_ports = rte_eth_dev_count();
+	num_ports = rte_eth_dev_count_avail();
 	if (num_ports == 0)
 		rte_panic("No ethernet ports found\n");
 
diff --git a/examples/eventdev_pipeline/pipeline_worker_tx.c b/examples/eventdev_pipeline/pipeline_worker_tx.c
index fc98128ec..3dbde92df 100644
--- a/examples/eventdev_pipeline/pipeline_worker_tx.c
+++ b/examples/eventdev_pipeline/pipeline_worker_tx.c
@@ -422,7 +422,7 @@ setup_eventdev_worker_tx(struct cons_data *cons_data,
 	const uint8_t dev_id = 0;
 	const uint8_t nb_ports = cdata.num_workers;
 	uint8_t nb_slots = 0;
-	uint8_t nb_queues = rte_eth_dev_count();
+	uint8_t nb_queues = rte_eth_dev_count_avail();
 
 	/*
 	 * In case where all type queues are not enabled, use queues equal to
@@ -431,7 +431,7 @@ setup_eventdev_worker_tx(struct cons_data *cons_data,
 	 */
 	if (!atq) {
 		nb_queues *= cdata.num_stages;
-		nb_queues += rte_eth_dev_count();
+		nb_queues += rte_eth_dev_count_avail();
 	}
 
 	struct rte_event_dev_config config = {
diff --git a/examples/exception_path/main.c b/examples/exception_path/main.c
index 996f4939d..2b381a5d8 100644
--- a/examples/exception_path/main.c
+++ b/examples/exception_path/main.c
@@ -559,7 +559,7 @@ main(int argc, char** argv)
 	}
 
 	/* Get number of ports found in scan */
-	nb_sys_ports = rte_eth_dev_count();
+	nb_sys_ports = rte_eth_dev_count_avail();
 	if (nb_sys_ports == 0)
 		FATAL_ERROR("No supported Ethernet device found");
 	/* Find highest port set in portmask */
diff --git a/examples/flow_classify/flow_classify.c b/examples/flow_classify/flow_classify.c
index d0e537e7d..3b087ce76 100644
--- a/examples/flow_classify/flow_classify.c
+++ b/examples/flow_classify/flow_classify.c
@@ -752,7 +752,7 @@ int
 main(int argc, char *argv[])
 {
 	struct rte_mempool *mbuf_pool;
-	uint8_t nb_ports;
+	uint16_t nb_ports;
 	uint16_t portid;
 	int ret;
 	int socket_id;
@@ -776,7 +776,7 @@ main(int argc, char *argv[])
 		rte_exit(EXIT_FAILURE, "Invalid flow_classify parameters\n");
 
 	/* Check that there is an even number of ports to send/receive on. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports < 2 || (nb_ports & 1))
 		rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
 
diff --git a/examples/flow_filtering/Makefile b/examples/flow_filtering/Makefile
index 01bb4cd81..f41b368fa 100644
--- a/examples/flow_filtering/Makefile
+++ b/examples/flow_filtering/Makefile
@@ -50,6 +50,8 @@ CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
 LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
 LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
 	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
 
@@ -75,6 +77,7 @@ RTE_TARGET ?= x86_64-native-linuxapp-gcc
 
 include $(RTE_SDK)/mk/rte.vars.mk
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 
diff --git a/examples/flow_filtering/main.c b/examples/flow_filtering/main.c
index 0bb81a8dd..db03c9c3a 100644
--- a/examples/flow_filtering/main.c
+++ b/examples/flow_filtering/main.c
@@ -232,7 +232,7 @@ int
 main(int argc, char **argv)
 {
 	int ret;
-	uint8_t nr_ports;
+	uint16_t nr_ports;
 	struct rte_flow_error error;
 
 	ret = rte_eal_init(argc, argv);
@@ -243,7 +243,7 @@ main(int argc, char **argv)
 	signal(SIGINT, signal_handler);
 	signal(SIGTERM, signal_handler);
 
-	nr_ports = rte_eth_dev_count();
+	nr_ports = rte_eth_dev_count_avail();
 	if (nr_ports == 0)
 		rte_exit(EXIT_FAILURE, ":: no Ethernet ports found\n");
 	port_id = 0;
diff --git a/examples/flow_filtering/meson.build b/examples/flow_filtering/meson.build
index 407795c42..949493300 100644
--- a/examples/flow_filtering/meson.build
+++ b/examples/flow_filtering/meson.build
@@ -9,3 +9,4 @@
 sources = files(
 	'main.c',
 )
+allow_experimental_apis = true
diff --git a/examples/ip_fragmentation/main.c b/examples/ip_fragmentation/main.c
index f525c3a9c..8952ea456 100644
--- a/examples/ip_fragmentation/main.c
+++ b/examples/ip_fragmentation/main.c
@@ -843,7 +843,7 @@ main(int argc, char **argv)
 	struct rte_eth_txconf *txconf;
 	struct rx_queue *rxq;
 	int socket, ret;
-	unsigned nb_ports;
+	uint16_t nb_ports;
 	uint16_t queueid = 0;
 	unsigned lcore_id = 0, rx_lcore_id = 0;
 	uint32_t n_tx_queue, nb_lcores;
@@ -861,7 +861,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "Invalid arguments");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No ports found!\n");
 
diff --git a/examples/ip_reassembly/main.c b/examples/ip_reassembly/main.c
index ddff35880..3e8e79c21 100644
--- a/examples/ip_reassembly/main.c
+++ b/examples/ip_reassembly/main.c
@@ -1008,7 +1008,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "Invalid IP reassembly parameters\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No ports found!\n");
 
diff --git a/examples/ipv4_multicast/main.c b/examples/ipv4_multicast/main.c
index 23b266bbe..ad2072f41 100644
--- a/examples/ipv4_multicast/main.c
+++ b/examples/ipv4_multicast/main.c
@@ -674,7 +674,7 @@ main(int argc, char **argv)
 	if (clone_pool == NULL)
 		rte_exit(EXIT_FAILURE, "Cannot init clone mbuf pool\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No physical ports!\n");
 	if (nb_ports > MAX_PORTS)
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 1855c64ab..40d74ae15 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -921,7 +921,7 @@ main(int argc, char** argv)
 	}
 
 	/* Get number of ports found in scan */
-	nb_sys_ports = rte_eth_dev_count();
+	nb_sys_ports = rte_eth_dev_count_avail();
 	if (nb_sys_ports == 0)
 		rte_exit(EXIT_FAILURE, "No supported Ethernet device found\n");
 
diff --git a/examples/l2fwd-cat/l2fwd-cat.c b/examples/l2fwd-cat/l2fwd-cat.c
index ed4878043..0e6078aad 100644
--- a/examples/l2fwd-cat/l2fwd-cat.c
+++ b/examples/l2fwd-cat/l2fwd-cat.c
@@ -173,7 +173,7 @@ main(int argc, char *argv[])
 	argv += ret;
 
 	/* Check that there is an even number of ports to send/receive on. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports < 2 || (nb_ports & 1))
 		rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
 
diff --git a/examples/l2fwd-crypto/main.c b/examples/l2fwd-crypto/main.c
index b1ad19284..422044898 100644
--- a/examples/l2fwd-crypto/main.c
+++ b/examples/l2fwd-crypto/main.c
@@ -2311,7 +2311,7 @@ initialize_ports(struct l2fwd_crypto_options *options)
 {
 	uint16_t last_portid = 0, portid;
 	unsigned enabled_portcount = 0;
-	unsigned nb_ports = rte_eth_dev_count();
+	unsigned nb_ports = rte_eth_dev_count_avail();
 
 	if (nb_ports == 0) {
 		printf("No Ethernet ports - bye\n");
diff --git a/examples/l2fwd-jobstats/main.c b/examples/l2fwd-jobstats/main.c
index 248224840..34553faa2 100644
--- a/examples/l2fwd-jobstats/main.c
+++ b/examples/l2fwd-jobstats/main.c
@@ -770,7 +770,7 @@ main(int argc, char **argv)
 	if (l2fwd_pktmbuf_pool == NULL)
 		rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
diff --git a/examples/l2fwd-keepalive/main.c b/examples/l2fwd-keepalive/main.c
index e9ad91a1f..a18b707cd 100644
--- a/examples/l2fwd-keepalive/main.c
+++ b/examples/l2fwd-keepalive/main.c
@@ -561,7 +561,7 @@ main(int argc, char **argv)
 	if (l2fwd_pktmbuf_pool == NULL)
 		rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index f8ca29cf6..690843578 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -545,7 +545,7 @@ main(int argc, char **argv)
 	/* convert to number of cycles */
 	timer_period *= rte_get_timer_hz();
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
diff --git a/examples/l3fwd-acl/main.c b/examples/l3fwd-acl/main.c
index 2c891b494..33ad467d3 100644
--- a/examples/l3fwd-acl/main.c
+++ b/examples/l3fwd-acl/main.c
@@ -1891,7 +1891,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	if (check_port_config() < 0)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index d6a092618..596d64548 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -1650,7 +1650,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	if (check_port_config() < 0)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
diff --git a/examples/l3fwd-vf/main.c b/examples/l3fwd-vf/main.c
index dd0e057ef..aaafb7bc2 100644
--- a/examples/l3fwd-vf/main.c
+++ b/examples/l3fwd-vf/main.c
@@ -949,7 +949,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	if (check_port_config() < 0)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index ec1da5c18..bf7dbd814 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -826,7 +826,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	if (check_port_config() < 0)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
diff --git a/examples/link_status_interrupt/Makefile b/examples/link_status_interrupt/Makefile
index 160682123..d778fcbbf 100644
--- a/examples/link_status_interrupt/Makefile
+++ b/examples/link_status_interrupt/Makefile
@@ -23,6 +23,8 @@ CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
 LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
 LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
 	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
 
@@ -48,6 +50,7 @@ RTE_TARGET ?= x86_64-native-linuxapp-gcc
 
 include $(RTE_SDK)/mk/rte.vars.mk
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 
diff --git a/examples/link_status_interrupt/main.c b/examples/link_status_interrupt/main.c
index ad0dd390d..f56895680 100644
--- a/examples/link_status_interrupt/main.c
+++ b/examples/link_status_interrupt/main.c
@@ -542,7 +542,7 @@ main(int argc, char **argv)
 	if (lsi_pktmbuf_pool == NULL)
 		rte_panic("Cannot init mbuf pool\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_panic("No Ethernet port - bye\n");
 
diff --git a/examples/link_status_interrupt/meson.build b/examples/link_status_interrupt/meson.build
index c34e11e36..2b0a25036 100644
--- a/examples/link_status_interrupt/meson.build
+++ b/examples/link_status_interrupt/meson.build
@@ -6,6 +6,7 @@
 # To build this example as a standalone application with an already-installed
 # DPDK instance, use 'make'
 
+allow_experimental_apis = true
 sources = files(
 	'main.c'
 )
diff --git a/examples/multi_process/client_server_mp/mp_client/Makefile b/examples/multi_process/client_server_mp/mp_client/Makefile
index 298e1b020..3bfcd75c5 100644
--- a/examples/multi_process/client_server_mp/mp_client/Makefile
+++ b/examples/multi_process/client_server_mp/mp_client/Makefile
@@ -14,6 +14,7 @@ APP = mp_client
 # all source are stored in SRCS-y
 SRCS-y := client.c
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -O3
 CFLAGS += -I$(SRCDIR)/../shared
 
diff --git a/examples/multi_process/client_server_mp/mp_client/client.c b/examples/multi_process/client_server_mp/mp_client/client.c
index 92955e974..c23dd3f37 100644
--- a/examples/multi_process/client_server_mp/mp_client/client.c
+++ b/examples/multi_process/client_server_mp/mp_client/client.c
@@ -220,7 +220,7 @@ main(int argc, char *argv[])
 	if (parse_app_args(argc, argv) < 0)
 		rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n");
 
-	if (rte_eth_dev_count() == 0)
+	if (rte_eth_dev_count_avail() == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
 	rx_ring = rte_ring_lookup(get_rx_queue_name(client_id));
diff --git a/examples/multi_process/client_server_mp/mp_server/Makefile b/examples/multi_process/client_server_mp/mp_server/Makefile
index 3e244e283..af7246e6b 100644
--- a/examples/multi_process/client_server_mp/mp_server/Makefile
+++ b/examples/multi_process/client_server_mp/mp_server/Makefile
@@ -23,6 +23,7 @@ SRCS-y := main.c init.c args.c
 
 INC := $(sort $(wildcard *.h))
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -O3
 CFLAGS += -I$(SRCDIR)/../shared
 
diff --git a/examples/multi_process/client_server_mp/mp_server/init.c b/examples/multi_process/client_server_mp/mp_server/init.c
index 1c465ccbc..30c8e44bc 100644
--- a/examples/multi_process/client_server_mp/mp_server/init.c
+++ b/examples/multi_process/client_server_mp/mp_server/init.c
@@ -243,7 +243,7 @@ init(int argc, char *argv[])
 	argv += retval;
 
 	/* get total number of ports */
-	total_ports = rte_eth_dev_count();
+	total_ports = rte_eth_dev_count_total();
 
 	/* set up array for port data */
 	mz = rte_memzone_reserve(MZ_PORT_INFO, sizeof(*ports),
diff --git a/examples/multi_process/l2fwd_fork/main.c b/examples/multi_process/l2fwd_fork/main.c
index 6b130f2f5..94318ab61 100644
--- a/examples/multi_process/l2fwd_fork/main.c
+++ b/examples/multi_process/l2fwd_fork/main.c
@@ -941,7 +941,7 @@ main(int argc, char **argv)
 	for (i = 0; i < RTE_MAX_LCORE; i++)
 		lcore_resource[i].lcore_id = i;
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
diff --git a/examples/multi_process/symmetric_mp/main.c b/examples/multi_process/symmetric_mp/main.c
index 75cad0cca..16f21a187 100644
--- a/examples/multi_process/symmetric_mp/main.c
+++ b/examples/multi_process/symmetric_mp/main.c
@@ -418,7 +418,7 @@ main(int argc, char **argv)
 	argv += ret;
 
 	/* determine the NIC devices available */
-	if (rte_eth_dev_count() == 0)
+	if (rte_eth_dev_count_avail() == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
 	/* parse application arguments (those after the EAL ones) */
diff --git a/examples/netmap_compat/bridge/Makefile b/examples/netmap_compat/bridge/Makefile
index a7c9c14a8..071c09dd3 100644
--- a/examples/netmap_compat/bridge/Makefile
+++ b/examples/netmap_compat/bridge/Makefile
@@ -27,6 +27,7 @@ VPATH := $(SRCDIR)/../lib
 SRCS-y := bridge.c
 SRCS-y += compat_netmap.c
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3 -I$(SRCDIR)/../lib -I$(SRCDIR)/../netmap
 CFLAGS += $(WERROR_FLAGS)
 
diff --git a/examples/netmap_compat/bridge/bridge.c b/examples/netmap_compat/bridge/bridge.c
index 59c5e4361..cb1882e65 100644
--- a/examples/netmap_compat/bridge/bridge.c
+++ b/examples/netmap_compat/bridge/bridge.c
@@ -236,7 +236,7 @@ int main(int argc, char *argv[])
 	if (ports.num == 0)
 		rte_exit(EXIT_FAILURE, "no ports specified\n");
 
-	if (rte_eth_dev_count() < 1)
+	if (rte_eth_dev_count_avail() < 1)
 		rte_exit(EXIT_FAILURE, "Not enough ethernet ports available\n");
 
 	pool = rte_pktmbuf_pool_create("mbuf_pool", MBUF_PER_POOL, 32, 0,
diff --git a/examples/packet_ordering/main.c b/examples/packet_ordering/main.c
index 676cb6f71..7ace7d10e 100644
--- a/examples/packet_ordering/main.c
+++ b/examples/packet_ordering/main.c
@@ -430,7 +430,7 @@ rx_thread(struct rte_ring *ring_out)
 static int
 worker_thread(void *args_ptr)
 {
-	const uint8_t nb_ports = rte_eth_dev_count();
+	const uint16_t nb_ports = rte_eth_dev_count_avail();
 	uint16_t i, ret = 0;
 	uint16_t burst_size = 0;
 	struct worker_thread_args *args;
@@ -644,7 +644,7 @@ main(int argc, char **argv)
 				"1 lcore for packet TX\n"
 				"and at least 1 lcore for worker threads\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0)
 		rte_exit(EXIT_FAILURE, "Error: no ethernet ports detected\n");
 	if (nb_ports != 1 && (nb_ports & 1))
diff --git a/examples/performance-thread/l3fwd-thread/main.c b/examples/performance-thread/l3fwd-thread/main.c
index 699b99d00..f51e6b0fd 100644
--- a/examples/performance-thread/l3fwd-thread/main.c
+++ b/examples/performance-thread/l3fwd-thread/main.c
@@ -3514,7 +3514,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "init_rx_rings failed\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	if (check_port_config() < 0)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
diff --git a/examples/ptpclient/ptpclient.c b/examples/ptpclient/ptpclient.c
index 55be3f7cd..c44013bc1 100644
--- a/examples/ptpclient/ptpclient.c
+++ b/examples/ptpclient/ptpclient.c
@@ -727,7 +727,7 @@ main(int argc, char *argv[])
 		rte_exit(EXIT_FAILURE, "Error with PTP initialization\n");
 
 	/* Check that there is an even number of ports to send/receive on. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	/* Creates a new mempool in memory to hold the mbufs. */
 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS * nb_ports,
diff --git a/examples/qos_sched/Makefile b/examples/qos_sched/Makefile
index 0f0a31ff2..e6dfbef1f 100644
--- a/examples/qos_sched/Makefile
+++ b/examples/qos_sched/Makefile
@@ -23,6 +23,8 @@ CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
 LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
 LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
 	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
 
@@ -55,6 +57,7 @@ all:
 clean:
 else
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS_args.o := -D_GNU_SOURCE
diff --git a/examples/qos_sched/init.c b/examples/qos_sched/init.c
index 8914f766f..c9e487975 100644
--- a/examples/qos_sched/init.c
+++ b/examples/qos_sched/init.c
@@ -298,7 +298,7 @@ int app_init(void)
 	char ring_name[MAX_NAME_LEN];
 	char pool_name[MAX_NAME_LEN];
 
-	if (rte_eth_dev_count() == 0)
+	if (rte_eth_dev_count_avail() == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet port - bye\n");
 
 	/* load configuration profile */
diff --git a/examples/qos_sched/meson.build b/examples/qos_sched/meson.build
index 289b81ce8..5101652af 100644
--- a/examples/qos_sched/meson.build
+++ b/examples/qos_sched/meson.build
@@ -7,6 +7,7 @@
 # DPDK instance, use 'make'
 
 deps += ['sched', 'cfgfile']
+allow_experimental_apis = true
 sources = files(
 	'app_thread.c', 'args.c', 'cfg_file.c', 'cmdline.c',
 	'init.c', 'main.c', 'stats.c'
diff --git a/examples/quota_watermark/qw/Makefile b/examples/quota_watermark/qw/Makefile
index 84299e594..d0a9b3cf4 100644
--- a/examples/quota_watermark/qw/Makefile
+++ b/examples/quota_watermark/qw/Makefile
@@ -16,6 +16,7 @@ APP = qw
 # all source are stored in SRCS-y
 SRCS-y := args.c init.c main.c
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3 -DQW_SOFTWARE_FC
 CFLAGS += $(WERROR_FLAGS)
 
diff --git a/examples/quota_watermark/qw/init.c b/examples/quota_watermark/qw/init.c
index d4a691839..00725bc95 100644
--- a/examples/quota_watermark/qw/init.c
+++ b/examples/quota_watermark/qw/init.c
@@ -112,7 +112,7 @@ void configure_eth_port(uint16_t port_id)
 void
 init_dpdk(void)
 {
-	if (rte_eth_dev_count() < 2)
+	if (rte_eth_dev_count_avail() < 2)
 		rte_exit(EXIT_FAILURE, "Not enough ethernet port available\n");
 }
 
diff --git a/examples/rxtx_callbacks/main.c b/examples/rxtx_callbacks/main.c
index 84b09cf05..e63ea288f 100644
--- a/examples/rxtx_callbacks/main.c
+++ b/examples/rxtx_callbacks/main.c
@@ -188,7 +188,7 @@ main(int argc, char *argv[])
 	argc -= ret;
 	argv += ret;
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports < 2 || (nb_ports & 1))
 		rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
 
diff --git a/examples/server_node_efd/node/Makefile b/examples/server_node_efd/node/Makefile
index fffbe3576..dc3191a5b 100644
--- a/examples/server_node_efd/node/Makefile
+++ b/examples/server_node_efd/node/Makefile
@@ -14,6 +14,7 @@ APP = node
 # all source are stored in SRCS-y
 SRCS-y := node.c
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -O3
 CFLAGS += -I$(SRCDIR)/../shared
 
diff --git a/examples/server_node_efd/node/node.c b/examples/server_node_efd/node/node.c
index 84f7bcffe..3b97fbd45 100644
--- a/examples/server_node_efd/node/node.c
+++ b/examples/server_node_efd/node/node.c
@@ -320,7 +320,7 @@ main(int argc, char *argv[])
 	if (parse_app_args(argc, argv) < 0)
 		rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n");
 
-	if (rte_eth_dev_count() == 0)
+	if (rte_eth_dev_count_avail() == 0)
 		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
 
 	rx_ring = rte_ring_lookup(get_rx_queue_name(node_id));
diff --git a/examples/server_node_efd/server/Makefile b/examples/server_node_efd/server/Makefile
index cbb91ebe8..d5456f920 100644
--- a/examples/server_node_efd/server/Makefile
+++ b/examples/server_node_efd/server/Makefile
@@ -23,6 +23,7 @@ SRCS-y := main.c init.c args.c
 
 INC := $(sort $(wildcard *.h))
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -O3
 CFLAGS += -I$(SRCDIR)/../shared
 
diff --git a/examples/server_node_efd/server/init.c b/examples/server_node_efd/server/init.c
index 07b6882f8..7dfe2fa23 100644
--- a/examples/server_node_efd/server/init.c
+++ b/examples/server_node_efd/server/init.c
@@ -310,7 +310,7 @@ init(int argc, char *argv[])
 	argv += retval;
 
 	/* get total number of ports */
-	total_ports = rte_eth_dev_count();
+	total_ports = rte_eth_dev_count_avail();
 
 	/* set up array for port data */
 	mz = rte_memzone_reserve(MZ_SHARED_INFO, sizeof(*info),
diff --git a/examples/skeleton/basicfwd.c b/examples/skeleton/basicfwd.c
index 5ac1dc820..03bc35856 100644
--- a/examples/skeleton/basicfwd.c
+++ b/examples/skeleton/basicfwd.c
@@ -173,7 +173,7 @@ main(int argc, char *argv[])
 	argv += ret;
 
 	/* Check that there is an even number of ports to send/receive on. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports < 2 || (nb_ports & 1))
 		rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
 
diff --git a/examples/skeleton/meson.build b/examples/skeleton/meson.build
index ef46b187e..89ddba2ea 100644
--- a/examples/skeleton/meson.build
+++ b/examples/skeleton/meson.build
@@ -10,3 +10,5 @@ allow_experimental_apis = true
 sources = files(
 	'basicfwd.c'
 )
+
+allow_experimental_apis = true
diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index e86854176..8543a9803 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -1156,7 +1156,7 @@ main(int argc, char *argv[])
 	nb_switching_cores = rte_lcore_count()-1;
 
 	/* Get the number of physical ports. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	/*
 	 * Update the global var NB_PORTS and global array PORTS
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 7cddac7d2..84e0d6366 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1448,7 +1448,7 @@ main(int argc, char *argv[])
 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
 
 	/* Get the number of physical ports. */
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	/*
 	 * Update the global var NUM_PORTS and global array PORTS
diff --git a/examples/vm_power_manager/main.c b/examples/vm_power_manager/main.c
index db0ddb01d..c9805a461 100644
--- a/examples/vm_power_manager/main.c
+++ b/examples/vm_power_manager/main.c
@@ -278,7 +278,7 @@ main(int argc, char **argv)
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "Invalid arguments\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS * nb_ports,
 		MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
diff --git a/examples/vmdq/main.c b/examples/vmdq/main.c
index 2f3eb74f5..52596dd5e 100644
--- a/examples/vmdq/main.c
+++ b/examples/vmdq/main.c
@@ -580,7 +580,7 @@ main(int argc, char *argv[])
 	if (rte_lcore_count() > RTE_MAX_LCORE)
 		rte_exit(EXIT_FAILURE, "Not enough cores\n");
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	/*
 	 * Update the global var NUM_PORTS and global array PORTS
diff --git a/examples/vmdq_dcb/main.c b/examples/vmdq_dcb/main.c
index 9c68ab089..2626a2f19 100644
--- a/examples/vmdq_dcb/main.c
+++ b/examples/vmdq_dcb/main.c
@@ -642,7 +642,7 @@ main(int argc, char *argv[])
 				" number of cores(1-%d)\n\n", RTE_MAX_LCORE);
 	}
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 
 	/*
 	 * Update the global var NUM_PORTS and global array PORTS
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e04..4f9c6fc64 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -534,6 +534,12 @@ rte_eth_dev_get_sec_ctx(uint16_t port_id)
 
 uint16_t
 rte_eth_dev_count(void)
+{
+	return rte_eth_dev_count_avail();
+}
+
+uint16_t
+rte_eth_dev_count_avail(void)
 {
 	uint16_t p;
 	uint16_t count;
@@ -546,6 +552,18 @@ rte_eth_dev_count(void)
 	return count;
 }
 
+uint16_t
+rte_eth_dev_count_total(void)
+{
+	uint16_t port, count = 0;
+
+	for (port = 0; port < RTE_MAX_ETHPORTS; port++)
+		if (rte_eth_devices[port].state != RTE_ETH_DEV_UNUSED)
+			count++;
+
+	return count;
+}
+
 int
 rte_eth_dev_get_name_by_port(uint16_t port_id, char *name)
 {
@@ -601,7 +619,7 @@ int
 rte_eth_dev_attach(const char *devargs, uint16_t *port_id)
 {
 	int ret = -1;
-	int current = rte_eth_dev_count();
+	int current = rte_eth_dev_count_total();
 	char *name = NULL;
 	char *args = NULL;
 
@@ -619,7 +637,7 @@ rte_eth_dev_attach(const char *devargs, uint16_t *port_id)
 		goto err;
 
 	/* no point looking at the port count if no port exists */
-	if (!rte_eth_dev_count()) {
+	if (!rte_eth_dev_count_total()) {
 		ethdev_log(ERR, "No port found for device (%s)", name);
 		ret = -1;
 		goto err;
@@ -627,8 +645,9 @@ rte_eth_dev_attach(const char *devargs, uint16_t *port_id)
 
 	/* if nothing happened, there is a bug here, since some driver told us
 	 * it did attach a device, but did not create a port.
+	 * FIXME: race condition in case of plug-out of another device
 	 */
-	if (current == rte_eth_dev_count()) {
+	if (current == rte_eth_dev_count_total()) {
 		ret = -1;
 		goto err;
 	}
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 5e13dca6a..ffd8b5478 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -1360,8 +1360,31 @@ int __rte_experimental rte_eth_dev_owner_get(const uint16_t port_id,
  * @return
  *   - The total number of usable Ethernet devices.
  */
+__rte_deprecated
 uint16_t rte_eth_dev_count(void);
 
+/**
+ * Get the number of ports which are usable for the application.
+ *
+ * These devices must be iterated by using the macro
+ * ``RTE_ETH_FOREACH_DEV`` or ``RTE_ETH_FOREACH_DEV_OWNED_BY``
+ * to deal with non-contiguous ranges of devices.
+ *
+ * @return
+ *   The count of available Ethernet devices.
+ */
+uint16_t __rte_experimental rte_eth_dev_count_avail(void);
+
+/**
+ * Get the total number of ports which are allocated.
+ *
+ * Some devices may not be available for the application.
+ *
+ * @return
+ *   The total count of Ethernet devices.
+ */
+uint16_t __rte_experimental rte_eth_dev_count_total(void);
+
 /**
  * Attach a new Ethernet device specified by arguments.
  *
diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..8fe07880f 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -206,6 +206,8 @@ DPDK_18.02 {
 EXPERIMENTAL {
 	global:
 
+	rte_eth_dev_count_avail;
+	rte_eth_dev_count_total;
 	rte_eth_dev_is_removed;
 	rte_eth_dev_owner_delete;
 	rte_eth_dev_owner_get;
@@ -228,4 +230,4 @@ EXPERIMENTAL {
 	rte_mtr_stats_read;
 	rte_mtr_stats_update;
 
-} DPDK_17.11;
+} DPDK_18.02;
diff --git a/lib/librte_eventdev/rte_event_eth_rx_adapter.c b/lib/librte_eventdev/rte_event_eth_rx_adapter.c
index 9297f4c4d..1b92fa33d 100644
--- a/lib/librte_eventdev/rte_event_eth_rx_adapter.c
+++ b/lib/librte_eventdev/rte_event_eth_rx_adapter.c
@@ -890,7 +890,8 @@ rte_event_eth_rx_adapter_create_ext(uint8_t id, uint8_t dev_id,
 	rx_adapter->conf_arg = conf_arg;
 	strcpy(rx_adapter->mem_name, mem_name);
 	rx_adapter->eth_devices = rte_zmalloc_socket(rx_adapter->mem_name,
-					rte_eth_dev_count() *
+					/* FIXME: incompatible with hotplug */
+					rte_eth_dev_count_total() *
 					sizeof(struct eth_device_info), 0,
 					socket_id);
 	rte_convert_rss_key((const uint32_t *)default_rss_key,
diff --git a/test/test/test_event_eth_rx_adapter.c b/test/test/test_event_eth_rx_adapter.c
index 2234df0eb..ab5539809 100644
--- a/test/test/test_event_eth_rx_adapter.c
+++ b/test/test/test_event_eth_rx_adapter.c
@@ -164,7 +164,7 @@ testsuite_setup(void)
 	 * so rte_eth_dev_start invokes rte_event_dev_start internally, so
 	 * call init_ports after rte_event_dev_configure
 	 */
-	err = init_ports(rte_eth_dev_count());
+	err = init_ports(rte_eth_dev_count_total());
 	TEST_ASSERT(err == 0, "Port initialization failed err %d\n", err);
 
 	err = rte_event_eth_rx_adapter_caps_get(TEST_DEV_ID, TEST_ETHDEV_ID,
@@ -273,7 +273,7 @@ adapter_queue_add_del(void)
 	queue_config.servicing_weight = 1;
 
 	err = rte_event_eth_rx_adapter_queue_add(TEST_INST_ID,
-						rte_eth_dev_count(),
+						rte_eth_dev_count_total(),
 						-1, &queue_config);
 	TEST_ASSERT(err == -EINVAL, "Expected -EINVAL got %d", err);
 
diff --git a/test/test/test_kni.c b/test/test/test_kni.c
index e4839cdb7..01c6530d9 100644
--- a/test/test/test_kni.c
+++ b/test/test/test_kni.c
@@ -480,7 +480,7 @@ test_kni(void)
 		return -1;
 	}
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports == 0) {
 		printf("no supported nic port found\n");
 		return -1;
diff --git a/test/test/test_link_bonding_mode4.c b/test/test/test_link_bonding_mode4.c
index 426877a2a..33c1fab01 100644
--- a/test/test/test_link_bonding_mode4.c
+++ b/test/test/test_link_bonding_mode4.c
@@ -425,7 +425,7 @@ test_setup(void)
 			TEST_ASSERT(retval >= 0,
 				"Failed to create ring ethdev '%s'\n", name);
 
-			port->port_id = rte_eth_dev_count() - 1;
+			port->port_id = rte_eth_dev_count_avail() - 1;
 		}
 
 		retval = configure_ethdev(port->port_id, 1);
diff --git a/test/test/test_link_bonding_rssconf.c b/test/test/test_link_bonding_rssconf.c
index 4cc08f5a2..6a1a28d23 100644
--- a/test/test/test_link_bonding_rssconf.c
+++ b/test/test/test_link_bonding_rssconf.c
@@ -521,7 +521,7 @@ test_setup(void)
 	FOR_EACH_PORT(n, port) {
 		port = &test_params.slave_ports[n];
 
-		port_id = rte_eth_dev_count();
+		port_id = rte_eth_dev_count_avail();
 		snprintf(name, sizeof(name), SLAVE_DEV_NAME_FMT, port_id);
 
 		retval = rte_vdev_init(name, "size=64,copy=0");
diff --git a/test/test/test_pmd_perf.c b/test/test/test_pmd_perf.c
index 0e64a581b..54bc4f6b0 100644
--- a/test/test/test_pmd_perf.c
+++ b/test/test/test_pmd_perf.c
@@ -676,7 +676,7 @@ test_pmd_perf(void)
 	signal(SIGUSR1, signal_handler);
 	signal(SIGUSR2, signal_handler);
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	if (nb_ports < NB_ETHPORTS_USED) {
 		printf("At least %u port(s) used for perf. test\n",
 		       NB_ETHPORTS_USED);
diff --git a/test/test/test_pmd_ring.c b/test/test/test_pmd_ring.c
index 0787e4ebf..219620125 100644
--- a/test/test/test_pmd_ring.c
+++ b/test/test/test_pmd_ring.c
@@ -399,7 +399,7 @@ test_pmd_ring(void)
 	int port, cmdl_port0 = -1;
 	uint8_t nb_ports;
 
-	nb_ports = rte_eth_dev_count();
+	nb_ports = rte_eth_dev_count_avail();
 	printf("nb_ports=%d\n", (int)nb_ports);
 
 	/*  create the rings and eth_rings in the test code.
-- 
2.16.2

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v2 2/4] net/nfp: update PMD for using new CPP interface
    2018-04-05 14:42  1% ` [dpdk-dev] [PATCH v2 1/4] net/nfp: add NFP CPP support Alejandro Lucero
@ 2018-04-05 14:42  6% ` Alejandro Lucero
  1 sibling, 0 replies; 200+ results
From: Alejandro Lucero @ 2018-04-05 14:42 UTC (permalink / raw)
  To: dev

PF PMD support was based on NSPU interface. This patch changes the
PMD for using the new CPP user space interface which gives more
flexibility for adding new functionalities.

This change just affects initialization with the datapath being the
same than before.

Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
---
 drivers/net/nfp/Makefile      |  17 ++-
 drivers/net/nfp/nfp_net.c     | 342 +++++++++++++++++++++++++++++-------------
 drivers/net/nfp/nfp_net_pmd.h |  16 +-
 3 files changed, 264 insertions(+), 111 deletions(-)

diff --git a/drivers/net/nfp/Makefile b/drivers/net/nfp/Makefile
index aa3b68a..ab4e0a7 100644
--- a/drivers/net/nfp/Makefile
+++ b/drivers/net/nfp/Makefile
@@ -20,11 +20,24 @@ EXPORT_MAP := rte_pmd_nfp_version.map
 
 LIBABIVER := 1
 
+VPATH += $(SRCDIR)/nfpcore
+
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_cppcore.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_cpp_pcie_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_mutex.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_resource.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_crc.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_mip.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nffw.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_hwinfo.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_rtsym.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp_cmds.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp_eth.c
+
 #
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_net.c
-SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nfpu.c
-SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nspu.c
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 8591c7d..4eb032c 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015 Netronome Systems, Inc.
+ * Copyright (c) 2014-2018 Netronome Systems, Inc.
  * All rights reserved.
  *
  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
@@ -55,7 +55,13 @@
 #include <rte_alarm.h>
 #include <rte_spinlock.h>
 
-#include "nfp_nfpu.h"
+#include "nfpcore/nfp_cpp.h"
+#include "nfpcore/nfp_nffw.h"
+#include "nfpcore/nfp_hwinfo.h"
+#include "nfpcore/nfp_mip.h"
+#include "nfpcore/nfp_rtsym.h"
+#include "nfpcore/nfp_nsp.h"
+
 #include "nfp_net_pmd.h"
 #include "nfp_net_logs.h"
 #include "nfp_net_ctrl.h"
@@ -104,12 +110,8 @@ static int nfp_net_rss_reta_write(struct rte_eth_dev *dev,
 static int nfp_net_rss_hash_write(struct rte_eth_dev *dev,
 			struct rte_eth_rss_conf *rss_conf);
 
-/*
- * The offset of the queue controller queues in the PCIe Target. These
- * happen to be at the same offset on the NFP6000 and the NFP3200 so
- * we use a single macro here.
- */
-#define NFP_PCIE_QUEUE(_q)	(0x800 * ((_q) & 0xff))
+/* The offset of the queue controller queues in the PCIe Target */
+#define NFP_PCIE_QUEUE(_q) (0x80000 + (NFP_QCP_QUEUE_ADDR_SZ * ((_q) & 0xff)))
 
 /* Maximum value which can be added to a queue with one transaction */
 #define NFP_QCP_MAX_ADD	0x7f
@@ -574,47 +576,29 @@ enum nfp_qcp_ptr {
 #define ETH_ADDR_LEN	6
 
 static void
-nfp_eth_copy_mac_reverse(uint8_t *dst, const uint8_t *src)
+nfp_eth_copy_mac(uint8_t *dst, const uint8_t *src)
 {
 	int i;
 
 	for (i = 0; i < ETH_ADDR_LEN; i++)
-		dst[ETH_ADDR_LEN - i - 1] = src[i];
+		dst[i] = src[i];
 }
 
 static int
 nfp_net_pf_read_mac(struct nfp_net_hw *hw, int port)
 {
-	union eth_table_entry *entry;
-	int idx, i;
-
-	idx = port;
-	entry = hw->eth_table;
-
-	/* Reading NFP ethernet table obtained before */
-	for (i = 0; i < NSP_ETH_MAX_COUNT; i++) {
-		if (!(entry->port & NSP_ETH_PORT_LANES_MASK)) {
-			/* port not in use */
-			entry++;
-			continue;
-		}
-		if (idx == 0)
-			break;
-		idx--;
-		entry++;
-	}
-
-	if (i == NSP_ETH_MAX_COUNT)
-		return -EINVAL;
+	struct nfp_eth_table *nfp_eth_table;
 
+	nfp_eth_table = nfp_eth_read_ports(hw->cpp);
 	/*
 	 * hw points to port0 private data. We need hw now pointing to
 	 * right port.
 	 */
 	hw += port;
-	nfp_eth_copy_mac_reverse((uint8_t *)&hw->mac_addr,
-				 (uint8_t *)&entry->mac_addr);
+	nfp_eth_copy_mac((uint8_t *)&hw->mac_addr,
+			 (uint8_t *)&nfp_eth_table->ports[port].mac_addr);
 
+	free(nfp_eth_table);
 	return 0;
 }
 
@@ -780,7 +764,7 @@ enum nfp_qcp_ptr {
 
 	if (hw->is_pf)
 		/* Configure the physical port up */
-		nfp_nsp_eth_config(hw->nspu_desc, hw->pf_port_idx, 1);
+		nfp_eth_set_configured(hw->cpp, hw->pf_port_idx, 1);
 
 	hw->ctrl = new_ctrl;
 
@@ -831,7 +815,7 @@ enum nfp_qcp_ptr {
 
 	if (hw->is_pf)
 		/* Configure the physical port down */
-		nfp_nsp_eth_config(hw->nspu_desc, hw->pf_port_idx, 0);
+		nfp_eth_set_configured(hw->cpp, hw->pf_port_idx, 0);
 }
 
 /* Reset and stop device. The device can not be restarted. */
@@ -2678,10 +2662,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	uint64_t tx_bar_off = 0, rx_bar_off = 0;
 	uint32_t start_q;
 	int stride = 4;
-
-	nspu_desc_t *nspu_desc = NULL;
-	uint64_t bar_offset;
 	int port = 0;
+	int err;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -2702,7 +2684,6 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 
 		/* This points to the specific port private data */
 		hw = &hwport0[port];
-		hw->pf_port_idx = port;
 	} else {
 		hw = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 		hwport0 = 0;
@@ -2736,19 +2717,14 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	}
 
 	if (hw->is_pf && port == 0) {
-		nspu_desc = hw->nspu_desc;
-
-		if (nfp_nsp_map_ctrl_bar(nspu_desc, &bar_offset) != 0) {
-			/*
-			 * A firmware should be there after PF probe so this
-			 * should not happen.
-			 */
-			RTE_LOG(ERR, PMD, "PF BAR symbol resolution failed\n");
-			return -ENODEV;
+		hw->ctrl_bar = nfp_rtsym_map(hw->sym_tbl, "_pf0_net_bar0",
+					     hw->total_ports * 32768,
+					     &hw->ctrl_area);
+		if (!hw->ctrl_bar) {
+			printf("nfp_rtsym_map fails for _pf0_net_ctrl_bar\n");
+			return -EIO;
 		}
 
-		/* vNIC PF control BAR is a subset of PF PCI device BAR */
-		hw->ctrl_bar += bar_offset;
 		PMD_INIT_LOG(DEBUG, "ctrl bar: %p\n", hw->ctrl_bar);
 	}
 
@@ -2772,13 +2748,14 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	case PCI_DEVICE_ID_NFP6000_PF_NIC:
 	case PCI_DEVICE_ID_NFP6000_VF_NIC:
 		start_q = nn_cfg_readl(hw, NFP_NET_CFG_START_TXQ);
-		tx_bar_off = NFP_PCIE_QUEUE(start_q);
+		tx_bar_off = start_q * NFP_QCP_QUEUE_ADDR_SZ;
 		start_q = nn_cfg_readl(hw, NFP_NET_CFG_START_RXQ);
-		rx_bar_off = NFP_PCIE_QUEUE(start_q);
+		rx_bar_off = start_q * NFP_QCP_QUEUE_ADDR_SZ;
 		break;
 	default:
 		RTE_LOG(ERR, PMD, "nfp_net: no device ID matching\n");
-		return -ENODEV;
+		err = -ENODEV;
+		goto dev_err_ctrl_map;
 	}
 
 	PMD_INIT_LOG(DEBUG, "tx_bar_off: 0x%" PRIx64 "\n", tx_bar_off);
@@ -2786,17 +2763,19 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 
 	if (hw->is_pf && port == 0) {
 		/* configure access to tx/rx vNIC BARs */
-		nfp_nsp_map_queues_bar(nspu_desc, &bar_offset);
-		PMD_INIT_LOG(DEBUG, "tx/rx bar_offset: %" PRIx64 "\n",
-				    bar_offset);
-		hwport0->hw_queues = (uint8_t *)pci_dev->mem_resource[0].addr;
-
-		/* vNIC PF tx/rx BARs are a subset of PF PCI device */
-		hwport0->hw_queues += bar_offset;
+		hwport0->hw_queues = nfp_cpp_map_area(hw->cpp, 0, 0,
+						      NFP_PCIE_QUEUE(0),
+						      NFP_QCP_QUEUE_AREA_SZ,
+						      &hw->hwqueues_area);
+
+		if (!hwport0->hw_queues) {
+			printf("nfp_rtsym_map fails for net.qc\n");
+			err = -EIO;
+			goto dev_err_ctrl_map;
+		}
 
-		/* Lets seize the chance to read eth table from hw */
-		if (nfp_nsp_eth_read_table(nspu_desc, &hw->eth_table))
-			return -ENODEV;
+		PMD_INIT_LOG(DEBUG, "tx/rx bar address: 0x%p\n",
+				    hwport0->hw_queues);
 	}
 
 	if (hw->is_pf) {
@@ -2856,7 +2835,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	eth_dev->data->mac_addrs = rte_zmalloc("mac_addr", ETHER_ADDR_LEN, 0);
 	if (eth_dev->data->mac_addrs == NULL) {
 		PMD_INIT_LOG(ERR, "Failed to space for MAC address");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto dev_err_queues_map;
 	}
 
 	if (hw->is_pf) {
@@ -2867,6 +2847,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	}
 
 	if (!is_valid_assigned_ether_addr((struct ether_addr *)&hw->mac_addr)) {
+		PMD_INIT_LOG(INFO, "Using random mac address for port %d\n",
+				   port);
 		/* Using random mac addresses for VFs */
 		eth_random_addr(&hw->mac_addr[0]);
 		nfp_net_write_mac(hw, (uint8_t *)&hw->mac_addr);
@@ -2895,11 +2877,19 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	nfp_net_stats_reset(eth_dev);
 
 	return 0;
+
+dev_err_queues_map:
+		nfp_cpp_area_free(hw->hwqueues_area);
+dev_err_ctrl_map:
+		nfp_cpp_area_free(hw->ctrl_area);
+
+	return err;
 }
 
 static int
 nfp_pf_create_dev(struct rte_pci_device *dev, int port, int ports,
-		  nfpu_desc_t *nfpu_desc, void **priv)
+		  struct nfp_cpp *cpp, struct nfp_hwinfo *hwinfo,
+		  int phys_port, struct nfp_rtsym_table *sym_tbl, void **priv)
 {
 	struct rte_eth_dev *eth_dev;
 	struct nfp_net_hw *hw;
@@ -2937,12 +2927,16 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	 * Then dev_private is adjusted per port.
 	 */
 	hw = (struct nfp_net_hw *)(eth_dev->data->dev_private) + port;
-	hw->nspu_desc = nfpu_desc->nspu;
-	hw->nfpu_desc = nfpu_desc;
+	hw->cpp = cpp;
+	hw->hwinfo = hwinfo;
+	hw->sym_tbl = sym_tbl;
+	hw->pf_port_idx = phys_port;
 	hw->is_pf = 1;
 	if (ports > 1)
 		hw->pf_multiport_enabled = 1;
 
+	hw->total_ports = ports;
+
 	eth_dev->device = &dev->device;
 	rte_eth_copy_pci_info(eth_dev, dev);
 
@@ -2956,55 +2950,191 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	return ret;
 }
 
+#define DEFAULT_FW_PATH       "/lib/firmware/netronome"
+
+static int
+nfp_fw_upload(struct rte_pci_device *dev, struct nfp_nsp *nsp, char *card)
+{
+	struct nfp_cpp *cpp = nsp->cpp;
+	int fw_f;
+	char *fw_buf;
+	char fw_name[100];
+	char serial[100];
+	struct stat file_stat;
+	off_t fsize, bytes;
+
+	/* Looking for firmware file in order of priority */
+
+	/* First try to find a firmware image specific for this device */
+	sprintf(serial, "serial-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x",
+		cpp->serial[0], cpp->serial[1], cpp->serial[2], cpp->serial[3],
+		cpp->serial[4], cpp->serial[5], cpp->interface >> 8,
+		cpp->interface & 0xff);
+
+	sprintf(fw_name, "%s/%s.nffw", DEFAULT_FW_PATH, serial);
+
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f > 0)
+		goto read_fw;
+
+	/* Then try the PCI name */
+	sprintf(fw_name, "%s/pci-%s.nffw", DEFAULT_FW_PATH, dev->device.name);
+
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f > 0)
+		goto read_fw;
+
+	/* Finally try the card type and media */
+	sprintf(fw_name, "%s/%s", DEFAULT_FW_PATH, card);
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f < 0) {
+		RTE_LOG(INFO, PMD, "Firmware file %s not found.", fw_name);
+		return -ENOENT;
+	}
+
+read_fw:
+	if (fstat(fw_f, &file_stat) < 0) {
+		RTE_LOG(INFO, PMD, "Firmware file %s size is unknown", fw_name);
+		close(fw_f);
+		return -ENOENT;
+	}
+
+	fsize = file_stat.st_size;
+	RTE_LOG(INFO, PMD, "Firmware file found at %s with size: %" PRIu64 "\n",
+			    fw_name, (uint64_t)fsize);
+
+	fw_buf = malloc((size_t)fsize);
+	if (!fw_buf) {
+		RTE_LOG(INFO, PMD, "malloc failed for fw buffer");
+		close(fw_f);
+		return -ENOMEM;
+	}
+	memset(fw_buf, 0, fsize);
+
+	bytes = read(fw_f, fw_buf, fsize);
+	if (bytes != fsize) {
+		RTE_LOG(INFO, PMD, "Reading fw to buffer failed.\n"
+				   "Just %" PRIu64 " of %" PRIu64 " bytes read",
+				   (uint64_t)bytes, (uint64_t)fsize);
+		free(fw_buf);
+		close(fw_f);
+		return -EIO;
+	}
+
+	RTE_LOG(INFO, PMD, "Uploading the firmware ...");
+	nfp_nsp_load_fw(nsp, fw_buf, bytes);
+	RTE_LOG(INFO, PMD, "Done");
+
+	free(fw_buf);
+	close(fw_f);
+
+	return 0;
+}
+
+static int
+nfp_fw_setup(struct rte_pci_device *dev, struct nfp_cpp *cpp,
+	     struct nfp_eth_table *nfp_eth_table, struct nfp_hwinfo *hwinfo)
+{
+	struct nfp_nsp *nsp;
+	const char *nfp_fw_model;
+	char card_desc[100];
+	int err = 0;
+
+	nfp_fw_model = nfp_hwinfo_lookup(hwinfo, "assembly.partno");
+
+	if (nfp_fw_model) {
+		RTE_LOG(INFO, PMD, "firmware model found: %s\n", nfp_fw_model);
+	} else {
+		RTE_LOG(ERR, PMD, "firmware model NOT found\n");
+		return -EIO;
+	}
+
+	if (nfp_eth_table->count == 0 || nfp_eth_table->count > 8) {
+		RTE_LOG(ERR, PMD, "NFP ethernet table reports wrong ports: %u\n",
+		       nfp_eth_table->count);
+		return -EIO;
+	}
+
+	RTE_LOG(INFO, PMD, "NFP ethernet port table reports %u ports\n",
+			   nfp_eth_table->count);
+
+	RTE_LOG(INFO, PMD, "Port speed: %u\n", nfp_eth_table->ports[0].speed);
+
+	sprintf(card_desc, "nic_%s_%dx%d.nffw", nfp_fw_model,
+		nfp_eth_table->count, nfp_eth_table->ports[0].speed / 1000);
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp) {
+		RTE_LOG(ERR, PMD, "NFP error when obtaining NSP handle\n");
+		return -EIO;
+	}
+
+	nfp_nsp_device_soft_reset(nsp);
+	err = nfp_fw_upload(dev, nsp, card_desc);
+
+	nfp_nsp_close(nsp);
+	return err;
+}
+
 static int nfp_pf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			    struct rte_pci_device *dev)
 {
-	nfpu_desc_t *nfpu_desc;
-	nspu_desc_t *nspu_desc;
-	uint64_t offset_symbol;
-	uint8_t *bar_offset;
-	int major, minor;
+	struct nfp_cpp *cpp;
+	struct nfp_hwinfo *hwinfo;
+	struct nfp_rtsym_table *sym_tbl;
+	struct nfp_eth_table *nfp_eth_table = NULL;
 	int total_ports;
 	void *priv = 0;
 	int ret = -ENODEV;
+	int err;
 	int i;
 
 	if (!dev)
 		return ret;
 
-	nfpu_desc = rte_malloc("nfp nfpu", sizeof(nfpu_desc_t), 0);
-	if (!nfpu_desc)
-		return -ENOMEM;
-
-	if (nfpu_open(dev, nfpu_desc, 0) < 0) {
-		RTE_LOG(ERR, PMD,
-			"nfpu_open failed\n");
-		goto nfpu_error;
+	cpp = nfp_cpp_from_device_name(dev->device.name);
+	if (!cpp) {
+		RTE_LOG(ERR, PMD, "A CPP handle can not be obtained");
+		ret = -EIO;
+		goto error;
 	}
 
-	nspu_desc = nfpu_desc->nspu;
+	hwinfo = nfp_hwinfo_read(cpp);
+	if (!hwinfo) {
+		RTE_LOG(ERR, PMD, "Error reading hwinfo table");
+		return -EIO;
+	}
 
+	nfp_eth_table = nfp_eth_read_ports(cpp);
+	if (!nfp_eth_table) {
+		RTE_LOG(ERR, PMD, "Error reading NFP ethernet table\n");
+		return -EIO;
+	}
 
-	/* Check NSP ABI version */
-	if (nfp_nsp_get_abi_version(nspu_desc, &major, &minor) < 0) {
-		RTE_LOG(INFO, PMD, "NFP NSP not present\n");
+	if (nfp_fw_setup(dev, cpp, nfp_eth_table, hwinfo)) {
+		RTE_LOG(INFO, PMD, "Error when uploading firmware\n");
+		ret = -EIO;
 		goto error;
 	}
-	PMD_INIT_LOG(INFO, "nspu ABI version: %d.%d\n", major, minor);
 
-	if ((major == 0) && (minor < 20)) {
-		RTE_LOG(INFO, PMD, "NFP NSP ABI version too old. Required 0.20 or higher\n");
+	/* Now the symbol table should be there */
+	sym_tbl = nfp_rtsym_table_read(cpp);
+	if (!sym_tbl) {
+		RTE_LOG(ERR, PMD, "Something is wrong with the firmware"
+				" symbol table");
+		ret = -EIO;
 		goto error;
 	}
 
-	ret = nfp_nsp_fw_setup(nspu_desc, "nfd_cfg_pf0_num_ports",
-			       &offset_symbol);
-	if (ret)
+	total_ports = nfp_rtsym_read_le(sym_tbl, "nfd_cfg_pf0_num_ports", &err);
+	if (total_ports != (int)nfp_eth_table->count) {
+		RTE_LOG(ERR, PMD, "Inconsistent number of ports\n");
+		ret = -EIO;
 		goto error;
-
-	bar_offset = (uint8_t *)dev->mem_resource[0].addr;
-	bar_offset += offset_symbol;
-	total_ports = (uint32_t)*bar_offset;
+	}
 	PMD_INIT_LOG(INFO, "Total pf ports: %d\n", total_ports);
 
 	if (total_ports <= 0 || total_ports > 8) {
@@ -3014,18 +3144,15 @@ static int nfp_pf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 
 	for (i = 0; i < total_ports; i++) {
-		ret = nfp_pf_create_dev(dev, i, total_ports, nfpu_desc, &priv);
+		ret = nfp_pf_create_dev(dev, i, total_ports, cpp, hwinfo,
+					nfp_eth_table->ports[i].index,
+					sym_tbl, &priv);
 		if (ret)
-			goto error;
+			break;
 	}
 
-	return 0;
-
 error:
-	nfpu_close(nfpu_desc);
-nfpu_error:
-	rte_free(nfpu_desc);
-
+	free(nfp_eth_table);
 	return ret;
 }
 
@@ -3073,8 +3200,19 @@ static int eth_nfp_pci_remove(struct rte_pci_device *pci_dev)
 	if ((pci_dev->id.device_id == PCI_DEVICE_ID_NFP4000_PF_NIC) ||
 	    (pci_dev->id.device_id == PCI_DEVICE_ID_NFP6000_PF_NIC)) {
 		port = get_pf_port_number(eth_dev->data->name);
+		/*
+		 * hotplug is not possible with multiport PF although freeing
+		 * data structures can be done for first port.
+		 */
+		if (port != 0)
+			return -ENOTSUP;
 		hwport0 = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 		hw = &hwport0[port];
+		nfp_cpp_area_free(hw->ctrl_area);
+		nfp_cpp_area_free(hw->hwqueues_area);
+		free(hw->hwinfo);
+		free(hw->sym_tbl);
+		nfp_cpp_free(hw->cpp);
 	} else {
 		hw = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 	}
diff --git a/drivers/net/nfp/nfp_net_pmd.h b/drivers/net/nfp/nfp_net_pmd.h
index 1ae0ea6..097c871 100644
--- a/drivers/net/nfp/nfp_net_pmd.h
+++ b/drivers/net/nfp/nfp_net_pmd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015 Netronome Systems, Inc.
+ * Copyright (c) 2014-2018 Netronome Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -63,6 +63,7 @@
 #define NFP_NET_CRTL_BAR        0
 #define NFP_NET_TX_BAR          2
 #define NFP_NET_RX_BAR          2
+#define NFP_QCP_QUEUE_AREA_SZ			0x80000
 
 /* Macros for accessing the Queue Controller Peripheral 'CSRs' */
 #define NFP_QCP_QUEUE_OFF(_x)                 ((_x) * 0x800)
@@ -430,20 +431,21 @@ struct nfp_net_hw {
 	/* Records starting point for counters */
 	struct rte_eth_stats eth_stats_base;
 
-#ifdef NFP_NET_LIBNFP
 	struct nfp_cpp *cpp;
 	struct nfp_cpp_area *ctrl_area;
-	struct nfp_cpp_area *tx_area;
-	struct nfp_cpp_area *rx_area;
+	struct nfp_cpp_area *hwqueues_area;
 	struct nfp_cpp_area *msix_area;
-#endif
+
 	uint8_t *hw_queues;
 	uint8_t is_pf;
 	uint8_t pf_port_idx;
 	uint8_t pf_multiport_enabled;
+	uint8_t total_ports;
+
 	union eth_table_entry *eth_table;
-	nspu_desc_t *nspu_desc;
-	nfpu_desc_t *nfpu_desc;
+
+	struct nfp_hwinfo *hwinfo;
+	struct nfp_rtsym_table *sym_tbl;
 };
 
 struct nfp_net_adapter {
-- 
1.9.1

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH v2 1/4] net/nfp: add NFP CPP support
  @ 2018-04-05 14:42  1% ` Alejandro Lucero
  2018-04-05 14:42  6% ` [dpdk-dev] [PATCH v2 2/4] net/nfp: update PMD for using new CPP interface Alejandro Lucero
  1 sibling, 0 replies; 200+ results
From: Alejandro Lucero @ 2018-04-05 14:42 UTC (permalink / raw)
  To: dev

CPP refers to the internal NFP Command Push Pull bus. This patch allows
to create CPP commands from user space allowing to access any single
part of the chip.

This CPP interface is the base for having other functionalities like
mutexes when accessing specific chip components, chip resources management,
firmware upload or using the NSP, an embedded arm processor which can
perform tasks on demand.

NSP was the previous only way for doing things in the chip by the PMD,
where a NSPU interface was used for commands like firmware upload or
port link configuration. CPP interface supersedes NSPU, but it is still
possible to use NSP through CPP.

CPP interface adds a great flexibility for doing things like extended
stats or firmware debugging.

Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
---
 drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h    | 722 +++++++++++++++++
 drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h |  36 +
 drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h    | 592 ++++++++++++++
 drivers/net/nfp/nfpcore/nfp6000/nfp6000.h         |  40 +
 drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h         |  26 +
 drivers/net/nfp/nfpcore/nfp_cpp.h                 | 776 ++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c        | 936 ++++++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_cppcore.c             | 856 ++++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_crc.c                 |  49 ++
 drivers/net/nfp/nfpcore/nfp_crc.h                 |  19 +
 drivers/net/nfp/nfpcore/nfp_hwinfo.c              | 199 +++++
 drivers/net/nfp/nfpcore/nfp_hwinfo.h              |  85 ++
 drivers/net/nfp/nfpcore/nfp_mip.c                 | 154 ++++
 drivers/net/nfp/nfpcore/nfp_mip.h                 |  21 +
 drivers/net/nfp/nfpcore/nfp_mutex.c               | 424 ++++++++++
 drivers/net/nfp/nfpcore/nfp_nffw.c                | 235 ++++++
 drivers/net/nfp/nfpcore/nfp_nffw.h                |  86 ++
 drivers/net/nfp/nfpcore/nfp_nsp.c                 | 427 ++++++++++
 drivers/net/nfp/nfpcore/nfp_nsp.h                 | 304 +++++++
 drivers/net/nfp/nfpcore/nfp_nsp_cmds.c            | 109 +++
 drivers/net/nfp/nfpcore/nfp_nsp_eth.c             | 665 +++++++++++++++
 drivers/net/nfp/nfpcore/nfp_resource.c            | 264 ++++++
 drivers/net/nfp/nfpcore/nfp_resource.h            |  52 ++
 drivers/net/nfp/nfpcore/nfp_rtsym.c               | 327 ++++++++
 drivers/net/nfp/nfpcore/nfp_rtsym.h               |  61 ++
 drivers/net/nfp/nfpcore/nfp_target.h              | 579 +++++++++++++
 26 files changed, 8044 insertions(+)
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cpp.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cppcore.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_crc.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_crc.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_hwinfo.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_hwinfo.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mip.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mip.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mutex.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nffw.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nffw.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp_eth.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_resource.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_resource.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_rtsym.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_rtsym.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_target.h

diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
new file mode 100644
index 0000000..6e380cc
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
@@ -0,0 +1,722 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CPPAT_H__
+#define __NFP_CPPAT_H__
+
+#include "nfp_platform.h"
+#include "nfp_resid.h"
+
+/* This file contains helpers for creating CPP commands
+ *
+ * All magic NFP-6xxx IMB 'mode' numbers here are from:
+ * Databook (1 August 2013)
+ * - System Overview and Connectivity
+ * -- Internal Connectivity
+ * --- Distributed Switch Fabric - Command Push/Pull (DSF-CPP) Bus
+ * ---- CPP addressing
+ * ----- Table 3.6. CPP Address Translation Mode Commands
+ */
+
+#define _NIC_NFP6000_MU_LOCALITY_DIRECT 2
+
+static inline int
+_nfp6000_decode_basic(uint64_t addr, int *dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0);
+
+static uint64_t
+_nic_mask64(int msb, int lsb, int at0)
+{
+	uint64_t v;
+	int w = msb - lsb + 1;
+
+	if (w == 64)
+		return ~(uint64_t)0;
+
+	if ((lsb + w) > 64)
+		return 0;
+
+	v = (UINT64_C(1) << w) - 1;
+
+	if (at0)
+		return v;
+
+	return v << lsb;
+}
+
+/* For VQDR, we may not modify the Channel bits, which might overlap
+ * with the Index bit. When it does, we need to ensure that isld0 == isld1.
+ */
+static inline int
+_nfp6000_encode_basic(uint64_t *addr, int dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0)
+{
+	uint64_t _u64;
+	int iid_lsb, idx_lsb;
+	int i, v = 0;
+	int isld[2];
+
+	isld[0] = isld0;
+	isld[1] = isld1;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_MU:
+		/* This function doesn't handle MU */
+		return NFP_ERRNO(EINVAL);
+	case NFP6000_CPPTGT_CTXPB:
+		/* This function doesn't handle CTXPB */
+		return NFP_ERRNO(EINVAL);
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case 0:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/*
+			 * In this specific mode we'd rather not modify the
+			 * address but we can verify if the existing contents
+			 * will point to a valid island.
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1,
+						  isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		iid_lsb = (addr40) ? 34 : 26;
+
+		/* <39:34> or <31:26> */
+		_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+		*addr &= ~_u64;
+		*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+		return 0;
+	case 1:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 39 : 31;
+		if (dest_island == isld0) {
+			/* Only need to clear the Index bit */
+			*addr &= ~_nic_mask64(idx_lsb, idx_lsb, 0);
+			return 0;
+		}
+
+		if (dest_island == isld1) {
+			/* Only need to set the Index bit */
+			*addr |= (UINT64_C(1) << idx_lsb);
+			return 0;
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 2:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/* iid<0> = addr<30> = channel<0> */
+			/* channel<1> = addr<31> = Index */
+
+			/*
+			 * Special case where we allow channel bits to be set
+			 * before hand and with them select an island.
+			 * So we need to confirm that it's at least plausible.
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 **/
+		isld[0] &= ~1;
+		isld[1] &= ~1;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 1;
+
+		/*
+		 * Try each option, take first one that fits. Not sure if we
+		 * would want to do some smarter searching and prefer 0 or non-0
+		 * island IDs.
+		 */
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 2; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 3:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/*
+			 * iid<0> = addr<29> = data
+			 * iid<1> = addr<30> = channel<0>
+			 * channel<1> = addr<31> = Index
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		isld[0] &= ~3;
+		isld[1] &= ~3;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 2;
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 4; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+		return NFP_ERRNO(ENODEV);
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_decode_basic(uint64_t addr, int *dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0)
+{
+	int iid_lsb, idx_lsb;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_MU:
+		/* This function doesn't handle MU */
+		return NFP_ERRNO(EINVAL);
+	case NFP6000_CPPTGT_CTXPB:
+		/* This function doesn't handle CTXPB */
+		return NFP_ERRNO(EINVAL);
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case 0:
+		/*
+		 * For VQDR, in this mode for 32-bit addressing it would be
+		 * islands 0, 16, 32 and 48 depending on channel and upper
+		 * address bits. Since those are not all valid islands, most
+		 * decode cases would result in bad island IDs, but we do them
+		 * anyway since this is decoding an address that is already
+		 * assumed to be used as-is to get to sram.
+		 */
+		iid_lsb = (addr40) ? 34 : 26;
+		*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+		return 0;
+	case 1:
+		/*
+		 * For VQDR 32-bit, this would decode as:
+		 *	Channel 0: island#0
+		 *	Channel 1: island#0
+		 *	Channel 2: island#1
+		 *	Channel 3: island#1
+		 *
+		 * That would be valid as long as both islands have VQDR.
+		 * Let's allow this.
+		 */
+
+		idx_lsb = (addr40) ? 39 : 31;
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1;
+		else
+			*dest_island = isld0;
+
+		return 0;
+	case 2:
+		/*
+		 * For VQDR 32-bit:
+		 *	Channel 0: (island#0 | 0)
+		 *	Channel 1: (island#0 | 1)
+		 *	Channel 2: (island#1 | 0)
+		 *	Channel 3: (island#1 | 1)
+		 *
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld0 &= ~1;
+		isld1 &= ~1;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 1;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 1);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 1);
+
+		return 0;
+	case 3:
+		/*
+		 * In this mode the data address starts to affect the island ID
+		 * so rather not allow it. In some really specific case one
+		 * could use this to send the upper half of the VQDR channel to
+		 * another MU, but this is getting very specific. However, as
+		 * above for mode 0, this is the decoder and the caller should
+		 * validate the resulting IID. This blindly does what the
+		 * silicon would do.
+		 */
+
+		isld0 &= ~3;
+		isld1 &= ~3;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 2;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 3);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 3);
+
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_mu_locality_lsb(int mode, int addr40)
+{
+	switch (mode) {
+	case 0:
+	case 1:
+	case 2:
+	case 3:
+		return (addr40) ? 38 : 30;
+	default:
+		break;
+	}
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_encode_mu(uint64_t *addr, int dest_island, int mode, int addr40,
+		   int isld1, int isld0)
+{
+	uint64_t _u64;
+	int iid_lsb, idx_lsb, locality_lsb;
+	int i, v;
+	int isld[2];
+	int da;
+
+	isld[0] = isld0;
+	isld[1] = isld1;
+	locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+
+	if (((*addr >> locality_lsb) & 3) == _NIC_NFP6000_MU_LOCALITY_DIRECT)
+		da = 1;
+	else
+		da = 0;
+
+	switch (mode) {
+	case 0:
+		iid_lsb = (addr40) ? 32 : 24;
+		_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+		*addr &= ~_u64;
+		*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+		return 0;
+	case 1:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 37 : 29;
+		if (dest_island == isld0) {
+			*addr &= ~_nic_mask64(idx_lsb, idx_lsb, 0);
+			return 0;
+		}
+
+		if (dest_island == isld1) {
+			*addr |= (UINT64_C(1) << idx_lsb);
+			return 0;
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 2:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld[0] &= ~1;
+		isld[1] &= ~1;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 1;
+
+		/*
+		 * Try each option, take first one that fits. Not sure if we
+		 * would want to do some smarter searching and prefer 0 or
+		 * non-0 island IDs.
+		 */
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 2; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+		return NFP_ERRNO(ENODEV);
+	case 3:
+		/*
+		 * Only the EMU will use 40 bit addressing. Silently set the
+		 * direct locality bit for everyone else. The SDK toolchain
+		 * uses dest_island <= 0 to test for atypical address encodings
+		 * to support access to local-island CTM with a 32-but address
+		 * (high-locality is effectively ignored and just used for
+		 * routing to island #0).
+		 */
+		if (dest_island > 0 &&
+		    (dest_island < 24 || dest_island > 26)) {
+			*addr |= ((uint64_t)_NIC_NFP6000_MU_LOCALITY_DIRECT)
+				 << locality_lsb;
+			da = 1;
+		}
+
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		isld[0] &= ~3;
+		isld[1] &= ~3;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 2;
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 4; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+
+		return NFP_ERRNO(ENODEV);
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_decode_mu(uint64_t addr, int *dest_island, int mode, int addr40,
+		   int isld1, int isld0)
+{
+	int iid_lsb, idx_lsb, locality_lsb;
+	int da;
+
+	locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+
+	if (((addr >> locality_lsb) & 3) == _NIC_NFP6000_MU_LOCALITY_DIRECT)
+		da = 1;
+	else
+		da = 0;
+
+	switch (mode) {
+	case 0:
+		iid_lsb = (addr40) ? 32 : 24;
+		*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+		return 0;
+	case 1:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 37 : 29;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1;
+		else
+			*dest_island = isld0;
+
+		return 0;
+	case 2:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld0 &= ~1;
+		isld1 &= ~1;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 1;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 1);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 1);
+
+		return 0;
+	case 3:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+
+		isld0 &= ~3;
+		isld1 &= ~3;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 2;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 3);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 3);
+
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_addr_encode(uint64_t *addr, int dest_island, int cpp_tgt,
+			   int mode, int addr40, int isld1, int isld0)
+{
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		return _nfp6000_encode_basic(addr, dest_island, cpp_tgt, mode,
+					     addr40, isld1, isld0);
+
+	case NFP6000_CPPTGT_MU:
+		return _nfp6000_encode_mu(addr, dest_island, mode, addr40,
+					  isld1, isld0);
+
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return NFP_ERRNO(EINVAL);
+
+		*addr &= ~_nic_mask64(29, 24, 0);
+		*addr |= (((uint64_t)dest_island) << 24) &
+			  _nic_mask64(29, 24, 0);
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_addr_decode(uint64_t addr, int *dest_island, int cpp_tgt,
+			   int mode, int addr40, int isld1, int isld0)
+{
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		return _nfp6000_decode_basic(addr, dest_island, cpp_tgt, mode,
+					     addr40, isld1, isld0);
+
+	case NFP6000_CPPTGT_MU:
+		return _nfp6000_decode_mu(addr, dest_island, mode, addr40,
+					  isld1, isld0);
+
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return -EINVAL;
+		*dest_island = (int)(addr >> 24) & 0x3F;
+		return 0;
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static inline int
+_nfp6000_cppat_addr_iid_clear(uint64_t *addr, int cpp_tgt, int mode, int addr40)
+{
+	int iid_lsb, locality_lsb, da;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		switch (mode) {
+		case 0:
+			iid_lsb = (addr40) ? 34 : 26;
+			*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+			return 0;
+		case 1:
+			iid_lsb = (addr40) ? 39 : 31;
+			*addr &= ~_nic_mask64(iid_lsb, iid_lsb, 0);
+			return 0;
+		case 2:
+			iid_lsb = (addr40) ? 38 : 30;
+			*addr &= ~_nic_mask64(iid_lsb + 1, iid_lsb, 0);
+			return 0;
+		case 3:
+			iid_lsb = (addr40) ? 37 : 29;
+			*addr &= ~_nic_mask64(iid_lsb + 2, iid_lsb, 0);
+			return 0;
+		default:
+			break;
+		}
+	case NFP6000_CPPTGT_MU:
+		locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+		da = (((*addr >> locality_lsb) & 3) ==
+		      _NIC_NFP6000_MU_LOCALITY_DIRECT);
+		switch (mode) {
+		case 0:
+			iid_lsb = (addr40) ? 32 : 24;
+			*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+			return 0;
+		case 1:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+			iid_lsb = (addr40) ? 37 : 29;
+			*addr &= ~_nic_mask64(iid_lsb, iid_lsb, 0);
+			return 0;
+		case 2:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+
+			iid_lsb = (addr40) ? 36 : 28;
+			*addr &= ~_nic_mask64(iid_lsb + 1, iid_lsb, 0);
+			return 0;
+		case 3:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+
+			iid_lsb = (addr40) ? 35 : 27;
+			*addr &= ~_nic_mask64(iid_lsb + 2, iid_lsb, 0);
+			return 0;
+		default:
+			break;
+		}
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return 0;
+		*addr &= ~(UINT64_C(0x3F) << 24);
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+#endif /* __NFP_CPPAT_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
new file mode 100644
index 0000000..b8541c5
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_PLATFORM_H__
+#define __NFP_PLATFORM_H__
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <sys/cdefs.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <errno.h>
+
+#ifndef BIT_ULL
+#define BIT(x) (1 << (x))
+#define BIT_ULL(x) (1ULL << (x))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define NFP_ERRNO(err) (errno = (err), -1)
+#define NFP_ERRNO_RET(err, ret) (errno = (err), (ret))
+#define NFP_NOERR(errv) (errno)
+#define NFP_ERRPTR(err) (errno = (err), NULL)
+#define NFP_PTRERR(errv) (errno)
+
+#endif /* __NFP_PLATFORM_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
new file mode 100644
index 0000000..0e03948
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
@@ -0,0 +1,592 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RESID_H__
+#define __NFP_RESID_H__
+
+#if (!defined(_NFP_RESID_NO_C_FUNC) && \
+	(defined(__NFP_TOOL_NFCC) || defined(__NFP_TOOL_NFAS)))
+#define _NFP_RESID_NO_C_FUNC
+#endif
+
+#ifndef _NFP_RESID_NO_C_FUNC
+#include "nfp_platform.h"
+#endif
+
+/*
+ * NFP Chip Architectures
+ *
+ * These are semi-arbitrary values to indicate an NFP architecture.
+ * They serve as a software view of a group of chip families, not necessarily a
+ * direct mapping to actual hardware design.
+ */
+#define NFP_CHIP_ARCH_YD	1
+#define NFP_CHIP_ARCH_TH	2
+
+/*
+ * NFP Chip Families.
+ *
+ * These are not enums, because they need to be microcode compatible.
+ * They are also not maskable.
+ *
+ * Note: The NFP-4xxx family is handled as NFP-6xxx in most software
+ * components.
+ *
+ */
+#define NFP_CHIP_FAMILY_NFP6000 0x6000	/* ARCH_TH */
+
+/* NFP Microengine/Flow Processing Core Versions */
+#define NFP_CHIP_ME_VERSION_2_7 0x0207
+#define NFP_CHIP_ME_VERSION_2_8 0x0208
+#define NFP_CHIP_ME_VERSION_2_9 0x0209
+
+/* NFP Chip Base Revisions. Minor stepping can just be added to these */
+#define NFP_CHIP_REVISION_A0 0x00
+#define NFP_CHIP_REVISION_B0 0x10
+#define NFP_CHIP_REVISION_C0 0x20
+#define NFP_CHIP_REVISION_PF 0xff /* Maximum possible revision */
+
+/* CPP Targets for each chip architecture */
+#define NFP6000_CPPTGT_NBI 1
+#define NFP6000_CPPTGT_VQDR 2
+#define NFP6000_CPPTGT_ILA 6
+#define NFP6000_CPPTGT_MU 7
+#define NFP6000_CPPTGT_PCIE 9
+#define NFP6000_CPPTGT_ARM 10
+#define NFP6000_CPPTGT_CRYPTO 12
+#define NFP6000_CPPTGT_CTXPB 14
+#define NFP6000_CPPTGT_CLS 15
+
+/*
+ * Wildcard indicating a CPP read or write action
+ *
+ * The action used will be either read or write depending on whether a read or
+ * write instruction/call is performed on the NFP_CPP_ID.  It is recomended that
+ * the RW action is used even if all actions to be performed on a NFP_CPP_ID are
+ * known to be only reads or writes. Doing so will in many cases save NFP CPP
+ * internal software resources.
+ */
+#define NFP_CPP_ACTION_RW 32
+
+#define NFP_CPP_TARGET_ID_MASK 0x1f
+
+/*
+ *  NFP_CPP_ID - pack target, token, and action into a CPP ID.
+ *
+ * Create a 32-bit CPP identifier representing the access to be made.
+ * These identifiers are used as parameters to other NFP CPP functions. Some
+ * CPP devices may allow wildcard identifiers to be specified.
+ *
+ * @param[in]	target	NFP CPP target id
+ * @param[in]	action	NFP CPP action id
+ * @param[in]	token	NFP CPP token id
+ * @return		NFP CPP ID
+ */
+#define NFP_CPP_ID(target, action, token)                   \
+	((((target) & 0x7f) << 24) | (((token) & 0xff) << 16) | \
+	 (((action) & 0xff) << 8))
+
+#define NFP_CPP_ISLAND_ID(target, action, token, island)    \
+	((((target) & 0x7f) << 24) | (((token) & 0xff) << 16) | \
+	 (((action) & 0xff) << 8) | (((island) & 0xff) << 0))
+
+#ifndef _NFP_RESID_NO_C_FUNC
+
+/**
+ * Return the NFP CPP target of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP target
+ */
+static inline uint8_t
+NFP_CPP_ID_TARGET_of(uint32_t id)
+{
+	return (id >> 24) & NFP_CPP_TARGET_ID_MASK;
+}
+
+/*
+ * Return the NFP CPP token of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP token
+ */
+static inline uint8_t
+NFP_CPP_ID_TOKEN_of(uint32_t id)
+{
+	return (id >> 16) & 0xff;
+}
+
+/*
+ * Return the NFP CPP action of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP action
+ */
+static inline uint8_t
+NFP_CPP_ID_ACTION_of(uint32_t id)
+{
+	return (id >> 8) & 0xff;
+}
+
+/*
+ * Return the NFP CPP action of a NFP CPP ID
+ * @param[in]   id      NFP CPP ID
+ * @return      NFP CPP action
+ */
+static inline uint8_t
+NFP_CPP_ID_ISLAND_of(uint32_t id)
+{
+	return (id) & 0xff;
+}
+
+#endif /* _NFP_RESID_NO_C_FUNC */
+
+/*
+ *  Check if @p chip_family is an ARCH_TH chip.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_IS_ARCH_TH(chip_family) \
+	((int)(chip_family) == (int)NFP_CHIP_FAMILY_NFP6000)
+
+/*
+ *  Get the NFP_CHIP_ARCH_* of @p chip_family.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_ARCH(x) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		(NFP_FAMILY_IS_ARCH_TH(_x) ? NFP_CHIP_ARCH_TH : \
+		NFP_FAMILY_IS_ARCH_YD(_x) ? NFP_CHIP_ARCH_YD : -1) \
+	}))
+
+/*
+ *  Check if @p chip_family is an NFP-6xxx chip.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_IS_NFP6000(chip_family) \
+	((int)(chip_family) == (int)NFP_CHIP_FAMILY_NFP6000)
+
+/*
+ *  Make microengine ID for NFP-6xxx.
+ * @param island_id   Island ID.
+ * @param menum       ME number, 0 based, within island.
+ *
+ * NOTE: menum should really be unsigned - MSC compiler throws error (not
+ * warning) if a clause is always true i.e. menum >= 0 if cluster_num is type
+ * unsigned int hence the cast of the menum to an int in that particular clause
+ */
+#define NFP6000_MEID(a, b)                       \
+	(__extension__ ({ \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		(((((int)(_a) & 0x3F) == (int)(_a)) &&   \
+		(((int)(_b) >= 0) && ((int)(_b) < 12))) ?    \
+		(int)(((_a) << 4) | ((_b) + 4)) : -1) \
+	}))
+
+/*
+ *  Do a general sanity check on the ME ID.
+ * The check is on the highest possible island ID for the chip family and the
+ * microengine number must  be a master ID.
+ * @param meid      ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_IS_VALID(meid) \
+	(__extension__ ({ \
+		typeof(meid) _a = (meid); \
+		((((_a) >> 4) < 64) && (((_a) >> 4) >= 0) && \
+		 (((_a) & 0xF) >= 4)) \
+	}))
+
+/*
+ *  Extract island ID from ME ID.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_ISLAND_of(meid) (((meid) >> 4) & 0x3F)
+
+/*
+ * Extract microengine number (0 based) from ME ID.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_MENUM_of(meid) (((meid) & 0xF) - 4)
+
+/*
+ * Extract microengine group number (0 based) from ME ID.
+ * The group is two code-sharing microengines, so group  0 refers to MEs 0,1,
+ * group 1 refers to MEs 2,3 etc.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_MEGRP_of(meid) (NFP6000_MEID_MENUM_of(meid) >> 1)
+
+#ifndef _NFP_RESID_NO_C_FUNC
+
+/*
+ *  Convert a string to an ME ID.
+ *
+ * @param s       A string of format iX.meY
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME ID part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return     ME ID on success, -1 on error.
+ */
+int nfp6000_idstr2meid(const char *s, const char **endptr);
+
+/*
+ *  Extract island ID from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2island("i32.me5", &c);
+ * // val == 32, c == "me5"
+ * val = nfp6000_idstr2island("i32", &c);
+ * // val == 32, c == ""
+ *
+ * @param s       A string of format "iX.anything" or "iX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the island part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the island ID, -1 on error.
+ */
+int nfp6000_idstr2island(const char *s, const char **endptr);
+
+/*
+ *  Extract microengine number from string.
+ *
+ * Example:
+ * char *c;
+ * int menum = nfp6000_idstr2menum("me5.anything", &c);
+ * // menum == 5, c == "anything"
+ * menum = nfp6000_idstr2menum("me5", &c);
+ * // menum == 5, c == ""
+ *
+ * @param s       A string of format "meX.anything" or "meX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME number, -1 on error.
+ */
+int nfp6000_idstr2menum(const char *s, const char **endptr);
+
+/*
+ * Extract context number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2ctxnum("ctx5.anything", &c);
+ * // val == 5, c == "anything"
+ * val = nfp6000_idstr2ctxnum("ctx5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "ctxN.anything" or "ctxN"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the context number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the context number, -1 on error.
+ */
+int nfp6000_idstr2ctxnum(const char *s, const char **endptr);
+
+/*
+ * Extract microengine group number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2megrp("tg2.anything", &c);
+ * // val == 2, c == "anything"
+ * val = nfp6000_idstr2megrp("tg5", &c);
+ * // val == 2, c == ""
+ *
+ * @param s       A string of format "tgX.anything" or "tgX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME group part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME group number, -1 on error.
+ */
+int nfp6000_idstr2megrp(const char *s, const char **endptr);
+
+/*
+ * Create ME ID string of format "iX[.meY]".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param meid   Microengine ID.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_meid2str(char *s, int meid);
+
+/*
+ * Create ME ID string of format "name[.meY]" or "iX[.meY]".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param meid   Microengine ID.
+ * @return       Pointer to "s" on success, NULL on error.
+ *
+ * Similar to nfp6000_meid2str() except use an alias instead of "iX"
+ * if one exists for the island.
+ */
+const char *nfp6000_meid2altstr(char *s, int meid);
+
+/*
+ * Create string of format "iX".
+ *
+ * @param s         Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                  The resulting string is output here.
+ * @param island_id Island ID.
+ * @return          Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_island2str(char *s, int island_id);
+
+/*
+ * Create string of format "name", an island alias.
+ *
+ * @param s         Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                  The resulting string is output here.
+ * @param island_id Island ID.
+ * @return          Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_island2altstr(char *s, int island_id);
+
+/*
+ * Create string of format "meY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param menum  Microengine number within island.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_menum2str(char *s, int menum);
+
+/*
+ * Create string of format "ctxY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param ctxnum Context number within microengine.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_ctxnum2str(char *s, int ctxnum);
+
+/*
+ * Create string of format "tgY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param megrp  Microengine group number within cluster.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_megrp2str(char *s, int megrp);
+
+/*
+ * Convert a string to an ME ID.
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format iX.meY (or clX.meY)
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    string after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            ME ID on success, -1 on error.
+ */
+int nfp_idstr2meid(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract island ID from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2island(chip, "i32.me5", &c);
+ * // val == 32, c == "me5"
+ * val = nfp_idstr2island(chip, "i32", &c);
+ * // val == 32, c == ""
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format "iX.anything" or "iX"
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    striong after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            The island ID on succes, -1 on error.
+ */
+int nfp_idstr2island(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract microengine number from string.
+ *
+ * Example:
+ * char *c;
+ * int menum = nfp_idstr2menum("me5.anything", &c);
+ * // menum == 5, c == "anything"
+ * menum = nfp_idstr2menum("me5", &c);
+ * // menum == 5, c == ""
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format "meX.anything" or "meX"
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    striong after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            The ME number on succes, -1 on error.
+ */
+int nfp_idstr2menum(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract context number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2ctxnum("ctx5.anything", &c);
+ * // val == 5, c == "anything"
+ * val = nfp_idstr2ctxnum("ctx5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "ctxN.anything" or "ctxN"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the context number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the context number, -1 on error.
+ */
+int nfp_idstr2ctxnum(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract microengine group number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2megrp("tg2.anything", &c);
+ * // val == 2, c == "anything"
+ * val = nfp_idstr2megrp("tg5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "tgX.anything" or "tgX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME group part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME group number, -1 on error.
+ */
+int nfp_idstr2megrp(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Create ME ID string of format "iX[.meY]".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param meid        Microengine ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_meid2str(int chip_family, char *s, int meid);
+
+/*
+ * Create ME ID string of format "name[.meY]" or "iX[.meY]".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param meid        Microengine ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ *
+ * Similar to nfp_meid2str() except use an alias instead of "iX"
+ * if one exists for the island.
+ */
+const char *nfp_meid2altstr(int chip_family, char *s, int meid);
+
+/*
+ * Create string of format "iX".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param island_id   Island ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_island2str(int chip_family, char *s, int island_id);
+
+/*
+ * Create string of format "name", an island alias.
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param island_id   Island ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_island2altstr(int chip_family, char *s, int island_id);
+
+/*
+ * Create string of format "meY".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param menum       Microengine number within island.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_menum2str(int chip_family, char *s, int menum);
+
+/*
+ * Create string of format "ctxY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param ctxnum Context number within microengine.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_ctxnum2str(int chip_family, char *s, int ctxnum);
+
+/*
+ * Create string of format "tgY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param megrp  Microengine group number within cluster.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_megrp2str(int chip_family, char *s, int megrp);
+
+/*
+ * Convert a two character string to revision number.
+ *
+ * Revision integer is 0x00 for A0, 0x11 for B1 etc.
+ *
+ * @param s     Two character string.
+ * @return      Revision number, -1 on error
+ */
+int nfp_idstr2rev(const char *s);
+
+/*
+ * Create string from revision number.
+ *
+ * String will be upper case.
+ *
+ * @param s     Pointer to char buffer with size of at least 3
+ *              for 2 characters and string terminator.
+ * @param rev   Revision number.
+ * @return      Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_rev2str(char *s, int rev);
+
+/*
+ * Get the NFP CPP address from a string
+ *
+ * String is in the format [island@]target[:[action:[token:]]address]
+ *
+ * @param chip_family Chip family ID
+ * @param tid           Pointer to string to parse
+ * @param cpp_idp       Pointer to CPP ID
+ * @param cpp_addrp     Pointer to CPP address
+ * @return              0 on success, or -1 and errno
+ */
+int nfp_str2cpp(int chip_family,
+		const char *tid,
+		uint32_t *cpp_idp,
+		uint64_t *cpp_addrp);
+
+
+#endif /* _NFP_RESID_NO_C_FUNC */
+
+#endif /* __NFP_RESID_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h b/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
new file mode 100644
index 0000000..47e1dda
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFP6000_H__
+#define __NFP_NFP6000_H__
+
+/* CPP Target IDs */
+#define NFP_CPP_TARGET_INVALID          0
+#define NFP_CPP_TARGET_NBI              1
+#define NFP_CPP_TARGET_QDR              2
+#define NFP_CPP_TARGET_ILA              6
+#define NFP_CPP_TARGET_MU               7
+#define NFP_CPP_TARGET_PCIE             9
+#define NFP_CPP_TARGET_ARM              10
+#define NFP_CPP_TARGET_CRYPTO           12
+#define NFP_CPP_TARGET_ISLAND_XPB       14	/* Shared with CAP */
+#define NFP_CPP_TARGET_ISLAND_CAP       14	/* Shared with XPB */
+#define NFP_CPP_TARGET_CT_XPB           14
+#define NFP_CPP_TARGET_LOCAL_SCRATCH    15
+#define NFP_CPP_TARGET_CLS              NFP_CPP_TARGET_LOCAL_SCRATCH
+
+#define NFP_ISL_EMEM0                   24
+
+#define NFP_MU_ADDR_ACCESS_TYPE_MASK    3ULL
+#define NFP_MU_ADDR_ACCESS_TYPE_DIRECT  2ULL
+
+static inline int
+nfp_cppat_mu_locality_lsb(int mode, int addr40)
+{
+	switch (mode) {
+	case 0 ... 3:
+		return addr40 ? 38 : 30;
+	default:
+		return -EINVAL;
+	}
+}
+
+#endif /* NFP_NFP6000_H */
diff --git a/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h b/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
new file mode 100644
index 0000000..7ada1bb
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_XPB_H__
+#define __NFP_XPB_H__
+
+/*
+ * For use with NFP6000 Databook "XPB Addressing" section
+ */
+#define NFP_XPB_OVERLAY(island)  (((island) & 0x3f) << 24)
+
+#define NFP_XPB_ISLAND(island)   (NFP_XPB_OVERLAY(island) + 0x60000)
+
+#define NFP_XPB_ISLAND_of(offset) (((offset) >> 24) & 0x3F)
+
+/*
+ * For use with NFP6000 Databook "XPB Island and Device IDs" chapter
+ */
+#define NFP_XPB_DEVICE(island, slave, device) \
+				(NFP_XPB_OVERLAY(island) | \
+				 (((slave) & 3) << 22) | \
+				 (((device) & 0x3f) << 16))
+
+#endif /* NFP_XPB_H */
diff --git a/drivers/net/nfp/nfpcore/nfp_cpp.h b/drivers/net/nfp/nfpcore/nfp_cpp.h
new file mode 100644
index 0000000..7e86214
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cpp.h
@@ -0,0 +1,776 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CPP_H__
+#define __NFP_CPP_H__
+
+#include "nfp-common/nfp_platform.h"
+#include "nfp-common/nfp_resid.h"
+
+struct nfp_cpp_mutex;
+
+/*
+ * NFP CPP handle
+ */
+struct nfp_cpp {
+	uint32_t model;
+	uint32_t interface;
+	uint8_t *serial;
+	int serial_len;
+	void *priv;
+
+	/* Mutex cache */
+	struct nfp_cpp_mutex *mutex_cache;
+	const struct nfp_cpp_operations *op;
+
+	/*
+	 * NFP-6xxx originating island IMB CPP Address Translation. CPP Target
+	 * ID is index into array. Values are obtained at runtime from local
+	 * island XPB CSRs.
+	 */
+	uint32_t imb_cat_table[16];
+};
+
+/*
+ * NFP CPP device area handle
+ */
+struct nfp_cpp_area {
+	struct nfp_cpp *cpp;
+	char *name;
+	unsigned long long offset;
+	unsigned long size;
+	/* Here follows the 'priv' part of nfp_cpp_area. */
+};
+
+/*
+ * NFP CPP operations structure
+ */
+struct nfp_cpp_operations {
+	/* Size of priv area in struct nfp_cpp_area */
+	size_t area_priv_size;
+
+	/* Instance an NFP CPP */
+	int (*init)(struct nfp_cpp *cpp, const char *devname);
+
+	/*
+	 * Free the bus.
+	 * Called only once, during nfp_cpp_unregister()
+	 */
+	void (*free)(struct nfp_cpp *cpp);
+
+	/*
+	 * Initialize a new NFP CPP area
+	 * NOTE: This is _not_ serialized
+	 */
+	int (*area_init)(struct nfp_cpp_area *area,
+			 uint32_t dest,
+			 unsigned long long address,
+			 unsigned long size);
+	/*
+	 * Clean up a NFP CPP area before it is freed
+	 * NOTE: This is _not_ serialized
+	 */
+	void (*area_cleanup)(struct nfp_cpp_area *area);
+
+	/*
+	 * Acquire resources for a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_acquire)(struct nfp_cpp_area *area);
+	/*
+	 * Release resources for a NFP CPP area
+	 * Serialized
+	 */
+	void (*area_release)(struct nfp_cpp_area *area);
+	/*
+	 * Return a void IO pointer to a NFP CPP area
+	 * NOTE: This is _not_ serialized
+	 */
+
+	void *(*area_iomem)(struct nfp_cpp_area *area);
+
+	void *(*area_mapped)(struct nfp_cpp_area *area);
+	/*
+	 * Perform a read from a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_read)(struct nfp_cpp_area *area,
+			 void *kernel_vaddr,
+			 unsigned long offset,
+			 unsigned int length);
+	/*
+	 * Perform a write to a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_write)(struct nfp_cpp_area *area,
+			  const void *kernel_vaddr,
+			  unsigned long offset,
+			  unsigned int length);
+};
+
+/*
+ * This should be the only external function the transport
+ * module supplies
+ */
+const struct nfp_cpp_operations *nfp_cpp_transport_operations(void);
+
+/*
+ * Set the model id
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   model   Model ID
+ */
+void nfp_cpp_model_set(struct nfp_cpp *cpp, uint32_t model);
+
+/*
+ * Set the private instance owned data of a nfp_cpp struct
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   interface Interface ID
+ */
+void nfp_cpp_interface_set(struct nfp_cpp *cpp, uint32_t interface);
+
+/*
+ * Set the private instance owned data of a nfp_cpp struct
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   serial  NFP serial byte array
+ * @param   len     Length of the serial byte array
+ */
+int nfp_cpp_serial_set(struct nfp_cpp *cpp, const uint8_t *serial,
+		       size_t serial_len);
+
+/*
+ * Set the private data of the nfp_cpp instance
+ *
+ * @param   cpp NFP CPP operations structure
+ * @return      Opaque device pointer
+ */
+void nfp_cpp_priv_set(struct nfp_cpp *cpp, void *priv);
+
+/*
+ * Return the private data of the nfp_cpp instance
+ *
+ * @param   cpp NFP CPP operations structure
+ * @return      Opaque device pointer
+ */
+void *nfp_cpp_priv(struct nfp_cpp *cpp);
+
+/*
+ * Get the privately allocated portion of a NFP CPP area handle
+ *
+ * @param   cpp_area    NFP CPP area handle
+ * @return          Pointer to the private area, or NULL on failure
+ */
+void *nfp_cpp_area_priv(struct nfp_cpp_area *cpp_area);
+
+uint32_t __nfp_cpp_model_autodetect(struct nfp_cpp *cpp);
+
+/*
+ * NFP CPP core interface for CPP clients.
+ */
+
+/*
+ * Open a NFP CPP handle to a CPP device
+ *
+ * @param[in]	id	0-based ID for the CPP interface to use
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp *nfp_cpp_from_device_name(const char *devname);
+
+/*
+ * Free a NFP CPP handle
+ *
+ * @param[in]	cpp	NFP CPP handle
+ */
+void nfp_cpp_free(struct nfp_cpp *cpp);
+
+#define NFP_CPP_MODEL_INVALID   0xffffffff
+
+/*
+ * NFP_CPP_MODEL_CHIP_of - retrieve the chip ID from the model ID
+ *
+ * The chip ID is a 16-bit BCD+A-F encoding for the chip type.
+ *
+ * @param[in]   model   NFP CPP model id
+ * @return      NFP CPP chip id
+ */
+#define NFP_CPP_MODEL_CHIP_of(model)        (((model) >> 16) & 0xffff)
+
+/*
+ * NFP_CPP_MODEL_IS_6000 - Check for the NFP6000 family of devices
+ *
+ * NOTE: The NFP4000 series is considered as a NFP6000 series variant.
+ *
+ * @param[in]	model	NFP CPP model id
+ * @return		true if model is in the NFP6000 family, false otherwise.
+ */
+#define NFP_CPP_MODEL_IS_6000(model)		     \
+		((NFP_CPP_MODEL_CHIP_of(model) >= 0x4000) && \
+		(NFP_CPP_MODEL_CHIP_of(model) < 0x7000))
+
+/*
+ * nfp_cpp_model - Retrieve the Model ID of the NFP
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @return		NFP CPP Model ID
+ */
+uint32_t nfp_cpp_model(struct nfp_cpp *cpp);
+
+/*
+ * NFP Interface types - logical interface for this CPP connection 4 bits are
+ * reserved for interface type.
+ */
+#define NFP_CPP_INTERFACE_TYPE_INVALID		0x0
+#define NFP_CPP_INTERFACE_TYPE_PCI		0x1
+#define NFP_CPP_INTERFACE_TYPE_ARM		0x2
+#define NFP_CPP_INTERFACE_TYPE_RPC		0x3
+#define NFP_CPP_INTERFACE_TYPE_ILA		0x4
+
+/*
+ * Construct a 16-bit NFP Interface ID
+ *
+ * Interface IDs consists of 4 bits of interface type, 4 bits of unit
+ * identifier, and 8 bits of channel identifier.
+ *
+ * The NFP Interface ID is used in the implementation of NFP CPP API mutexes,
+ * which use the MU Atomic CompareAndWrite operation - hence the limit to 16
+ * bits to be able to use the NFP Interface ID as a lock owner.
+ *
+ * @param[in]	type	NFP Interface Type
+ * @param[in]	unit	Unit identifier for the interface type
+ * @param[in]	channel	Channel identifier for the interface unit
+ * @return		Interface ID
+ */
+#define NFP_CPP_INTERFACE(type, unit, channel)	\
+	((((type) & 0xf) << 12) | \
+	 (((unit) & 0xf) <<  8) | \
+	 (((channel) & 0xff) << 0))
+
+/*
+ * Get the interface type of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's type
+ */
+#define NFP_CPP_INTERFACE_TYPE_of(interface)	(((interface) >> 12) & 0xf)
+
+/*
+ * Get the interface unit of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's unit
+ */
+#define NFP_CPP_INTERFACE_UNIT_of(interface)	(((interface) >>  8) & 0xf)
+
+/*
+ * Get the interface channel of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's channel
+ */
+#define NFP_CPP_INTERFACE_CHANNEL_of(interface)	(((interface) >>  0) & 0xff)
+
+/*
+ * Retrieve the Interface ID of the NFP
+ * @param[in]	cpp	NFP CPP handle
+ * @return		NFP CPP Interface ID
+ */
+uint16_t nfp_cpp_interface(struct nfp_cpp *cpp);
+
+/*
+ * Retrieve the NFP Serial Number (unique per NFP)
+ * @param[in]	cpp	NFP CPP handle
+ * @param[out]	serial	Pointer to reference the serial number array
+ *
+ * @return	size of the NFP6000 serial number, in bytes
+ */
+int nfp_cpp_serial(struct nfp_cpp *cpp, const uint8_t **serial);
+
+/*
+ * Allocate a NFP CPP area handle, as an offset into a CPP ID
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc(struct nfp_cpp *cpp, uint32_t cpp_id,
+					unsigned long long address,
+					unsigned long size);
+
+/*
+ * Allocate a NFP CPP area handle, as an offset into a CPP ID, by a named owner
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	name	Name of owner of the area
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc_with_name(struct nfp_cpp *cpp,
+						  uint32_t cpp_id,
+						  const char *name,
+						  unsigned long long address,
+						  unsigned long size);
+
+/*
+ * Free an allocated NFP CPP area handle
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_free(struct nfp_cpp_area *area);
+
+/*
+ * Acquire the resources needed to access the NFP CPP area handle
+ *
+ * @param[in]	area	NFP CPP area handle
+ *
+ * @return 0 on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_acquire(struct nfp_cpp_area *area);
+
+/*
+ * Release the resources needed to access the NFP CPP area handle
+ *
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_release(struct nfp_cpp_area *area);
+
+/*
+ * Allocate, then acquire the resources needed to access the NFP CPP area handle
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc_acquire(struct nfp_cpp *cpp,
+						uint32_t cpp_id,
+						unsigned long long address,
+						unsigned long size);
+
+/*
+ * Release the resources, then free the NFP CPP area handle
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_release_free(struct nfp_cpp_area *area);
+
+uint8_t *nfp_cpp_map_area(struct nfp_cpp *cpp, int domain, int target,
+			   uint64_t addr, unsigned long size,
+			   struct nfp_cpp_area **area);
+/*
+ * Return an IO pointer to the beginning of the NFP CPP area handle. The area
+ * must be acquired with 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ *
+ * @return Pointer to IO memory, or NULL on failure (and set errno accordingly).
+ */
+void *nfp_cpp_area_mapped(struct nfp_cpp_area *area);
+
+/*
+ * Read from a NFP CPP area handle into a buffer. The area must be acquired with
+ * 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	buffer	Location of buffer to receive the data
+ * @param[in]	length	Length of the data to read
+ *
+ * @return bytes read on success, -1 on failure (and set errno accordingly).
+ *
+ */
+int nfp_cpp_area_read(struct nfp_cpp_area *area, unsigned long offset,
+		      void *buffer, size_t length);
+
+/*
+ * Write to a NFP CPP area handle from a buffer. The area must be acquired with
+ * 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	buffer	Location of buffer that holds the data
+ * @param[in]	length	Length of the data to read
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_write(struct nfp_cpp_area *area, unsigned long offset,
+		       const void *buffer, size_t length);
+
+/*
+ * nfp_cpp_area_iomem() - get IOMEM region for CPP area
+ * @area:       CPP area handle
+ *
+ * Returns an iomem pointer for use with readl()/writel() style operations.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ *
+ * Return: pointer to the area, or NULL
+ */
+void *nfp_cpp_area_iomem(struct nfp_cpp_area *area);
+
+/*
+ * Verify that IO can be performed on an offset in an area
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	size	Size of region to validate
+ *
+ * @return 0 on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_check_range(struct nfp_cpp_area *area,
+			     unsigned long long offset, unsigned long size);
+
+/*
+ * Get the NFP CPP handle that is the parent of a NFP CPP area handle
+ *
+ * @param	cpp_area	NFP CPP area handle
+ * @return			NFP CPP handle
+ */
+struct nfp_cpp *nfp_cpp_area_cpp(struct nfp_cpp_area *cpp_area);
+
+/*
+ * Get the name passed during allocation of the NFP CPP area handle
+ *
+ * @param	cpp_area	NFP CPP area handle
+ * @return			Pointer to the area's name
+ */
+const char *nfp_cpp_area_name(struct nfp_cpp_area *cpp_area);
+
+/*
+ * Read a block of data from a NFP CPP ID
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	kernel_vaddr	Buffer to copy read data to
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes read on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_read(struct nfp_cpp *cpp, uint32_t cpp_id,
+		 unsigned long long address, void *kernel_vaddr, size_t length);
+
+/*
+ * Write a block of data to a NFP CPP ID
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	kernel_vaddr	Buffer to copy write data from
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_write(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, const void *kernel_vaddr,
+		  size_t length);
+
+
+
+/*
+ * Fill a NFP CPP area handle and offset with a value
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the NFP CPP ID address space
+ * @param[in]	value	32-bit value to fill area with
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_fill(struct nfp_cpp_area *area, unsigned long offset,
+		      uint32_t value, size_t length);
+
+/*
+ * Read a single 32-bit value from a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		output value
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 32-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_readl(struct nfp_cpp_area *area, unsigned long offset,
+		       uint32_t *value);
+
+/*
+ * Write a single 32-bit value to a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		value to write
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 32-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_writel(struct nfp_cpp_area *area, unsigned long offset,
+			uint32_t value);
+
+/*
+ * Read a single 64-bit value from a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		output value
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 64-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_readq(struct nfp_cpp_area *area, unsigned long offset,
+		       uint64_t *value);
+
+/*
+ * Write a single 64-bit value to a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		value to write
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 64-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_writeq(struct nfp_cpp_area *area, unsigned long offset,
+			uint64_t value);
+
+/*
+ * Write a single 32-bit value on the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt	XPB target and address
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_writel(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t value);
+
+/*
+ * Read a single 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt	XPB target and address
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_readl(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t *value);
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to modify
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_writelm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		    uint32_t value);
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to monitor for
+ * @param timeout_us    maximum number of us to wait (-1 for forever)
+ *
+ * @return >= 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_waitlm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		   uint32_t value, int timeout_us);
+
+/*
+ * Read a 32-bit word from a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_readl(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, uint32_t *value);
+
+/*
+ * Write a 32-bit value to a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ *
+ */
+int nfp_cpp_writel(struct nfp_cpp *cpp, uint32_t cpp_id,
+		   unsigned long long address, uint32_t value);
+
+/*
+ * Read a 64-bit work from a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_readq(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, uint64_t *value);
+
+/*
+ * Write a 64-bit value to a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_writeq(struct nfp_cpp *cpp, uint32_t cpp_id,
+		   unsigned long long address, uint64_t value);
+
+/*
+ * Initialize a mutex location
+
+ * The CPP target:address must point to a 64-bit aligned location, and will
+ * initialize 64 bits of data at the location.
+ *
+ * This creates the initial mutex state, as locked by this nfp_cpp_interface().
+ *
+ * This function should only be called when setting up the initial lock state
+ * upon boot-up of the system.
+ *
+ * @param cpp		NFP CPP handle
+ * @param target	NFP CPP target ID
+ * @param address	Offset into the address space of the NFP CPP target ID
+ * @param key_id	Unique 32-bit value for this mutex
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_init(struct nfp_cpp *cpp, int target,
+		       unsigned long long address, uint32_t key_id);
+
+/*
+ * Create a mutex handle from an address controlled by a MU Atomic engine
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and reserve
+ * 64 bits of data at the location for use by the handle.
+ *
+ * Only target/address pairs that point to entities that support the MU Atomic
+ * Engine's CmpAndSwap32 command are supported.
+ *
+ * @param cpp		NFP CPP handle
+ * @param target	NFP CPP target ID
+ * @param address	Offset into the address space of the NFP CPP target ID
+ * @param key_id	32-bit unique key (must match the key at this location)
+ *
+ * @return		A non-NULL struct nfp_cpp_mutex * on success, NULL on
+ *                      failure.
+ */
+struct nfp_cpp_mutex *nfp_cpp_mutex_alloc(struct nfp_cpp *cpp, int target,
+					  unsigned long long address,
+					  uint32_t key_id);
+
+/*
+ * Get the NFP CPP handle the mutex was created with
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          NFP CPP handle
+ */
+struct nfp_cpp *nfp_cpp_mutex_cpp(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex key
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex key
+ */
+uint32_t nfp_cpp_mutex_key(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex owner
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Interface ID of the mutex owner
+ *
+ * NOTE: This is for debug purposes ONLY - the owner may change at any time,
+ * unless it has been locked by this NFP CPP handle.
+ */
+uint16_t nfp_cpp_mutex_owner(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex target
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex CPP target (ie NFP_CPP_TARGET_MU)
+ */
+int nfp_cpp_mutex_target(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex address
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex CPP address
+ */
+uint64_t nfp_cpp_mutex_address(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Free a mutex handle - does not alter the lock state
+ *
+ * @param mutex		NFP CPP Mutex handle
+ */
+void nfp_cpp_mutex_free(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Unlock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_unlock(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Attempt to lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ * @return		0 if the lock succeeded, -1 on failure (and errno set
+ *			appropriately).
+ */
+int nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex);
+
+#endif /* !__NFP_CPP_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c b/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
new file mode 100644
index 0000000..ad6ce72
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
@@ -0,0 +1,936 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/*
+ * nfp_cpp_pcie_ops.c
+ * Authors: Vinayak Tammineedi <vinayak.tammineedi@netronome.com>
+ *
+ * Multiplexes the NFP BARs between NFP internal resources and
+ * implements the PCIe specific interface for generic CPP bus access.
+ *
+ * The BARs are managed and allocated if they are available.
+ * The generic CPP bus abstraction builds upon this BAR interface.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <dirent.h>
+#include <libgen.h>
+
+#include <sys/mman.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include "nfp_cpp.h"
+#include "nfp_target.h"
+#include "nfp6000/nfp6000.h"
+
+#define NFP_PCIE_BAR(_pf)	(0x30000 + ((_pf) & 7) * 0xc0)
+
+#define NFP_PCIE_BAR_PCIE2CPP_ACTION_BASEADDRESS(_x)  (((_x) & 0x1f) << 16)
+#define NFP_PCIE_BAR_PCIE2CPP_BASEADDRESS(_x)         (((_x) & 0xffff) << 0)
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT(_x)        (((_x) & 0x3) << 27)
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_32BIT    0
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_64BIT    1
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_0BYTE    3
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE(_x)             (((_x) & 0x7) << 29)
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_OF(_x)          (((_x) >> 29) & 0x7)
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_FIXED         0
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_BULK          1
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_TARGET        2
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_GENERAL       3
+#define NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(_x)  (((_x) & 0xf) << 23)
+#define NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(_x)   (((_x) & 0x3) << 21)
+
+/*
+ * Minimal size of the PCIe cfg memory we depend on being mapped,
+ * queue controller and DMA controller don't have to be covered.
+ */
+#define NFP_PCI_MIN_MAP_SIZE				0x080000
+
+#define NFP_PCIE_P2C_FIXED_SIZE(bar)               (1 << (bar)->bitsize)
+#define NFP_PCIE_P2C_BULK_SIZE(bar)                (1 << (bar)->bitsize)
+#define NFP_PCIE_P2C_GENERAL_TARGET_OFFSET(bar, x) ((x) << ((bar)->bitsize - 2))
+#define NFP_PCIE_P2C_GENERAL_TOKEN_OFFSET(bar, x) ((x) << ((bar)->bitsize - 4))
+#define NFP_PCIE_P2C_GENERAL_SIZE(bar)             (1 << ((bar)->bitsize - 4))
+
+#define NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(bar, slot) \
+	(NFP_PCIE_BAR(0) + ((bar) * 8 + (slot)) * 4)
+
+#define NFP_PCIE_CPP_BAR_PCIETOCPPEXPBAR(bar, slot) \
+	(((bar) * 8 + (slot)) * 4)
+
+/*
+ * Define to enable a bit more verbose debug output.
+ * Set to 1 to enable a bit more verbose debug output.
+ */
+struct nfp_pcie_user;
+struct nfp6000_area_priv;
+
+/*
+ * struct nfp_bar - describes BAR configuration and usage
+ * @nfp:	backlink to owner
+ * @barcfg:	cached contents of BAR config CSR
+ * @base:	the BAR's base CPP offset
+ * @mask:       mask for the BAR aperture (read only)
+ * @bitsize:	bitsize of BAR aperture (read only)
+ * @index:	index of the BAR
+ * @lock:	lock to specify if bar is in use
+ * @refcnt:	number of current users
+ * @iomem:	mapped IO memory
+ */
+#define NFP_BAR_MAX 7
+struct nfp_bar {
+	struct nfp_pcie_user *nfp;
+	uint32_t barcfg;
+	uint64_t base;		/* CPP address base */
+	uint64_t mask;		/* Bit mask of the bar */
+	uint32_t bitsize;	/* Bit size of the bar */
+	int index;
+	int lock;
+
+	char *csr;
+	char *iomem;
+};
+
+#define BUSDEV_SZ	13
+struct nfp_pcie_user {
+	struct nfp_bar bar[NFP_BAR_MAX];
+
+	int device;
+	int lock;
+	char busdev[BUSDEV_SZ];
+	int barsz;
+	char *cfg;
+};
+
+static uint32_t
+nfp_bar_maptype(struct nfp_bar *bar)
+{
+	return NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_OF(bar->barcfg);
+}
+
+#define TARGET_WIDTH_32    4
+#define TARGET_WIDTH_64    8
+
+static int
+nfp_compute_bar(const struct nfp_bar *bar, uint32_t *bar_config,
+		uint64_t *bar_base, int tgt, int act, int tok,
+		uint64_t offset, size_t size, int width)
+{
+	uint32_t bitsize;
+	uint32_t newcfg;
+	uint64_t mask;
+
+	if (tgt >= 16)
+		return -EINVAL;
+
+	switch (width) {
+	case 8:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_64BIT);
+		break;
+	case 4:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_32BIT);
+		break;
+	case 0:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_0BYTE);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (act != NFP_CPP_ACTION_RW && act != 0) {
+		/* Fixed CPP mapping with specific action */
+		mask = ~(NFP_PCIE_P2C_FIXED_SIZE(bar) - 1);
+
+		newcfg |=
+		    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE
+		    (NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_FIXED);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(tgt);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_ACTION_BASEADDRESS(act);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(tok);
+
+		if ((offset & mask) != ((offset + size - 1) & mask)) {
+			printf("BAR%d: Won't use for Fixed mapping\n",
+				bar->index);
+			printf("\t<%#llx,%#llx>, action=%d\n",
+				(unsigned long long)offset,
+				(unsigned long long)(offset + size), act);
+			printf("\tBAR too small (0x%llx).\n",
+				(unsigned long long)mask);
+			return -EINVAL;
+		}
+		offset &= mask;
+
+#ifdef DEBUG
+		printf("BAR%d: Created Fixed mapping\n", bar->index);
+		printf("\t%d:%d:%d:0x%#llx-0x%#llx>\n", tgt, act, tok,
+			(unsigned long long)offset,
+			(unsigned long long)(offset + mask));
+#endif
+
+		bitsize = 40 - 16;
+	} else {
+		mask = ~(NFP_PCIE_P2C_BULK_SIZE(bar) - 1);
+
+		/* Bulk mapping */
+		newcfg |=
+		    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE
+		    (NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_BULK);
+
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(tgt);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(tok);
+
+		if ((offset & mask) != ((offset + size - 1) & mask)) {
+			printf("BAR%d: Won't use for bulk mapping\n",
+				bar->index);
+			printf("\t<%#llx,%#llx>\n", (unsigned long long)offset,
+				(unsigned long long)(offset + size));
+			printf("\ttarget=%d, token=%d\n", tgt, tok);
+			printf("\tBAR too small (%#llx) - (%#llx != %#llx).\n",
+				(unsigned long long)mask,
+				(unsigned long long)(offset & mask),
+				(unsigned long long)(offset + size - 1) & mask);
+
+			return -EINVAL;
+		}
+
+		offset &= mask;
+
+#ifdef DEBUG
+		printf("BAR%d: Created bulk mapping %d:x:%d:%#llx-%#llx\n",
+			bar->index, tgt, tok, (unsigned long long)offset,
+			(unsigned long long)(offset + ~mask));
+#endif
+
+		bitsize = 40 - 21;
+	}
+
+	if (bar->bitsize < bitsize) {
+		printf("BAR%d: Too small for %d:%d:%d\n", bar->index, tgt, tok,
+			act);
+		return -EINVAL;
+	}
+
+	newcfg |= offset >> bitsize;
+
+	if (bar_base)
+		*bar_base = offset;
+
+	if (bar_config)
+		*bar_config = newcfg;
+
+	return 0;
+}
+
+static int
+nfp_bar_write(struct nfp_pcie_user *nfp, struct nfp_bar *bar,
+		  uint32_t newcfg)
+{
+	int base, slot;
+
+	base = bar->index >> 3;
+	slot = bar->index & 7;
+
+	if (!nfp->cfg)
+		return (-ENOMEM);
+
+	bar->csr = nfp->cfg +
+		   NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(base, slot);
+
+	*(uint32_t *)(bar->csr) = newcfg;
+
+	bar->barcfg = newcfg;
+#ifdef DEBUG
+	printf("BAR%d: updated to 0x%08x\n", bar->index, newcfg);
+#endif
+
+	return 0;
+}
+
+static int
+nfp_reconfigure_bar(struct nfp_pcie_user *nfp, struct nfp_bar *bar, int tgt,
+		int act, int tok, uint64_t offset, size_t size, int width)
+{
+	uint64_t newbase;
+	uint32_t newcfg;
+	int err;
+
+	err = nfp_compute_bar(bar, &newcfg, &newbase, tgt, act, tok, offset,
+			      size, width);
+	if (err)
+		return err;
+
+	bar->base = newbase;
+
+	return nfp_bar_write(nfp, bar, newcfg);
+}
+
+/*
+ * Map all PCI bars. We assume that the BAR with the PCIe config block is
+ * already mapped.
+ *
+ * BAR0.0: Reserved for General Mapping (for MSI-X access to PCIe SRAM)
+ */
+static int
+nfp_enable_bars(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		bar->barcfg = 0;
+		bar->nfp = nfp;
+		bar->index = x;
+		bar->mask = (1 << (nfp->barsz - 3)) - 1;
+		bar->bitsize = nfp->barsz - 3;
+		bar->base = 0;
+		bar->iomem = NULL;
+		bar->lock = 0;
+		bar->csr = nfp->cfg +
+			   NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(bar->index >> 3,
+							   bar->index & 7);
+		bar->iomem =
+		    (char *)mmap(0, 1 << bar->bitsize, PROT_READ | PROT_WRITE,
+				 MAP_SHARED, nfp->device,
+				 bar->index << bar->bitsize);
+
+		if (bar->iomem == MAP_FAILED)
+			return (-ENOMEM);
+	}
+	return 0;
+}
+
+static struct nfp_bar *
+nfp_alloc_bar(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		if (!bar->lock) {
+			bar->lock = 1;
+			return bar;
+		}
+	}
+	return NULL;
+}
+
+static void
+nfp_disable_bars(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		if (bar->iomem) {
+			munmap(bar->iomem, 1 << (nfp->barsz - 3));
+			bar->iomem = NULL;
+			bar->lock = 0;
+		}
+	}
+}
+
+/*
+ * Generic CPP bus access interface.
+ */
+
+struct nfp6000_area_priv {
+	struct nfp_bar *bar;
+	uint32_t bar_offset;
+
+	uint32_t target;
+	uint32_t action;
+	uint32_t token;
+	uint64_t offset;
+	struct {
+		int read;
+		int write;
+		int bar;
+	} width;
+	size_t size;
+	char *iomem;
+};
+
+static int
+nfp6000_area_init(struct nfp_cpp_area *area, uint32_t dest,
+		  unsigned long long address, unsigned long size)
+{
+	struct nfp_pcie_user *nfp = nfp_cpp_priv(nfp_cpp_area_cpp(area));
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	uint32_t target = NFP_CPP_ID_TARGET_of(dest);
+	uint32_t action = NFP_CPP_ID_ACTION_of(dest);
+	uint32_t token = NFP_CPP_ID_TOKEN_of(dest);
+	int pp, ret = 0;
+
+	pp = nfp6000_target_pushpull(NFP_CPP_ID(target, action, token),
+				     address);
+	if (pp < 0)
+		return pp;
+
+	priv->width.read = PUSH_WIDTH(pp);
+	priv->width.write = PULL_WIDTH(pp);
+
+	if (priv->width.read > 0 &&
+	    priv->width.write > 0 && priv->width.read != priv->width.write)
+		return -EINVAL;
+
+	if (priv->width.read > 0)
+		priv->width.bar = priv->width.read;
+	else
+		priv->width.bar = priv->width.write;
+
+	priv->bar = nfp_alloc_bar(nfp);
+	if (priv->bar == NULL)
+		return -ENOMEM;
+
+	priv->target = target;
+	priv->action = action;
+	priv->token = token;
+	priv->offset = address;
+	priv->size = size;
+
+	ret = nfp_reconfigure_bar(nfp, priv->bar, priv->target, priv->action,
+				  priv->token, priv->offset, priv->size,
+				  priv->width.bar);
+
+	return ret;
+}
+
+static int
+nfp6000_area_acquire(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+
+	/* Calculate offset into BAR. */
+	if (nfp_bar_maptype(priv->bar) ==
+	    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_GENERAL) {
+		priv->bar_offset = priv->offset &
+			(NFP_PCIE_P2C_GENERAL_SIZE(priv->bar) - 1);
+		priv->bar_offset +=
+			NFP_PCIE_P2C_GENERAL_TARGET_OFFSET(priv->bar,
+							   priv->target);
+		priv->bar_offset +=
+		    NFP_PCIE_P2C_GENERAL_TOKEN_OFFSET(priv->bar, priv->token);
+	} else {
+		priv->bar_offset = priv->offset & priv->bar->mask;
+	}
+
+	/* Must have been too big. Sub-allocate. */
+	if (!priv->bar->iomem)
+		return (-ENOMEM);
+
+	priv->iomem = priv->bar->iomem + priv->bar_offset;
+
+	return 0;
+}
+
+static void *
+nfp6000_area_mapped(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *area_priv = nfp_cpp_area_priv(area);
+
+	if (!area_priv->iomem)
+		return NULL;
+
+	return area_priv->iomem;
+}
+
+static void
+nfp6000_area_release(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	priv->bar->lock = 0;
+	priv->bar = NULL;
+	priv->iomem = NULL;
+}
+
+static void *
+nfp6000_area_iomem(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	return priv->iomem;
+}
+
+static int
+nfp6000_area_read(struct nfp_cpp_area *area, void *kernel_vaddr,
+		  unsigned long offset, unsigned int length)
+{
+	uint64_t *wrptr64 = kernel_vaddr;
+	const volatile uint64_t *rdptr64;
+	struct nfp6000_area_priv *priv;
+	uint32_t *wrptr32 = kernel_vaddr;
+	const volatile uint32_t *rdptr32;
+	int width;
+	unsigned int n;
+	bool is_64;
+
+	priv = nfp_cpp_area_priv(area);
+	rdptr64 = (uint64_t *)(priv->iomem + offset);
+	rdptr32 = (uint32_t *)(priv->iomem + offset);
+
+	if (offset + length > priv->size)
+		return -EFAULT;
+
+	width = priv->width.read;
+
+	if (width <= 0)
+		return -EINVAL;
+
+	/* Unaligned? Translate to an explicit access */
+	if ((priv->offset + offset) & (width - 1)) {
+		printf("aread_read unaligned!!!\n");
+		return -EINVAL;
+	}
+
+	is_64 = width == TARGET_WIDTH_64;
+
+	/* MU reads via a PCIe2CPP BAR supports 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
+	    priv->action == NFP_CPP_ACTION_RW) {
+		is_64 = false;
+	}
+
+	if (is_64) {
+		if (offset % sizeof(uint64_t) != 0 ||
+		    length % sizeof(uint64_t) != 0)
+			return -EINVAL;
+	} else {
+		if (offset % sizeof(uint32_t) != 0 ||
+		    length % sizeof(uint32_t) != 0)
+			return -EINVAL;
+	}
+
+	if (!priv->bar)
+		return -EFAULT;
+
+	if (is_64)
+		for (n = 0; n < length; n += sizeof(uint64_t)) {
+			*wrptr64 = *rdptr64;
+			wrptr64++;
+			rdptr64++;
+		}
+	else
+		for (n = 0; n < length; n += sizeof(uint32_t)) {
+			*wrptr32 = *rdptr32;
+			wrptr32++;
+			rdptr32++;
+		}
+
+	return n;
+}
+
+static int
+nfp6000_area_write(struct nfp_cpp_area *area, const void *kernel_vaddr,
+		   unsigned long offset, unsigned int length)
+{
+	const uint64_t *rdptr64 = kernel_vaddr;
+	uint64_t *wrptr64;
+	const uint32_t *rdptr32 = kernel_vaddr;
+	struct nfp6000_area_priv *priv;
+	uint32_t *wrptr32;
+	int width;
+	unsigned int n;
+	bool is_64;
+
+	priv = nfp_cpp_area_priv(area);
+	wrptr64 = (uint64_t *)(priv->iomem + offset);
+	wrptr32 = (uint32_t *)(priv->iomem + offset);
+
+	if (offset + length > priv->size)
+		return -EFAULT;
+
+	width = priv->width.write;
+
+	if (width <= 0)
+		return -EINVAL;
+
+	/* Unaligned? Translate to an explicit access */
+	if ((priv->offset + offset) & (width - 1))
+		return -EINVAL;
+
+	is_64 = width == TARGET_WIDTH_64;
+
+	/* MU writes via a PCIe2CPP BAR supports 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
+	    priv->action == NFP_CPP_ACTION_RW)
+		is_64 = false;
+
+	if (is_64) {
+		if (offset % sizeof(uint64_t) != 0 ||
+		    length % sizeof(uint64_t) != 0)
+			return -EINVAL;
+	} else {
+		if (offset % sizeof(uint32_t) != 0 ||
+		    length % sizeof(uint32_t) != 0)
+			return -EINVAL;
+	}
+
+	if (!priv->bar)
+		return -EFAULT;
+
+	if (is_64)
+		for (n = 0; n < length; n += sizeof(uint64_t)) {
+			*wrptr64 = *rdptr64;
+			wrptr64++;
+			rdptr64++;
+		}
+	else
+		for (n = 0; n < length; n += sizeof(uint32_t)) {
+			*wrptr32 = *rdptr32;
+			wrptr32++;
+			rdptr32++;
+		}
+
+	return n;
+}
+
+#define PCI_DEVICES "/sys/bus/pci/devices"
+
+static int
+nfp_acquire_process_lock(struct nfp_pcie_user *desc)
+{
+	int rc;
+	struct flock lock;
+	char lockname[30];
+
+	memset(&lock, 0, sizeof(lock));
+
+	snprintf(lockname, sizeof(lockname), "/var/lock/nfp_%s", desc->busdev);
+	desc->lock = open(lockname, O_RDWR | O_CREAT, 0666);
+	if (desc->lock < 0)
+		return desc->lock;
+
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	rc = -1;
+	while (rc != 0) {
+		rc = fcntl(desc->lock, F_SETLKW, &lock);
+		if (rc < 0) {
+			if (errno != EAGAIN && errno != EACCES) {
+				close(desc->lock);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int
+nfp6000_set_model(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint32_t tmp;
+	int fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	lseek(fp, 0x2e, SEEK_SET);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("Error reading config file for model\n");
+		return -1;
+	}
+
+	tmp = tmp << 16;
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_model_set(cpp, tmp);
+
+	return 0;
+}
+
+static int
+nfp6000_set_interface(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint16_t tmp;
+	int fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	lseek(fp, 0x154, SEEK_SET);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for interface\n");
+		return -1;
+	}
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_interface_set(cpp, tmp);
+
+	return 0;
+}
+
+#define PCI_CFG_SPACE_SIZE	256
+#define PCI_CFG_SPACE_EXP_SIZE	4096
+#define PCI_EXT_CAP_ID(header)		(int)(header & 0x0000ffff)
+#define PCI_EXT_CAP_NEXT(header)	((header >> 20) & 0xffc)
+#define PCI_EXT_CAP_ID_DSN	0x03
+static int
+nfp_pci_find_next_ext_capability(int fp, int cap)
+{
+	uint32_t header;
+	int ttl;
+	int pos = PCI_CFG_SPACE_SIZE;
+
+	/* minimum 8 bytes per capability */
+	ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8;
+
+	lseek(fp, pos, SEEK_SET);
+	if (read(fp, &header, sizeof(header)) != sizeof(header)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	/*
+	 * If we have no capabilities, this is indicated by cap ID,
+	 * cap version and next pointer all being 0.
+	 */
+	if (header == 0)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < PCI_CFG_SPACE_SIZE)
+			break;
+
+		lseek(fp, pos, SEEK_SET);
+		if (read(fp, &header, sizeof(header)) != sizeof(header)) {
+			printf("error reading config file for serial\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nfp6000_set_serial(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint16_t tmp;
+	uint8_t serial[6];
+	int serial_len = 6;
+	int fp, pos;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	pos = nfp_pci_find_next_ext_capability(fp, PCI_EXT_CAP_ID_DSN);
+	if (pos <= 0) {
+		printf("PCI_EXT_CAP_ID_DSN not found. Using default offset\n");
+		lseek(fp, 0x156, SEEK_SET);
+	} else {
+		lseek(fp, pos + 6, SEEK_SET);
+	}
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[4] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[5] = (uint8_t)(tmp & 0xff);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[2] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[3] = (uint8_t)(tmp & 0xff);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[0] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[1] = (uint8_t)(tmp & 0xff);
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_serial_set(cpp, serial, serial_len);
+
+	return 0;
+}
+
+static int
+nfp6000_set_barsz(struct nfp_pcie_user *desc)
+{
+	char tmp_str[80];
+	unsigned long start, end, flags, tmp;
+	int i;
+	FILE *fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/resource", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = fopen(tmp_str, "r");
+	if (!fp)
+		return -1;
+
+	if (fscanf(fp, "0x%lx 0x%lx 0x%lx", &start, &end, &flags) == 0) {
+		printf("error reading resource file for bar size\n");
+		return -1;
+	}
+
+	if (fclose(fp) == -1)
+		return -1;
+
+	tmp = (end - start) + 1;
+	i = 0;
+	while (tmp >>= 1)
+		i++;
+	desc->barsz = i;
+	return 0;
+}
+
+static int
+nfp6000_init(struct nfp_cpp *cpp, const char *devname)
+{
+	char link[120];
+	char tmp_str[80];
+	ssize_t size;
+	int ret = 0;
+	uint32_t model;
+	struct nfp_pcie_user *desc;
+
+	desc = malloc(sizeof(*desc));
+	if (!desc)
+		return -1;
+
+
+	memset(desc->busdev, 0, BUSDEV_SZ);
+	strncpy(desc->busdev, devname, strlen(devname));
+
+	ret = nfp_acquire_process_lock(desc);
+	if (ret)
+		return -1;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/driver", PCI_DEVICES,
+		 desc->busdev);
+
+	size = readlink(tmp_str, link, sizeof(link));
+
+	if (size == -1)
+		tmp_str[0] = '\0';
+
+	if (size == sizeof(link))
+		tmp_str[0] = '\0';
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/resource0", PCI_DEVICES,
+		 desc->busdev);
+
+	desc->device = open(tmp_str, O_RDWR);
+	if (desc->device == -1)
+		return -1;
+
+	if (nfp6000_set_model(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_interface(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_serial(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_barsz(desc) < 0)
+		return -1;
+
+	desc->cfg = (char *)mmap(0, 1 << (desc->barsz - 3),
+				 PROT_READ | PROT_WRITE,
+				 MAP_SHARED, desc->device, 0);
+
+	if (desc->cfg == MAP_FAILED)
+		return -1;
+
+	nfp_enable_bars(desc);
+
+	nfp_cpp_priv_set(cpp, desc);
+
+	model = __nfp_cpp_model_autodetect(cpp);
+	nfp_cpp_model_set(cpp, model);
+
+	return ret;
+}
+
+static void
+nfp6000_free(struct nfp_cpp *cpp)
+{
+	struct nfp_pcie_user *desc = nfp_cpp_priv(cpp);
+	int x;
+
+	/* Unmap may cause if there are any pending transaxctions */
+	nfp_disable_bars(desc);
+	munmap(desc->cfg, 1 << (desc->barsz - 3));
+
+	for (x = ARRAY_SIZE(desc->bar); x > 0; x--) {
+		if (desc->bar[x - 1].iomem)
+			munmap(desc->bar[x - 1].iomem, 1 << (desc->barsz - 3));
+	}
+	close(desc->lock);
+	close(desc->device);
+	free(desc);
+}
+
+static const struct nfp_cpp_operations nfp6000_pcie_ops = {
+	.init = nfp6000_init,
+	.free = nfp6000_free,
+
+	.area_priv_size = sizeof(struct nfp6000_area_priv),
+	.area_init = nfp6000_area_init,
+	.area_acquire = nfp6000_area_acquire,
+	.area_release = nfp6000_area_release,
+	.area_mapped = nfp6000_area_mapped,
+	.area_read = nfp6000_area_read,
+	.area_write = nfp6000_area_write,
+	.area_iomem = nfp6000_area_iomem,
+};
+
+const struct
+nfp_cpp_operations *nfp_cpp_transport_operations(void)
+{
+	return &nfp6000_pcie_ops;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_cppcore.c b/drivers/net/nfp/nfpcore/nfp_cppcore.c
new file mode 100644
index 0000000..94d4a0b
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cppcore.c
@@ -0,0 +1,856 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#include <rte_byteorder.h>
+
+#include "nfp_cpp.h"
+#include "nfp_target.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp6000/nfp_xpb.h"
+#include "nfp_nffw.h"
+
+#define NFP_PL_DEVICE_ID                        0x00000004
+#define NFP_PL_DEVICE_ID_MASK                   0xff
+
+#define NFP6000_ARM_GCSR_SOFTMODEL0             0x00400144
+
+void
+nfp_cpp_priv_set(struct nfp_cpp *cpp, void *priv)
+{
+	cpp->priv = priv;
+}
+
+void *
+nfp_cpp_priv(struct nfp_cpp *cpp)
+{
+	return cpp->priv;
+}
+
+void
+nfp_cpp_model_set(struct nfp_cpp *cpp, uint32_t model)
+{
+	cpp->model = model;
+}
+
+uint32_t
+nfp_cpp_model(struct nfp_cpp *cpp)
+{
+	if (!cpp)
+		return NFP_CPP_MODEL_INVALID;
+
+	if (cpp->model == 0)
+		cpp->model = __nfp_cpp_model_autodetect(cpp);
+
+	return cpp->model;
+}
+
+void
+nfp_cpp_interface_set(struct nfp_cpp *cpp, uint32_t interface)
+{
+	cpp->interface = interface;
+}
+
+int
+nfp_cpp_serial(struct nfp_cpp *cpp, const uint8_t **serial)
+{
+	*serial = cpp->serial;
+	return cpp->serial_len;
+}
+
+int
+nfp_cpp_serial_set(struct nfp_cpp *cpp, const uint8_t *serial,
+		   size_t serial_len)
+{
+	if (cpp->serial_len)
+		free(cpp->serial);
+
+	cpp->serial = malloc(serial_len);
+	if (!cpp->serial)
+		return -1;
+
+	memcpy(cpp->serial, serial, serial_len);
+	cpp->serial_len = serial_len;
+
+	return 0;
+}
+
+uint16_t
+nfp_cpp_interface(struct nfp_cpp *cpp)
+{
+	if (!cpp)
+		return NFP_CPP_INTERFACE(NFP_CPP_INTERFACE_TYPE_INVALID, 0, 0);
+
+	return cpp->interface;
+}
+
+void *
+nfp_cpp_area_priv(struct nfp_cpp_area *cpp_area)
+{
+	return &cpp_area[1];
+}
+
+struct nfp_cpp *
+nfp_cpp_area_cpp(struct nfp_cpp_area *cpp_area)
+{
+	return cpp_area->cpp;
+}
+
+const char *
+nfp_cpp_area_name(struct nfp_cpp_area *cpp_area)
+{
+	return cpp_area->name;
+}
+
+/*
+ * nfp_cpp_area_alloc - allocate a new CPP area
+ * @cpp:    CPP handle
+ * @dest:   CPP id
+ * @address:    start address on CPP target
+ * @size:   size of area in bytes
+ *
+ * Allocate and initialize a CPP area structure.  The area must later
+ * be locked down with an 'acquire' before it can be safely accessed.
+ *
+ * NOTE: @address and @size must be 32-bit aligned values.
+ */
+struct nfp_cpp_area *
+nfp_cpp_area_alloc_with_name(struct nfp_cpp *cpp, uint32_t dest,
+			      const char *name, unsigned long long address,
+			      unsigned long size)
+{
+	struct nfp_cpp_area *area;
+	uint64_t tmp64 = (uint64_t)address;
+	int tmp, err;
+
+	if (!cpp)
+		return NULL;
+
+	/* CPP bus uses only a 40-bit address */
+	if ((address + size) > (1ULL << 40))
+		return NFP_ERRPTR(EFAULT);
+
+	/* Remap from cpp_island to cpp_target */
+	err = nfp_target_cpp(dest, tmp64, &dest, &tmp64, cpp->imb_cat_table);
+	if (err < 0)
+		return NULL;
+
+	address = (unsigned long long)tmp64;
+
+	if (!name)
+		name = "";
+
+	area = calloc(1, sizeof(*area) + cpp->op->area_priv_size +
+		      strlen(name) + 1);
+	if (!area)
+		return NULL;
+
+	area->cpp = cpp;
+	area->name = ((char *)area) + sizeof(*area) + cpp->op->area_priv_size;
+	memcpy(area->name, name, strlen(name) + 1);
+
+	/*
+	 * Preserve errno around the call to area_init, since most
+	 * implementations will blindly call nfp_target_action_width()for both
+	 * read or write modes, and that will set errno to EINVAL.
+	 */
+	tmp = errno;
+
+	err = cpp->op->area_init(area, dest, address, size);
+	if (err < 0) {
+		free(area);
+		return NULL;
+	}
+
+	/* Restore errno */
+	errno = tmp;
+
+	area->offset = address;
+	area->size = size;
+
+	return area;
+}
+
+struct nfp_cpp_area *
+nfp_cpp_area_alloc(struct nfp_cpp *cpp, uint32_t dest,
+		    unsigned long long address, unsigned long size)
+{
+	return nfp_cpp_area_alloc_with_name(cpp, dest, NULL, address, size);
+}
+
+/*
+ * nfp_cpp_area_alloc_acquire - allocate a new CPP area and lock it down
+ *
+ * @cpp:    CPP handle
+ * @dest:   CPP id
+ * @address:    start address on CPP target
+ * @size:   size of area
+ *
+ * Allocate and initilizae a CPP area structure, and lock it down so
+ * that it can be accessed directly.
+ *
+ * NOTE: @address and @size must be 32-bit aligned values.
+ *
+ * NOTE: The area must also be 'released' when the structure is freed.
+ */
+struct nfp_cpp_area *
+nfp_cpp_area_alloc_acquire(struct nfp_cpp *cpp, uint32_t destination,
+			    unsigned long long address, unsigned long size)
+{
+	struct nfp_cpp_area *area;
+
+	area = nfp_cpp_area_alloc(cpp, destination, address, size);
+	if (!area)
+		return NULL;
+
+	if (nfp_cpp_area_acquire(area)) {
+		nfp_cpp_area_free(area);
+		return NULL;
+	}
+
+	return area;
+}
+
+/*
+ * nfp_cpp_area_free - free up the CPP area
+ * area:    CPP area handle
+ *
+ * Frees up memory resources held by the CPP area.
+ */
+void
+nfp_cpp_area_free(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_cleanup)
+		area->cpp->op->area_cleanup(area);
+	free(area);
+}
+
+/*
+ * nfp_cpp_area_release_free - release CPP area and free it
+ * area:    CPP area handle
+ *
+ * Releases CPP area and frees up memory resources held by the it.
+ */
+void
+nfp_cpp_area_release_free(struct nfp_cpp_area *area)
+{
+	nfp_cpp_area_release(area);
+	nfp_cpp_area_free(area);
+}
+
+/*
+ * nfp_cpp_area_acquire - lock down a CPP area for access
+ * @area:   CPP area handle
+ *
+ * Locks down the CPP area for a potential long term activity.  Area
+ * must always be locked down before being accessed.
+ */
+int
+nfp_cpp_area_acquire(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_acquire) {
+		int err = area->cpp->op->area_acquire(area);
+
+		if (err < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * nfp_cpp_area_release - release a locked down CPP area
+ * @area:   CPP area handle
+ *
+ * Releases a previously locked down CPP area.
+ */
+void
+nfp_cpp_area_release(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_release)
+		area->cpp->op->area_release(area);
+}
+
+/*
+ * nfp_cpp_area_iomem() - get IOMEM region for CPP area
+ *
+ * @area:       CPP area handle
+ *
+ * Returns an iomem pointer for use with readl()/writel() style operations.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ *
+ * Return: pointer to the area, or NULL
+ */
+void *
+nfp_cpp_area_iomem(struct nfp_cpp_area *area)
+{
+	void *iomem = NULL;
+
+	if (area->cpp->op->area_iomem)
+		iomem = area->cpp->op->area_iomem(area);
+
+	return iomem;
+}
+
+/*
+ * nfp_cpp_area_read - read data from CPP area
+ *
+ * @area:       CPP area handle
+ * @offset:     offset into CPP area
+ * @kernel_vaddr:   kernel address to put data into
+ * @length:     number of bytes to read
+ *
+ * Read data from indicated CPP region.
+ *
+ * NOTE: @offset and @length must be 32-bit aligned values.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ */
+int
+nfp_cpp_area_read(struct nfp_cpp_area *area, unsigned long offset,
+		  void *kernel_vaddr, size_t length)
+{
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EFAULT);
+
+	return area->cpp->op->area_read(area, kernel_vaddr, offset, length);
+}
+
+/*
+ * nfp_cpp_area_write - write data to CPP area
+ *
+ * @area:       CPP area handle
+ * @offset:     offset into CPP area
+ * @kernel_vaddr:   kernel address to read data from
+ * @length:     number of bytes to write
+ *
+ * Write data to indicated CPP region.
+ *
+ * NOTE: @offset and @length must be 32-bit aligned values.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ */
+int
+nfp_cpp_area_write(struct nfp_cpp_area *area, unsigned long offset,
+		   const void *kernel_vaddr, size_t length)
+{
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EFAULT);
+
+	return area->cpp->op->area_write(area, kernel_vaddr, offset, length);
+}
+
+void *
+nfp_cpp_area_mapped(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_mapped)
+		return area->cpp->op->area_mapped(area);
+	return NULL;
+}
+
+/*
+ * nfp_cpp_area_check_range - check if address range fits in CPP area
+ *
+ * @area:   CPP area handle
+ * @offset: offset into CPP area
+ * @length: size of address range in bytes
+ *
+ * Check if address range fits within CPP area.  Return 0 if area fits
+ * or -1 on error.
+ */
+int
+nfp_cpp_area_check_range(struct nfp_cpp_area *area, unsigned long long offset,
+			 unsigned long length)
+{
+	if (((offset + length) > area->size))
+		return NFP_ERRNO(EFAULT);
+
+	return 0;
+}
+
+/*
+ * Return the correct CPP address, and fixup xpb_addr as needed,
+ * based upon NFP model.
+ */
+static uint32_t
+nfp_xpb_to_cpp(struct nfp_cpp *cpp, uint32_t *xpb_addr)
+{
+	uint32_t xpb;
+	int island;
+
+	if (!NFP_CPP_MODEL_IS_6000(cpp->model))
+		return 0;
+
+	xpb = NFP_CPP_ID(14, NFP_CPP_ACTION_RW, 0);
+
+	/*
+	 * Ensure that non-local XPB accesses go out through the
+	 * global XPBM bus.
+	 */
+	island = ((*xpb_addr) >> 24) & 0x3f;
+
+	if (!island)
+		return xpb;
+
+	if (island == 1) {
+		/*
+		 * Accesses to the ARM Island overlay uses Island 0
+		 * Global Bit
+		 */
+		(*xpb_addr) &= ~0x7f000000;
+		if (*xpb_addr < 0x60000)
+			*xpb_addr |= (1 << 30);
+		else
+			/* And only non-ARM interfaces use island id = 1 */
+			if (NFP_CPP_INTERFACE_TYPE_of(nfp_cpp_interface(cpp)) !=
+			    NFP_CPP_INTERFACE_TYPE_ARM)
+				*xpb_addr |= (1 << 24);
+	} else {
+		(*xpb_addr) |= (1 << 30);
+	}
+
+	return xpb;
+}
+
+int
+nfp_cpp_area_readl(struct nfp_cpp_area *area, unsigned long offset,
+		   uint32_t *value)
+{
+	int sz;
+	uint32_t tmp = 0;
+
+	sz = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_32(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_writel(struct nfp_cpp_area *area, unsigned long offset,
+		    uint32_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_32(value);
+	sz = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_readq(struct nfp_cpp_area *area, unsigned long offset,
+		   uint64_t *value)
+{
+	int sz;
+	uint64_t tmp = 0;
+
+	sz = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_64(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_writeq(struct nfp_cpp_area *area, unsigned long offset,
+		    uint64_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_64(value);
+	sz = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_readl(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	      uint32_t *value)
+{
+	int sz;
+	uint32_t tmp;
+
+	sz = nfp_cpp_read(cpp, cpp_id, address, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_32(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_writel(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	       uint32_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_32(value);
+	sz = nfp_cpp_write(cpp, cpp_id, address, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_readq(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	      uint64_t *value)
+{
+	int sz;
+	uint64_t tmp;
+
+	sz = nfp_cpp_read(cpp, cpp_id, address, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_64(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_writeq(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	       uint64_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_64(value);
+	sz = nfp_cpp_write(cpp, cpp_id, address, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_xpb_writel(struct nfp_cpp *cpp, uint32_t xpb_addr, uint32_t value)
+{
+	uint32_t cpp_dest;
+
+	cpp_dest = nfp_xpb_to_cpp(cpp, &xpb_addr);
+
+	return nfp_cpp_writel(cpp, cpp_dest, xpb_addr, value);
+}
+
+int
+nfp_xpb_readl(struct nfp_cpp *cpp, uint32_t xpb_addr, uint32_t *value)
+{
+	uint32_t cpp_dest;
+
+	cpp_dest = nfp_xpb_to_cpp(cpp, &xpb_addr);
+
+	return nfp_cpp_readl(cpp, cpp_dest, xpb_addr, value);
+}
+
+static struct nfp_cpp *
+nfp_cpp_alloc(const char *devname)
+{
+	const struct nfp_cpp_operations *ops;
+	struct nfp_cpp *cpp;
+	int err;
+
+	ops = nfp_cpp_transport_operations();
+
+	if (!ops || !ops->init)
+		return NFP_ERRPTR(EINVAL);
+
+	cpp = calloc(1, sizeof(*cpp));
+	if (!cpp)
+		return NULL;
+
+	cpp->op = ops;
+
+	if (cpp->op->init) {
+		err = cpp->op->init(cpp, devname);
+		if (err < 0) {
+			free(cpp);
+			return NULL;
+		}
+	}
+
+	if (NFP_CPP_MODEL_IS_6000(nfp_cpp_model(cpp))) {
+		uint32_t xpbaddr;
+		size_t tgt;
+
+		for (tgt = 0; tgt < ARRAY_SIZE(cpp->imb_cat_table); tgt++) {
+			/* Hardcoded XPB IMB Base, island 0 */
+			xpbaddr = 0x000a0000 + (tgt * 4);
+			err = nfp_xpb_readl(cpp, xpbaddr,
+				(uint32_t *)&cpp->imb_cat_table[tgt]);
+			if (err < 0) {
+				free(cpp);
+				return NULL;
+			}
+		}
+	}
+
+	return cpp;
+}
+
+/*
+ * nfp_cpp_free - free the CPP handle
+ * @cpp:    CPP handle
+ */
+void
+nfp_cpp_free(struct nfp_cpp *cpp)
+{
+	if (cpp->op && cpp->op->free)
+		cpp->op->free(cpp);
+
+	if (cpp->serial_len)
+		free(cpp->serial);
+
+	free(cpp);
+}
+
+struct nfp_cpp *
+nfp_cpp_from_device_name(const char *devname)
+{
+	return nfp_cpp_alloc(devname);
+}
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to modify
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_xpb_writelm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		uint32_t value)
+{
+	int err;
+	uint32_t tmp;
+
+	err = nfp_xpb_readl(cpp, xpb_tgt, &tmp);
+	if (err < 0)
+		return err;
+
+	tmp &= ~mask;
+	tmp |= (mask & value);
+	return nfp_xpb_writel(cpp, xpb_tgt, tmp);
+}
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to monitor for
+ * @param timeout_us    maximum number of us to wait (-1 for forever)
+ *
+ * @return >= 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_xpb_waitlm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+	       uint32_t value, int timeout_us)
+{
+	uint32_t tmp;
+	int err;
+
+	do {
+		err = nfp_xpb_readl(cpp, xpb_tgt, &tmp);
+		if (err < 0)
+			goto exit;
+
+		if ((tmp & mask) == (value & mask)) {
+			if (timeout_us < 0)
+				timeout_us = 0;
+			break;
+		}
+
+		if (timeout_us < 0)
+			continue;
+
+		timeout_us -= 100;
+		usleep(100);
+	} while (timeout_us >= 0);
+
+	if (timeout_us < 0)
+		err = NFP_ERRNO(ETIMEDOUT);
+	else
+		err = timeout_us;
+
+exit:
+	return err;
+}
+
+/*
+ * nfp_cpp_read - read from CPP target
+ * @cpp:        CPP handle
+ * @destination:    CPP id
+ * @address:        offset into CPP target
+ * @kernel_vaddr:   kernel buffer for result
+ * @length:     number of bytes to read
+ */
+int
+nfp_cpp_read(struct nfp_cpp *cpp, uint32_t destination,
+	     unsigned long long address, void *kernel_vaddr, size_t length)
+{
+	struct nfp_cpp_area *area;
+	int err;
+
+	area = nfp_cpp_area_alloc_acquire(cpp, destination, address, length);
+	if (!area) {
+		printf("Area allocation/acquire failed\n");
+		return -1;
+	}
+
+	err = nfp_cpp_area_read(area, 0, kernel_vaddr, length);
+
+	nfp_cpp_area_release_free(area);
+	return err;
+}
+
+/*
+ * nfp_cpp_write - write to CPP target
+ * @cpp:        CPP handle
+ * @destination:    CPP id
+ * @address:        offset into CPP target
+ * @kernel_vaddr:   kernel buffer to read from
+ * @length:     number of bytes to write
+ */
+int
+nfp_cpp_write(struct nfp_cpp *cpp, uint32_t destination,
+	      unsigned long long address, const void *kernel_vaddr,
+	      size_t length)
+{
+	struct nfp_cpp_area *area;
+	int err;
+
+	area = nfp_cpp_area_alloc_acquire(cpp, destination, address, length);
+	if (!area)
+		return -1;
+
+	err = nfp_cpp_area_write(area, 0, kernel_vaddr, length);
+
+	nfp_cpp_area_release_free(area);
+	return err;
+}
+
+/*
+ * nfp_cpp_area_fill - fill a CPP area with a value
+ * @area:       CPP area
+ * @offset:     offset into CPP area
+ * @value:      value to fill with
+ * @length:     length of area to fill
+ */
+int
+nfp_cpp_area_fill(struct nfp_cpp_area *area, unsigned long offset,
+		  uint32_t value, size_t length)
+{
+	int err;
+	size_t i;
+	uint64_t value64;
+
+	value = rte_cpu_to_le_32(value);
+	value64 = ((uint64_t)value << 32) | value;
+
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EINVAL);
+
+	if ((area->offset + offset) & 3)
+		return NFP_ERRNO(EINVAL);
+
+	if (((area->offset + offset) & 7) == 4 && length >= 4) {
+		err = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value))
+			return NFP_ERRNO(ENOSPC);
+		offset += sizeof(value);
+		length -= sizeof(value);
+	}
+
+	for (i = 0; (i + sizeof(value)) < length; i += sizeof(value64)) {
+		err =
+		    nfp_cpp_area_write(area, offset + i, &value64,
+				       sizeof(value64));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value64))
+			return NFP_ERRNO(ENOSPC);
+	}
+
+	if ((i + sizeof(value)) <= length) {
+		err =
+		    nfp_cpp_area_write(area, offset + i, &value, sizeof(value));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value))
+			return NFP_ERRNO(ENOSPC);
+		i += sizeof(value);
+	}
+
+	return (int)i;
+}
+
+/*
+ * NOTE: This code should not use nfp_xpb_* functions,
+ * as those are model-specific
+ */
+uint32_t
+__nfp_cpp_model_autodetect(struct nfp_cpp *cpp)
+{
+	uint32_t arm_id = NFP_CPP_ID(NFP_CPP_TARGET_ARM, 0, 0);
+	uint32_t model = 0;
+
+	nfp_cpp_readl(cpp, arm_id, NFP6000_ARM_GCSR_SOFTMODEL0, &model);
+
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		uint32_t tmp;
+
+		nfp_cpp_model_set(cpp, model);
+
+		/* The PL's PluDeviceID revision code is authoratative */
+		model &= ~0xff;
+		nfp_xpb_readl(cpp, NFP_XPB_DEVICE(1, 1, 16) +
+				   NFP_PL_DEVICE_ID, &tmp);
+		model |= (NFP_PL_DEVICE_ID_MASK & tmp) - 0x10;
+	}
+
+	return model;
+}
+
+/*
+ * nfp_cpp_map_area() - Helper function to map an area
+ * @cpp:    NFP CPP handler
+ * @domain: CPP domain
+ * @target: CPP target
+ * @addr:   CPP address
+ * @size:   Size of the area
+ * @area:   Area handle (output)
+ *
+ * Map an area of IOMEM access.  To undo the effect of this function call
+ * @nfp_cpp_area_release_free(*area).
+ *
+ * Return: Pointer to memory mapped area or ERR_PTR
+ */
+uint8_t *
+nfp_cpp_map_area(struct nfp_cpp *cpp, int domain, int target, uint64_t addr,
+		 unsigned long size, struct nfp_cpp_area **area)
+{
+	uint8_t *res;
+	uint32_t dest;
+
+	dest = NFP_CPP_ISLAND_ID(target, NFP_CPP_ACTION_RW, 0, domain);
+
+	*area = nfp_cpp_area_alloc_acquire(cpp, dest, addr, size);
+	if (!*area)
+		goto err_eio;
+
+	res = nfp_cpp_area_iomem(*area);
+	if (!res)
+		goto err_release_free;
+
+	return res;
+
+err_release_free:
+	nfp_cpp_area_release_free(*area);
+err_eio:
+	return NULL;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_crc.c b/drivers/net/nfp/nfpcore/nfp_crc.c
new file mode 100644
index 0000000..20431bf
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_crc.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "nfp_crc.h"
+
+static inline uint32_t
+nfp_crc32_be_generic(uint32_t crc, unsigned char const *p, size_t len,
+		 uint32_t polynomial)
+{
+	int i;
+	while (len--) {
+		crc ^= *p++ << 24;
+		for (i = 0; i < 8; i++)
+			crc = (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
+					  0);
+	}
+	return crc;
+}
+
+static inline uint32_t
+nfp_crc32_be(uint32_t crc, unsigned char const *p, size_t len)
+{
+	return nfp_crc32_be_generic(crc, p, len, CRCPOLY_BE);
+}
+
+static uint32_t
+nfp_crc32_posix_end(uint32_t crc, size_t total_len)
+{
+	/* Extend with the length of the string. */
+	while (total_len != 0) {
+		uint8_t c = total_len & 0xff;
+
+		crc = nfp_crc32_be(crc, &c, 1);
+		total_len >>= 8;
+	}
+
+	return ~crc;
+}
+
+uint32_t
+nfp_crc32_posix(const void *buff, size_t len)
+{
+	return nfp_crc32_posix_end(nfp_crc32_be(0, buff, len), len);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_crc.h b/drivers/net/nfp/nfpcore/nfp_crc.h
new file mode 100644
index 0000000..f99c89f
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_crc.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CRC_H__
+#define __NFP_CRC_H__
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+uint32_t nfp_crc32_posix(const void *buff, size_t len);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_hwinfo.c b/drivers/net/nfp/nfpcore/nfp_hwinfo.c
new file mode 100644
index 0000000..c0516bf
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_hwinfo.c
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/* Parse the hwinfo table that the ARM firmware builds in the ARM scratch SRAM
+ * after chip reset.
+ *
+ * Examples of the fields:
+ *   me.count = 40
+ *   me.mask = 0x7f_ffff_ffff
+ *
+ *   me.count is the total number of MEs on the system.
+ *   me.mask is the bitmask of MEs that are available for application usage.
+ *
+ *   (ie, in this example, ME 39 has been reserved by boardconfig.)
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+#include "nfp_hwinfo.h"
+#include "nfp_crc.h"
+
+static int
+nfp_hwinfo_is_updating(struct nfp_hwinfo *hwinfo)
+{
+	return hwinfo->version & NFP_HWINFO_VERSION_UPDATING;
+}
+
+static int
+nfp_hwinfo_db_walk(struct nfp_hwinfo *hwinfo, uint32_t size)
+{
+	const char *key, *val, *end = hwinfo->data + size;
+
+	for (key = hwinfo->data; *key && key < end;
+	     key = val + strlen(val) + 1) {
+		val = key + strlen(key) + 1;
+		if (val >= end) {
+			printf("Bad HWINFO - overflowing key\n");
+			return -EINVAL;
+		}
+
+		if (val + strlen(val) + 1 > end) {
+			printf("Bad HWINFO - overflowing value\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int
+nfp_hwinfo_db_validate(struct nfp_hwinfo *db, uint32_t len)
+{
+	uint32_t size, new_crc, *crc;
+
+	size = db->size;
+	if (size > len) {
+		printf("Unsupported hwinfo size %u > %u\n", size, len);
+		return -EINVAL;
+	}
+
+	size -= sizeof(uint32_t);
+	new_crc = nfp_crc32_posix((char *)db, size);
+	crc = (uint32_t *)(db->start + size);
+	if (new_crc != *crc) {
+		printf("Corrupt hwinfo table (CRC mismatch)\n");
+		printf("\tcalculated 0x%x, expected 0x%x\n", new_crc, *crc);
+		return -EINVAL;
+	}
+
+	return nfp_hwinfo_db_walk(db, size);
+}
+
+static struct nfp_hwinfo *
+nfp_hwinfo_try_fetch(struct nfp_cpp *cpp, size_t *cpp_size)
+{
+	struct nfp_hwinfo *header;
+	void *res;
+	uint64_t cpp_addr;
+	uint32_t cpp_id;
+	int err;
+	uint8_t *db;
+
+	res = nfp_resource_acquire(cpp, NFP_RESOURCE_NFP_HWINFO);
+	if (res) {
+		cpp_id = nfp_resource_cpp_id(res);
+		cpp_addr = nfp_resource_address(res);
+		*cpp_size = nfp_resource_size(res);
+
+		nfp_resource_release(res);
+
+		if (*cpp_size < HWINFO_SIZE_MIN)
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+	db = malloc(*cpp_size + 1);
+	if (!db)
+		return NULL;
+
+	err = nfp_cpp_read(cpp, cpp_id, cpp_addr, db, *cpp_size);
+	if (err != (int)*cpp_size)
+		goto exit_free;
+
+	header = (void *)db;
+	printf("NFP HWINFO header: %08x\n", *(uint32_t *)header);
+	if (nfp_hwinfo_is_updating(header))
+		goto exit_free;
+
+	if (header->version != NFP_HWINFO_VERSION_2) {
+		printf("Unknown HWInfo version: 0x%08x\n",
+			header->version);
+		goto exit_free;
+	}
+
+	/* NULL-terminate for safety */
+	db[*cpp_size] = '\0';
+
+	return (void *)db;
+exit_free:
+	free(db);
+	return NULL;
+}
+
+static struct nfp_hwinfo *
+nfp_hwinfo_fetch(struct nfp_cpp *cpp, size_t *hwdb_size)
+{
+	struct timespec wait;
+	struct nfp_hwinfo *db;
+	int count;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 10000000;
+	count = 0;
+
+	for (;;) {
+		db = nfp_hwinfo_try_fetch(cpp, hwdb_size);
+		if (db)
+			return db;
+
+		nanosleep(&wait, NULL);
+		if (count++ > 200) {
+			printf("NFP access error\n");
+			return NULL;
+		}
+	}
+}
+
+struct nfp_hwinfo *
+nfp_hwinfo_read(struct nfp_cpp *cpp)
+{
+	struct nfp_hwinfo *db;
+	size_t hwdb_size = 0;
+	int err;
+
+	db = nfp_hwinfo_fetch(cpp, &hwdb_size);
+	if (!db)
+		return NULL;
+
+	err = nfp_hwinfo_db_validate(db, hwdb_size);
+	if (err) {
+		free(db);
+		return NULL;
+	}
+	return db;
+}
+
+/*
+ * nfp_hwinfo_lookup() - Find a value in the HWInfo table by name
+ * @hwinfo:	NFP HWinfo table
+ * @lookup:	HWInfo name to search for
+ *
+ * Return: Value of the HWInfo name, or NULL
+ */
+const char *
+nfp_hwinfo_lookup(struct nfp_hwinfo *hwinfo, const char *lookup)
+{
+	const char *key, *val, *end;
+
+	if (!hwinfo || !lookup)
+		return NULL;
+
+	end = hwinfo->data + hwinfo->size - sizeof(uint32_t);
+
+	for (key = hwinfo->data; *key && key < end;
+	     key = val + strlen(val) + 1) {
+		val = key + strlen(key) + 1;
+
+		if (strcmp(key, lookup) == 0)
+			return val;
+	}
+
+	return NULL;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_hwinfo.h b/drivers/net/nfp/nfpcore/nfp_hwinfo.h
new file mode 100644
index 0000000..ccc6163
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_hwinfo.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_HWINFO_H__
+#define __NFP_HWINFO_H__
+
+#include <inttypes.h>
+
+#define HWINFO_SIZE_MIN	0x100
+
+/*
+ * The Hardware Info Table defines the properties of the system.
+ *
+ * HWInfo v1 Table (fixed size)
+ *
+ * 0x0000: uint32_t version	        Hardware Info Table version (1.0)
+ * 0x0004: uint32_t size	        Total size of the table, including the
+ *					CRC32 (IEEE 802.3)
+ * 0x0008: uint32_t jumptab	        Offset of key/value table
+ * 0x000c: uint32_t keys	        Total number of keys in the key/value
+ *					table
+ * NNNNNN:				Key/value jump table and string data
+ * (size - 4): uint32_t crc32	CRC32 (same as IEEE 802.3, POSIX csum, etc)
+ *				CRC32("",0) = ~0, CRC32("a",1) = 0x48C279FE
+ *
+ * HWInfo v2 Table (variable size)
+ *
+ * 0x0000: uint32_t version	        Hardware Info Table version (2.0)
+ * 0x0004: uint32_t size	        Current size of the data area, excluding
+ *					CRC32
+ * 0x0008: uint32_t limit	        Maximum size of the table
+ * 0x000c: uint32_t reserved	        Unused, set to zero
+ * NNNNNN:			Key/value data
+ * (size - 4): uint32_t crc32	CRC32 (same as IEEE 802.3, POSIX csum, etc)
+ *				CRC32("",0) = ~0, CRC32("a",1) = 0x48C279FE
+ *
+ * If the HWInfo table is in the process of being updated, the low bit of
+ * version will be set.
+ *
+ * HWInfo v1 Key/Value Table
+ * -------------------------
+ *
+ *  The key/value table is a set of offsets to ASCIIZ strings which have
+ *  been strcmp(3) sorted (yes, please use bsearch(3) on the table).
+ *
+ *  All keys are guaranteed to be unique.
+ *
+ * N+0:	uint32_t key_1		Offset to the first key
+ * N+4:	uint32_t val_1		Offset to the first value
+ * N+8: uint32_t key_2		Offset to the second key
+ * N+c: uint32_t val_2		Offset to the second value
+ * ...
+ *
+ * HWInfo v2 Key/Value Table
+ * -------------------------
+ *
+ * Packed UTF8Z strings, ie 'key1\000value1\000key2\000value2\000'
+ *
+ * Unsorted.
+ */
+
+#define NFP_HWINFO_VERSION_1 ('H' << 24 | 'I' << 16 | 1 << 8 | 0 << 1 | 0)
+#define NFP_HWINFO_VERSION_2 ('H' << 24 | 'I' << 16 | 2 << 8 | 0 << 1 | 0)
+#define NFP_HWINFO_VERSION_UPDATING	BIT(0)
+
+struct nfp_hwinfo {
+	uint8_t start[0];
+
+	uint32_t version;
+	uint32_t size;
+
+	/* v2 specific fields */
+	uint32_t limit;
+	uint32_t resv;
+
+	char data[];
+};
+
+struct nfp_hwinfo *nfp_hwinfo_read(struct nfp_cpp *cpp);
+
+const char *nfp_hwinfo_lookup(struct nfp_hwinfo *hwinfo, const char *lookup);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_mip.c b/drivers/net/nfp/nfpcore/nfp_mip.c
new file mode 100644
index 0000000..c86966d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mip.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+
+#include "nfp_cpp.h"
+#include "nfp_mip.h"
+#include "nfp_nffw.h"
+
+#define NFP_MIP_SIGNATURE	rte_cpu_to_le_32(0x0050494d)  /* "MIP\0" */
+#define NFP_MIP_VERSION		rte_cpu_to_le_32(1)
+#define NFP_MIP_MAX_OFFSET	(256 * 1024)
+
+struct nfp_mip {
+	uint32_t signature;
+	uint32_t mip_version;
+	uint32_t mip_size;
+	uint32_t first_entry;
+
+	uint32_t version;
+	uint32_t buildnum;
+	uint32_t buildtime;
+	uint32_t loadtime;
+
+	uint32_t symtab_addr;
+	uint32_t symtab_size;
+	uint32_t strtab_addr;
+	uint32_t strtab_size;
+
+	char name[16];
+	char toolchain[32];
+};
+
+/* Read memory and check if it could be a valid MIP */
+static int
+nfp_mip_try_read(struct nfp_cpp *cpp, uint32_t cpp_id, uint64_t addr,
+		 struct nfp_mip *mip)
+{
+	int ret;
+
+	ret = nfp_cpp_read(cpp, cpp_id, addr, mip, sizeof(*mip));
+	if (ret != sizeof(*mip)) {
+		printf("Failed to read MIP data (%d, %zu)\n",
+			ret, sizeof(*mip));
+		return -EIO;
+	}
+	if (mip->signature != NFP_MIP_SIGNATURE) {
+		printf("Incorrect MIP signature (0x%08x)\n",
+			 rte_le_to_cpu_32(mip->signature));
+		return -EINVAL;
+	}
+	if (mip->mip_version != NFP_MIP_VERSION) {
+		printf("Unsupported MIP version (%d)\n",
+			 rte_le_to_cpu_32(mip->mip_version));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Try to locate MIP using the resource table */
+static int
+nfp_mip_read_resource(struct nfp_cpp *cpp, struct nfp_mip *mip)
+{
+	struct nfp_nffw_info *nffw_info;
+	uint32_t cpp_id;
+	uint64_t addr;
+	int err;
+
+	nffw_info = nfp_nffw_info_open(cpp);
+	if (!nffw_info)
+		return -ENODEV;
+
+	err = nfp_nffw_info_mip_first(nffw_info, &cpp_id, &addr);
+	if (err)
+		goto exit_close_nffw;
+
+	err = nfp_mip_try_read(cpp, cpp_id, addr, mip);
+exit_close_nffw:
+	nfp_nffw_info_close(nffw_info);
+	return err;
+}
+
+/*
+ * nfp_mip_open() - Get device MIP structure
+ * @cpp:	NFP CPP Handle
+ *
+ * Copy MIP structure from NFP device and return it.  The returned
+ * structure is handled internally by the library and should be
+ * freed by calling nfp_mip_close().
+ *
+ * Return: pointer to mip, NULL on failure.
+ */
+struct nfp_mip *
+nfp_mip_open(struct nfp_cpp *cpp)
+{
+	struct nfp_mip *mip;
+	int err;
+
+	mip = malloc(sizeof(*mip));
+	if (!mip)
+		return NULL;
+
+	err = nfp_mip_read_resource(cpp, mip);
+	if (err) {
+		free(mip);
+		return NULL;
+	}
+
+	mip->name[sizeof(mip->name) - 1] = 0;
+
+	return mip;
+}
+
+void
+nfp_mip_close(struct nfp_mip *mip)
+{
+	free(mip);
+}
+
+const char *
+nfp_mip_name(const struct nfp_mip *mip)
+{
+	return mip->name;
+}
+
+/*
+ * nfp_mip_symtab() - Get the address and size of the MIP symbol table
+ * @mip:	MIP handle
+ * @addr:	Location for NFP DDR address of MIP symbol table
+ * @size:	Location for size of MIP symbol table
+ */
+void
+nfp_mip_symtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size)
+{
+	*addr = rte_le_to_cpu_32(mip->symtab_addr);
+	*size = rte_le_to_cpu_32(mip->symtab_size);
+}
+
+/*
+ * nfp_mip_strtab() - Get the address and size of the MIP symbol name table
+ * @mip:	MIP handle
+ * @addr:	Location for NFP DDR address of MIP symbol name table
+ * @size:	Location for size of MIP symbol name table
+ */
+void
+nfp_mip_strtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size)
+{
+	*addr = rte_le_to_cpu_32(mip->strtab_addr);
+	*size = rte_le_to_cpu_32(mip->strtab_size);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_mip.h b/drivers/net/nfp/nfpcore/nfp_mip.h
new file mode 100644
index 0000000..d0919b5
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mip.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_MIP_H__
+#define __NFP_MIP_H__
+
+#include "nfp_nffw.h"
+
+struct nfp_mip;
+
+struct nfp_mip *nfp_mip_open(struct nfp_cpp *cpp);
+void nfp_mip_close(struct nfp_mip *mip);
+
+const char *nfp_mip_name(const struct nfp_mip *mip);
+void nfp_mip_symtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size);
+void nfp_mip_strtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size);
+int nfp_nffw_info_mip_first(struct nfp_nffw_info *state, uint32_t *cpp_id,
+			    uint64_t *off);
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_mutex.c b/drivers/net/nfp/nfpcore/nfp_mutex.c
new file mode 100644
index 0000000..318c580
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mutex.c
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <errno.h>
+
+#include <malloc.h>
+#include <time.h>
+#include <sched.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+
+#define MUTEX_LOCKED(interface)  ((((uint32_t)(interface)) << 16) | 0x000f)
+#define MUTEX_UNLOCK(interface)  (0                               | 0x0000)
+
+#define MUTEX_IS_LOCKED(value)   (((value) & 0xffff) == 0x000f)
+#define MUTEX_IS_UNLOCKED(value) (((value) & 0xffff) == 0x0000)
+#define MUTEX_INTERFACE(value)   (((value) >> 16) & 0xffff)
+
+/*
+ * If you need more than 65536 recursive locks, please
+ * rethink your code.
+ */
+#define MUTEX_DEPTH_MAX         0xffff
+
+struct nfp_cpp_mutex {
+	struct nfp_cpp *cpp;
+	uint8_t target;
+	uint16_t depth;
+	unsigned long long address;
+	uint32_t key;
+	unsigned int usage;
+	struct nfp_cpp_mutex *prev, *next;
+};
+
+static int
+_nfp_cpp_mutex_validate(uint32_t model, int *target, unsigned long long address)
+{
+	/* Address must be 64-bit aligned */
+	if (address & 7)
+		return NFP_ERRNO(EINVAL);
+
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		if (*target != NFP_CPP_TARGET_MU)
+			return NFP_ERRNO(EINVAL);
+	} else {
+		return NFP_ERRNO(EINVAL);
+	}
+
+	return 0;
+}
+
+/*
+ * Initialize a mutex location
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and
+ * will initialize 64 bits of data at the location.
+ *
+ * This creates the initial mutex state, as locked by this
+ * nfp_cpp_interface().
+ *
+ * This function should only be called when setting up
+ * the initial lock state upon boot-up of the system.
+ *
+ * @param mutex     NFP CPP Mutex handle
+ * @param target    NFP CPP target ID (ie NFP_CPP_TARGET_CLS or
+ *		    NFP_CPP_TARGET_MU)
+ * @param address   Offset into the address space of the NFP CPP target ID
+ * @param key       Unique 32-bit value for this mutex
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_init(struct nfp_cpp *cpp, int target, unsigned long long address,
+		   uint32_t key)
+{
+	uint32_t model = nfp_cpp_model(cpp);
+	uint32_t muw = NFP_CPP_ID(target, 4, 0);	/* atomic_write */
+	int err;
+
+	err = _nfp_cpp_mutex_validate(model, &target, address);
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_writel(cpp, muw, address + 4, key);
+	if (err < 0)
+		return err;
+
+	err =
+	    nfp_cpp_writel(cpp, muw, address + 0,
+			   MUTEX_LOCKED(nfp_cpp_interface(cpp)));
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+/*
+ * Create a mutex handle from an address controlled by a MU Atomic engine
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and
+ * reserve 64 bits of data at the location for use by the handle.
+ *
+ * Only target/address pairs that point to entities that support the
+ * MU Atomic Engine are supported.
+ *
+ * @param cpp       NFP CPP handle
+ * @param target    NFP CPP target ID (ie NFP_CPP_TARGET_CLS or
+ *		    NFP_CPP_TARGET_MU)
+ * @param address   Offset into the address space of the NFP CPP target ID
+ * @param key       32-bit unique key (must match the key at this location)
+ *
+ * @return      A non-NULL struct nfp_cpp_mutex * on success, NULL on failure.
+ */
+struct nfp_cpp_mutex *
+nfp_cpp_mutex_alloc(struct nfp_cpp *cpp, int target,
+		     unsigned long long address, uint32_t key)
+{
+	uint32_t model = nfp_cpp_model(cpp);
+	struct nfp_cpp_mutex *mutex;
+	uint32_t mur = NFP_CPP_ID(target, 3, 0);	/* atomic_read */
+	int err;
+	uint32_t tmp;
+
+	/* Look for cached mutex */
+	for (mutex = cpp->mutex_cache; mutex; mutex = mutex->next) {
+		if (mutex->target == target && mutex->address == address)
+			break;
+	}
+
+	if (mutex) {
+		if (mutex->key == key) {
+			mutex->usage++;
+			return mutex;
+		}
+
+		/* If the key doesn't match... */
+		return NFP_ERRPTR(EEXIST);
+	}
+
+	err = _nfp_cpp_mutex_validate(model, &target, address);
+	if (err < 0)
+		return NULL;
+
+	err = nfp_cpp_readl(cpp, mur, address + 4, &tmp);
+	if (err < 0)
+		return NULL;
+
+	if (tmp != key)
+		return NFP_ERRPTR(EEXIST);
+
+	mutex = calloc(sizeof(*mutex), 1);
+	if (!mutex)
+		return NFP_ERRPTR(ENOMEM);
+
+	mutex->cpp = cpp;
+	mutex->target = target;
+	mutex->address = address;
+	mutex->key = key;
+	mutex->depth = 0;
+	mutex->usage = 1;
+
+	/* Add mutex to the cache */
+	if (cpp->mutex_cache) {
+		cpp->mutex_cache->prev = mutex;
+		mutex->next = cpp->mutex_cache;
+		cpp->mutex_cache = mutex;
+	} else {
+		cpp->mutex_cache = mutex;
+	}
+
+	return mutex;
+}
+
+struct nfp_cpp *
+nfp_cpp_mutex_cpp(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->cpp;
+}
+
+uint32_t
+nfp_cpp_mutex_key(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->key;
+}
+
+uint16_t
+nfp_cpp_mutex_owner(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	uint32_t value, key;
+	int err;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address, &value);
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		return err;
+
+	if (key != mutex->key)
+		return NFP_ERRNO(EPERM);
+
+	if (!MUTEX_IS_LOCKED(value))
+		return 0;
+
+	return MUTEX_INTERFACE(value);
+}
+
+int
+nfp_cpp_mutex_target(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->target;
+}
+
+uint64_t
+nfp_cpp_mutex_address(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->address;
+}
+
+/*
+ * Free a mutex handle - does not alter the lock state
+ *
+ * @param mutex     NFP CPP Mutex handle
+ */
+void
+nfp_cpp_mutex_free(struct nfp_cpp_mutex *mutex)
+{
+	mutex->usage--;
+	if (mutex->usage > 0)
+		return;
+
+	/* Remove mutex from the cache */
+	if (mutex->next)
+		mutex->next->prev = mutex->prev;
+	if (mutex->prev)
+		mutex->prev->next = mutex->next;
+
+	/* If mutex->cpp == NULL, something broke */
+	if (mutex->cpp && mutex == mutex->cpp->mutex_cache)
+		mutex->cpp->mutex_cache = mutex->next;
+
+	free(mutex);
+}
+
+/*
+ * Lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex     NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex)
+{
+	int err;
+	time_t warn_at = time(NULL) + 15;
+
+	while ((err = nfp_cpp_mutex_trylock(mutex)) != 0) {
+		/* If errno != EBUSY, then the lock was damaged */
+		if (err < 0 && errno != EBUSY)
+			return err;
+		if (time(NULL) >= warn_at) {
+			printf("Warning: waiting for NFP mutex\n");
+			printf("\tusage:%u\n", mutex->usage);
+			printf("\tdepth:%hd]\n", mutex->depth);
+			printf("\ttarget:%d\n", mutex->target);
+			printf("\taddr:%llx\n", mutex->address);
+			printf("\tkey:%08x]\n", mutex->key);
+			warn_at = time(NULL) + 60;
+		}
+		sched_yield();
+	}
+	return 0;
+}
+
+/*
+ * Unlock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex     NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_unlock(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t muw = NFP_CPP_ID(mutex->target, 4, 0);	/* atomic_write */
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	struct nfp_cpp *cpp = mutex->cpp;
+	uint32_t key, value;
+	uint16_t interface = nfp_cpp_interface(cpp);
+	int err;
+
+	if (mutex->depth > 1) {
+		mutex->depth--;
+		return 0;
+	}
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address, &value);
+	if (err < 0)
+		goto exit;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		goto exit;
+
+	if (key != mutex->key) {
+		err = NFP_ERRNO(EPERM);
+		goto exit;
+	}
+
+	if (value != MUTEX_LOCKED(interface)) {
+		err = NFP_ERRNO(EACCES);
+		goto exit;
+	}
+
+	err = nfp_cpp_writel(cpp, muw, mutex->address, MUTEX_UNLOCK(interface));
+	if (err < 0)
+		goto exit;
+
+	mutex->depth = 0;
+
+exit:
+	return err;
+}
+
+/*
+ * Attempt to lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * Valid lock states:
+ *
+ *      0x....0000      - Unlocked
+ *      0x....000f      - Locked
+ *
+ * @param mutex     NFP CPP Mutex handle
+ * @return      0 if the lock succeeded, -1 on failure (and errno set
+ *		appropriately).
+ */
+int
+nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	uint32_t muw = NFP_CPP_ID(mutex->target, 4, 0);	/* atomic_write */
+	uint32_t mus = NFP_CPP_ID(mutex->target, 5, 3);	/* test_set_imm */
+	uint32_t key, value, tmp;
+	struct nfp_cpp *cpp = mutex->cpp;
+	int err;
+
+	if (mutex->depth > 0) {
+		if (mutex->depth == MUTEX_DEPTH_MAX)
+			return NFP_ERRNO(E2BIG);
+
+		mutex->depth++;
+		return 0;
+	}
+
+	/* Verify that the lock marker is not damaged */
+	err = nfp_cpp_readl(cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		goto exit;
+
+	if (key != mutex->key) {
+		err = NFP_ERRNO(EPERM);
+		goto exit;
+	}
+
+	/*
+	 * Compare against the unlocked state, and if true,
+	 * write the interface id into the top 16 bits, and
+	 * mark as locked.
+	 */
+	value = MUTEX_LOCKED(nfp_cpp_interface(cpp));
+
+	/*
+	 * We use test_set_imm here, as it implies a read
+	 * of the current state, and sets the bits in the
+	 * bytemask of the command to 1s. Since the mutex
+	 * is guaranteed to be 64-bit aligned, the bytemask
+	 * of this 32-bit command is ensured to be 8'b00001111,
+	 * which implies that the lower 4 bits will be set to
+	 * ones regardless of the initial state.
+	 *
+	 * Since this is a 'Readback' operation, with no Pull
+	 * data, we can treat this as a normal Push (read)
+	 * atomic, which returns the original value.
+	 */
+	err = nfp_cpp_readl(cpp, mus, mutex->address, &tmp);
+	if (err < 0)
+		goto exit;
+
+	/* Was it unlocked? */
+	if (MUTEX_IS_UNLOCKED(tmp)) {
+		/*
+		 * The read value can only be 0x....0000 in the unlocked state.
+		 * If there was another contending for this lock, then
+		 * the lock state would be 0x....000f
+		 *
+		 * Write our owner ID into the lock
+		 * While not strictly necessary, this helps with
+		 * debug and bookkeeping.
+		 */
+		err = nfp_cpp_writel(cpp, muw, mutex->address, value);
+		if (err < 0)
+			goto exit;
+
+		mutex->depth = 1;
+		goto exit;
+	}
+
+	/* Already locked by us? Success! */
+	if (tmp == value) {
+		mutex->depth = 1;
+		goto exit;
+	}
+
+	err = NFP_ERRNO(MUTEX_IS_LOCKED(tmp) ? EBUSY : EINVAL);
+
+exit:
+	return err;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nffw.c b/drivers/net/nfp/nfpcore/nfp_nffw.c
new file mode 100644
index 0000000..8bec0e3
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nffw.c
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_cpp.h"
+#include "nfp_nffw.h"
+#include "nfp_mip.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+
+/*
+ * flg_info_version = flags[0]<27:16>
+ * This is a small version counter intended only to detect if the current
+ * implementation can read the current struct. Struct changes should be very
+ * rare and as such a 12-bit counter should cover large spans of time. By the
+ * time it wraps around, we don't expect to have 4096 versions of this struct
+ * to be in use at the same time.
+ */
+static uint32_t
+nffw_res_info_version_get(const struct nfp_nffw_info_data *res)
+{
+	return (res->flags[0] >> 16) & 0xfff;
+}
+
+/* flg_init = flags[0]<0> */
+static uint32_t
+nffw_res_flg_init_get(const struct nfp_nffw_info_data *res)
+{
+	return (res->flags[0] >> 0) & 1;
+}
+
+/* loaded = loaded__mu_da__mip_off_hi<31:31> */
+static uint32_t
+nffw_fwinfo_loaded_get(const struct nffw_fwinfo *fi)
+{
+	return (fi->loaded__mu_da__mip_off_hi >> 31) & 1;
+}
+
+/* mip_cppid = mip_cppid */
+static uint32_t
+nffw_fwinfo_mip_cppid_get(const struct nffw_fwinfo *fi)
+{
+	return fi->mip_cppid;
+}
+
+/* loaded = loaded__mu_da__mip_off_hi<8:8> */
+static uint32_t
+nffw_fwinfo_mip_mu_da_get(const struct nffw_fwinfo *fi)
+{
+	return (fi->loaded__mu_da__mip_off_hi >> 8) & 1;
+}
+
+/* mip_offset = (loaded__mu_da__mip_off_hi<7:0> << 8) | mip_offset_lo */
+static uint64_t
+nffw_fwinfo_mip_offset_get(const struct nffw_fwinfo *fi)
+{
+	uint64_t mip_off_hi = fi->loaded__mu_da__mip_off_hi;
+
+	return (mip_off_hi & 0xFF) << 32 | fi->mip_offset_lo;
+}
+
+#define NFP_IMB_TGTADDRESSMODECFG_MODE_of(_x)		(((_x) >> 13) & 0x7)
+#define NFP_IMB_TGTADDRESSMODECFG_ADDRMODE		BIT(12)
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_32_BIT	0
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_40_BIT	BIT(12)
+
+static int
+nfp_mip_mu_locality_lsb(struct nfp_cpp *cpp)
+{
+	unsigned int mode, addr40;
+	uint32_t xpbaddr, imbcppat;
+	int err;
+
+	/* Hardcoded XPB IMB Base, island 0 */
+	xpbaddr = 0x000a0000 + NFP_CPP_TARGET_MU * 4;
+	err = nfp_xpb_readl(cpp, xpbaddr, &imbcppat);
+	if (err < 0)
+		return err;
+
+	mode = NFP_IMB_TGTADDRESSMODECFG_MODE_of(imbcppat);
+	addr40 = !!(imbcppat & NFP_IMB_TGTADDRESSMODECFG_ADDRMODE);
+
+	return nfp_cppat_mu_locality_lsb(mode, addr40);
+}
+
+static unsigned int
+nffw_res_fwinfos(struct nfp_nffw_info_data *fwinf, struct nffw_fwinfo **arr)
+{
+	/*
+	 * For the this code, version 0 is most likely to be version 1 in this
+	 * case. Since the kernel driver does not take responsibility for
+	 * initialising the nfp.nffw resource, any previous code (CA firmware or
+	 * userspace) that left the version 0 and did set the init flag is going
+	 * to be version 1.
+	 */
+	switch (nffw_res_info_version_get(fwinf)) {
+	case 0:
+	case 1:
+		*arr = &fwinf->info.v1.fwinfo[0];
+		return NFFW_FWINFO_CNT_V1;
+	case 2:
+		*arr = &fwinf->info.v2.fwinfo[0];
+		return NFFW_FWINFO_CNT_V2;
+	default:
+		*arr = NULL;
+		return 0;
+	}
+}
+
+/*
+ * nfp_nffw_info_open() - Acquire the lock on the NFFW table
+ * @cpp:	NFP CPP handle
+ *
+ * Return: 0, or -ERRNO
+ */
+struct nfp_nffw_info *
+nfp_nffw_info_open(struct nfp_cpp *cpp)
+{
+	struct nfp_nffw_info_data *fwinf;
+	struct nfp_nffw_info *state;
+	uint32_t info_ver;
+	int err;
+
+	state = malloc(sizeof(*state));
+	if (!state)
+		return NULL;
+
+	memset(state, 0, sizeof(*state));
+
+	state->res = nfp_resource_acquire(cpp, NFP_RESOURCE_NFP_NFFW);
+	if (!state->res)
+		goto err_free;
+
+	fwinf = &state->fwinf;
+
+	if (sizeof(*fwinf) > nfp_resource_size(state->res))
+		goto err_release;
+
+	err = nfp_cpp_read(cpp, nfp_resource_cpp_id(state->res),
+			   nfp_resource_address(state->res),
+			   fwinf, sizeof(*fwinf));
+	if (err < (int)sizeof(*fwinf))
+		goto err_release;
+
+	if (!nffw_res_flg_init_get(fwinf))
+		goto err_release;
+
+	info_ver = nffw_res_info_version_get(fwinf);
+	if (info_ver > NFFW_INFO_VERSION_CURRENT)
+		goto err_release;
+
+	state->cpp = cpp;
+	return state;
+
+err_release:
+	nfp_resource_release(state->res);
+err_free:
+	free(state);
+	return NULL;
+}
+
+/*
+ * nfp_nffw_info_release() - Release the lock on the NFFW table
+ * @state:	NFP FW info state
+ *
+ * Return: 0, or -ERRNO
+ */
+void
+nfp_nffw_info_close(struct nfp_nffw_info *state)
+{
+	nfp_resource_release(state->res);
+	free(state);
+}
+
+/*
+ * nfp_nffw_info_fwid_first() - Return the first firmware ID in the NFFW
+ * @state:	NFP FW info state
+ *
+ * Return: First NFFW firmware info, NULL on failure
+ */
+static struct nffw_fwinfo *
+nfp_nffw_info_fwid_first(struct nfp_nffw_info *state)
+{
+	struct nffw_fwinfo *fwinfo;
+	unsigned int cnt, i;
+
+	cnt = nffw_res_fwinfos(&state->fwinf, &fwinfo);
+	if (!cnt)
+		return NULL;
+
+	for (i = 0; i < cnt; i++)
+		if (nffw_fwinfo_loaded_get(&fwinfo[i]))
+			return &fwinfo[i];
+
+	return NULL;
+}
+
+/*
+ * nfp_nffw_info_mip_first() - Retrieve the location of the first FW's MIP
+ * @state:	NFP FW info state
+ * @cpp_id:	Pointer to the CPP ID of the MIP
+ * @off:	Pointer to the CPP Address of the MIP
+ *
+ * Return: 0, or -ERRNO
+ */
+int
+nfp_nffw_info_mip_first(struct nfp_nffw_info *state, uint32_t *cpp_id,
+			uint64_t *off)
+{
+	struct nffw_fwinfo *fwinfo;
+
+	fwinfo = nfp_nffw_info_fwid_first(state);
+	if (!fwinfo)
+		return -EINVAL;
+
+	*cpp_id = nffw_fwinfo_mip_cppid_get(fwinfo);
+	*off = nffw_fwinfo_mip_offset_get(fwinfo);
+
+	if (nffw_fwinfo_mip_mu_da_get(fwinfo)) {
+		int locality_off;
+
+		if (NFP_CPP_ID_TARGET_of(*cpp_id) != NFP_CPP_TARGET_MU)
+			return 0;
+
+		locality_off = nfp_mip_mu_locality_lsb(state->cpp);
+		if (locality_off < 0)
+			return locality_off;
+
+		*off &= ~(NFP_MU_ADDR_ACCESS_TYPE_MASK << locality_off);
+		*off |= NFP_MU_ADDR_ACCESS_TYPE_DIRECT << locality_off;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nffw.h b/drivers/net/nfp/nfpcore/nfp_nffw.h
new file mode 100644
index 0000000..3bbdf1c
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nffw.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFFW_H__
+#define __NFP_NFFW_H__
+
+#include "nfp-common/nfp_platform.h"
+#include "nfp_cpp.h"
+
+/*
+ * Init-CSR owner IDs for firmware map to firmware IDs which start at 4.
+ * Lower IDs are reserved for target and loader IDs.
+ */
+#define NFFW_FWID_EXT   3	/* For active MEs that we didn't load. */
+#define NFFW_FWID_BASE  4
+
+#define NFFW_FWID_ALL   255
+
+/* Init-CSR owner IDs for firmware map to firmware IDs which start at 4.
+ * Lower IDs are reserved for target and loader IDs.
+ */
+#define NFFW_FWID_EXT   3 /* For active MEs that we didn't load. */
+#define NFFW_FWID_BASE  4
+
+#define NFFW_FWID_ALL   255
+
+/**
+ * NFFW_INFO_VERSION history:
+ * 0: This was never actually used (before versioning), but it refers to
+ *    the previous struct which had FWINFO_CNT = MEINFO_CNT = 120 that later
+ *    changed to 200.
+ * 1: First versioned struct, with
+ *     FWINFO_CNT = 120
+ *     MEINFO_CNT = 120
+ * 2:  FWINFO_CNT = 200
+ *     MEINFO_CNT = 200
+ */
+#define NFFW_INFO_VERSION_CURRENT 2
+
+/* Enough for all current chip families */
+#define NFFW_MEINFO_CNT_V1 120
+#define NFFW_FWINFO_CNT_V1 120
+#define NFFW_MEINFO_CNT_V2 200
+#define NFFW_FWINFO_CNT_V2 200
+
+struct nffw_meinfo {
+	uint32_t ctxmask__fwid__meid;
+};
+
+struct nffw_fwinfo {
+	uint32_t loaded__mu_da__mip_off_hi;
+	uint32_t mip_cppid; /* 0 means no MIP */
+	uint32_t mip_offset_lo;
+};
+
+struct nfp_nffw_info_v1 {
+	struct nffw_meinfo meinfo[NFFW_MEINFO_CNT_V1];
+	struct nffw_fwinfo fwinfo[NFFW_FWINFO_CNT_V1];
+};
+
+struct nfp_nffw_info_v2 {
+	struct nffw_meinfo meinfo[NFFW_MEINFO_CNT_V2];
+	struct nffw_fwinfo fwinfo[NFFW_FWINFO_CNT_V2];
+};
+
+struct nfp_nffw_info_data {
+	uint32_t flags[2];
+	union {
+		struct nfp_nffw_info_v1 v1;
+		struct nfp_nffw_info_v2 v2;
+	} info;
+};
+
+struct nfp_nffw_info {
+	struct nfp_cpp *cpp;
+	struct nfp_resource *res;
+
+	struct nfp_nffw_info_data fwinf;
+};
+
+struct nfp_nffw_info *nfp_nffw_info_open(struct nfp_cpp *cpp);
+void nfp_nffw_info_close(struct nfp_nffw_info *state);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp.c b/drivers/net/nfp/nfpcore/nfp_nsp.c
new file mode 100644
index 0000000..876a401
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp.c
@@ -0,0 +1,427 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#define NFP_SUBSYS "nfp_nsp"
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_common.h>
+
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp_resource.h"
+
+int
+nfp_nsp_config_modified(struct nfp_nsp *state)
+{
+	return state->modified;
+}
+
+void
+nfp_nsp_config_set_modified(struct nfp_nsp *state, int modified)
+{
+	state->modified = modified;
+}
+
+void *
+nfp_nsp_config_entries(struct nfp_nsp *state)
+{
+	return state->entries;
+}
+
+unsigned int
+nfp_nsp_config_idx(struct nfp_nsp *state)
+{
+	return state->idx;
+}
+
+void
+nfp_nsp_config_set_state(struct nfp_nsp *state, void *entries, unsigned int idx)
+{
+	state->entries = entries;
+	state->idx = idx;
+}
+
+void
+nfp_nsp_config_clear_state(struct nfp_nsp *state)
+{
+	state->entries = NULL;
+	state->idx = 0;
+}
+
+static void
+nfp_nsp_print_extended_error(uint32_t ret_val)
+{
+	int i;
+
+	if (!ret_val)
+		return;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_errors); i++)
+		if (ret_val == (uint32_t)nsp_errors[i].code)
+			printf("err msg: %s\n", nsp_errors[i].msg);
+}
+
+static int
+nfp_nsp_check(struct nfp_nsp *state)
+{
+	struct nfp_cpp *cpp = state->cpp;
+	uint64_t nsp_status, reg;
+	uint32_t nsp_cpp;
+	int err;
+
+	nsp_cpp = nfp_resource_cpp_id(state->res);
+	nsp_status = nfp_resource_address(state->res) + NSP_STATUS;
+
+	err = nfp_cpp_readq(cpp, nsp_cpp, nsp_status, &reg);
+	if (err < 0)
+		return err;
+
+	if (FIELD_GET(NSP_STATUS_MAGIC, reg) != NSP_MAGIC) {
+		printf("Cannot detect NFP Service Processor\n");
+		return -ENODEV;
+	}
+
+	state->ver.major = FIELD_GET(NSP_STATUS_MAJOR, reg);
+	state->ver.minor = FIELD_GET(NSP_STATUS_MINOR, reg);
+
+	if (state->ver.major != NSP_MAJOR || state->ver.minor < NSP_MINOR) {
+		printf("Unsupported ABI %hu.%hu\n", state->ver.major,
+						    state->ver.minor);
+		return -EINVAL;
+	}
+
+	if (reg & NSP_STATUS_BUSY) {
+		printf("Service processor busy!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * nfp_nsp_open() - Prepare for communication and lock the NSP resource.
+ * @cpp:	NFP CPP Handle
+ */
+struct nfp_nsp *
+nfp_nsp_open(struct nfp_cpp *cpp)
+{
+	struct nfp_resource *res;
+	struct nfp_nsp *state;
+	int err;
+
+	res = nfp_resource_acquire(cpp, NFP_RESOURCE_NSP);
+	if (!res)
+		return NULL;
+
+	state = malloc(sizeof(*state));
+	if (!state) {
+		nfp_resource_release(res);
+		return NULL;
+	}
+	memset(state, 0, sizeof(*state));
+	state->cpp = cpp;
+	state->res = res;
+
+	err = nfp_nsp_check(state);
+	if (err) {
+		nfp_nsp_close(state);
+		return NULL;
+	}
+
+	return state;
+}
+
+/*
+ * nfp_nsp_close() - Clean up and unlock the NSP resource.
+ * @state:	NFP SP state
+ */
+void
+nfp_nsp_close(struct nfp_nsp *state)
+{
+	nfp_resource_release(state->res);
+	free(state);
+}
+
+uint16_t
+nfp_nsp_get_abi_ver_major(struct nfp_nsp *state)
+{
+	return state->ver.major;
+}
+
+uint16_t
+nfp_nsp_get_abi_ver_minor(struct nfp_nsp *state)
+{
+	return state->ver.minor;
+}
+
+static int
+nfp_nsp_wait_reg(struct nfp_cpp *cpp, uint64_t *reg, uint32_t nsp_cpp,
+		 uint64_t addr, uint64_t mask, uint64_t val)
+{
+	struct timespec wait;
+	int count;
+	int err;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 25000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_cpp_readq(cpp, nsp_cpp, addr, reg);
+		if (err < 0)
+			return err;
+
+		if ((*reg & mask) == val)
+			return 0;
+
+		nanosleep(&wait, 0);
+		if (count++ > 1000)
+			return -ETIMEDOUT;
+	}
+}
+
+/*
+ * nfp_nsp_command() - Execute a command on the NFP Service Processor
+ * @state:	NFP SP state
+ * @code:	NFP SP Command Code
+ * @option:	NFP SP Command Argument
+ * @buff_cpp:	NFP SP Buffer CPP Address info
+ * @buff_addr:	NFP SP Buffer Host address
+ *
+ * Return: 0 for success with no result
+ *
+ *	 positive value for NSP completion with a result code
+ *
+ *	-EAGAIN if the NSP is not yet present
+ *	-ENODEV if the NSP is not a supported model
+ *	-EBUSY if the NSP is stuck
+ *	-EINTR if interrupted while waiting for completion
+ *	-ETIMEDOUT if the NSP took longer than 30 seconds to complete
+ */
+static int
+nfp_nsp_command(struct nfp_nsp *state, uint16_t code, uint32_t option,
+		uint32_t buff_cpp, uint64_t buff_addr)
+{
+	uint64_t reg, ret_val, nsp_base, nsp_buffer, nsp_status, nsp_command;
+	struct nfp_cpp *cpp = state->cpp;
+	uint32_t nsp_cpp;
+	int err;
+
+	nsp_cpp = nfp_resource_cpp_id(state->res);
+	nsp_base = nfp_resource_address(state->res);
+	nsp_status = nsp_base + NSP_STATUS;
+	nsp_command = nsp_base + NSP_COMMAND;
+	nsp_buffer = nsp_base + NSP_BUFFER;
+
+	err = nfp_nsp_check(state);
+	if (err)
+		return err;
+
+	if (!FIELD_FIT(NSP_BUFFER_CPP, buff_cpp >> 8) ||
+	    !FIELD_FIT(NSP_BUFFER_ADDRESS, buff_addr)) {
+		printf("Host buffer out of reach %08x %" PRIx64 "\n",
+			buff_cpp, buff_addr);
+		return -EINVAL;
+	}
+
+	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_buffer,
+			     FIELD_PREP(NSP_BUFFER_CPP, buff_cpp >> 8) |
+			     FIELD_PREP(NSP_BUFFER_ADDRESS, buff_addr));
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_command,
+			     FIELD_PREP(NSP_COMMAND_OPTION, option) |
+			     FIELD_PREP(NSP_COMMAND_CODE, code) |
+			     FIELD_PREP(NSP_COMMAND_START, 1));
+	if (err < 0)
+		return err;
+
+	/* Wait for NSP_COMMAND_START to go to 0 */
+	err = nfp_nsp_wait_reg(cpp, &reg, nsp_cpp, nsp_command,
+			       NSP_COMMAND_START, 0);
+	if (err) {
+		printf("Error %d waiting for code 0x%04x to start\n",
+			err, code);
+		return err;
+	}
+
+	/* Wait for NSP_STATUS_BUSY to go to 0 */
+	err = nfp_nsp_wait_reg(cpp, &reg, nsp_cpp, nsp_status, NSP_STATUS_BUSY,
+			       0);
+	if (err) {
+		printf("Error %d waiting for code 0x%04x to complete\n",
+			err, code);
+		return err;
+	}
+
+	err = nfp_cpp_readq(cpp, nsp_cpp, nsp_command, &ret_val);
+	if (err < 0)
+		return err;
+	ret_val = FIELD_GET(NSP_COMMAND_OPTION, ret_val);
+
+	err = FIELD_GET(NSP_STATUS_RESULT, reg);
+	if (err) {
+		printf("Result (error) code set: %d (%d) command: %d\n",
+			 -err, (int)ret_val, code);
+		nfp_nsp_print_extended_error(ret_val);
+		return -err;
+	}
+
+	return ret_val;
+}
+
+#define SZ_1M 0x00100000
+
+static int
+nfp_nsp_command_buf(struct nfp_nsp *nsp, uint16_t code, uint32_t option,
+		    const void *in_buf, unsigned int in_size, void *out_buf,
+		    unsigned int out_size)
+{
+	struct nfp_cpp *cpp = nsp->cpp;
+	unsigned int max_size;
+	uint64_t reg, cpp_buf;
+	int ret, err;
+	uint32_t cpp_id;
+
+	if (nsp->ver.minor < 13) {
+		printf("NSP: Code 0x%04x with buffer not supported\n", code);
+		printf("\t(ABI %hu.%hu)\n", nsp->ver.major, nsp->ver.minor);
+		return -EOPNOTSUPP;
+	}
+
+	err = nfp_cpp_readq(cpp, nfp_resource_cpp_id(nsp->res),
+			    nfp_resource_address(nsp->res) +
+			    NSP_DFLT_BUFFER_CONFIG,
+			    &reg);
+	if (err < 0)
+		return err;
+
+	max_size = RTE_MAX(in_size, out_size);
+	if (FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M < max_size) {
+		printf("NSP: default buffer too small for command 0x%04x\n",
+		       code);
+		printf("\t(%llu < %u)\n",
+		       FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M,
+		       max_size);
+		return -EINVAL;
+	}
+
+	err = nfp_cpp_readq(cpp, nfp_resource_cpp_id(nsp->res),
+			    nfp_resource_address(nsp->res) +
+			    NSP_DFLT_BUFFER,
+			    &reg);
+	if (err < 0)
+		return err;
+
+	cpp_id = FIELD_GET(NSP_BUFFER_CPP, reg) << 8;
+	cpp_buf = FIELD_GET(NSP_BUFFER_ADDRESS, reg);
+
+	if (in_buf && in_size) {
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf, in_buf, in_size);
+		if (err < 0)
+			return err;
+	}
+	/* Zero out remaining part of the buffer */
+	if (out_buf && out_size && out_size > in_size) {
+		memset(out_buf, 0, out_size - in_size);
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf + in_size, out_buf,
+				    out_size - in_size);
+		if (err < 0)
+			return err;
+	}
+
+	ret = nfp_nsp_command(nsp, code, option, cpp_id, cpp_buf);
+	if (ret < 0)
+		return ret;
+
+	if (out_buf && out_size) {
+		err = nfp_cpp_read(cpp, cpp_id, cpp_buf, out_buf, out_size);
+		if (err < 0)
+			return err;
+	}
+
+	return ret;
+}
+
+int
+nfp_nsp_wait(struct nfp_nsp *state)
+{
+	struct timespec wait;
+	int count;
+	int err;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 25000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_nsp_command(state, SPCODE_NOOP, 0, 0, 0);
+		if (err != -EAGAIN)
+			break;
+
+		nanosleep(&wait, 0);
+
+		if (count++ > 1000) {
+			err = -ETIMEDOUT;
+			break;
+		}
+	}
+	if (err)
+		printf("NSP failed to respond %d\n", err);
+
+	return err;
+}
+
+int
+nfp_nsp_device_soft_reset(struct nfp_nsp *state)
+{
+	return nfp_nsp_command(state, SPCODE_SOFT_RESET, 0, 0, 0);
+}
+
+int
+nfp_nsp_mac_reinit(struct nfp_nsp *state)
+{
+	return nfp_nsp_command(state, SPCODE_MAC_INIT, 0, 0, 0);
+}
+
+int
+nfp_nsp_load_fw(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_FW_LOAD, size, buf, size,
+				   NULL, 0);
+}
+
+int
+nfp_nsp_read_eth_table(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_ETH_RESCAN, size, NULL, 0,
+				   buf, size);
+}
+
+int
+nfp_nsp_write_eth_table(struct nfp_nsp *state, const void *buf,
+			unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_ETH_CONTROL, size, buf, size,
+				   NULL, 0);
+}
+
+int
+nfp_nsp_read_identify(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_NSP_IDENTIFY, size, NULL, 0,
+				   buf, size);
+}
+
+int
+nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask, void *buf,
+		     unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_NSP_SENSORS, sensor_mask, NULL,
+				   0, buf, size);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp.h b/drivers/net/nfp/nfpcore/nfp_nsp.h
new file mode 100644
index 0000000..c9c7b0d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NSP_NSP_H
+#define NSP_NSP_H 1
+
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (64 - 1 - (h))))
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_GET(_mask, _reg)	\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		(typeof(_x))(((_reg) & (_x)) >> __bf_shf(_x));	\
+	}))
+
+#define FIELD_FIT(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		!((((typeof(_x))_val) << __bf_shf(_x)) & ~(_x)); \
+	}))
+
+#define FIELD_PREP(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		((typeof(_x))(_val) << __bf_shf(_x)) & (_x);	\
+	}))
+
+/* Offsets relative to the CSR base */
+#define NSP_STATUS		0x00
+#define   NSP_STATUS_MAGIC	GENMASK_ULL(63, 48)
+#define   NSP_STATUS_MAJOR	GENMASK_ULL(47, 44)
+#define   NSP_STATUS_MINOR	GENMASK_ULL(43, 32)
+#define   NSP_STATUS_CODE	GENMASK_ULL(31, 16)
+#define   NSP_STATUS_RESULT	GENMASK_ULL(15, 8)
+#define   NSP_STATUS_BUSY	BIT_ULL(0)
+
+#define NSP_COMMAND		0x08
+#define   NSP_COMMAND_OPTION	GENMASK_ULL(63, 32)
+#define   NSP_COMMAND_CODE	GENMASK_ULL(31, 16)
+#define   NSP_COMMAND_START	BIT_ULL(0)
+
+/* CPP address to retrieve the data from */
+#define NSP_BUFFER		0x10
+#define   NSP_BUFFER_CPP	GENMASK_ULL(63, 40)
+#define   NSP_BUFFER_PCIE	GENMASK_ULL(39, 38)
+#define   NSP_BUFFER_ADDRESS	GENMASK_ULL(37, 0)
+
+#define NSP_DFLT_BUFFER		0x18
+
+#define NSP_DFLT_BUFFER_CONFIG	0x20
+#define   NSP_DFLT_BUFFER_SIZE_MB	GENMASK_ULL(7, 0)
+
+#define NSP_MAGIC		0xab10
+#define NSP_MAJOR		0
+#define NSP_MINOR		8
+
+#define NSP_CODE_MAJOR		GENMASK(15, 12)
+#define NSP_CODE_MINOR		GENMASK(11, 0)
+
+enum nfp_nsp_cmd {
+	SPCODE_NOOP		= 0, /* No operation */
+	SPCODE_SOFT_RESET	= 1, /* Soft reset the NFP */
+	SPCODE_FW_DEFAULT	= 2, /* Load default (UNDI) FW */
+	SPCODE_PHY_INIT		= 3, /* Initialize the PHY */
+	SPCODE_MAC_INIT		= 4, /* Initialize the MAC */
+	SPCODE_PHY_RXADAPT	= 5, /* Re-run PHY RX Adaptation */
+	SPCODE_FW_LOAD		= 6, /* Load fw from buffer, len in option */
+	SPCODE_ETH_RESCAN	= 7, /* Rescan ETHs, write ETH_TABLE to buf */
+	SPCODE_ETH_CONTROL	= 8, /* Update media config from buffer */
+	SPCODE_NSP_SENSORS	= 12, /* Read NSP sensor(s) */
+	SPCODE_NSP_IDENTIFY	= 13, /* Read NSP version */
+};
+
+static const struct {
+	int code;
+	const char *msg;
+} nsp_errors[] = {
+	{ 6010, "could not map to phy for port" },
+	{ 6011, "not an allowed rate/lanes for port" },
+	{ 6012, "not an allowed rate/lanes for port" },
+	{ 6013, "high/low error, change other port first" },
+	{ 6014, "config not found in flash" },
+};
+
+struct nfp_nsp {
+	struct nfp_cpp *cpp;
+	struct nfp_resource *res;
+	struct {
+		uint16_t major;
+		uint16_t minor;
+	} ver;
+
+	/* Eth table config state */
+	int modified;
+	unsigned int idx;
+	void *entries;
+};
+
+struct nfp_nsp *nfp_nsp_open(struct nfp_cpp *cpp);
+void nfp_nsp_close(struct nfp_nsp *state);
+uint16_t nfp_nsp_get_abi_ver_major(struct nfp_nsp *state);
+uint16_t nfp_nsp_get_abi_ver_minor(struct nfp_nsp *state);
+int nfp_nsp_wait(struct nfp_nsp *state);
+int nfp_nsp_device_soft_reset(struct nfp_nsp *state);
+int nfp_nsp_load_fw(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_mac_reinit(struct nfp_nsp *state);
+int nfp_nsp_read_identify(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
+			 void *buf, unsigned int size);
+
+static inline int nfp_nsp_has_mac_reinit(struct nfp_nsp *state)
+{
+	return nfp_nsp_get_abi_ver_minor(state) > 20;
+}
+
+enum nfp_eth_interface {
+	NFP_INTERFACE_NONE	= 0,
+	NFP_INTERFACE_SFP	= 1,
+	NFP_INTERFACE_SFPP	= 10,
+	NFP_INTERFACE_SFP28	= 28,
+	NFP_INTERFACE_QSFP	= 40,
+	NFP_INTERFACE_CXP	= 100,
+	NFP_INTERFACE_QSFP28	= 112,
+};
+
+enum nfp_eth_media {
+	NFP_MEDIA_DAC_PASSIVE = 0,
+	NFP_MEDIA_DAC_ACTIVE,
+	NFP_MEDIA_FIBRE,
+};
+
+enum nfp_eth_aneg {
+	NFP_ANEG_AUTO = 0,
+	NFP_ANEG_SEARCH,
+	NFP_ANEG_25G_CONSORTIUM,
+	NFP_ANEG_25G_IEEE,
+	NFP_ANEG_DISABLED,
+};
+
+enum nfp_eth_fec {
+	NFP_FEC_AUTO_BIT = 0,
+	NFP_FEC_BASER_BIT,
+	NFP_FEC_REED_SOLOMON_BIT,
+	NFP_FEC_DISABLED_BIT,
+};
+
+#define NFP_FEC_AUTO		BIT(NFP_FEC_AUTO_BIT)
+#define NFP_FEC_BASER		BIT(NFP_FEC_BASER_BIT)
+#define NFP_FEC_REED_SOLOMON	BIT(NFP_FEC_REED_SOLOMON_BIT)
+#define NFP_FEC_DISABLED	BIT(NFP_FEC_DISABLED_BIT)
+
+#define ETH_ALEN	6
+
+/**
+ * struct nfp_eth_table - ETH table information
+ * @count:	number of table entries
+ * @max_index:	max of @index fields of all @ports
+ * @ports:	table of ports
+ *
+ * @eth_index:	port index according to legacy ethX numbering
+ * @index:	chip-wide first channel index
+ * @nbi:	NBI index
+ * @base:	first channel index (within NBI)
+ * @lanes:	number of channels
+ * @speed:	interface speed (in Mbps)
+ * @interface:	interface (module) plugged in
+ * @media:	media type of the @interface
+ * @fec:	forward error correction mode
+ * @aneg:	auto negotiation mode
+ * @mac_addr:	interface MAC address
+ * @label_port:	port id
+ * @label_subport:  id of interface within port (for split ports)
+ * @enabled:	is enabled?
+ * @tx_enabled:	is TX enabled?
+ * @rx_enabled:	is RX enabled?
+ * @override_changed: is media reconfig pending?
+ *
+ * @port_type:	one of %PORT_* defines for ethtool
+ * @port_lanes:	total number of lanes on the port (sum of lanes of all subports)
+ * @is_split:	is interface part of a split port
+ * @fec_modes_supported:	bitmap of FEC modes supported
+ */
+struct nfp_eth_table {
+	unsigned int count;
+	unsigned int max_index;
+	struct nfp_eth_table_port {
+		unsigned int eth_index;
+		unsigned int index;
+		unsigned int nbi;
+		unsigned int base;
+		unsigned int lanes;
+		unsigned int speed;
+
+		unsigned int interface;
+		enum nfp_eth_media media;
+
+		enum nfp_eth_fec fec;
+		enum nfp_eth_aneg aneg;
+
+		uint8_t mac_addr[ETH_ALEN];
+
+		uint8_t label_port;
+		uint8_t label_subport;
+
+		int enabled;
+		int tx_enabled;
+		int rx_enabled;
+
+		int override_changed;
+
+		/* Computed fields */
+		uint8_t port_type;
+
+		unsigned int port_lanes;
+
+		int is_split;
+
+		unsigned int fec_modes_supported;
+	} ports[0];
+};
+
+struct nfp_eth_table *nfp_eth_read_ports(struct nfp_cpp *cpp);
+
+int nfp_eth_set_mod_enable(struct nfp_cpp *cpp, unsigned int idx, int enable);
+int nfp_eth_set_configured(struct nfp_cpp *cpp, unsigned int idx,
+			   int configed);
+int
+nfp_eth_set_fec(struct nfp_cpp *cpp, unsigned int idx, enum nfp_eth_fec mode);
+
+int nfp_nsp_read_eth_table(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_write_eth_table(struct nfp_nsp *state, const void *buf,
+			    unsigned int size);
+void nfp_nsp_config_set_state(struct nfp_nsp *state, void *entries,
+			      unsigned int idx);
+void nfp_nsp_config_clear_state(struct nfp_nsp *state);
+void nfp_nsp_config_set_modified(struct nfp_nsp *state, int modified);
+void *nfp_nsp_config_entries(struct nfp_nsp *state);
+int nfp_nsp_config_modified(struct nfp_nsp *state);
+unsigned int nfp_nsp_config_idx(struct nfp_nsp *state);
+
+static inline int nfp_eth_can_support_fec(struct nfp_eth_table_port *eth_port)
+{
+	return !!eth_port->fec_modes_supported;
+}
+
+static inline unsigned int
+nfp_eth_supported_fec_modes(struct nfp_eth_table_port *eth_port)
+{
+	return eth_port->fec_modes_supported;
+}
+
+struct nfp_nsp *nfp_eth_config_start(struct nfp_cpp *cpp, unsigned int idx);
+int nfp_eth_config_commit_end(struct nfp_nsp *nsp);
+void nfp_eth_config_cleanup_end(struct nfp_nsp *nsp);
+
+int __nfp_eth_set_aneg(struct nfp_nsp *nsp, enum nfp_eth_aneg mode);
+int __nfp_eth_set_speed(struct nfp_nsp *nsp, unsigned int speed);
+int __nfp_eth_set_split(struct nfp_nsp *nsp, unsigned int lanes);
+
+/**
+ * struct nfp_nsp_identify - NSP static information
+ * @version:      opaque version string
+ * @flags:        version flags
+ * @br_primary:   branch id of primary bootloader
+ * @br_secondary: branch id of secondary bootloader
+ * @br_nsp:       branch id of NSP
+ * @primary:      version of primarary bootloader
+ * @secondary:    version id of secondary bootloader
+ * @nsp:          version id of NSP
+ * @sensor_mask:  mask of present sensors available on NIC
+ */
+struct nfp_nsp_identify {
+	char version[40];
+	uint8_t flags;
+	uint8_t br_primary;
+	uint8_t br_secondary;
+	uint8_t br_nsp;
+	uint16_t primary;
+	uint16_t secondary;
+	uint16_t nsp;
+	uint64_t sensor_mask;
+};
+
+struct nfp_nsp_identify *__nfp_nsp_identify(struct nfp_nsp *nsp);
+
+enum nfp_nsp_sensor_id {
+	NFP_SENSOR_CHIP_TEMPERATURE,
+	NFP_SENSOR_ASSEMBLY_POWER,
+	NFP_SENSOR_ASSEMBLY_12V_POWER,
+	NFP_SENSOR_ASSEMBLY_3V3_POWER,
+};
+
+int nfp_hwmon_read_sensor(struct nfp_cpp *cpp, enum nfp_nsp_sensor_id id,
+			  long *val);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c b/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
new file mode 100644
index 0000000..bfd1edd
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp_nffw.h"
+
+struct nsp_identify {
+	uint8_t version[40];
+	uint8_t flags;
+	uint8_t br_primary;
+	uint8_t br_secondary;
+	uint8_t br_nsp;
+	uint16_t primary;
+	uint16_t secondary;
+	uint16_t nsp;
+	uint8_t reserved[6];
+	uint64_t sensor_mask;
+};
+
+struct nfp_nsp_identify *
+__nfp_nsp_identify(struct nfp_nsp *nsp)
+{
+	struct nfp_nsp_identify *nspi = NULL;
+	struct nsp_identify *ni;
+	int ret;
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 15)
+		return NULL;
+
+	ni = malloc(sizeof(*ni));
+	if (!ni)
+		return NULL;
+
+	memset(ni, 0, sizeof(*ni));
+	ret = nfp_nsp_read_identify(nsp, ni, sizeof(*ni));
+	if (ret < 0) {
+		printf("reading bsp version failed %d\n",
+			ret);
+		goto exit_free;
+	}
+
+	nspi = malloc(sizeof(*nspi));
+	if (!nspi)
+		goto exit_free;
+
+	memset(nspi, 0, sizeof(*nspi));
+	memcpy(nspi->version, ni->version, sizeof(nspi->version));
+	nspi->version[sizeof(nspi->version) - 1] = '\0';
+	nspi->flags = ni->flags;
+	nspi->br_primary = ni->br_primary;
+	nspi->br_secondary = ni->br_secondary;
+	nspi->br_nsp = ni->br_nsp;
+	nspi->primary = rte_le_to_cpu_16(ni->primary);
+	nspi->secondary = rte_le_to_cpu_16(ni->secondary);
+	nspi->nsp = rte_le_to_cpu_16(ni->nsp);
+	nspi->sensor_mask = rte_le_to_cpu_64(ni->sensor_mask);
+
+exit_free:
+	free(ni);
+	return nspi;
+}
+
+struct nfp_sensors {
+	uint32_t chip_temp;
+	uint32_t assembly_power;
+	uint32_t assembly_12v_power;
+	uint32_t assembly_3v3_power;
+};
+
+int
+nfp_hwmon_read_sensor(struct nfp_cpp *cpp, enum nfp_nsp_sensor_id id, long *val)
+{
+	struct nfp_sensors s;
+	struct nfp_nsp *nsp;
+	int ret;
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp)
+		return -EIO;
+
+	ret = nfp_nsp_read_sensors(nsp, BIT(id), &s, sizeof(s));
+	nfp_nsp_close(nsp);
+
+	if (ret < 0)
+		return ret;
+
+	switch (id) {
+	case NFP_SENSOR_CHIP_TEMPERATURE:
+		*val = rte_le_to_cpu_32(s.chip_temp);
+		break;
+	case NFP_SENSOR_ASSEMBLY_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_power);
+		break;
+	case NFP_SENSOR_ASSEMBLY_12V_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_12v_power);
+		break;
+	case NFP_SENSOR_ASSEMBLY_3V3_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_3v3_power);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/nfp/nfpcore/nfp_nsp_eth.c
new file mode 100644
index 0000000..6794689
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp_eth.c
@@ -0,0 +1,665 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp6000/nfp6000.h"
+
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (64 - 1 - (h))))
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_GET(_mask, _reg)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		(typeof(_x))(((_reg) & (_x)) >> __bf_shf(_x));	\
+	}))
+
+#define FIELD_FIT(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		!((((typeof(_x))_val) << __bf_shf(_x)) & ~(_x)); \
+	}))
+
+#define FIELD_PREP(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		((typeof(_x))(_val) << __bf_shf(_x)) & (_x);	\
+	}))
+
+#define NSP_ETH_NBI_PORT_COUNT		24
+#define NSP_ETH_MAX_COUNT		(2 * NSP_ETH_NBI_PORT_COUNT)
+#define NSP_ETH_TABLE_SIZE		(NSP_ETH_MAX_COUNT *		\
+					 sizeof(union eth_table_entry))
+
+#define NSP_ETH_PORT_LANES		GENMASK_ULL(3, 0)
+#define NSP_ETH_PORT_INDEX		GENMASK_ULL(15, 8)
+#define NSP_ETH_PORT_LABEL		GENMASK_ULL(53, 48)
+#define NSP_ETH_PORT_PHYLABEL		GENMASK_ULL(59, 54)
+#define NSP_ETH_PORT_FEC_SUPP_BASER	BIT_ULL(60)
+#define NSP_ETH_PORT_FEC_SUPP_RS	BIT_ULL(61)
+
+#define NSP_ETH_PORT_LANES_MASK		rte_cpu_to_le_64(NSP_ETH_PORT_LANES)
+
+#define NSP_ETH_STATE_CONFIGURED	BIT_ULL(0)
+#define NSP_ETH_STATE_ENABLED		BIT_ULL(1)
+#define NSP_ETH_STATE_TX_ENABLED	BIT_ULL(2)
+#define NSP_ETH_STATE_RX_ENABLED	BIT_ULL(3)
+#define NSP_ETH_STATE_RATE		GENMASK_ULL(11, 8)
+#define NSP_ETH_STATE_INTERFACE		GENMASK_ULL(19, 12)
+#define NSP_ETH_STATE_MEDIA		GENMASK_ULL(21, 20)
+#define NSP_ETH_STATE_OVRD_CHNG		BIT_ULL(22)
+#define NSP_ETH_STATE_ANEG		GENMASK_ULL(25, 23)
+#define NSP_ETH_STATE_FEC		GENMASK_ULL(27, 26)
+
+#define NSP_ETH_CTRL_CONFIGURED		BIT_ULL(0)
+#define NSP_ETH_CTRL_ENABLED		BIT_ULL(1)
+#define NSP_ETH_CTRL_TX_ENABLED		BIT_ULL(2)
+#define NSP_ETH_CTRL_RX_ENABLED		BIT_ULL(3)
+#define NSP_ETH_CTRL_SET_RATE		BIT_ULL(4)
+#define NSP_ETH_CTRL_SET_LANES		BIT_ULL(5)
+#define NSP_ETH_CTRL_SET_ANEG		BIT_ULL(6)
+#define NSP_ETH_CTRL_SET_FEC		BIT_ULL(7)
+
+/* Which connector port. */
+#define PORT_TP			0x00
+#define PORT_AUI		0x01
+#define PORT_MII		0x02
+#define PORT_FIBRE		0x03
+#define PORT_BNC		0x04
+#define PORT_DA			0x05
+#define PORT_NONE		0xef
+#define PORT_OTHER		0xff
+
+#define SPEED_10		10
+#define SPEED_100		100
+#define SPEED_1000		1000
+#define SPEED_2500		2500
+#define SPEED_5000		5000
+#define SPEED_10000		10000
+#define SPEED_14000		14000
+#define SPEED_20000		20000
+#define SPEED_25000		25000
+#define SPEED_40000		40000
+#define SPEED_50000		50000
+#define SPEED_56000		56000
+#define SPEED_100000		100000
+
+enum nfp_eth_raw {
+	NSP_ETH_RAW_PORT = 0,
+	NSP_ETH_RAW_STATE,
+	NSP_ETH_RAW_MAC,
+	NSP_ETH_RAW_CONTROL,
+
+	NSP_ETH_NUM_RAW
+};
+
+enum nfp_eth_rate {
+	RATE_INVALID = 0,
+	RATE_10M,
+	RATE_100M,
+	RATE_1G,
+	RATE_10G,
+	RATE_25G,
+};
+
+union eth_table_entry {
+	struct {
+		uint64_t port;
+		uint64_t state;
+		uint8_t mac_addr[6];
+		uint8_t resv[2];
+		uint64_t control;
+	};
+	uint64_t raw[NSP_ETH_NUM_RAW];
+};
+
+static const struct {
+	enum nfp_eth_rate rate;
+	unsigned int speed;
+} nsp_eth_rate_tbl[] = {
+	{ RATE_INVALID,	0, },
+	{ RATE_10M,	SPEED_10, },
+	{ RATE_100M,	SPEED_100, },
+	{ RATE_1G,	SPEED_1000, },
+	{ RATE_10G,	SPEED_10000, },
+	{ RATE_25G,	SPEED_25000, },
+};
+
+static unsigned int
+nfp_eth_rate2speed(enum nfp_eth_rate rate)
+{
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_eth_rate_tbl); i++)
+		if (nsp_eth_rate_tbl[i].rate == rate)
+			return nsp_eth_rate_tbl[i].speed;
+
+	return 0;
+}
+
+static unsigned int
+nfp_eth_speed2rate(unsigned int speed)
+{
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_eth_rate_tbl); i++)
+		if (nsp_eth_rate_tbl[i].speed == speed)
+			return nsp_eth_rate_tbl[i].rate;
+
+	return RATE_INVALID;
+}
+
+static void
+nfp_eth_copy_mac_reverse(uint8_t *dst, const uint8_t *src)
+{
+	int i;
+
+	for (i = 0; i < (int)ETH_ALEN; i++)
+		dst[ETH_ALEN - i - 1] = src[i];
+}
+
+static void
+nfp_eth_port_translate(struct nfp_nsp *nsp, const union eth_table_entry *src,
+		       unsigned int index, struct nfp_eth_table_port *dst)
+{
+	unsigned int rate;
+	unsigned int fec;
+	uint64_t port, state;
+
+	port = rte_le_to_cpu_64(src->port);
+	state = rte_le_to_cpu_64(src->state);
+
+	dst->eth_index = FIELD_GET(NSP_ETH_PORT_INDEX, port);
+	dst->index = index;
+	dst->nbi = index / NSP_ETH_NBI_PORT_COUNT;
+	dst->base = index % NSP_ETH_NBI_PORT_COUNT;
+	dst->lanes = FIELD_GET(NSP_ETH_PORT_LANES, port);
+
+	dst->enabled = FIELD_GET(NSP_ETH_STATE_ENABLED, state);
+	dst->tx_enabled = FIELD_GET(NSP_ETH_STATE_TX_ENABLED, state);
+	dst->rx_enabled = FIELD_GET(NSP_ETH_STATE_RX_ENABLED, state);
+
+	rate = nfp_eth_rate2speed(FIELD_GET(NSP_ETH_STATE_RATE, state));
+	dst->speed = dst->lanes * rate;
+
+	dst->interface = FIELD_GET(NSP_ETH_STATE_INTERFACE, state);
+	dst->media = FIELD_GET(NSP_ETH_STATE_MEDIA, state);
+
+	nfp_eth_copy_mac_reverse(dst->mac_addr, src->mac_addr);
+
+	dst->label_port = FIELD_GET(NSP_ETH_PORT_PHYLABEL, port);
+	dst->label_subport = FIELD_GET(NSP_ETH_PORT_LABEL, port);
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 17)
+		return;
+
+	dst->override_changed = FIELD_GET(NSP_ETH_STATE_OVRD_CHNG, state);
+	dst->aneg = FIELD_GET(NSP_ETH_STATE_ANEG, state);
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 22)
+		return;
+
+	fec = FIELD_GET(NSP_ETH_PORT_FEC_SUPP_BASER, port);
+	dst->fec_modes_supported |= fec << NFP_FEC_BASER_BIT;
+	fec = FIELD_GET(NSP_ETH_PORT_FEC_SUPP_RS, port);
+	dst->fec_modes_supported |= fec << NFP_FEC_REED_SOLOMON_BIT;
+	if (dst->fec_modes_supported)
+		dst->fec_modes_supported |= NFP_FEC_AUTO | NFP_FEC_DISABLED;
+
+	dst->fec = 1 << FIELD_GET(NSP_ETH_STATE_FEC, state);
+}
+
+static void
+nfp_eth_calc_port_geometry(struct nfp_eth_table *table)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < table->count; i++) {
+		table->max_index = RTE_MAX(table->max_index,
+					   table->ports[i].index);
+
+		for (j = 0; j < table->count; j++) {
+			if (table->ports[i].label_port !=
+			    table->ports[j].label_port)
+				continue;
+			table->ports[i].port_lanes += table->ports[j].lanes;
+
+			if (i == j)
+				continue;
+			if (table->ports[i].label_subport ==
+			    table->ports[j].label_subport)
+				printf("Port %d subport %d is a duplicate\n",
+					 table->ports[i].label_port,
+					 table->ports[i].label_subport);
+
+			table->ports[i].is_split = 1;
+		}
+	}
+}
+
+static void
+nfp_eth_calc_port_type(struct nfp_eth_table_port *entry)
+{
+	if (entry->interface == NFP_INTERFACE_NONE) {
+		entry->port_type = PORT_NONE;
+		return;
+	}
+
+	if (entry->media == NFP_MEDIA_FIBRE)
+		entry->port_type = PORT_FIBRE;
+	else
+		entry->port_type = PORT_DA;
+}
+
+static struct nfp_eth_table *
+__nfp_eth_read_ports(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries;
+	struct nfp_eth_table *table;
+	uint32_t table_sz;
+	int i, j, ret, cnt = 0;
+
+	entries = malloc(NSP_ETH_TABLE_SIZE);
+	if (!entries)
+		return NULL;
+
+	memset(entries, 0, NSP_ETH_TABLE_SIZE);
+	ret = nfp_nsp_read_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+	if (ret < 0) {
+		printf("reading port table failed %d\n", ret);
+		goto err;
+	}
+
+	for (i = 0; i < NSP_ETH_MAX_COUNT; i++)
+		if (entries[i].port & NSP_ETH_PORT_LANES_MASK)
+			cnt++;
+
+	/* Some versions of flash will give us 0 instead of port count. For
+	 * those that give a port count, verify it against the value calculated
+	 * above.
+	 */
+	if (ret && ret != cnt) {
+		printf("table entry count (%d) unmatch entries present (%d)\n",
+		       ret, cnt);
+		goto err;
+	}
+
+	table_sz = sizeof(*table) + sizeof(struct nfp_eth_table_port) * cnt;
+	table = malloc(table_sz);
+	if (!table)
+		goto err;
+
+	memset(table, 0, table_sz);
+	table->count = cnt;
+	for (i = 0, j = 0; i < NSP_ETH_MAX_COUNT; i++)
+		if (entries[i].port & NSP_ETH_PORT_LANES_MASK)
+			nfp_eth_port_translate(nsp, &entries[i], i,
+					       &table->ports[j++]);
+
+	nfp_eth_calc_port_geometry(table);
+	for (i = 0; i < (int)table->count; i++)
+		nfp_eth_calc_port_type(&table->ports[i]);
+
+	free(entries);
+
+	return table;
+
+err:
+	free(entries);
+	return NULL;
+}
+
+/*
+ * nfp_eth_read_ports() - retrieve port information
+ * @cpp:	NFP CPP handle
+ *
+ * Read the port information from the device.  Returned structure should
+ * be freed with kfree() once no longer needed.
+ *
+ * Return: populated ETH table or NULL on error.
+ */
+struct nfp_eth_table *
+nfp_eth_read_ports(struct nfp_cpp *cpp)
+{
+	struct nfp_eth_table *ret;
+	struct nfp_nsp *nsp;
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp)
+		return NULL;
+
+	ret = __nfp_eth_read_ports(nsp);
+	nfp_nsp_close(nsp);
+
+	return ret;
+}
+
+struct nfp_nsp *
+nfp_eth_config_start(struct nfp_cpp *cpp, unsigned int idx)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	int ret;
+
+	entries = malloc(NSP_ETH_TABLE_SIZE);
+	if (!entries)
+		return NULL;
+
+	memset(entries, 0, NSP_ETH_TABLE_SIZE);
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp) {
+		free(entries);
+		return nsp;
+	}
+
+	ret = nfp_nsp_read_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+	if (ret < 0) {
+		printf("reading port table failed %d\n", ret);
+		goto err;
+	}
+
+	if (!(entries[idx].port & NSP_ETH_PORT_LANES_MASK)) {
+		printf("trying to set port state on disabled port %d\n", idx);
+		goto err;
+	}
+
+	nfp_nsp_config_set_state(nsp, entries, idx);
+	return nsp;
+
+err:
+	nfp_nsp_close(nsp);
+	free(entries);
+	return NULL;
+}
+
+void
+nfp_eth_config_cleanup_end(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+
+	nfp_nsp_config_set_modified(nsp, 0);
+	nfp_nsp_config_clear_state(nsp);
+	nfp_nsp_close(nsp);
+	free(entries);
+}
+
+/*
+ * nfp_eth_config_commit_end() - perform recorded configuration changes
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ *
+ * Perform the configuration which was requested with __nfp_eth_set_*()
+ * helpers and recorded in @nsp state.  If device was already configured
+ * as requested or no __nfp_eth_set_*() operations were made no NSP command
+ * will be performed.
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_config_commit_end(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+	int ret = 1;
+
+	if (nfp_nsp_config_modified(nsp)) {
+		ret = nfp_nsp_write_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+		ret = ret < 0 ? ret : 0;
+	}
+
+	nfp_eth_config_cleanup_end(nsp);
+
+	return ret;
+}
+
+/*
+ * nfp_eth_set_mod_enable() - set PHY module enable control bit
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @enable:	Desired state
+ *
+ * Enable or disable PHY module (this usually means setting the TX lanes
+ * disable bits).
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_mod_enable(struct nfp_cpp *cpp, unsigned int idx, int enable)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	uint64_t reg;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -1;
+
+	entries = nfp_nsp_config_entries(nsp);
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].state);
+	if (enable != (int)FIELD_GET(NSP_ETH_CTRL_ENABLED, reg)) {
+		reg = rte_le_to_cpu_64(entries[idx].control);
+		reg &= ~NSP_ETH_CTRL_ENABLED;
+		reg |= FIELD_PREP(NSP_ETH_CTRL_ENABLED, enable);
+		entries[idx].control = rte_cpu_to_le_64(reg);
+
+		nfp_nsp_config_set_modified(nsp, 1);
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+/*
+ * nfp_eth_set_configured() - set PHY module configured control bit
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @configed:	Desired state
+ *
+ * Set the ifup/ifdown state on the PHY.
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_configured(struct nfp_cpp *cpp, unsigned int idx, int configed)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	uint64_t reg;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -EIO;
+
+	/*
+	 * Older ABI versions did support this feature, however this has only
+	 * been reliable since ABI 20.
+	 */
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 20) {
+		nfp_eth_config_cleanup_end(nsp);
+		return -EOPNOTSUPP;
+	}
+
+	entries = nfp_nsp_config_entries(nsp);
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].state);
+	if (configed != (int)FIELD_GET(NSP_ETH_STATE_CONFIGURED, reg)) {
+		reg = rte_le_to_cpu_64(entries[idx].control);
+		reg &= ~NSP_ETH_CTRL_CONFIGURED;
+		reg |= FIELD_PREP(NSP_ETH_CTRL_CONFIGURED, configed);
+		entries[idx].control = rte_cpu_to_le_64(reg);
+
+		nfp_nsp_config_set_modified(nsp, 1);
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+static int
+nfp_eth_set_bit_config(struct nfp_nsp *nsp, unsigned int raw_idx,
+		       const uint64_t mask, const unsigned int shift,
+		       unsigned int val, const uint64_t ctrl_bit)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+	unsigned int idx = nfp_nsp_config_idx(nsp);
+	uint64_t reg;
+
+	/*
+	 * Note: set features were added in ABI 0.14 but the error
+	 *	 codes were initially not populated correctly.
+	 */
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 17) {
+		printf("set operations not supported, please update flash\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].raw[raw_idx]);
+	if (val == (reg & mask) >> shift)
+		return 0;
+
+	reg &= ~mask;
+	reg |= (val << shift) & mask;
+	entries[idx].raw[raw_idx] = rte_cpu_to_le_64(reg);
+
+	entries[idx].control |= rte_cpu_to_le_64(ctrl_bit);
+
+	nfp_nsp_config_set_modified(nsp, 1);
+
+	return 0;
+}
+
+#define NFP_ETH_SET_BIT_CONFIG(nsp, raw_idx, mask, val, ctrl_bit)	\
+	(__extension__ ({ \
+		typeof(mask) _x = (mask); \
+		nfp_eth_set_bit_config(nsp, raw_idx, _x, __bf_shf(_x), \
+				       val, ctrl_bit);			\
+	}))
+
+/*
+ * __nfp_eth_set_aneg() - set PHY autonegotiation control bit
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @mode:	Desired autonegotiation mode
+ *
+ * Allow/disallow PHY module to advertise/perform autonegotiation.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_aneg(struct nfp_nsp *nsp, enum nfp_eth_aneg mode)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_ANEG, mode,
+				      NSP_ETH_CTRL_SET_ANEG);
+}
+
+/*
+ * __nfp_eth_set_fec() - set PHY forward error correction control bit
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @mode:	Desired fec mode
+ *
+ * Set the PHY module forward error correction mode.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+static int
+__nfp_eth_set_fec(struct nfp_nsp *nsp, enum nfp_eth_fec mode)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_FEC, mode,
+				      NSP_ETH_CTRL_SET_FEC);
+}
+
+/*
+ * nfp_eth_set_fec() - set PHY forward error correction control mode
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @mode:	Desired fec mode
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_fec(struct nfp_cpp *cpp, unsigned int idx, enum nfp_eth_fec mode)
+{
+	struct nfp_nsp *nsp;
+	int err;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -EIO;
+
+	err = __nfp_eth_set_fec(nsp, mode);
+	if (err) {
+		nfp_eth_config_cleanup_end(nsp);
+		return err;
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+/*
+ * __nfp_eth_set_speed() - set interface speed/rate
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @speed:	Desired speed (per lane)
+ *
+ * Set lane speed.  Provided @speed value should be subport speed divided
+ * by number of lanes this subport is spanning (i.e. 10000 for 40G, 25000 for
+ * 50G, etc.)
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_speed(struct nfp_nsp *nsp, unsigned int speed)
+{
+	enum nfp_eth_rate rate;
+
+	rate = nfp_eth_speed2rate(speed);
+	if (rate == RATE_INVALID) {
+		printf("could not find matching lane rate for speed %u\n",
+			 speed);
+		return -EINVAL;
+	}
+
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_RATE, rate,
+				      NSP_ETH_CTRL_SET_RATE);
+}
+
+/*
+ * __nfp_eth_set_split() - set interface lane split
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @lanes:	Desired lanes per port
+ *
+ * Set number of lanes in the port.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_split(struct nfp_nsp *nsp, unsigned int lanes)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_PORT, NSP_ETH_PORT_LANES,
+				      lanes, NSP_ETH_CTRL_SET_LANES);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_resource.c b/drivers/net/nfp/nfpcore/nfp_resource.c
new file mode 100644
index 0000000..e1df2b2
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_resource.c
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include <endian.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+#include "nfp_crc.h"
+
+#define NFP_RESOURCE_TBL_TARGET		NFP_CPP_TARGET_MU
+#define NFP_RESOURCE_TBL_BASE		0x8100000000ULL
+
+/* NFP Resource Table self-identifier */
+#define NFP_RESOURCE_TBL_NAME		"nfp.res"
+#define NFP_RESOURCE_TBL_KEY		0x00000000 /* Special key for entry 0 */
+
+#define NFP_RESOURCE_ENTRY_NAME_SZ	8
+
+/*
+ * struct nfp_resource_entry - Resource table entry
+ * @owner:		NFP CPP Lock, interface owner
+ * @key:		NFP CPP Lock, posix_crc32(name, 8)
+ * @region:		Memory region descriptor
+ * @name:		ASCII, zero padded name
+ * @reserved
+ * @cpp_action:		CPP Action
+ * @cpp_token:		CPP Token
+ * @cpp_target:		CPP Target ID
+ * @page_offset:	256-byte page offset into target's CPP address
+ * @page_size:		size, in 256-byte pages
+ */
+struct nfp_resource_entry {
+	struct nfp_resource_entry_mutex {
+		uint32_t owner;
+		uint32_t key;
+	} mutex;
+	struct nfp_resource_entry_region {
+		uint8_t  name[NFP_RESOURCE_ENTRY_NAME_SZ];
+		uint8_t  reserved[5];
+		uint8_t  cpp_action;
+		uint8_t  cpp_token;
+		uint8_t  cpp_target;
+		uint32_t page_offset;
+		uint32_t page_size;
+	} region;
+};
+
+#define NFP_RESOURCE_TBL_SIZE		4096
+#define NFP_RESOURCE_TBL_ENTRIES	(int)(NFP_RESOURCE_TBL_SIZE /	\
+					 sizeof(struct nfp_resource_entry))
+
+struct nfp_resource {
+	char name[NFP_RESOURCE_ENTRY_NAME_SZ + 1];
+	uint32_t cpp_id;
+	uint64_t addr;
+	uint64_t size;
+	struct nfp_cpp_mutex *mutex;
+};
+
+static int
+nfp_cpp_resource_find(struct nfp_cpp *cpp, struct nfp_resource *res)
+{
+	char name_pad[NFP_RESOURCE_ENTRY_NAME_SZ] = {};
+	struct nfp_resource_entry entry;
+	uint32_t cpp_id, key;
+	int ret, i;
+
+	cpp_id = NFP_CPP_ID(NFP_RESOURCE_TBL_TARGET, 3, 0);  /* Atomic read */
+
+	memset(name_pad, 0, NFP_RESOURCE_ENTRY_NAME_SZ);
+	strncpy(name_pad, res->name, sizeof(name_pad));
+
+	/* Search for a matching entry */
+	if (!memcmp(name_pad, NFP_RESOURCE_TBL_NAME "\0\0\0\0\0\0\0\0", 8)) {
+		printf("Grabbing device lock not supported\n");
+		return -EOPNOTSUPP;
+	}
+	key = nfp_crc32_posix(name_pad, sizeof(name_pad));
+
+	for (i = 0; i < NFP_RESOURCE_TBL_ENTRIES; i++) {
+		uint64_t addr = NFP_RESOURCE_TBL_BASE +
+			sizeof(struct nfp_resource_entry) * i;
+
+		ret = nfp_cpp_read(cpp, cpp_id, addr, &entry, sizeof(entry));
+		if (ret != sizeof(entry))
+			return -EIO;
+
+		if (entry.mutex.key != key)
+			continue;
+
+		/* Found key! */
+		res->mutex =
+			nfp_cpp_mutex_alloc(cpp,
+					    NFP_RESOURCE_TBL_TARGET, addr, key);
+		res->cpp_id = NFP_CPP_ID(entry.region.cpp_target,
+					 entry.region.cpp_action,
+					 entry.region.cpp_token);
+		res->addr = ((uint64_t)entry.region.page_offset) << 8;
+		res->size = (uint64_t)entry.region.page_size << 8;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int
+nfp_resource_try_acquire(struct nfp_cpp *cpp, struct nfp_resource *res,
+			 struct nfp_cpp_mutex *dev_mutex)
+{
+	int err;
+
+	if (nfp_cpp_mutex_lock(dev_mutex))
+		return -EINVAL;
+
+	err = nfp_cpp_resource_find(cpp, res);
+	if (err)
+		goto err_unlock_dev;
+
+	err = nfp_cpp_mutex_trylock(res->mutex);
+	if (err)
+		goto err_res_mutex_free;
+
+	nfp_cpp_mutex_unlock(dev_mutex);
+
+	return 0;
+
+err_res_mutex_free:
+	nfp_cpp_mutex_free(res->mutex);
+err_unlock_dev:
+	nfp_cpp_mutex_unlock(dev_mutex);
+
+	return err;
+}
+
+/*
+ * nfp_resource_acquire() - Acquire a resource handle
+ * @cpp:	NFP CPP handle
+ * @name:	Name of the resource
+ *
+ * NOTE: This function locks the acquired resource
+ *
+ * Return: NFP Resource handle, or ERR_PTR()
+ */
+struct nfp_resource *
+nfp_resource_acquire(struct nfp_cpp *cpp, const char *name)
+{
+	struct nfp_cpp_mutex *dev_mutex;
+	struct nfp_resource *res;
+	int err;
+	struct timespec wait;
+	int count;
+
+	res = malloc(sizeof(*res));
+	if (!res)
+		return NULL;
+
+	memset(res, 0, sizeof(*res));
+
+	strncpy(res->name, name, NFP_RESOURCE_ENTRY_NAME_SZ);
+
+	dev_mutex = nfp_cpp_mutex_alloc(cpp, NFP_RESOURCE_TBL_TARGET,
+					NFP_RESOURCE_TBL_BASE,
+					NFP_RESOURCE_TBL_KEY);
+	if (!dev_mutex) {
+		free(res);
+		return NULL;
+	}
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 1000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_resource_try_acquire(cpp, res, dev_mutex);
+		if (!err)
+			break;
+		if (err != -EBUSY)
+			goto err_free;
+
+		if (count++ > 1000) {
+			printf("Error: resource %s timed out\n", name);
+			err = -EBUSY;
+			goto err_free;
+		}
+
+		nanosleep(&wait, NULL);
+	}
+
+	nfp_cpp_mutex_free(dev_mutex);
+
+	return res;
+
+err_free:
+	nfp_cpp_mutex_free(dev_mutex);
+	free(res);
+	return NULL;
+}
+
+/*
+ * nfp_resource_release() - Release a NFP Resource handle
+ * @res:	NFP Resource handle
+ *
+ * NOTE: This function implictly unlocks the resource handle
+ */
+void
+nfp_resource_release(struct nfp_resource *res)
+{
+	nfp_cpp_mutex_unlock(res->mutex);
+	nfp_cpp_mutex_free(res->mutex);
+	free(res);
+}
+
+/*
+ * nfp_resource_cpp_id() - Return the cpp_id of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: NFP CPP ID
+ */
+uint32_t
+nfp_resource_cpp_id(const struct nfp_resource *res)
+{
+	return res->cpp_id;
+}
+
+/*
+ * nfp_resource_name() - Return the name of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: const char pointer to the name of the resource
+ */
+const char
+*nfp_resource_name(const struct nfp_resource *res)
+{
+	return res->name;
+}
+
+/*
+ * nfp_resource_address() - Return the address of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: Address of the resource
+ */
+uint64_t
+nfp_resource_address(const struct nfp_resource *res)
+{
+	return res->addr;
+}
+
+/*
+ * nfp_resource_size() - Return the size in bytes of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: Size of the resource in bytes
+ */
+uint64_t
+nfp_resource_size(const struct nfp_resource *res)
+{
+	return res->size;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_resource.h b/drivers/net/nfp/nfpcore/nfp_resource.h
new file mode 100644
index 0000000..06cc6f7
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_resource.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NFP_RESOURCE_H
+#define NFP_RESOURCE_H
+
+#include "nfp_cpp.h"
+
+#define NFP_RESOURCE_NFP_NFFW           "nfp.nffw"
+#define NFP_RESOURCE_NFP_HWINFO         "nfp.info"
+#define NFP_RESOURCE_NSP		"nfp.sp"
+
+/**
+ * Opaque handle to a NFP Resource
+ */
+struct nfp_resource;
+
+struct nfp_resource *nfp_resource_acquire(struct nfp_cpp *cpp,
+					  const char *name);
+
+/**
+ * Release a NFP Resource, and free the handle
+ * @param[in]   res     NFP Resource handle
+ */
+void nfp_resource_release(struct nfp_resource *res);
+
+/**
+ * Return the CPP ID of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      CPP ID of the NFP Resource
+ */
+uint32_t nfp_resource_cpp_id(const struct nfp_resource *res);
+
+/**
+ * Return the name of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      Name of the NFP Resource
+ */
+const char *nfp_resource_name(const struct nfp_resource *res);
+
+/**
+ * Return the target address of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      Address of the NFP Resource
+ */
+uint64_t nfp_resource_address(const struct nfp_resource *res);
+
+uint64_t nfp_resource_size(const struct nfp_resource *res);
+
+#endif /* NFP_RESOURCE_H */
diff --git a/drivers/net/nfp/nfpcore/nfp_rtsym.c b/drivers/net/nfp/nfpcore/nfp_rtsym.c
new file mode 100644
index 0000000..cb7d83d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_rtsym.c
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/*
+ * nfp_rtsym.c
+ * Interface for accessing run-time symbol table
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_mip.h"
+#include "nfp_rtsym.h"
+#include "nfp6000/nfp6000.h"
+
+/* These need to match the linker */
+#define SYM_TGT_LMEM		0
+#define SYM_TGT_EMU_CACHE	0x17
+
+struct nfp_rtsym_entry {
+	uint8_t	type;
+	uint8_t	target;
+	uint8_t	island;
+	uint8_t	addr_hi;
+	uint32_t addr_lo;
+	uint16_t name;
+	uint8_t	menum;
+	uint8_t	size_hi;
+	uint32_t size_lo;
+};
+
+struct nfp_rtsym_table {
+	struct nfp_cpp *cpp;
+	int num;
+	char *strtab;
+	struct nfp_rtsym symtab[];
+};
+
+static int
+nfp_meid(uint8_t island_id, uint8_t menum)
+{
+	return (island_id & 0x3F) == island_id && menum < 12 ?
+		(island_id << 4) | (menum + 4) : -1;
+}
+
+static void
+nfp_rtsym_sw_entry_init(struct nfp_rtsym_table *cache, uint32_t strtab_size,
+			struct nfp_rtsym *sw, struct nfp_rtsym_entry *fw)
+{
+	sw->type = fw->type;
+	sw->name = cache->strtab + rte_le_to_cpu_16(fw->name) % strtab_size;
+	sw->addr = ((uint64_t)fw->addr_hi << 32) |
+		   rte_le_to_cpu_32(fw->addr_lo);
+	sw->size = ((uint64_t)fw->size_hi << 32) |
+		   rte_le_to_cpu_32(fw->size_lo);
+
+#ifdef DEBUG
+	printf("rtsym_entry_init\n");
+	printf("\tname=%s, addr=%" PRIx64 ", size=%" PRIu64 ",target=%d\n",
+		sw->name, sw->addr, sw->size, sw->target);
+#endif
+	switch (fw->target) {
+	case SYM_TGT_LMEM:
+		sw->target = NFP_RTSYM_TARGET_LMEM;
+		break;
+	case SYM_TGT_EMU_CACHE:
+		sw->target = NFP_RTSYM_TARGET_EMU_CACHE;
+		break;
+	default:
+		sw->target = fw->target;
+		break;
+	}
+
+	if (fw->menum != 0xff)
+		sw->domain = nfp_meid(fw->island, fw->menum);
+	else if (fw->island != 0xff)
+		sw->domain = fw->island;
+	else
+		sw->domain = -1;
+}
+
+struct nfp_rtsym_table *
+nfp_rtsym_table_read(struct nfp_cpp *cpp)
+{
+	struct nfp_rtsym_table *rtbl;
+	struct nfp_mip *mip;
+
+	mip = nfp_mip_open(cpp);
+	rtbl = __nfp_rtsym_table_read(cpp, mip);
+	nfp_mip_close(mip);
+
+	return rtbl;
+}
+
+/*
+ * This looks more complex than it should be. But we need to get the type for
+ * the ~ right in round_down (it needs to be as wide as the result!), and we
+ * want to evaluate the macro arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+
+#define round_up(x, y) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		((((_x) - 1) | __round_mask(_x, y)) + 1); \
+	}))
+
+#define round_down(x, y) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		((_x) & ~__round_mask(_x, y)); \
+	}))
+
+struct nfp_rtsym_table *
+__nfp_rtsym_table_read(struct nfp_cpp *cpp, const struct nfp_mip *mip)
+{
+	uint32_t strtab_addr, symtab_addr, strtab_size, symtab_size;
+	struct nfp_rtsym_entry *rtsymtab;
+	struct nfp_rtsym_table *cache;
+	const uint32_t dram =
+		NFP_CPP_ID(NFP_CPP_TARGET_MU, NFP_CPP_ACTION_RW, 0) |
+		NFP_ISL_EMEM0;
+	int err, n, size;
+
+	if (!mip)
+		return NULL;
+
+	nfp_mip_strtab(mip, &strtab_addr, &strtab_size);
+	nfp_mip_symtab(mip, &symtab_addr, &symtab_size);
+
+	if (!symtab_size || !strtab_size || symtab_size % sizeof(*rtsymtab))
+		return NULL;
+
+	/* Align to 64 bits */
+	symtab_size = round_up(symtab_size, 8);
+	strtab_size = round_up(strtab_size, 8);
+
+	rtsymtab = malloc(symtab_size);
+	if (!rtsymtab)
+		return NULL;
+
+	size = sizeof(*cache);
+	size += symtab_size / sizeof(*rtsymtab) * sizeof(struct nfp_rtsym);
+	size +=	strtab_size + 1;
+	cache = malloc(size);
+	if (!cache)
+		goto exit_free_rtsym_raw;
+
+	cache->cpp = cpp;
+	cache->num = symtab_size / sizeof(*rtsymtab);
+	cache->strtab = (void *)&cache->symtab[cache->num];
+
+	err = nfp_cpp_read(cpp, dram, symtab_addr, rtsymtab, symtab_size);
+	if (err != (int)symtab_size)
+		goto exit_free_cache;
+
+	err = nfp_cpp_read(cpp, dram, strtab_addr, cache->strtab, strtab_size);
+	if (err != (int)strtab_size)
+		goto exit_free_cache;
+	cache->strtab[strtab_size] = '\0';
+
+	for (n = 0; n < cache->num; n++)
+		nfp_rtsym_sw_entry_init(cache, strtab_size,
+					&cache->symtab[n], &rtsymtab[n]);
+
+	free(rtsymtab);
+
+	return cache;
+
+exit_free_cache:
+	free(cache);
+exit_free_rtsym_raw:
+	free(rtsymtab);
+	return NULL;
+}
+
+/*
+ * nfp_rtsym_count() - Get the number of RTSYM descriptors
+ * @rtbl:	NFP RTsym table
+ *
+ * Return: Number of RTSYM descriptors
+ */
+int
+nfp_rtsym_count(struct nfp_rtsym_table *rtbl)
+{
+	if (!rtbl)
+		return -EINVAL;
+
+	return rtbl->num;
+}
+
+/*
+ * nfp_rtsym_get() - Get the Nth RTSYM descriptor
+ * @rtbl:	NFP RTsym table
+ * @idx:	Index (0-based) of the RTSYM descriptor
+ *
+ * Return: const pointer to a struct nfp_rtsym descriptor, or NULL
+ */
+const struct nfp_rtsym *
+nfp_rtsym_get(struct nfp_rtsym_table *rtbl, int idx)
+{
+	if (!rtbl)
+		return NULL;
+
+	if (idx >= rtbl->num)
+		return NULL;
+
+	return &rtbl->symtab[idx];
+}
+
+/*
+ * nfp_rtsym_lookup() - Return the RTSYM descriptor for a symbol name
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ *
+ * Return: const pointer to a struct nfp_rtsym descriptor, or NULL
+ */
+const struct nfp_rtsym *
+nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name)
+{
+	int n;
+
+	if (!rtbl)
+		return NULL;
+
+	for (n = 0; n < rtbl->num; n++)
+		if (strcmp(name, rtbl->symtab[n].name) == 0)
+			return &rtbl->symtab[n];
+
+	return NULL;
+}
+
+/*
+ * nfp_rtsym_read_le() - Read a simple unsigned scalar value from symbol
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ * @error:	Poniter to error code (optional)
+ *
+ * Lookup a symbol, map, read it and return it's value. Value of the symbol
+ * will be interpreted as a simple little-endian unsigned value. Symbol can
+ * be 4 or 8 bytes in size.
+ *
+ * Return: value read, on error sets the error and returns ~0ULL.
+ */
+uint64_t
+nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name, int *error)
+{
+	const struct nfp_rtsym *sym;
+	uint32_t val32, id;
+	uint64_t val;
+	int err;
+
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym) {
+		err = -ENOENT;
+		goto exit;
+	}
+
+	id = NFP_CPP_ISLAND_ID(sym->target, NFP_CPP_ACTION_RW, 0, sym->domain);
+
+#ifdef DEBUG
+	printf("Reading symbol %s with size %" PRIu64 " at %" PRIx64 "\n",
+		name, sym->size, sym->addr);
+#endif
+	switch (sym->size) {
+	case 4:
+		err = nfp_cpp_readl(rtbl->cpp, id, sym->addr, &val32);
+		val = val32;
+		break;
+	case 8:
+		err = nfp_cpp_readq(rtbl->cpp, id, sym->addr, &val);
+		break;
+	default:
+		printf("rtsym '%s' unsupported size: %" PRId64 "\n",
+			name, sym->size);
+		err = -EINVAL;
+		break;
+	}
+
+	if (err)
+		err = -EIO;
+exit:
+	if (error)
+		*error = err;
+
+	if (err)
+		return ~0ULL;
+
+	return val;
+}
+
+uint8_t *
+nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name,
+	      unsigned int min_size, struct nfp_cpp_area **area)
+{
+	const struct nfp_rtsym *sym;
+	uint8_t *mem;
+
+#ifdef DEBUG
+	printf("mapping symbol %s\n", name);
+#endif
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym) {
+		printf("symbol lookup fails for %s\n", name);
+		return NULL;
+	}
+
+	if (sym->size < min_size) {
+		printf("Symbol %s too small (%" PRIu64 " < %u)\n", name,
+			sym->size, min_size);
+		return NULL;
+	}
+
+	mem = nfp_cpp_map_area(rtbl->cpp, sym->domain, sym->target, sym->addr,
+			       sym->size, area);
+	if (!mem) {
+		printf("Failed to map symbol %s\n", name);
+		return NULL;
+	}
+#ifdef DEBUG
+	printf("symbol %s with address %p\n", name, mem);
+#endif
+
+	return mem;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_rtsym.h b/drivers/net/nfp/nfpcore/nfp_rtsym.h
new file mode 100644
index 0000000..8b49421
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_rtsym.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RTSYM_H__
+#define __NFP_RTSYM_H__
+
+#define NFP_RTSYM_TYPE_NONE             0
+#define NFP_RTSYM_TYPE_OBJECT           1
+#define NFP_RTSYM_TYPE_FUNCTION         2
+#define NFP_RTSYM_TYPE_ABS              3
+
+#define NFP_RTSYM_TARGET_NONE           0
+#define NFP_RTSYM_TARGET_LMEM           -1
+#define NFP_RTSYM_TARGET_EMU_CACHE      -7
+
+/*
+ * Structure describing a run-time NFP symbol.
+ *
+ * The memory target of the symbol is generally the CPP target number and can be
+ * used directly by the nfp_cpp API calls.  However, in some cases (i.e., for
+ * local memory or control store) the target is encoded using a negative number.
+ *
+ * When the target type can not be used to fully describe the location of a
+ * symbol the domain field is used to further specify the location (i.e., the
+ * specific ME or island number).
+ *
+ * For ME target resources, 'domain' is an MEID.
+ * For Island target resources, 'domain' is an island ID, with the one exception
+ * of "sram" symbols for backward compatibility, which are viewed as global.
+ */
+struct nfp_rtsym {
+	const char *name;
+	uint64_t addr;
+	uint64_t size;
+	int type;
+	int target;
+	int domain;
+};
+
+struct nfp_rtsym_table;
+
+struct nfp_rtsym_table *nfp_rtsym_table_read(struct nfp_cpp *cpp);
+
+struct nfp_rtsym_table *
+__nfp_rtsym_table_read(struct nfp_cpp *cpp, const struct nfp_mip *mip);
+
+int nfp_rtsym_count(struct nfp_rtsym_table *rtbl);
+
+const struct nfp_rtsym *nfp_rtsym_get(struct nfp_rtsym_table *rtbl, int idx);
+
+const struct nfp_rtsym *
+nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name);
+
+uint64_t nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
+			   int *error);
+uint8_t *
+nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name,
+	      unsigned int min_size, struct nfp_cpp_area **area);
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_target.h b/drivers/net/nfp/nfpcore/nfp_target.h
new file mode 100644
index 0000000..2884a00
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_target.h
@@ -0,0 +1,579 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NFP_TARGET_H
+#define NFP_TARGET_H
+
+#include "nfp-common/nfp_resid.h"
+#include "nfp-common/nfp_cppat.h"
+#include "nfp-common/nfp_platform.h"
+#include "nfp_cpp.h"
+
+#define P32 1
+#define P64 2
+
+#define PUSHPULL(_pull, _push) (((_pull) << 4) | ((_push) << 0))
+
+#ifndef NFP_ERRNO
+#include <errno.h>
+#define NFP_ERRNO(x)    (errno = (x), -1)
+#endif
+
+static inline int
+pushpull_width(int pp)
+{
+	pp &= 0xf;
+
+	if (pp == 0)
+		return NFP_ERRNO(EINVAL);
+	return (2 << pp);
+}
+
+#define PUSH_WIDTH(_pushpull)      pushpull_width((_pushpull) >> 0)
+#define PULL_WIDTH(_pushpull)      pushpull_width((_pushpull) >> 4)
+
+static inline int
+target_rw(uint32_t cpp_id, int pp, int start, int len)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < start || island > (start + len)))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0):
+		return PUSHPULL(0, pp);
+	case NFP_CPP_ID(0, 1, 0):
+		return PUSHPULL(pp, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(pp, pp);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_dma(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiDma */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiDma */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_stats(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiStats */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiStats */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_tm(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiTM */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0):  /* WriteNbiTM */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_ppc(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiPreclassifier */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiPreclassifier */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi(uint32_t cpp_id, uint64_t address)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+	uint64_t rel_addr = address & 0x3fFFFF;
+
+	if (island && (island < 8 || island > 9))
+		return NFP_ERRNO(EINVAL);
+
+	if (rel_addr < (1 << 20))
+		return nfp6000_nbi_dma(cpp_id);
+	if (rel_addr < (2 << 20))
+		return nfp6000_nbi_stats(cpp_id);
+	if (rel_addr < (3 << 20))
+		return nfp6000_nbi_tm(cpp_id);
+	return nfp6000_nbi_ppc(cpp_id);
+}
+
+/*
+ * This structure ONLY includes items that can be done with a read or write of
+ * 32-bit or 64-bit words. All others are not listed.
+ */
+static inline int
+nfp6000_mu_common(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0): /* read_be/write_be */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 1): /* read_le/write_le */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 2): /* {read/write}_swap_be */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 3): /* {read/write}_swap_le */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, 0, 0): /* read_be */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 1): /* read_le */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 2): /* read_swap_be */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 3): /* read_swap_le */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* write_be */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 1): /* write_le */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 2): /* write_swap_be */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 3): /* write_swap_le */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 3, 0): /* atomic_read */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 2): /* mask_compare_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 0): /* atomic_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 2): /* atomic_write_imm */
+		return PUSHPULL(0, 0);
+	case NFP_CPP_ID(0, 4, 3): /* swap_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 5, 0): /* set */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 5, 3): /* test_set_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 6, 0): /* clr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 3): /* test_clr_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 7, 0): /* add */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 7, 3): /* test_add_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 8, 0): /* addsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 3): /* test_subsat_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 0): /* sub */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 9, 3): /* test_sub_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 10, 0): /* subsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 10, 3): /* test_subsat_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 0): /* microq128_get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 1): /* microq128_pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 2): /* microq128_put */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 0): /* xor */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 3): /* test_xor_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 0): /* read32_be */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 1): /* read32_le */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 2): /* read32_swap_be */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 3): /* read32_swap_le */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 31, 0): /* write32_be */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 1): /* write32_le */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 2): /* write32_swap_be */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 3): /* write32_swap_le */
+		return PUSHPULL(P32, 0);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_mu_ctm(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 16, 1): /* packet_read_packet_status */
+		return PUSHPULL(0, P32);
+	default:
+		return nfp6000_mu_common(cpp_id);
+	}
+}
+
+static inline int
+nfp6000_mu_emu(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 18, 0): /* read_queue */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 18, 1): /* read_queue_ring */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 18, 2): /* write_queue */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 18, 3): /* write_queue_ring */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 20, 2): /* journal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 21, 0): /* get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 21, 1): /* get_eop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 21, 2): /* get_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 0): /* pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 1): /* pop_eop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 2): /* pop_freely */
+		return PUSHPULL(0, P32);
+	default:
+		return nfp6000_mu_common(cpp_id);
+	}
+}
+
+static inline int
+nfp6000_mu_imu(uint32_t cpp_id)
+{
+	return nfp6000_mu_common(cpp_id);
+}
+
+static inline int
+nfp6000_mu(uint32_t cpp_id, uint64_t address)
+{
+	int pp;
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island == 0) {
+		if (address < 0x2000000000ULL)
+			pp = nfp6000_mu_ctm(cpp_id);
+		else if (address < 0x8000000000ULL)
+			pp = nfp6000_mu_emu(cpp_id);
+		else if (address < 0x9800000000ULL)
+			pp = nfp6000_mu_ctm(cpp_id);
+		else if (address < 0x9C00000000ULL)
+			pp = nfp6000_mu_emu(cpp_id);
+		else if (address < 0xA000000000ULL)
+			pp = nfp6000_mu_imu(cpp_id);
+		else
+			pp = nfp6000_mu_ctm(cpp_id);
+	} else if (island >= 24 && island <= 27) {
+		pp = nfp6000_mu_emu(cpp_id);
+	} else if (island >= 28 && island <= 31) {
+		pp = nfp6000_mu_imu(cpp_id);
+	} else if (island == 1 ||
+		   (island >= 4 && island <= 7) ||
+		   (island >= 12 && island <= 13) ||
+		   (island >= 32 && island <= 47) ||
+		   (island >= 48 && island <= 51)) {
+		pp = nfp6000_mu_ctm(cpp_id);
+	} else {
+		pp = NFP_ERRNO(EINVAL);
+	}
+
+	return pp;
+}
+
+static inline int
+nfp6000_ila(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 48 || island > 51))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 1): /* read_check_error */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 0): /* read_int */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0): /* write_int */
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 48, 4);
+	}
+}
+
+static inline int
+nfp6000_pci(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 4 || island > 7))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 2, 0):
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0):
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 4, 4);
+	}
+}
+
+static inline int
+nfp6000_crypto(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 12 || island > 15))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 2, 0):
+		return PUSHPULL(P64, 0);
+	default:
+		return target_rw(cpp_id, P64, 12, 4);
+	}
+}
+
+static inline int
+nfp6000_cap_xpb(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 1 || island > 63))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 1): /* RingGet */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 0, 2): /* Interthread Signal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 1, 1): /* RingPut */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 1, 2): /* CTNNWr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 0): /* ReflectRd, signal none */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 1): /* ReflectRd, signal self */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 2): /* ReflectRd, signal remote */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 3): /* ReflectRd, signal both */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0): /* ReflectWr, signal none */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 1): /* ReflectWr, signal self */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 2): /* ReflectWr, signal remote */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 3): /* ReflectWr, signal both */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 1):
+		return PUSHPULL(P32, P32);
+	default:
+		return target_rw(cpp_id, P32, 1, 63);
+	}
+}
+
+static inline int
+nfp6000_cls(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 1 || island > 63))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 3): /* xor */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 0): /* set */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 1): /* clr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 0): /* add */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 1): /* add64 */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 0): /* sub */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 1): /* sub64 */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 2): /* subsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 2): /* hash_mask */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 3): /* hash_clear */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 9, 0): /* ring_get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 1): /* ring_pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 2): /* ring_get_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 3): /* ring_pop_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 10, 0): /* ring_put */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 10, 2): /* ring_journal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 14, 0): /* reflect_write_sig_local */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 1):  /* reflect_read_sig_local */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 17, 2): /* statistic */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 24, 0): /* ring_read */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 24, 1): /* ring_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 25, 0): /* ring_workq_add_thread */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 25, 1): /* ring_workq_add_work */
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 0, 64);
+	}
+}
+
+static inline int
+nfp6000_target_pushpull(uint32_t cpp_id, uint64_t address)
+{
+	switch (NFP_CPP_ID_TARGET_of(cpp_id)) {
+	case NFP6000_CPPTGT_NBI:
+		return nfp6000_nbi(cpp_id, address);
+	case NFP6000_CPPTGT_VQDR:
+		return target_rw(cpp_id, P32, 24, 4);
+	case NFP6000_CPPTGT_ILA:
+		return nfp6000_ila(cpp_id);
+	case NFP6000_CPPTGT_MU:
+		return nfp6000_mu(cpp_id, address);
+	case NFP6000_CPPTGT_PCIE:
+		return nfp6000_pci(cpp_id);
+	case NFP6000_CPPTGT_ARM:
+		if (address < 0x10000)
+			return target_rw(cpp_id, P64, 1, 1);
+		else
+			return target_rw(cpp_id, P32, 1, 1);
+	case NFP6000_CPPTGT_CRYPTO:
+		return nfp6000_crypto(cpp_id);
+	case NFP6000_CPPTGT_CTXPB:
+		return nfp6000_cap_xpb(cpp_id);
+	case NFP6000_CPPTGT_CLS:
+		return nfp6000_cls(cpp_id);
+	case 0:
+		return target_rw(cpp_id, P32, 4, 4);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp_target_pushpull_width(int pp, int write_not_read)
+{
+	if (pp < 0)
+		return pp;
+
+	if (write_not_read)
+		return PULL_WIDTH(pp);
+	else
+		return PUSH_WIDTH(pp);
+}
+
+static inline int
+nfp6000_target_action_width(uint32_t cpp_id, uint64_t address,
+			    int write_not_read)
+{
+	int pp;
+
+	pp = nfp6000_target_pushpull(cpp_id, address);
+
+	return nfp_target_pushpull_width(pp, write_not_read);
+}
+
+static inline int
+nfp_target_action_width(uint32_t model, uint32_t cpp_id, uint64_t address,
+			int write_not_read)
+{
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		return nfp6000_target_action_width(cpp_id, address,
+						   write_not_read);
+	} else {
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp_target_cpp(uint32_t cpp_island_id, uint64_t cpp_island_address,
+	       uint32_t *cpp_target_id, uint64_t *cpp_target_address,
+	       const uint32_t *imb_table)
+{
+	int err;
+	int island = NFP_CPP_ID_ISLAND_of(cpp_island_id);
+	int target = NFP_CPP_ID_TARGET_of(cpp_island_id);
+	uint32_t imb;
+
+	if (target < 0 || target >= 16)
+		return NFP_ERRNO(EINVAL);
+
+	if (island == 0) {
+		/* Already translated */
+		*cpp_target_id = cpp_island_id;
+		*cpp_target_address = cpp_island_address;
+		return 0;
+	}
+
+	if (!imb_table) {
+		/* CPP + Island only allowed on systems with IMB tables */
+		return NFP_ERRNO(EINVAL);
+	}
+
+	imb = imb_table[target];
+
+	*cpp_target_address = cpp_island_address;
+	err = _nfp6000_cppat_addr_encode(cpp_target_address, island, target,
+					 ((imb >> 13) & 7),
+					 ((imb >> 12) & 1),
+					 ((imb >> 6) & 0x3f),
+					 ((imb >> 0) & 0x3f));
+	if (err == 0) {
+		*cpp_target_id =
+		    NFP_CPP_ID(target, NFP_CPP_ID_ACTION_of(cpp_island_id),
+			       NFP_CPP_ID_TOKEN_of(cpp_island_id));
+	}
+
+	return err;
+}
+
+#endif /* NFP_TARGET_H */
-- 
1.9.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH 2/4] net/nfp: update PMD for using new CPP interface
    2018-04-05 14:28  1% ` [dpdk-dev] [PATCH 1/4] net/nfp: add NFP CPP support Alejandro Lucero
@ 2018-04-05 14:28  6% ` Alejandro Lucero
  1 sibling, 0 replies; 200+ results
From: Alejandro Lucero @ 2018-04-05 14:28 UTC (permalink / raw)
  To: dev

PF PMD support was based on NSPU interface. This patch changes the
PMD for using the new CPP user space interface which gives more
flexibility for adding new functionalities.

This change just affects initialization with the datapath being the
same than before.

Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
---
 drivers/net/nfp/Makefile      |  17 ++-
 drivers/net/nfp/nfp_net.c     | 342 +++++++++++++++++++++++++++++-------------
 drivers/net/nfp/nfp_net_pmd.h |  16 +-
 3 files changed, 264 insertions(+), 111 deletions(-)

diff --git a/drivers/net/nfp/Makefile b/drivers/net/nfp/Makefile
index aa3b68a..ab4e0a7 100644
--- a/drivers/net/nfp/Makefile
+++ b/drivers/net/nfp/Makefile
@@ -20,11 +20,24 @@ EXPORT_MAP := rte_pmd_nfp_version.map
 
 LIBABIVER := 1
 
+VPATH += $(SRCDIR)/nfpcore
+
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_cppcore.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_cpp_pcie_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_mutex.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_resource.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_crc.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_mip.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nffw.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_hwinfo.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_rtsym.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp_cmds.c
+SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nsp_eth.c
+
 #
 # all source are stored in SRCS-y
 #
 SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_net.c
-SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nfpu.c
-SRCS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += nfp_nspu.c
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index 8591c7d..4eb032c 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015 Netronome Systems, Inc.
+ * Copyright (c) 2014-2018 Netronome Systems, Inc.
  * All rights reserved.
  *
  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
@@ -55,7 +55,13 @@
 #include <rte_alarm.h>
 #include <rte_spinlock.h>
 
-#include "nfp_nfpu.h"
+#include "nfpcore/nfp_cpp.h"
+#include "nfpcore/nfp_nffw.h"
+#include "nfpcore/nfp_hwinfo.h"
+#include "nfpcore/nfp_mip.h"
+#include "nfpcore/nfp_rtsym.h"
+#include "nfpcore/nfp_nsp.h"
+
 #include "nfp_net_pmd.h"
 #include "nfp_net_logs.h"
 #include "nfp_net_ctrl.h"
@@ -104,12 +110,8 @@ static int nfp_net_rss_reta_write(struct rte_eth_dev *dev,
 static int nfp_net_rss_hash_write(struct rte_eth_dev *dev,
 			struct rte_eth_rss_conf *rss_conf);
 
-/*
- * The offset of the queue controller queues in the PCIe Target. These
- * happen to be at the same offset on the NFP6000 and the NFP3200 so
- * we use a single macro here.
- */
-#define NFP_PCIE_QUEUE(_q)	(0x800 * ((_q) & 0xff))
+/* The offset of the queue controller queues in the PCIe Target */
+#define NFP_PCIE_QUEUE(_q) (0x80000 + (NFP_QCP_QUEUE_ADDR_SZ * ((_q) & 0xff)))
 
 /* Maximum value which can be added to a queue with one transaction */
 #define NFP_QCP_MAX_ADD	0x7f
@@ -574,47 +576,29 @@ enum nfp_qcp_ptr {
 #define ETH_ADDR_LEN	6
 
 static void
-nfp_eth_copy_mac_reverse(uint8_t *dst, const uint8_t *src)
+nfp_eth_copy_mac(uint8_t *dst, const uint8_t *src)
 {
 	int i;
 
 	for (i = 0; i < ETH_ADDR_LEN; i++)
-		dst[ETH_ADDR_LEN - i - 1] = src[i];
+		dst[i] = src[i];
 }
 
 static int
 nfp_net_pf_read_mac(struct nfp_net_hw *hw, int port)
 {
-	union eth_table_entry *entry;
-	int idx, i;
-
-	idx = port;
-	entry = hw->eth_table;
-
-	/* Reading NFP ethernet table obtained before */
-	for (i = 0; i < NSP_ETH_MAX_COUNT; i++) {
-		if (!(entry->port & NSP_ETH_PORT_LANES_MASK)) {
-			/* port not in use */
-			entry++;
-			continue;
-		}
-		if (idx == 0)
-			break;
-		idx--;
-		entry++;
-	}
-
-	if (i == NSP_ETH_MAX_COUNT)
-		return -EINVAL;
+	struct nfp_eth_table *nfp_eth_table;
 
+	nfp_eth_table = nfp_eth_read_ports(hw->cpp);
 	/*
 	 * hw points to port0 private data. We need hw now pointing to
 	 * right port.
 	 */
 	hw += port;
-	nfp_eth_copy_mac_reverse((uint8_t *)&hw->mac_addr,
-				 (uint8_t *)&entry->mac_addr);
+	nfp_eth_copy_mac((uint8_t *)&hw->mac_addr,
+			 (uint8_t *)&nfp_eth_table->ports[port].mac_addr);
 
+	free(nfp_eth_table);
 	return 0;
 }
 
@@ -780,7 +764,7 @@ enum nfp_qcp_ptr {
 
 	if (hw->is_pf)
 		/* Configure the physical port up */
-		nfp_nsp_eth_config(hw->nspu_desc, hw->pf_port_idx, 1);
+		nfp_eth_set_configured(hw->cpp, hw->pf_port_idx, 1);
 
 	hw->ctrl = new_ctrl;
 
@@ -831,7 +815,7 @@ enum nfp_qcp_ptr {
 
 	if (hw->is_pf)
 		/* Configure the physical port down */
-		nfp_nsp_eth_config(hw->nspu_desc, hw->pf_port_idx, 0);
+		nfp_eth_set_configured(hw->cpp, hw->pf_port_idx, 0);
 }
 
 /* Reset and stop device. The device can not be restarted. */
@@ -2678,10 +2662,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	uint64_t tx_bar_off = 0, rx_bar_off = 0;
 	uint32_t start_q;
 	int stride = 4;
-
-	nspu_desc_t *nspu_desc = NULL;
-	uint64_t bar_offset;
 	int port = 0;
+	int err;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -2702,7 +2684,6 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 
 		/* This points to the specific port private data */
 		hw = &hwport0[port];
-		hw->pf_port_idx = port;
 	} else {
 		hw = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 		hwport0 = 0;
@@ -2736,19 +2717,14 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	}
 
 	if (hw->is_pf && port == 0) {
-		nspu_desc = hw->nspu_desc;
-
-		if (nfp_nsp_map_ctrl_bar(nspu_desc, &bar_offset) != 0) {
-			/*
-			 * A firmware should be there after PF probe so this
-			 * should not happen.
-			 */
-			RTE_LOG(ERR, PMD, "PF BAR symbol resolution failed\n");
-			return -ENODEV;
+		hw->ctrl_bar = nfp_rtsym_map(hw->sym_tbl, "_pf0_net_bar0",
+					     hw->total_ports * 32768,
+					     &hw->ctrl_area);
+		if (!hw->ctrl_bar) {
+			printf("nfp_rtsym_map fails for _pf0_net_ctrl_bar\n");
+			return -EIO;
 		}
 
-		/* vNIC PF control BAR is a subset of PF PCI device BAR */
-		hw->ctrl_bar += bar_offset;
 		PMD_INIT_LOG(DEBUG, "ctrl bar: %p\n", hw->ctrl_bar);
 	}
 
@@ -2772,13 +2748,14 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	case PCI_DEVICE_ID_NFP6000_PF_NIC:
 	case PCI_DEVICE_ID_NFP6000_VF_NIC:
 		start_q = nn_cfg_readl(hw, NFP_NET_CFG_START_TXQ);
-		tx_bar_off = NFP_PCIE_QUEUE(start_q);
+		tx_bar_off = start_q * NFP_QCP_QUEUE_ADDR_SZ;
 		start_q = nn_cfg_readl(hw, NFP_NET_CFG_START_RXQ);
-		rx_bar_off = NFP_PCIE_QUEUE(start_q);
+		rx_bar_off = start_q * NFP_QCP_QUEUE_ADDR_SZ;
 		break;
 	default:
 		RTE_LOG(ERR, PMD, "nfp_net: no device ID matching\n");
-		return -ENODEV;
+		err = -ENODEV;
+		goto dev_err_ctrl_map;
 	}
 
 	PMD_INIT_LOG(DEBUG, "tx_bar_off: 0x%" PRIx64 "\n", tx_bar_off);
@@ -2786,17 +2763,19 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 
 	if (hw->is_pf && port == 0) {
 		/* configure access to tx/rx vNIC BARs */
-		nfp_nsp_map_queues_bar(nspu_desc, &bar_offset);
-		PMD_INIT_LOG(DEBUG, "tx/rx bar_offset: %" PRIx64 "\n",
-				    bar_offset);
-		hwport0->hw_queues = (uint8_t *)pci_dev->mem_resource[0].addr;
-
-		/* vNIC PF tx/rx BARs are a subset of PF PCI device */
-		hwport0->hw_queues += bar_offset;
+		hwport0->hw_queues = nfp_cpp_map_area(hw->cpp, 0, 0,
+						      NFP_PCIE_QUEUE(0),
+						      NFP_QCP_QUEUE_AREA_SZ,
+						      &hw->hwqueues_area);
+
+		if (!hwport0->hw_queues) {
+			printf("nfp_rtsym_map fails for net.qc\n");
+			err = -EIO;
+			goto dev_err_ctrl_map;
+		}
 
-		/* Lets seize the chance to read eth table from hw */
-		if (nfp_nsp_eth_read_table(nspu_desc, &hw->eth_table))
-			return -ENODEV;
+		PMD_INIT_LOG(DEBUG, "tx/rx bar address: 0x%p\n",
+				    hwport0->hw_queues);
 	}
 
 	if (hw->is_pf) {
@@ -2856,7 +2835,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	eth_dev->data->mac_addrs = rte_zmalloc("mac_addr", ETHER_ADDR_LEN, 0);
 	if (eth_dev->data->mac_addrs == NULL) {
 		PMD_INIT_LOG(ERR, "Failed to space for MAC address");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto dev_err_queues_map;
 	}
 
 	if (hw->is_pf) {
@@ -2867,6 +2847,8 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	}
 
 	if (!is_valid_assigned_ether_addr((struct ether_addr *)&hw->mac_addr)) {
+		PMD_INIT_LOG(INFO, "Using random mac address for port %d\n",
+				   port);
 		/* Using random mac addresses for VFs */
 		eth_random_addr(&hw->mac_addr[0]);
 		nfp_net_write_mac(hw, (uint8_t *)&hw->mac_addr);
@@ -2895,11 +2877,19 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	nfp_net_stats_reset(eth_dev);
 
 	return 0;
+
+dev_err_queues_map:
+		nfp_cpp_area_free(hw->hwqueues_area);
+dev_err_ctrl_map:
+		nfp_cpp_area_free(hw->ctrl_area);
+
+	return err;
 }
 
 static int
 nfp_pf_create_dev(struct rte_pci_device *dev, int port, int ports,
-		  nfpu_desc_t *nfpu_desc, void **priv)
+		  struct nfp_cpp *cpp, struct nfp_hwinfo *hwinfo,
+		  int phys_port, struct nfp_rtsym_table *sym_tbl, void **priv)
 {
 	struct rte_eth_dev *eth_dev;
 	struct nfp_net_hw *hw;
@@ -2937,12 +2927,16 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	 * Then dev_private is adjusted per port.
 	 */
 	hw = (struct nfp_net_hw *)(eth_dev->data->dev_private) + port;
-	hw->nspu_desc = nfpu_desc->nspu;
-	hw->nfpu_desc = nfpu_desc;
+	hw->cpp = cpp;
+	hw->hwinfo = hwinfo;
+	hw->sym_tbl = sym_tbl;
+	hw->pf_port_idx = phys_port;
 	hw->is_pf = 1;
 	if (ports > 1)
 		hw->pf_multiport_enabled = 1;
 
+	hw->total_ports = ports;
+
 	eth_dev->device = &dev->device;
 	rte_eth_copy_pci_info(eth_dev, dev);
 
@@ -2956,55 +2950,191 @@ uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 	return ret;
 }
 
+#define DEFAULT_FW_PATH       "/lib/firmware/netronome"
+
+static int
+nfp_fw_upload(struct rte_pci_device *dev, struct nfp_nsp *nsp, char *card)
+{
+	struct nfp_cpp *cpp = nsp->cpp;
+	int fw_f;
+	char *fw_buf;
+	char fw_name[100];
+	char serial[100];
+	struct stat file_stat;
+	off_t fsize, bytes;
+
+	/* Looking for firmware file in order of priority */
+
+	/* First try to find a firmware image specific for this device */
+	sprintf(serial, "serial-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x",
+		cpp->serial[0], cpp->serial[1], cpp->serial[2], cpp->serial[3],
+		cpp->serial[4], cpp->serial[5], cpp->interface >> 8,
+		cpp->interface & 0xff);
+
+	sprintf(fw_name, "%s/%s.nffw", DEFAULT_FW_PATH, serial);
+
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f > 0)
+		goto read_fw;
+
+	/* Then try the PCI name */
+	sprintf(fw_name, "%s/pci-%s.nffw", DEFAULT_FW_PATH, dev->device.name);
+
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f > 0)
+		goto read_fw;
+
+	/* Finally try the card type and media */
+	sprintf(fw_name, "%s/%s", DEFAULT_FW_PATH, card);
+	RTE_LOG(DEBUG, PMD, "Trying with fw file: %s\n", fw_name);
+	fw_f = open(fw_name, O_RDONLY);
+	if (fw_f < 0) {
+		RTE_LOG(INFO, PMD, "Firmware file %s not found.", fw_name);
+		return -ENOENT;
+	}
+
+read_fw:
+	if (fstat(fw_f, &file_stat) < 0) {
+		RTE_LOG(INFO, PMD, "Firmware file %s size is unknown", fw_name);
+		close(fw_f);
+		return -ENOENT;
+	}
+
+	fsize = file_stat.st_size;
+	RTE_LOG(INFO, PMD, "Firmware file found at %s with size: %" PRIu64 "\n",
+			    fw_name, (uint64_t)fsize);
+
+	fw_buf = malloc((size_t)fsize);
+	if (!fw_buf) {
+		RTE_LOG(INFO, PMD, "malloc failed for fw buffer");
+		close(fw_f);
+		return -ENOMEM;
+	}
+	memset(fw_buf, 0, fsize);
+
+	bytes = read(fw_f, fw_buf, fsize);
+	if (bytes != fsize) {
+		RTE_LOG(INFO, PMD, "Reading fw to buffer failed.\n"
+				   "Just %" PRIu64 " of %" PRIu64 " bytes read",
+				   (uint64_t)bytes, (uint64_t)fsize);
+		free(fw_buf);
+		close(fw_f);
+		return -EIO;
+	}
+
+	RTE_LOG(INFO, PMD, "Uploading the firmware ...");
+	nfp_nsp_load_fw(nsp, fw_buf, bytes);
+	RTE_LOG(INFO, PMD, "Done");
+
+	free(fw_buf);
+	close(fw_f);
+
+	return 0;
+}
+
+static int
+nfp_fw_setup(struct rte_pci_device *dev, struct nfp_cpp *cpp,
+	     struct nfp_eth_table *nfp_eth_table, struct nfp_hwinfo *hwinfo)
+{
+	struct nfp_nsp *nsp;
+	const char *nfp_fw_model;
+	char card_desc[100];
+	int err = 0;
+
+	nfp_fw_model = nfp_hwinfo_lookup(hwinfo, "assembly.partno");
+
+	if (nfp_fw_model) {
+		RTE_LOG(INFO, PMD, "firmware model found: %s\n", nfp_fw_model);
+	} else {
+		RTE_LOG(ERR, PMD, "firmware model NOT found\n");
+		return -EIO;
+	}
+
+	if (nfp_eth_table->count == 0 || nfp_eth_table->count > 8) {
+		RTE_LOG(ERR, PMD, "NFP ethernet table reports wrong ports: %u\n",
+		       nfp_eth_table->count);
+		return -EIO;
+	}
+
+	RTE_LOG(INFO, PMD, "NFP ethernet port table reports %u ports\n",
+			   nfp_eth_table->count);
+
+	RTE_LOG(INFO, PMD, "Port speed: %u\n", nfp_eth_table->ports[0].speed);
+
+	sprintf(card_desc, "nic_%s_%dx%d.nffw", nfp_fw_model,
+		nfp_eth_table->count, nfp_eth_table->ports[0].speed / 1000);
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp) {
+		RTE_LOG(ERR, PMD, "NFP error when obtaining NSP handle\n");
+		return -EIO;
+	}
+
+	nfp_nsp_device_soft_reset(nsp);
+	err = nfp_fw_upload(dev, nsp, card_desc);
+
+	nfp_nsp_close(nsp);
+	return err;
+}
+
 static int nfp_pf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			    struct rte_pci_device *dev)
 {
-	nfpu_desc_t *nfpu_desc;
-	nspu_desc_t *nspu_desc;
-	uint64_t offset_symbol;
-	uint8_t *bar_offset;
-	int major, minor;
+	struct nfp_cpp *cpp;
+	struct nfp_hwinfo *hwinfo;
+	struct nfp_rtsym_table *sym_tbl;
+	struct nfp_eth_table *nfp_eth_table = NULL;
 	int total_ports;
 	void *priv = 0;
 	int ret = -ENODEV;
+	int err;
 	int i;
 
 	if (!dev)
 		return ret;
 
-	nfpu_desc = rte_malloc("nfp nfpu", sizeof(nfpu_desc_t), 0);
-	if (!nfpu_desc)
-		return -ENOMEM;
-
-	if (nfpu_open(dev, nfpu_desc, 0) < 0) {
-		RTE_LOG(ERR, PMD,
-			"nfpu_open failed\n");
-		goto nfpu_error;
+	cpp = nfp_cpp_from_device_name(dev->device.name);
+	if (!cpp) {
+		RTE_LOG(ERR, PMD, "A CPP handle can not be obtained");
+		ret = -EIO;
+		goto error;
 	}
 
-	nspu_desc = nfpu_desc->nspu;
+	hwinfo = nfp_hwinfo_read(cpp);
+	if (!hwinfo) {
+		RTE_LOG(ERR, PMD, "Error reading hwinfo table");
+		return -EIO;
+	}
 
+	nfp_eth_table = nfp_eth_read_ports(cpp);
+	if (!nfp_eth_table) {
+		RTE_LOG(ERR, PMD, "Error reading NFP ethernet table\n");
+		return -EIO;
+	}
 
-	/* Check NSP ABI version */
-	if (nfp_nsp_get_abi_version(nspu_desc, &major, &minor) < 0) {
-		RTE_LOG(INFO, PMD, "NFP NSP not present\n");
+	if (nfp_fw_setup(dev, cpp, nfp_eth_table, hwinfo)) {
+		RTE_LOG(INFO, PMD, "Error when uploading firmware\n");
+		ret = -EIO;
 		goto error;
 	}
-	PMD_INIT_LOG(INFO, "nspu ABI version: %d.%d\n", major, minor);
 
-	if ((major == 0) && (minor < 20)) {
-		RTE_LOG(INFO, PMD, "NFP NSP ABI version too old. Required 0.20 or higher\n");
+	/* Now the symbol table should be there */
+	sym_tbl = nfp_rtsym_table_read(cpp);
+	if (!sym_tbl) {
+		RTE_LOG(ERR, PMD, "Something is wrong with the firmware"
+				" symbol table");
+		ret = -EIO;
 		goto error;
 	}
 
-	ret = nfp_nsp_fw_setup(nspu_desc, "nfd_cfg_pf0_num_ports",
-			       &offset_symbol);
-	if (ret)
+	total_ports = nfp_rtsym_read_le(sym_tbl, "nfd_cfg_pf0_num_ports", &err);
+	if (total_ports != (int)nfp_eth_table->count) {
+		RTE_LOG(ERR, PMD, "Inconsistent number of ports\n");
+		ret = -EIO;
 		goto error;
-
-	bar_offset = (uint8_t *)dev->mem_resource[0].addr;
-	bar_offset += offset_symbol;
-	total_ports = (uint32_t)*bar_offset;
+	}
 	PMD_INIT_LOG(INFO, "Total pf ports: %d\n", total_ports);
 
 	if (total_ports <= 0 || total_ports > 8) {
@@ -3014,18 +3144,15 @@ static int nfp_pf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	}
 
 	for (i = 0; i < total_ports; i++) {
-		ret = nfp_pf_create_dev(dev, i, total_ports, nfpu_desc, &priv);
+		ret = nfp_pf_create_dev(dev, i, total_ports, cpp, hwinfo,
+					nfp_eth_table->ports[i].index,
+					sym_tbl, &priv);
 		if (ret)
-			goto error;
+			break;
 	}
 
-	return 0;
-
 error:
-	nfpu_close(nfpu_desc);
-nfpu_error:
-	rte_free(nfpu_desc);
-
+	free(nfp_eth_table);
 	return ret;
 }
 
@@ -3073,8 +3200,19 @@ static int eth_nfp_pci_remove(struct rte_pci_device *pci_dev)
 	if ((pci_dev->id.device_id == PCI_DEVICE_ID_NFP4000_PF_NIC) ||
 	    (pci_dev->id.device_id == PCI_DEVICE_ID_NFP6000_PF_NIC)) {
 		port = get_pf_port_number(eth_dev->data->name);
+		/*
+		 * hotplug is not possible with multiport PF although freeing
+		 * data structures can be done for first port.
+		 */
+		if (port != 0)
+			return -ENOTSUP;
 		hwport0 = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 		hw = &hwport0[port];
+		nfp_cpp_area_free(hw->ctrl_area);
+		nfp_cpp_area_free(hw->hwqueues_area);
+		free(hw->hwinfo);
+		free(hw->sym_tbl);
+		nfp_cpp_free(hw->cpp);
 	} else {
 		hw = NFP_NET_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
 	}
diff --git a/drivers/net/nfp/nfp_net_pmd.h b/drivers/net/nfp/nfp_net_pmd.h
index 1ae0ea6..097c871 100644
--- a/drivers/net/nfp/nfp_net_pmd.h
+++ b/drivers/net/nfp/nfp_net_pmd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015 Netronome Systems, Inc.
+ * Copyright (c) 2014-2018 Netronome Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -63,6 +63,7 @@
 #define NFP_NET_CRTL_BAR        0
 #define NFP_NET_TX_BAR          2
 #define NFP_NET_RX_BAR          2
+#define NFP_QCP_QUEUE_AREA_SZ			0x80000
 
 /* Macros for accessing the Queue Controller Peripheral 'CSRs' */
 #define NFP_QCP_QUEUE_OFF(_x)                 ((_x) * 0x800)
@@ -430,20 +431,21 @@ struct nfp_net_hw {
 	/* Records starting point for counters */
 	struct rte_eth_stats eth_stats_base;
 
-#ifdef NFP_NET_LIBNFP
 	struct nfp_cpp *cpp;
 	struct nfp_cpp_area *ctrl_area;
-	struct nfp_cpp_area *tx_area;
-	struct nfp_cpp_area *rx_area;
+	struct nfp_cpp_area *hwqueues_area;
 	struct nfp_cpp_area *msix_area;
-#endif
+
 	uint8_t *hw_queues;
 	uint8_t is_pf;
 	uint8_t pf_port_idx;
 	uint8_t pf_multiport_enabled;
+	uint8_t total_ports;
+
 	union eth_table_entry *eth_table;
-	nspu_desc_t *nspu_desc;
-	nfpu_desc_t *nfpu_desc;
+
+	struct nfp_hwinfo *hwinfo;
+	struct nfp_rtsym_table *sym_tbl;
 };
 
 struct nfp_net_adapter {
-- 
1.9.1

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH 1/4] net/nfp: add NFP CPP support
  @ 2018-04-05 14:28  1% ` Alejandro Lucero
  2018-04-05 14:28  6% ` [dpdk-dev] [PATCH 2/4] net/nfp: update PMD for using new CPP interface Alejandro Lucero
  1 sibling, 0 replies; 200+ results
From: Alejandro Lucero @ 2018-04-05 14:28 UTC (permalink / raw)
  To: dev

CPP refers to the internal NFP Command Push Pull bus. This patch allows
to create CPP commands from user space allowing to access any single
part of the chip.

This CPP interface is the base for having other functionalities like
mutexes when accessing specific chip components, chip resources management,
firmware upload or using the NSP, an embedded arm processor which can
perform tasks on demand.

NSP was the previous only way for doing things in the chip by the PMD,
where a NSPU interface was used for commands like firmware upload or
port link configuration. CPP interface supersedes NSPU, but it is still
possible to use NSP through CPP.

CPP interface adds a great flexibility for doing things like extended
stats or firmware debugging.

Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
---
 drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h    | 722 +++++++++++++++++
 drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h |  36 +
 drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h    | 592 ++++++++++++++
 drivers/net/nfp/nfpcore/nfp6000/nfp6000.h         |  40 +
 drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h         |  26 +
 drivers/net/nfp/nfpcore/nfp_cpp.h                 | 776 ++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c        | 936 ++++++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_cppcore.c             | 856 ++++++++++++++++++++
 drivers/net/nfp/nfpcore/nfp_crc.c                 |  49 ++
 drivers/net/nfp/nfpcore/nfp_crc.h                 |  19 +
 drivers/net/nfp/nfpcore/nfp_hwinfo.c              | 199 +++++
 drivers/net/nfp/nfpcore/nfp_hwinfo.h              |  85 ++
 drivers/net/nfp/nfpcore/nfp_mip.c                 | 154 ++++
 drivers/net/nfp/nfpcore/nfp_mip.h                 |  21 +
 drivers/net/nfp/nfpcore/nfp_mutex.c               | 424 ++++++++++
 drivers/net/nfp/nfpcore/nfp_nffw.c                | 235 ++++++
 drivers/net/nfp/nfpcore/nfp_nffw.h                |  86 ++
 drivers/net/nfp/nfpcore/nfp_nsp.c                 | 427 ++++++++++
 drivers/net/nfp/nfpcore/nfp_nsp.h                 | 304 +++++++
 drivers/net/nfp/nfpcore/nfp_nsp_cmds.c            | 109 +++
 drivers/net/nfp/nfpcore/nfp_nsp_eth.c             | 665 +++++++++++++++
 drivers/net/nfp/nfpcore/nfp_resource.c            | 264 ++++++
 drivers/net/nfp/nfpcore/nfp_resource.h            |  52 ++
 drivers/net/nfp/nfpcore/nfp_rtsym.c               | 327 ++++++++
 drivers/net/nfp/nfpcore/nfp_rtsym.h               |  61 ++
 drivers/net/nfp/nfpcore/nfp_target.h              | 579 +++++++++++++
 26 files changed, 8044 insertions(+)
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cpp.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_cppcore.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_crc.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_crc.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_hwinfo.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_hwinfo.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mip.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mip.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_mutex.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nffw.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nffw.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_nsp_eth.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_resource.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_resource.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_rtsym.c
 create mode 100644 drivers/net/nfp/nfpcore/nfp_rtsym.h
 create mode 100644 drivers/net/nfp/nfpcore/nfp_target.h

diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
new file mode 100644
index 0000000..6e380cc
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_cppat.h
@@ -0,0 +1,722 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CPPAT_H__
+#define __NFP_CPPAT_H__
+
+#include "nfp_platform.h"
+#include "nfp_resid.h"
+
+/* This file contains helpers for creating CPP commands
+ *
+ * All magic NFP-6xxx IMB 'mode' numbers here are from:
+ * Databook (1 August 2013)
+ * - System Overview and Connectivity
+ * -- Internal Connectivity
+ * --- Distributed Switch Fabric - Command Push/Pull (DSF-CPP) Bus
+ * ---- CPP addressing
+ * ----- Table 3.6. CPP Address Translation Mode Commands
+ */
+
+#define _NIC_NFP6000_MU_LOCALITY_DIRECT 2
+
+static inline int
+_nfp6000_decode_basic(uint64_t addr, int *dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0);
+
+static uint64_t
+_nic_mask64(int msb, int lsb, int at0)
+{
+	uint64_t v;
+	int w = msb - lsb + 1;
+
+	if (w == 64)
+		return ~(uint64_t)0;
+
+	if ((lsb + w) > 64)
+		return 0;
+
+	v = (UINT64_C(1) << w) - 1;
+
+	if (at0)
+		return v;
+
+	return v << lsb;
+}
+
+/* For VQDR, we may not modify the Channel bits, which might overlap
+ * with the Index bit. When it does, we need to ensure that isld0 == isld1.
+ */
+static inline int
+_nfp6000_encode_basic(uint64_t *addr, int dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0)
+{
+	uint64_t _u64;
+	int iid_lsb, idx_lsb;
+	int i, v = 0;
+	int isld[2];
+
+	isld[0] = isld0;
+	isld[1] = isld1;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_MU:
+		/* This function doesn't handle MU */
+		return NFP_ERRNO(EINVAL);
+	case NFP6000_CPPTGT_CTXPB:
+		/* This function doesn't handle CTXPB */
+		return NFP_ERRNO(EINVAL);
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case 0:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/*
+			 * In this specific mode we'd rather not modify the
+			 * address but we can verify if the existing contents
+			 * will point to a valid island.
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1,
+						  isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		iid_lsb = (addr40) ? 34 : 26;
+
+		/* <39:34> or <31:26> */
+		_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+		*addr &= ~_u64;
+		*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+		return 0;
+	case 1:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 39 : 31;
+		if (dest_island == isld0) {
+			/* Only need to clear the Index bit */
+			*addr &= ~_nic_mask64(idx_lsb, idx_lsb, 0);
+			return 0;
+		}
+
+		if (dest_island == isld1) {
+			/* Only need to set the Index bit */
+			*addr |= (UINT64_C(1) << idx_lsb);
+			return 0;
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 2:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/* iid<0> = addr<30> = channel<0> */
+			/* channel<1> = addr<31> = Index */
+
+			/*
+			 * Special case where we allow channel bits to be set
+			 * before hand and with them select an island.
+			 * So we need to confirm that it's at least plausible.
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			/*
+			 * If dest_island is invalid, the current address won't
+			 * go where expected.
+			 */
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 **/
+		isld[0] &= ~1;
+		isld[1] &= ~1;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 1;
+
+		/*
+		 * Try each option, take first one that fits. Not sure if we
+		 * would want to do some smarter searching and prefer 0 or non-0
+		 * island IDs.
+		 */
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 2; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 3:
+		if (cpp_tgt == NFP6000_CPPTGT_VQDR && !addr40) {
+			/*
+			 * iid<0> = addr<29> = data
+			 * iid<1> = addr<30> = channel<0>
+			 * channel<1> = addr<31> = Index
+			 */
+			i = _nfp6000_decode_basic(*addr, &v, cpp_tgt, mode,
+						  addr40, isld1, isld0);
+			if (i != 0)
+				/* Full Island ID and channel bits overlap */
+				return i;
+
+			if (dest_island != -1 && dest_island != v)
+				return NFP_ERRNO(EINVAL);
+
+			/* If dest_island was -1, we don't care */
+			return 0;
+		}
+
+		isld[0] &= ~3;
+		isld[1] &= ~3;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 2;
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 4; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+		return NFP_ERRNO(ENODEV);
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_decode_basic(uint64_t addr, int *dest_island, int cpp_tgt, int mode,
+		      int addr40, int isld1, int isld0)
+{
+	int iid_lsb, idx_lsb;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_MU:
+		/* This function doesn't handle MU */
+		return NFP_ERRNO(EINVAL);
+	case NFP6000_CPPTGT_CTXPB:
+		/* This function doesn't handle CTXPB */
+		return NFP_ERRNO(EINVAL);
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case 0:
+		/*
+		 * For VQDR, in this mode for 32-bit addressing it would be
+		 * islands 0, 16, 32 and 48 depending on channel and upper
+		 * address bits. Since those are not all valid islands, most
+		 * decode cases would result in bad island IDs, but we do them
+		 * anyway since this is decoding an address that is already
+		 * assumed to be used as-is to get to sram.
+		 */
+		iid_lsb = (addr40) ? 34 : 26;
+		*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+		return 0;
+	case 1:
+		/*
+		 * For VQDR 32-bit, this would decode as:
+		 *	Channel 0: island#0
+		 *	Channel 1: island#0
+		 *	Channel 2: island#1
+		 *	Channel 3: island#1
+		 *
+		 * That would be valid as long as both islands have VQDR.
+		 * Let's allow this.
+		 */
+
+		idx_lsb = (addr40) ? 39 : 31;
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1;
+		else
+			*dest_island = isld0;
+
+		return 0;
+	case 2:
+		/*
+		 * For VQDR 32-bit:
+		 *	Channel 0: (island#0 | 0)
+		 *	Channel 1: (island#0 | 1)
+		 *	Channel 2: (island#1 | 0)
+		 *	Channel 3: (island#1 | 1)
+		 *
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld0 &= ~1;
+		isld1 &= ~1;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 1;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 1);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 1);
+
+		return 0;
+	case 3:
+		/*
+		 * In this mode the data address starts to affect the island ID
+		 * so rather not allow it. In some really specific case one
+		 * could use this to send the upper half of the VQDR channel to
+		 * another MU, but this is getting very specific. However, as
+		 * above for mode 0, this is the decoder and the caller should
+		 * validate the resulting IID. This blindly does what the
+		 * silicon would do.
+		 */
+
+		isld0 &= ~3;
+		isld1 &= ~3;
+
+		idx_lsb = (addr40) ? 39 : 31;
+		iid_lsb = idx_lsb - 2;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 3);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 3);
+
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_mu_locality_lsb(int mode, int addr40)
+{
+	switch (mode) {
+	case 0:
+	case 1:
+	case 2:
+	case 3:
+		return (addr40) ? 38 : 30;
+	default:
+		break;
+	}
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_encode_mu(uint64_t *addr, int dest_island, int mode, int addr40,
+		   int isld1, int isld0)
+{
+	uint64_t _u64;
+	int iid_lsb, idx_lsb, locality_lsb;
+	int i, v;
+	int isld[2];
+	int da;
+
+	isld[0] = isld0;
+	isld[1] = isld1;
+	locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+
+	if (((*addr >> locality_lsb) & 3) == _NIC_NFP6000_MU_LOCALITY_DIRECT)
+		da = 1;
+	else
+		da = 0;
+
+	switch (mode) {
+	case 0:
+		iid_lsb = (addr40) ? 32 : 24;
+		_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+		*addr &= ~_u64;
+		*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+		return 0;
+	case 1:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 37 : 29;
+		if (dest_island == isld0) {
+			*addr &= ~_nic_mask64(idx_lsb, idx_lsb, 0);
+			return 0;
+		}
+
+		if (dest_island == isld1) {
+			*addr |= (UINT64_C(1) << idx_lsb);
+			return 0;
+		}
+
+		return NFP_ERRNO(ENODEV);
+	case 2:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld[0] &= ~1;
+		isld[1] &= ~1;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 1;
+
+		/*
+		 * Try each option, take first one that fits. Not sure if we
+		 * would want to do some smarter searching and prefer 0 or
+		 * non-0 island IDs.
+		 */
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 2; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+		return NFP_ERRNO(ENODEV);
+	case 3:
+		/*
+		 * Only the EMU will use 40 bit addressing. Silently set the
+		 * direct locality bit for everyone else. The SDK toolchain
+		 * uses dest_island <= 0 to test for atypical address encodings
+		 * to support access to local-island CTM with a 32-but address
+		 * (high-locality is effectively ignored and just used for
+		 * routing to island #0).
+		 */
+		if (dest_island > 0 &&
+		    (dest_island < 24 || dest_island > 26)) {
+			*addr |= ((uint64_t)_NIC_NFP6000_MU_LOCALITY_DIRECT)
+				 << locality_lsb;
+			da = 1;
+		}
+
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			_u64 = _nic_mask64((iid_lsb + 5), iid_lsb, 0);
+			*addr &= ~_u64;
+			*addr |= (((uint64_t)dest_island) << iid_lsb) & _u64;
+			return 0;
+		}
+
+		isld[0] &= ~3;
+		isld[1] &= ~3;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 2;
+
+		for (i = 0; i < 2; i++) {
+			for (v = 0; v < 4; v++) {
+				if (dest_island != (isld[i] | v))
+					continue;
+				*addr &= ~_nic_mask64(idx_lsb, iid_lsb, 0);
+				*addr |= (((uint64_t)i) << idx_lsb);
+				*addr |= (((uint64_t)v) << iid_lsb);
+				return 0;
+			}
+		}
+
+		return NFP_ERRNO(ENODEV);
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_decode_mu(uint64_t addr, int *dest_island, int mode, int addr40,
+		   int isld1, int isld0)
+{
+	int iid_lsb, idx_lsb, locality_lsb;
+	int da;
+
+	locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+
+	if (((addr >> locality_lsb) & 3) == _NIC_NFP6000_MU_LOCALITY_DIRECT)
+		da = 1;
+	else
+		da = 0;
+
+	switch (mode) {
+	case 0:
+		iid_lsb = (addr40) ? 32 : 24;
+		*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+		return 0;
+	case 1:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+
+		idx_lsb = (addr40) ? 37 : 29;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1;
+		else
+			*dest_island = isld0;
+
+		return 0;
+	case 2:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+		/*
+		 * Make sure we compare against isldN values by clearing the
+		 * LSB. This is what the silicon does.
+		 */
+		isld0 &= ~1;
+		isld1 &= ~1;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 1;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 1);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 1);
+
+		return 0;
+	case 3:
+		if (da) {
+			iid_lsb = (addr40) ? 32 : 24;
+			*dest_island = (int)(addr >> iid_lsb) & 0x3F;
+			return 0;
+		}
+
+		isld0 &= ~3;
+		isld1 &= ~3;
+
+		idx_lsb = (addr40) ? 37 : 29;
+		iid_lsb = idx_lsb - 2;
+
+		if (addr & _nic_mask64(idx_lsb, idx_lsb, 0))
+			*dest_island = isld1 | (int)((addr >> iid_lsb) & 3);
+		else
+			*dest_island = isld0 | (int)((addr >> iid_lsb) & 3);
+
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_addr_encode(uint64_t *addr, int dest_island, int cpp_tgt,
+			   int mode, int addr40, int isld1, int isld0)
+{
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		return _nfp6000_encode_basic(addr, dest_island, cpp_tgt, mode,
+					     addr40, isld1, isld0);
+
+	case NFP6000_CPPTGT_MU:
+		return _nfp6000_encode_mu(addr, dest_island, mode, addr40,
+					  isld1, isld0);
+
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return NFP_ERRNO(EINVAL);
+
+		*addr &= ~_nic_mask64(29, 24, 0);
+		*addr |= (((uint64_t)dest_island) << 24) &
+			  _nic_mask64(29, 24, 0);
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+static inline int
+_nfp6000_cppat_addr_decode(uint64_t addr, int *dest_island, int cpp_tgt,
+			   int mode, int addr40, int isld1, int isld0)
+{
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		return _nfp6000_decode_basic(addr, dest_island, cpp_tgt, mode,
+					     addr40, isld1, isld0);
+
+	case NFP6000_CPPTGT_MU:
+		return _nfp6000_decode_mu(addr, dest_island, mode, addr40,
+					  isld1, isld0);
+
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return -EINVAL;
+		*dest_island = (int)(addr >> 24) & 0x3F;
+		return 0;
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static inline int
+_nfp6000_cppat_addr_iid_clear(uint64_t *addr, int cpp_tgt, int mode, int addr40)
+{
+	int iid_lsb, locality_lsb, da;
+
+	switch (cpp_tgt) {
+	case NFP6000_CPPTGT_NBI:
+	case NFP6000_CPPTGT_VQDR:
+	case NFP6000_CPPTGT_ILA:
+	case NFP6000_CPPTGT_PCIE:
+	case NFP6000_CPPTGT_ARM:
+	case NFP6000_CPPTGT_CRYPTO:
+	case NFP6000_CPPTGT_CLS:
+		switch (mode) {
+		case 0:
+			iid_lsb = (addr40) ? 34 : 26;
+			*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+			return 0;
+		case 1:
+			iid_lsb = (addr40) ? 39 : 31;
+			*addr &= ~_nic_mask64(iid_lsb, iid_lsb, 0);
+			return 0;
+		case 2:
+			iid_lsb = (addr40) ? 38 : 30;
+			*addr &= ~_nic_mask64(iid_lsb + 1, iid_lsb, 0);
+			return 0;
+		case 3:
+			iid_lsb = (addr40) ? 37 : 29;
+			*addr &= ~_nic_mask64(iid_lsb + 2, iid_lsb, 0);
+			return 0;
+		default:
+			break;
+		}
+	case NFP6000_CPPTGT_MU:
+		locality_lsb = _nfp6000_cppat_mu_locality_lsb(mode, addr40);
+		da = (((*addr >> locality_lsb) & 3) ==
+		      _NIC_NFP6000_MU_LOCALITY_DIRECT);
+		switch (mode) {
+		case 0:
+			iid_lsb = (addr40) ? 32 : 24;
+			*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+			return 0;
+		case 1:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+			iid_lsb = (addr40) ? 37 : 29;
+			*addr &= ~_nic_mask64(iid_lsb, iid_lsb, 0);
+			return 0;
+		case 2:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+
+			iid_lsb = (addr40) ? 36 : 28;
+			*addr &= ~_nic_mask64(iid_lsb + 1, iid_lsb, 0);
+			return 0;
+		case 3:
+			if (da) {
+				iid_lsb = (addr40) ? 32 : 24;
+				*addr &= ~(UINT64_C(0x3F) << iid_lsb);
+				return 0;
+			}
+
+			iid_lsb = (addr40) ? 35 : 27;
+			*addr &= ~_nic_mask64(iid_lsb + 2, iid_lsb, 0);
+			return 0;
+		default:
+			break;
+		}
+	case NFP6000_CPPTGT_CTXPB:
+		if (mode != 1 || addr40 != 0)
+			return 0;
+		*addr &= ~(UINT64_C(0x3F) << 24);
+		return 0;
+	default:
+		break;
+	}
+
+	return NFP_ERRNO(EINVAL);
+}
+
+#endif /* __NFP_CPPAT_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
new file mode 100644
index 0000000..b8541c5
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_platform.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_PLATFORM_H__
+#define __NFP_PLATFORM_H__
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <sys/cdefs.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <errno.h>
+
+#ifndef BIT_ULL
+#define BIT(x) (1 << (x))
+#define BIT_ULL(x) (1ULL << (x))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define NFP_ERRNO(err) (errno = (err), -1)
+#define NFP_ERRNO_RET(err, ret) (errno = (err), (ret))
+#define NFP_NOERR(errv) (errno)
+#define NFP_ERRPTR(err) (errno = (err), NULL)
+#define NFP_PTRERR(errv) (errno)
+
+#endif /* __NFP_PLATFORM_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h b/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
new file mode 100644
index 0000000..0e03948
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp-common/nfp_resid.h
@@ -0,0 +1,592 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RESID_H__
+#define __NFP_RESID_H__
+
+#if (!defined(_NFP_RESID_NO_C_FUNC) && \
+	(defined(__NFP_TOOL_NFCC) || defined(__NFP_TOOL_NFAS)))
+#define _NFP_RESID_NO_C_FUNC
+#endif
+
+#ifndef _NFP_RESID_NO_C_FUNC
+#include "nfp_platform.h"
+#endif
+
+/*
+ * NFP Chip Architectures
+ *
+ * These are semi-arbitrary values to indicate an NFP architecture.
+ * They serve as a software view of a group of chip families, not necessarily a
+ * direct mapping to actual hardware design.
+ */
+#define NFP_CHIP_ARCH_YD	1
+#define NFP_CHIP_ARCH_TH	2
+
+/*
+ * NFP Chip Families.
+ *
+ * These are not enums, because they need to be microcode compatible.
+ * They are also not maskable.
+ *
+ * Note: The NFP-4xxx family is handled as NFP-6xxx in most software
+ * components.
+ *
+ */
+#define NFP_CHIP_FAMILY_NFP6000 0x6000	/* ARCH_TH */
+
+/* NFP Microengine/Flow Processing Core Versions */
+#define NFP_CHIP_ME_VERSION_2_7 0x0207
+#define NFP_CHIP_ME_VERSION_2_8 0x0208
+#define NFP_CHIP_ME_VERSION_2_9 0x0209
+
+/* NFP Chip Base Revisions. Minor stepping can just be added to these */
+#define NFP_CHIP_REVISION_A0 0x00
+#define NFP_CHIP_REVISION_B0 0x10
+#define NFP_CHIP_REVISION_C0 0x20
+#define NFP_CHIP_REVISION_PF 0xff /* Maximum possible revision */
+
+/* CPP Targets for each chip architecture */
+#define NFP6000_CPPTGT_NBI 1
+#define NFP6000_CPPTGT_VQDR 2
+#define NFP6000_CPPTGT_ILA 6
+#define NFP6000_CPPTGT_MU 7
+#define NFP6000_CPPTGT_PCIE 9
+#define NFP6000_CPPTGT_ARM 10
+#define NFP6000_CPPTGT_CRYPTO 12
+#define NFP6000_CPPTGT_CTXPB 14
+#define NFP6000_CPPTGT_CLS 15
+
+/*
+ * Wildcard indicating a CPP read or write action
+ *
+ * The action used will be either read or write depending on whether a read or
+ * write instruction/call is performed on the NFP_CPP_ID.  It is recomended that
+ * the RW action is used even if all actions to be performed on a NFP_CPP_ID are
+ * known to be only reads or writes. Doing so will in many cases save NFP CPP
+ * internal software resources.
+ */
+#define NFP_CPP_ACTION_RW 32
+
+#define NFP_CPP_TARGET_ID_MASK 0x1f
+
+/*
+ *  NFP_CPP_ID - pack target, token, and action into a CPP ID.
+ *
+ * Create a 32-bit CPP identifier representing the access to be made.
+ * These identifiers are used as parameters to other NFP CPP functions. Some
+ * CPP devices may allow wildcard identifiers to be specified.
+ *
+ * @param[in]	target	NFP CPP target id
+ * @param[in]	action	NFP CPP action id
+ * @param[in]	token	NFP CPP token id
+ * @return		NFP CPP ID
+ */
+#define NFP_CPP_ID(target, action, token)                   \
+	((((target) & 0x7f) << 24) | (((token) & 0xff) << 16) | \
+	 (((action) & 0xff) << 8))
+
+#define NFP_CPP_ISLAND_ID(target, action, token, island)    \
+	((((target) & 0x7f) << 24) | (((token) & 0xff) << 16) | \
+	 (((action) & 0xff) << 8) | (((island) & 0xff) << 0))
+
+#ifndef _NFP_RESID_NO_C_FUNC
+
+/**
+ * Return the NFP CPP target of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP target
+ */
+static inline uint8_t
+NFP_CPP_ID_TARGET_of(uint32_t id)
+{
+	return (id >> 24) & NFP_CPP_TARGET_ID_MASK;
+}
+
+/*
+ * Return the NFP CPP token of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP token
+ */
+static inline uint8_t
+NFP_CPP_ID_TOKEN_of(uint32_t id)
+{
+	return (id >> 16) & 0xff;
+}
+
+/*
+ * Return the NFP CPP action of a NFP CPP ID
+ * @param[in]	id	NFP CPP ID
+ * @return	NFP CPP action
+ */
+static inline uint8_t
+NFP_CPP_ID_ACTION_of(uint32_t id)
+{
+	return (id >> 8) & 0xff;
+}
+
+/*
+ * Return the NFP CPP action of a NFP CPP ID
+ * @param[in]   id      NFP CPP ID
+ * @return      NFP CPP action
+ */
+static inline uint8_t
+NFP_CPP_ID_ISLAND_of(uint32_t id)
+{
+	return (id) & 0xff;
+}
+
+#endif /* _NFP_RESID_NO_C_FUNC */
+
+/*
+ *  Check if @p chip_family is an ARCH_TH chip.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_IS_ARCH_TH(chip_family) \
+	((int)(chip_family) == (int)NFP_CHIP_FAMILY_NFP6000)
+
+/*
+ *  Get the NFP_CHIP_ARCH_* of @p chip_family.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_ARCH(x) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		(NFP_FAMILY_IS_ARCH_TH(_x) ? NFP_CHIP_ARCH_TH : \
+		NFP_FAMILY_IS_ARCH_YD(_x) ? NFP_CHIP_ARCH_YD : -1) \
+	}))
+
+/*
+ *  Check if @p chip_family is an NFP-6xxx chip.
+ * @param chip_family One of NFP_CHIP_FAMILY_*
+ */
+#define NFP_FAMILY_IS_NFP6000(chip_family) \
+	((int)(chip_family) == (int)NFP_CHIP_FAMILY_NFP6000)
+
+/*
+ *  Make microengine ID for NFP-6xxx.
+ * @param island_id   Island ID.
+ * @param menum       ME number, 0 based, within island.
+ *
+ * NOTE: menum should really be unsigned - MSC compiler throws error (not
+ * warning) if a clause is always true i.e. menum >= 0 if cluster_num is type
+ * unsigned int hence the cast of the menum to an int in that particular clause
+ */
+#define NFP6000_MEID(a, b)                       \
+	(__extension__ ({ \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		(((((int)(_a) & 0x3F) == (int)(_a)) &&   \
+		(((int)(_b) >= 0) && ((int)(_b) < 12))) ?    \
+		(int)(((_a) << 4) | ((_b) + 4)) : -1) \
+	}))
+
+/*
+ *  Do a general sanity check on the ME ID.
+ * The check is on the highest possible island ID for the chip family and the
+ * microengine number must  be a master ID.
+ * @param meid      ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_IS_VALID(meid) \
+	(__extension__ ({ \
+		typeof(meid) _a = (meid); \
+		((((_a) >> 4) < 64) && (((_a) >> 4) >= 0) && \
+		 (((_a) & 0xF) >= 4)) \
+	}))
+
+/*
+ *  Extract island ID from ME ID.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_ISLAND_of(meid) (((meid) >> 4) & 0x3F)
+
+/*
+ * Extract microengine number (0 based) from ME ID.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_MENUM_of(meid) (((meid) & 0xF) - 4)
+
+/*
+ * Extract microengine group number (0 based) from ME ID.
+ * The group is two code-sharing microengines, so group  0 refers to MEs 0,1,
+ * group 1 refers to MEs 2,3 etc.
+ * @param meid   ME ID as created by NFP6000_MEID
+ */
+#define NFP6000_MEID_MEGRP_of(meid) (NFP6000_MEID_MENUM_of(meid) >> 1)
+
+#ifndef _NFP_RESID_NO_C_FUNC
+
+/*
+ *  Convert a string to an ME ID.
+ *
+ * @param s       A string of format iX.meY
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME ID part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return     ME ID on success, -1 on error.
+ */
+int nfp6000_idstr2meid(const char *s, const char **endptr);
+
+/*
+ *  Extract island ID from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2island("i32.me5", &c);
+ * // val == 32, c == "me5"
+ * val = nfp6000_idstr2island("i32", &c);
+ * // val == 32, c == ""
+ *
+ * @param s       A string of format "iX.anything" or "iX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the island part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the island ID, -1 on error.
+ */
+int nfp6000_idstr2island(const char *s, const char **endptr);
+
+/*
+ *  Extract microengine number from string.
+ *
+ * Example:
+ * char *c;
+ * int menum = nfp6000_idstr2menum("me5.anything", &c);
+ * // menum == 5, c == "anything"
+ * menum = nfp6000_idstr2menum("me5", &c);
+ * // menum == 5, c == ""
+ *
+ * @param s       A string of format "meX.anything" or "meX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME number, -1 on error.
+ */
+int nfp6000_idstr2menum(const char *s, const char **endptr);
+
+/*
+ * Extract context number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2ctxnum("ctx5.anything", &c);
+ * // val == 5, c == "anything"
+ * val = nfp6000_idstr2ctxnum("ctx5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "ctxN.anything" or "ctxN"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the context number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the context number, -1 on error.
+ */
+int nfp6000_idstr2ctxnum(const char *s, const char **endptr);
+
+/*
+ * Extract microengine group number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp6000_idstr2megrp("tg2.anything", &c);
+ * // val == 2, c == "anything"
+ * val = nfp6000_idstr2megrp("tg5", &c);
+ * // val == 2, c == ""
+ *
+ * @param s       A string of format "tgX.anything" or "tgX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME group part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME group number, -1 on error.
+ */
+int nfp6000_idstr2megrp(const char *s, const char **endptr);
+
+/*
+ * Create ME ID string of format "iX[.meY]".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param meid   Microengine ID.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_meid2str(char *s, int meid);
+
+/*
+ * Create ME ID string of format "name[.meY]" or "iX[.meY]".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param meid   Microengine ID.
+ * @return       Pointer to "s" on success, NULL on error.
+ *
+ * Similar to nfp6000_meid2str() except use an alias instead of "iX"
+ * if one exists for the island.
+ */
+const char *nfp6000_meid2altstr(char *s, int meid);
+
+/*
+ * Create string of format "iX".
+ *
+ * @param s         Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                  The resulting string is output here.
+ * @param island_id Island ID.
+ * @return          Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_island2str(char *s, int island_id);
+
+/*
+ * Create string of format "name", an island alias.
+ *
+ * @param s         Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                  The resulting string is output here.
+ * @param island_id Island ID.
+ * @return          Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_island2altstr(char *s, int island_id);
+
+/*
+ * Create string of format "meY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param menum  Microengine number within island.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_menum2str(char *s, int menum);
+
+/*
+ * Create string of format "ctxY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param ctxnum Context number within microengine.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_ctxnum2str(char *s, int ctxnum);
+
+/*
+ * Create string of format "tgY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param megrp  Microengine group number within cluster.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp6000_megrp2str(char *s, int megrp);
+
+/*
+ * Convert a string to an ME ID.
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format iX.meY (or clX.meY)
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    string after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            ME ID on success, -1 on error.
+ */
+int nfp_idstr2meid(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract island ID from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2island(chip, "i32.me5", &c);
+ * // val == 32, c == "me5"
+ * val = nfp_idstr2island(chip, "i32", &c);
+ * // val == 32, c == ""
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format "iX.anything" or "iX"
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    striong after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            The island ID on succes, -1 on error.
+ */
+int nfp_idstr2island(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract microengine number from string.
+ *
+ * Example:
+ * char *c;
+ * int menum = nfp_idstr2menum("me5.anything", &c);
+ * // menum == 5, c == "anything"
+ * menum = nfp_idstr2menum("me5", &c);
+ * // menum == 5, c == ""
+ *
+ * @param chip_family Chip family ID
+ * @param s           A string of format "meX.anything" or "meX"
+ * @param endptr      If non-NULL, *endptr will point to the trailing
+ *                    striong after the ME ID part of the string, which
+ *                    is either an empty string or the first character
+ *                    after the separating period.
+ * @return            The ME number on succes, -1 on error.
+ */
+int nfp_idstr2menum(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract context number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2ctxnum("ctx5.anything", &c);
+ * // val == 5, c == "anything"
+ * val = nfp_idstr2ctxnum("ctx5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "ctxN.anything" or "ctxN"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the context number part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the context number, -1 on error.
+ */
+int nfp_idstr2ctxnum(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Extract microengine group number from string.
+ *
+ * Example:
+ * char *c;
+ * int val = nfp_idstr2megrp("tg2.anything", &c);
+ * // val == 2, c == "anything"
+ * val = nfp_idstr2megrp("tg5", &c);
+ * // val == 5, c == ""
+ *
+ * @param s       A string of format "tgX.anything" or "tgX"
+ * @param endptr  If non-NULL, *endptr will point to the trailing string
+ *                after the ME group part of the string, which is either
+ *                an empty string or the first character after the separating
+ *                period.
+ * @return        If successful, the ME group number, -1 on error.
+ */
+int nfp_idstr2megrp(int chip_family, const char *s, const char **endptr);
+
+/*
+ * Create ME ID string of format "iX[.meY]".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param meid        Microengine ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_meid2str(int chip_family, char *s, int meid);
+
+/*
+ * Create ME ID string of format "name[.meY]" or "iX[.meY]".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param meid        Microengine ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ *
+ * Similar to nfp_meid2str() except use an alias instead of "iX"
+ * if one exists for the island.
+ */
+const char *nfp_meid2altstr(int chip_family, char *s, int meid);
+
+/*
+ * Create string of format "iX".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param island_id   Island ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_island2str(int chip_family, char *s, int island_id);
+
+/*
+ * Create string of format "name", an island alias.
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param island_id   Island ID.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_island2altstr(int chip_family, char *s, int island_id);
+
+/*
+ * Create string of format "meY".
+ *
+ * @param chip_family Chip family ID
+ * @param s           Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *                    The resulting string is output here.
+ * @param menum       Microengine number within island.
+ * @return            Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_menum2str(int chip_family, char *s, int menum);
+
+/*
+ * Create string of format "ctxY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param ctxnum Context number within microengine.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_ctxnum2str(int chip_family, char *s, int ctxnum);
+
+/*
+ * Create string of format "tgY".
+ *
+ * @param s      Pointer to char buffer of size NFP_MEID_STR_SZ.
+ *               The resulting string is output here.
+ * @param megrp  Microengine group number within cluster.
+ * @return       Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_megrp2str(int chip_family, char *s, int megrp);
+
+/*
+ * Convert a two character string to revision number.
+ *
+ * Revision integer is 0x00 for A0, 0x11 for B1 etc.
+ *
+ * @param s     Two character string.
+ * @return      Revision number, -1 on error
+ */
+int nfp_idstr2rev(const char *s);
+
+/*
+ * Create string from revision number.
+ *
+ * String will be upper case.
+ *
+ * @param s     Pointer to char buffer with size of at least 3
+ *              for 2 characters and string terminator.
+ * @param rev   Revision number.
+ * @return      Pointer to "s" on success, NULL on error.
+ */
+const char *nfp_rev2str(char *s, int rev);
+
+/*
+ * Get the NFP CPP address from a string
+ *
+ * String is in the format [island@]target[:[action:[token:]]address]
+ *
+ * @param chip_family Chip family ID
+ * @param tid           Pointer to string to parse
+ * @param cpp_idp       Pointer to CPP ID
+ * @param cpp_addrp     Pointer to CPP address
+ * @return              0 on success, or -1 and errno
+ */
+int nfp_str2cpp(int chip_family,
+		const char *tid,
+		uint32_t *cpp_idp,
+		uint64_t *cpp_addrp);
+
+
+#endif /* _NFP_RESID_NO_C_FUNC */
+
+#endif /* __NFP_RESID_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h b/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
new file mode 100644
index 0000000..47e1dda
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp6000/nfp6000.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFP6000_H__
+#define __NFP_NFP6000_H__
+
+/* CPP Target IDs */
+#define NFP_CPP_TARGET_INVALID          0
+#define NFP_CPP_TARGET_NBI              1
+#define NFP_CPP_TARGET_QDR              2
+#define NFP_CPP_TARGET_ILA              6
+#define NFP_CPP_TARGET_MU               7
+#define NFP_CPP_TARGET_PCIE             9
+#define NFP_CPP_TARGET_ARM              10
+#define NFP_CPP_TARGET_CRYPTO           12
+#define NFP_CPP_TARGET_ISLAND_XPB       14	/* Shared with CAP */
+#define NFP_CPP_TARGET_ISLAND_CAP       14	/* Shared with XPB */
+#define NFP_CPP_TARGET_CT_XPB           14
+#define NFP_CPP_TARGET_LOCAL_SCRATCH    15
+#define NFP_CPP_TARGET_CLS              NFP_CPP_TARGET_LOCAL_SCRATCH
+
+#define NFP_ISL_EMEM0                   24
+
+#define NFP_MU_ADDR_ACCESS_TYPE_MASK    3ULL
+#define NFP_MU_ADDR_ACCESS_TYPE_DIRECT  2ULL
+
+static inline int
+nfp_cppat_mu_locality_lsb(int mode, int addr40)
+{
+	switch (mode) {
+	case 0 ... 3:
+		return addr40 ? 38 : 30;
+	default:
+		return -EINVAL;
+	}
+}
+
+#endif /* NFP_NFP6000_H */
diff --git a/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h b/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
new file mode 100644
index 0000000..7ada1bb
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp6000/nfp_xpb.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_XPB_H__
+#define __NFP_XPB_H__
+
+/*
+ * For use with NFP6000 Databook "XPB Addressing" section
+ */
+#define NFP_XPB_OVERLAY(island)  (((island) & 0x3f) << 24)
+
+#define NFP_XPB_ISLAND(island)   (NFP_XPB_OVERLAY(island) + 0x60000)
+
+#define NFP_XPB_ISLAND_of(offset) (((offset) >> 24) & 0x3F)
+
+/*
+ * For use with NFP6000 Databook "XPB Island and Device IDs" chapter
+ */
+#define NFP_XPB_DEVICE(island, slave, device) \
+				(NFP_XPB_OVERLAY(island) | \
+				 (((slave) & 3) << 22) | \
+				 (((device) & 0x3f) << 16))
+
+#endif /* NFP_XPB_H */
diff --git a/drivers/net/nfp/nfpcore/nfp_cpp.h b/drivers/net/nfp/nfpcore/nfp_cpp.h
new file mode 100644
index 0000000..7e86214
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cpp.h
@@ -0,0 +1,776 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CPP_H__
+#define __NFP_CPP_H__
+
+#include "nfp-common/nfp_platform.h"
+#include "nfp-common/nfp_resid.h"
+
+struct nfp_cpp_mutex;
+
+/*
+ * NFP CPP handle
+ */
+struct nfp_cpp {
+	uint32_t model;
+	uint32_t interface;
+	uint8_t *serial;
+	int serial_len;
+	void *priv;
+
+	/* Mutex cache */
+	struct nfp_cpp_mutex *mutex_cache;
+	const struct nfp_cpp_operations *op;
+
+	/*
+	 * NFP-6xxx originating island IMB CPP Address Translation. CPP Target
+	 * ID is index into array. Values are obtained at runtime from local
+	 * island XPB CSRs.
+	 */
+	uint32_t imb_cat_table[16];
+};
+
+/*
+ * NFP CPP device area handle
+ */
+struct nfp_cpp_area {
+	struct nfp_cpp *cpp;
+	char *name;
+	unsigned long long offset;
+	unsigned long size;
+	/* Here follows the 'priv' part of nfp_cpp_area. */
+};
+
+/*
+ * NFP CPP operations structure
+ */
+struct nfp_cpp_operations {
+	/* Size of priv area in struct nfp_cpp_area */
+	size_t area_priv_size;
+
+	/* Instance an NFP CPP */
+	int (*init)(struct nfp_cpp *cpp, const char *devname);
+
+	/*
+	 * Free the bus.
+	 * Called only once, during nfp_cpp_unregister()
+	 */
+	void (*free)(struct nfp_cpp *cpp);
+
+	/*
+	 * Initialize a new NFP CPP area
+	 * NOTE: This is _not_ serialized
+	 */
+	int (*area_init)(struct nfp_cpp_area *area,
+			 uint32_t dest,
+			 unsigned long long address,
+			 unsigned long size);
+	/*
+	 * Clean up a NFP CPP area before it is freed
+	 * NOTE: This is _not_ serialized
+	 */
+	void (*area_cleanup)(struct nfp_cpp_area *area);
+
+	/*
+	 * Acquire resources for a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_acquire)(struct nfp_cpp_area *area);
+	/*
+	 * Release resources for a NFP CPP area
+	 * Serialized
+	 */
+	void (*area_release)(struct nfp_cpp_area *area);
+	/*
+	 * Return a void IO pointer to a NFP CPP area
+	 * NOTE: This is _not_ serialized
+	 */
+
+	void *(*area_iomem)(struct nfp_cpp_area *area);
+
+	void *(*area_mapped)(struct nfp_cpp_area *area);
+	/*
+	 * Perform a read from a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_read)(struct nfp_cpp_area *area,
+			 void *kernel_vaddr,
+			 unsigned long offset,
+			 unsigned int length);
+	/*
+	 * Perform a write to a NFP CPP area
+	 * Serialized
+	 */
+	int (*area_write)(struct nfp_cpp_area *area,
+			  const void *kernel_vaddr,
+			  unsigned long offset,
+			  unsigned int length);
+};
+
+/*
+ * This should be the only external function the transport
+ * module supplies
+ */
+const struct nfp_cpp_operations *nfp_cpp_transport_operations(void);
+
+/*
+ * Set the model id
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   model   Model ID
+ */
+void nfp_cpp_model_set(struct nfp_cpp *cpp, uint32_t model);
+
+/*
+ * Set the private instance owned data of a nfp_cpp struct
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   interface Interface ID
+ */
+void nfp_cpp_interface_set(struct nfp_cpp *cpp, uint32_t interface);
+
+/*
+ * Set the private instance owned data of a nfp_cpp struct
+ *
+ * @param   cpp     NFP CPP operations structure
+ * @param   serial  NFP serial byte array
+ * @param   len     Length of the serial byte array
+ */
+int nfp_cpp_serial_set(struct nfp_cpp *cpp, const uint8_t *serial,
+		       size_t serial_len);
+
+/*
+ * Set the private data of the nfp_cpp instance
+ *
+ * @param   cpp NFP CPP operations structure
+ * @return      Opaque device pointer
+ */
+void nfp_cpp_priv_set(struct nfp_cpp *cpp, void *priv);
+
+/*
+ * Return the private data of the nfp_cpp instance
+ *
+ * @param   cpp NFP CPP operations structure
+ * @return      Opaque device pointer
+ */
+void *nfp_cpp_priv(struct nfp_cpp *cpp);
+
+/*
+ * Get the privately allocated portion of a NFP CPP area handle
+ *
+ * @param   cpp_area    NFP CPP area handle
+ * @return          Pointer to the private area, or NULL on failure
+ */
+void *nfp_cpp_area_priv(struct nfp_cpp_area *cpp_area);
+
+uint32_t __nfp_cpp_model_autodetect(struct nfp_cpp *cpp);
+
+/*
+ * NFP CPP core interface for CPP clients.
+ */
+
+/*
+ * Open a NFP CPP handle to a CPP device
+ *
+ * @param[in]	id	0-based ID for the CPP interface to use
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp *nfp_cpp_from_device_name(const char *devname);
+
+/*
+ * Free a NFP CPP handle
+ *
+ * @param[in]	cpp	NFP CPP handle
+ */
+void nfp_cpp_free(struct nfp_cpp *cpp);
+
+#define NFP_CPP_MODEL_INVALID   0xffffffff
+
+/*
+ * NFP_CPP_MODEL_CHIP_of - retrieve the chip ID from the model ID
+ *
+ * The chip ID is a 16-bit BCD+A-F encoding for the chip type.
+ *
+ * @param[in]   model   NFP CPP model id
+ * @return      NFP CPP chip id
+ */
+#define NFP_CPP_MODEL_CHIP_of(model)        (((model) >> 16) & 0xffff)
+
+/*
+ * NFP_CPP_MODEL_IS_6000 - Check for the NFP6000 family of devices
+ *
+ * NOTE: The NFP4000 series is considered as a NFP6000 series variant.
+ *
+ * @param[in]	model	NFP CPP model id
+ * @return		true if model is in the NFP6000 family, false otherwise.
+ */
+#define NFP_CPP_MODEL_IS_6000(model)		     \
+		((NFP_CPP_MODEL_CHIP_of(model) >= 0x4000) && \
+		(NFP_CPP_MODEL_CHIP_of(model) < 0x7000))
+
+/*
+ * nfp_cpp_model - Retrieve the Model ID of the NFP
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @return		NFP CPP Model ID
+ */
+uint32_t nfp_cpp_model(struct nfp_cpp *cpp);
+
+/*
+ * NFP Interface types - logical interface for this CPP connection 4 bits are
+ * reserved for interface type.
+ */
+#define NFP_CPP_INTERFACE_TYPE_INVALID		0x0
+#define NFP_CPP_INTERFACE_TYPE_PCI		0x1
+#define NFP_CPP_INTERFACE_TYPE_ARM		0x2
+#define NFP_CPP_INTERFACE_TYPE_RPC		0x3
+#define NFP_CPP_INTERFACE_TYPE_ILA		0x4
+
+/*
+ * Construct a 16-bit NFP Interface ID
+ *
+ * Interface IDs consists of 4 bits of interface type, 4 bits of unit
+ * identifier, and 8 bits of channel identifier.
+ *
+ * The NFP Interface ID is used in the implementation of NFP CPP API mutexes,
+ * which use the MU Atomic CompareAndWrite operation - hence the limit to 16
+ * bits to be able to use the NFP Interface ID as a lock owner.
+ *
+ * @param[in]	type	NFP Interface Type
+ * @param[in]	unit	Unit identifier for the interface type
+ * @param[in]	channel	Channel identifier for the interface unit
+ * @return		Interface ID
+ */
+#define NFP_CPP_INTERFACE(type, unit, channel)	\
+	((((type) & 0xf) << 12) | \
+	 (((unit) & 0xf) <<  8) | \
+	 (((channel) & 0xff) << 0))
+
+/*
+ * Get the interface type of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's type
+ */
+#define NFP_CPP_INTERFACE_TYPE_of(interface)	(((interface) >> 12) & 0xf)
+
+/*
+ * Get the interface unit of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's unit
+ */
+#define NFP_CPP_INTERFACE_UNIT_of(interface)	(((interface) >>  8) & 0xf)
+
+/*
+ * Get the interface channel of a NFP Interface ID
+ * @param[in]	interface	NFP Interface ID
+ * @return			NFP Interface ID's channel
+ */
+#define NFP_CPP_INTERFACE_CHANNEL_of(interface)	(((interface) >>  0) & 0xff)
+
+/*
+ * Retrieve the Interface ID of the NFP
+ * @param[in]	cpp	NFP CPP handle
+ * @return		NFP CPP Interface ID
+ */
+uint16_t nfp_cpp_interface(struct nfp_cpp *cpp);
+
+/*
+ * Retrieve the NFP Serial Number (unique per NFP)
+ * @param[in]	cpp	NFP CPP handle
+ * @param[out]	serial	Pointer to reference the serial number array
+ *
+ * @return	size of the NFP6000 serial number, in bytes
+ */
+int nfp_cpp_serial(struct nfp_cpp *cpp, const uint8_t **serial);
+
+/*
+ * Allocate a NFP CPP area handle, as an offset into a CPP ID
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc(struct nfp_cpp *cpp, uint32_t cpp_id,
+					unsigned long long address,
+					unsigned long size);
+
+/*
+ * Allocate a NFP CPP area handle, as an offset into a CPP ID, by a named owner
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	name	Name of owner of the area
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc_with_name(struct nfp_cpp *cpp,
+						  uint32_t cpp_id,
+						  const char *name,
+						  unsigned long long address,
+						  unsigned long size);
+
+/*
+ * Free an allocated NFP CPP area handle
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_free(struct nfp_cpp_area *area);
+
+/*
+ * Acquire the resources needed to access the NFP CPP area handle
+ *
+ * @param[in]	area	NFP CPP area handle
+ *
+ * @return 0 on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_acquire(struct nfp_cpp_area *area);
+
+/*
+ * Release the resources needed to access the NFP CPP area handle
+ *
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_release(struct nfp_cpp_area *area);
+
+/*
+ * Allocate, then acquire the resources needed to access the NFP CPP area handle
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	size	Size of the area to reserve
+ *
+ * @return NFP CPP handle, or NULL on failure (and set errno accordingly).
+ */
+struct nfp_cpp_area *nfp_cpp_area_alloc_acquire(struct nfp_cpp *cpp,
+						uint32_t cpp_id,
+						unsigned long long address,
+						unsigned long size);
+
+/*
+ * Release the resources, then free the NFP CPP area handle
+ * @param[in]	area	NFP CPP area handle
+ */
+void nfp_cpp_area_release_free(struct nfp_cpp_area *area);
+
+uint8_t *nfp_cpp_map_area(struct nfp_cpp *cpp, int domain, int target,
+			   uint64_t addr, unsigned long size,
+			   struct nfp_cpp_area **area);
+/*
+ * Return an IO pointer to the beginning of the NFP CPP area handle. The area
+ * must be acquired with 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ *
+ * @return Pointer to IO memory, or NULL on failure (and set errno accordingly).
+ */
+void *nfp_cpp_area_mapped(struct nfp_cpp_area *area);
+
+/*
+ * Read from a NFP CPP area handle into a buffer. The area must be acquired with
+ * 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	buffer	Location of buffer to receive the data
+ * @param[in]	length	Length of the data to read
+ *
+ * @return bytes read on success, -1 on failure (and set errno accordingly).
+ *
+ */
+int nfp_cpp_area_read(struct nfp_cpp_area *area, unsigned long offset,
+		      void *buffer, size_t length);
+
+/*
+ * Write to a NFP CPP area handle from a buffer. The area must be acquired with
+ * 'nfp_cpp_area_acquire()' before calling this operation.
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	buffer	Location of buffer that holds the data
+ * @param[in]	length	Length of the data to read
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_write(struct nfp_cpp_area *area, unsigned long offset,
+		       const void *buffer, size_t length);
+
+/*
+ * nfp_cpp_area_iomem() - get IOMEM region for CPP area
+ * @area:       CPP area handle
+ *
+ * Returns an iomem pointer for use with readl()/writel() style operations.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ *
+ * Return: pointer to the area, or NULL
+ */
+void *nfp_cpp_area_iomem(struct nfp_cpp_area *area);
+
+/*
+ * Verify that IO can be performed on an offset in an area
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the area
+ * @param[in]	size	Size of region to validate
+ *
+ * @return 0 on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_check_range(struct nfp_cpp_area *area,
+			     unsigned long long offset, unsigned long size);
+
+/*
+ * Get the NFP CPP handle that is the parent of a NFP CPP area handle
+ *
+ * @param	cpp_area	NFP CPP area handle
+ * @return			NFP CPP handle
+ */
+struct nfp_cpp *nfp_cpp_area_cpp(struct nfp_cpp_area *cpp_area);
+
+/*
+ * Get the name passed during allocation of the NFP CPP area handle
+ *
+ * @param	cpp_area	NFP CPP area handle
+ * @return			Pointer to the area's name
+ */
+const char *nfp_cpp_area_name(struct nfp_cpp_area *cpp_area);
+
+/*
+ * Read a block of data from a NFP CPP ID
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	kernel_vaddr	Buffer to copy read data to
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes read on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_read(struct nfp_cpp *cpp, uint32_t cpp_id,
+		 unsigned long long address, void *kernel_vaddr, size_t length);
+
+/*
+ * Write a block of data to a NFP CPP ID
+ *
+ * @param[in]	cpp	NFP CPP handle
+ * @param[in]	cpp_id	NFP CPP ID
+ * @param[in]	address	Offset into the NFP CPP ID address space
+ * @param[in]	kernel_vaddr	Buffer to copy write data from
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_write(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, const void *kernel_vaddr,
+		  size_t length);
+
+
+
+/*
+ * Fill a NFP CPP area handle and offset with a value
+ *
+ * @param[in]	area	NFP CPP area handle
+ * @param[in]	offset	Offset into the NFP CPP ID address space
+ * @param[in]	value	32-bit value to fill area with
+ * @param[in]	length	Size of the area to reserve
+ *
+ * @return bytes written on success, -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_area_fill(struct nfp_cpp_area *area, unsigned long offset,
+		      uint32_t value, size_t length);
+
+/*
+ * Read a single 32-bit value from a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		output value
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 32-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_readl(struct nfp_cpp_area *area, unsigned long offset,
+		       uint32_t *value);
+
+/*
+ * Write a single 32-bit value to a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		value to write
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 32-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_writel(struct nfp_cpp_area *area, unsigned long offset,
+			uint32_t value);
+
+/*
+ * Read a single 64-bit value from a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		output value
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 64-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_readq(struct nfp_cpp_area *area, unsigned long offset,
+		       uint64_t *value);
+
+/*
+ * Write a single 64-bit value to a NFP CPP area handle
+ *
+ * @param area		NFP CPP area handle
+ * @param offset	offset into NFP CPP area handle
+ * @param value		value to write
+ *
+ * The area must be acquired with 'nfp_cpp_area_acquire()' before calling this
+ * operation.
+ *
+ * NOTE: offset must be 64-bit aligned.
+ *
+ * @return 0 on success, or -1 on error (and set errno accordingly).
+ */
+int nfp_cpp_area_writeq(struct nfp_cpp_area *area, unsigned long offset,
+			uint64_t value);
+
+/*
+ * Write a single 32-bit value on the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt	XPB target and address
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_writel(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t value);
+
+/*
+ * Read a single 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt	XPB target and address
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_readl(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t *value);
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to modify
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_writelm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		    uint32_t value);
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to monitor for
+ * @param timeout_us    maximum number of us to wait (-1 for forever)
+ *
+ * @return >= 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_xpb_waitlm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		   uint32_t value, int timeout_us);
+
+/*
+ * Read a 32-bit word from a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_readl(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, uint32_t *value);
+
+/*
+ * Write a 32-bit value to a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ *
+ */
+int nfp_cpp_writel(struct nfp_cpp *cpp, uint32_t cpp_id,
+		   unsigned long long address, uint32_t value);
+
+/*
+ * Read a 64-bit work from a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         output value
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_readq(struct nfp_cpp *cpp, uint32_t cpp_id,
+		  unsigned long long address, uint64_t *value);
+
+/*
+ * Write a 64-bit value to a NFP CPP ID
+ *
+ * @param cpp           NFP CPP handle
+ * @param cpp_id        NFP CPP ID
+ * @param address       offset into the NFP CPP ID address space
+ * @param value         value to write
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_writeq(struct nfp_cpp *cpp, uint32_t cpp_id,
+		   unsigned long long address, uint64_t value);
+
+/*
+ * Initialize a mutex location
+
+ * The CPP target:address must point to a 64-bit aligned location, and will
+ * initialize 64 bits of data at the location.
+ *
+ * This creates the initial mutex state, as locked by this nfp_cpp_interface().
+ *
+ * This function should only be called when setting up the initial lock state
+ * upon boot-up of the system.
+ *
+ * @param cpp		NFP CPP handle
+ * @param target	NFP CPP target ID
+ * @param address	Offset into the address space of the NFP CPP target ID
+ * @param key_id	Unique 32-bit value for this mutex
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_init(struct nfp_cpp *cpp, int target,
+		       unsigned long long address, uint32_t key_id);
+
+/*
+ * Create a mutex handle from an address controlled by a MU Atomic engine
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and reserve
+ * 64 bits of data at the location for use by the handle.
+ *
+ * Only target/address pairs that point to entities that support the MU Atomic
+ * Engine's CmpAndSwap32 command are supported.
+ *
+ * @param cpp		NFP CPP handle
+ * @param target	NFP CPP target ID
+ * @param address	Offset into the address space of the NFP CPP target ID
+ * @param key_id	32-bit unique key (must match the key at this location)
+ *
+ * @return		A non-NULL struct nfp_cpp_mutex * on success, NULL on
+ *                      failure.
+ */
+struct nfp_cpp_mutex *nfp_cpp_mutex_alloc(struct nfp_cpp *cpp, int target,
+					  unsigned long long address,
+					  uint32_t key_id);
+
+/*
+ * Get the NFP CPP handle the mutex was created with
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          NFP CPP handle
+ */
+struct nfp_cpp *nfp_cpp_mutex_cpp(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex key
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex key
+ */
+uint32_t nfp_cpp_mutex_key(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex owner
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Interface ID of the mutex owner
+ *
+ * NOTE: This is for debug purposes ONLY - the owner may change at any time,
+ * unless it has been locked by this NFP CPP handle.
+ */
+uint16_t nfp_cpp_mutex_owner(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex target
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex CPP target (ie NFP_CPP_TARGET_MU)
+ */
+int nfp_cpp_mutex_target(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Get the mutex address
+ *
+ * @param   mutex   NFP mutex handle
+ * @return          Mutex CPP address
+ */
+uint64_t nfp_cpp_mutex_address(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Free a mutex handle - does not alter the lock state
+ *
+ * @param mutex		NFP CPP Mutex handle
+ */
+void nfp_cpp_mutex_free(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Unlock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int nfp_cpp_mutex_unlock(struct nfp_cpp_mutex *mutex);
+
+/*
+ * Attempt to lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex		NFP CPP Mutex handle
+ * @return		0 if the lock succeeded, -1 on failure (and errno set
+ *			appropriately).
+ */
+int nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex);
+
+#endif /* !__NFP_CPP_H__ */
diff --git a/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c b/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
new file mode 100644
index 0000000..ad6ce72
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cpp_pcie_ops.c
@@ -0,0 +1,936 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/*
+ * nfp_cpp_pcie_ops.c
+ * Authors: Vinayak Tammineedi <vinayak.tammineedi@netronome.com>
+ *
+ * Multiplexes the NFP BARs between NFP internal resources and
+ * implements the PCIe specific interface for generic CPP bus access.
+ *
+ * The BARs are managed and allocated if they are available.
+ * The generic CPP bus abstraction builds upon this BAR interface.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <dirent.h>
+#include <libgen.h>
+
+#include <sys/mman.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include "nfp_cpp.h"
+#include "nfp_target.h"
+#include "nfp6000/nfp6000.h"
+
+#define NFP_PCIE_BAR(_pf)	(0x30000 + ((_pf) & 7) * 0xc0)
+
+#define NFP_PCIE_BAR_PCIE2CPP_ACTION_BASEADDRESS(_x)  (((_x) & 0x1f) << 16)
+#define NFP_PCIE_BAR_PCIE2CPP_BASEADDRESS(_x)         (((_x) & 0xffff) << 0)
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT(_x)        (((_x) & 0x3) << 27)
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_32BIT    0
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_64BIT    1
+#define NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_0BYTE    3
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE(_x)             (((_x) & 0x7) << 29)
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_OF(_x)          (((_x) >> 29) & 0x7)
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_FIXED         0
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_BULK          1
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_TARGET        2
+#define NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_GENERAL       3
+#define NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(_x)  (((_x) & 0xf) << 23)
+#define NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(_x)   (((_x) & 0x3) << 21)
+
+/*
+ * Minimal size of the PCIe cfg memory we depend on being mapped,
+ * queue controller and DMA controller don't have to be covered.
+ */
+#define NFP_PCI_MIN_MAP_SIZE				0x080000
+
+#define NFP_PCIE_P2C_FIXED_SIZE(bar)               (1 << (bar)->bitsize)
+#define NFP_PCIE_P2C_BULK_SIZE(bar)                (1 << (bar)->bitsize)
+#define NFP_PCIE_P2C_GENERAL_TARGET_OFFSET(bar, x) ((x) << ((bar)->bitsize - 2))
+#define NFP_PCIE_P2C_GENERAL_TOKEN_OFFSET(bar, x) ((x) << ((bar)->bitsize - 4))
+#define NFP_PCIE_P2C_GENERAL_SIZE(bar)             (1 << ((bar)->bitsize - 4))
+
+#define NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(bar, slot) \
+	(NFP_PCIE_BAR(0) + ((bar) * 8 + (slot)) * 4)
+
+#define NFP_PCIE_CPP_BAR_PCIETOCPPEXPBAR(bar, slot) \
+	(((bar) * 8 + (slot)) * 4)
+
+/*
+ * Define to enable a bit more verbose debug output.
+ * Set to 1 to enable a bit more verbose debug output.
+ */
+struct nfp_pcie_user;
+struct nfp6000_area_priv;
+
+/*
+ * struct nfp_bar - describes BAR configuration and usage
+ * @nfp:	backlink to owner
+ * @barcfg:	cached contents of BAR config CSR
+ * @base:	the BAR's base CPP offset
+ * @mask:       mask for the BAR aperture (read only)
+ * @bitsize:	bitsize of BAR aperture (read only)
+ * @index:	index of the BAR
+ * @lock:	lock to specify if bar is in use
+ * @refcnt:	number of current users
+ * @iomem:	mapped IO memory
+ */
+#define NFP_BAR_MAX 7
+struct nfp_bar {
+	struct nfp_pcie_user *nfp;
+	uint32_t barcfg;
+	uint64_t base;		/* CPP address base */
+	uint64_t mask;		/* Bit mask of the bar */
+	uint32_t bitsize;	/* Bit size of the bar */
+	int index;
+	int lock;
+
+	char *csr;
+	char *iomem;
+};
+
+#define BUSDEV_SZ	13
+struct nfp_pcie_user {
+	struct nfp_bar bar[NFP_BAR_MAX];
+
+	int device;
+	int lock;
+	char busdev[BUSDEV_SZ];
+	int barsz;
+	char *cfg;
+};
+
+static uint32_t
+nfp_bar_maptype(struct nfp_bar *bar)
+{
+	return NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_OF(bar->barcfg);
+}
+
+#define TARGET_WIDTH_32    4
+#define TARGET_WIDTH_64    8
+
+static int
+nfp_compute_bar(const struct nfp_bar *bar, uint32_t *bar_config,
+		uint64_t *bar_base, int tgt, int act, int tok,
+		uint64_t offset, size_t size, int width)
+{
+	uint32_t bitsize;
+	uint32_t newcfg;
+	uint64_t mask;
+
+	if (tgt >= 16)
+		return -EINVAL;
+
+	switch (width) {
+	case 8:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_64BIT);
+		break;
+	case 4:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_32BIT);
+		break;
+	case 0:
+		newcfg =
+		    NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT
+		    (NFP_PCIE_BAR_PCIE2CPP_LENGTHSELECT_0BYTE);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (act != NFP_CPP_ACTION_RW && act != 0) {
+		/* Fixed CPP mapping with specific action */
+		mask = ~(NFP_PCIE_P2C_FIXED_SIZE(bar) - 1);
+
+		newcfg |=
+		    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE
+		    (NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_FIXED);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(tgt);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_ACTION_BASEADDRESS(act);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(tok);
+
+		if ((offset & mask) != ((offset + size - 1) & mask)) {
+			printf("BAR%d: Won't use for Fixed mapping\n",
+				bar->index);
+			printf("\t<%#llx,%#llx>, action=%d\n",
+				(unsigned long long)offset,
+				(unsigned long long)(offset + size), act);
+			printf("\tBAR too small (0x%llx).\n",
+				(unsigned long long)mask);
+			return -EINVAL;
+		}
+		offset &= mask;
+
+#ifdef DEBUG
+		printf("BAR%d: Created Fixed mapping\n", bar->index);
+		printf("\t%d:%d:%d:0x%#llx-0x%#llx>\n", tgt, act, tok,
+			(unsigned long long)offset,
+			(unsigned long long)(offset + mask));
+#endif
+
+		bitsize = 40 - 16;
+	} else {
+		mask = ~(NFP_PCIE_P2C_BULK_SIZE(bar) - 1);
+
+		/* Bulk mapping */
+		newcfg |=
+		    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE
+		    (NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_BULK);
+
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TARGET_BASEADDRESS(tgt);
+		newcfg |= NFP_PCIE_BAR_PCIE2CPP_TOKEN_BASEADDRESS(tok);
+
+		if ((offset & mask) != ((offset + size - 1) & mask)) {
+			printf("BAR%d: Won't use for bulk mapping\n",
+				bar->index);
+			printf("\t<%#llx,%#llx>\n", (unsigned long long)offset,
+				(unsigned long long)(offset + size));
+			printf("\ttarget=%d, token=%d\n", tgt, tok);
+			printf("\tBAR too small (%#llx) - (%#llx != %#llx).\n",
+				(unsigned long long)mask,
+				(unsigned long long)(offset & mask),
+				(unsigned long long)(offset + size - 1) & mask);
+
+			return -EINVAL;
+		}
+
+		offset &= mask;
+
+#ifdef DEBUG
+		printf("BAR%d: Created bulk mapping %d:x:%d:%#llx-%#llx\n",
+			bar->index, tgt, tok, (unsigned long long)offset,
+			(unsigned long long)(offset + ~mask));
+#endif
+
+		bitsize = 40 - 21;
+	}
+
+	if (bar->bitsize < bitsize) {
+		printf("BAR%d: Too small for %d:%d:%d\n", bar->index, tgt, tok,
+			act);
+		return -EINVAL;
+	}
+
+	newcfg |= offset >> bitsize;
+
+	if (bar_base)
+		*bar_base = offset;
+
+	if (bar_config)
+		*bar_config = newcfg;
+
+	return 0;
+}
+
+static int
+nfp_bar_write(struct nfp_pcie_user *nfp, struct nfp_bar *bar,
+		  uint32_t newcfg)
+{
+	int base, slot;
+
+	base = bar->index >> 3;
+	slot = bar->index & 7;
+
+	if (!nfp->cfg)
+		return (-ENOMEM);
+
+	bar->csr = nfp->cfg +
+		   NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(base, slot);
+
+	*(uint32_t *)(bar->csr) = newcfg;
+
+	bar->barcfg = newcfg;
+#ifdef DEBUG
+	printf("BAR%d: updated to 0x%08x\n", bar->index, newcfg);
+#endif
+
+	return 0;
+}
+
+static int
+nfp_reconfigure_bar(struct nfp_pcie_user *nfp, struct nfp_bar *bar, int tgt,
+		int act, int tok, uint64_t offset, size_t size, int width)
+{
+	uint64_t newbase;
+	uint32_t newcfg;
+	int err;
+
+	err = nfp_compute_bar(bar, &newcfg, &newbase, tgt, act, tok, offset,
+			      size, width);
+	if (err)
+		return err;
+
+	bar->base = newbase;
+
+	return nfp_bar_write(nfp, bar, newcfg);
+}
+
+/*
+ * Map all PCI bars. We assume that the BAR with the PCIe config block is
+ * already mapped.
+ *
+ * BAR0.0: Reserved for General Mapping (for MSI-X access to PCIe SRAM)
+ */
+static int
+nfp_enable_bars(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		bar->barcfg = 0;
+		bar->nfp = nfp;
+		bar->index = x;
+		bar->mask = (1 << (nfp->barsz - 3)) - 1;
+		bar->bitsize = nfp->barsz - 3;
+		bar->base = 0;
+		bar->iomem = NULL;
+		bar->lock = 0;
+		bar->csr = nfp->cfg +
+			   NFP_PCIE_CFG_BAR_PCIETOCPPEXPBAR(bar->index >> 3,
+							   bar->index & 7);
+		bar->iomem =
+		    (char *)mmap(0, 1 << bar->bitsize, PROT_READ | PROT_WRITE,
+				 MAP_SHARED, nfp->device,
+				 bar->index << bar->bitsize);
+
+		if (bar->iomem == MAP_FAILED)
+			return (-ENOMEM);
+	}
+	return 0;
+}
+
+static struct nfp_bar *
+nfp_alloc_bar(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		if (!bar->lock) {
+			bar->lock = 1;
+			return bar;
+		}
+	}
+	return NULL;
+}
+
+static void
+nfp_disable_bars(struct nfp_pcie_user *nfp)
+{
+	struct nfp_bar *bar;
+	int x;
+
+	for (x = ARRAY_SIZE(nfp->bar); x > 0; x--) {
+		bar = &nfp->bar[x - 1];
+		if (bar->iomem) {
+			munmap(bar->iomem, 1 << (nfp->barsz - 3));
+			bar->iomem = NULL;
+			bar->lock = 0;
+		}
+	}
+}
+
+/*
+ * Generic CPP bus access interface.
+ */
+
+struct nfp6000_area_priv {
+	struct nfp_bar *bar;
+	uint32_t bar_offset;
+
+	uint32_t target;
+	uint32_t action;
+	uint32_t token;
+	uint64_t offset;
+	struct {
+		int read;
+		int write;
+		int bar;
+	} width;
+	size_t size;
+	char *iomem;
+};
+
+static int
+nfp6000_area_init(struct nfp_cpp_area *area, uint32_t dest,
+		  unsigned long long address, unsigned long size)
+{
+	struct nfp_pcie_user *nfp = nfp_cpp_priv(nfp_cpp_area_cpp(area));
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	uint32_t target = NFP_CPP_ID_TARGET_of(dest);
+	uint32_t action = NFP_CPP_ID_ACTION_of(dest);
+	uint32_t token = NFP_CPP_ID_TOKEN_of(dest);
+	int pp, ret = 0;
+
+	pp = nfp6000_target_pushpull(NFP_CPP_ID(target, action, token),
+				     address);
+	if (pp < 0)
+		return pp;
+
+	priv->width.read = PUSH_WIDTH(pp);
+	priv->width.write = PULL_WIDTH(pp);
+
+	if (priv->width.read > 0 &&
+	    priv->width.write > 0 && priv->width.read != priv->width.write)
+		return -EINVAL;
+
+	if (priv->width.read > 0)
+		priv->width.bar = priv->width.read;
+	else
+		priv->width.bar = priv->width.write;
+
+	priv->bar = nfp_alloc_bar(nfp);
+	if (priv->bar == NULL)
+		return -ENOMEM;
+
+	priv->target = target;
+	priv->action = action;
+	priv->token = token;
+	priv->offset = address;
+	priv->size = size;
+
+	ret = nfp_reconfigure_bar(nfp, priv->bar, priv->target, priv->action,
+				  priv->token, priv->offset, priv->size,
+				  priv->width.bar);
+
+	return ret;
+}
+
+static int
+nfp6000_area_acquire(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+
+	/* Calculate offset into BAR. */
+	if (nfp_bar_maptype(priv->bar) ==
+	    NFP_PCIE_BAR_PCIE2CPP_MAPTYPE_GENERAL) {
+		priv->bar_offset = priv->offset &
+			(NFP_PCIE_P2C_GENERAL_SIZE(priv->bar) - 1);
+		priv->bar_offset +=
+			NFP_PCIE_P2C_GENERAL_TARGET_OFFSET(priv->bar,
+							   priv->target);
+		priv->bar_offset +=
+		    NFP_PCIE_P2C_GENERAL_TOKEN_OFFSET(priv->bar, priv->token);
+	} else {
+		priv->bar_offset = priv->offset & priv->bar->mask;
+	}
+
+	/* Must have been too big. Sub-allocate. */
+	if (!priv->bar->iomem)
+		return (-ENOMEM);
+
+	priv->iomem = priv->bar->iomem + priv->bar_offset;
+
+	return 0;
+}
+
+static void *
+nfp6000_area_mapped(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *area_priv = nfp_cpp_area_priv(area);
+
+	if (!area_priv->iomem)
+		return NULL;
+
+	return area_priv->iomem;
+}
+
+static void
+nfp6000_area_release(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	priv->bar->lock = 0;
+	priv->bar = NULL;
+	priv->iomem = NULL;
+}
+
+static void *
+nfp6000_area_iomem(struct nfp_cpp_area *area)
+{
+	struct nfp6000_area_priv *priv = nfp_cpp_area_priv(area);
+	return priv->iomem;
+}
+
+static int
+nfp6000_area_read(struct nfp_cpp_area *area, void *kernel_vaddr,
+		  unsigned long offset, unsigned int length)
+{
+	uint64_t *wrptr64 = kernel_vaddr;
+	const volatile uint64_t *rdptr64;
+	struct nfp6000_area_priv *priv;
+	uint32_t *wrptr32 = kernel_vaddr;
+	const volatile uint32_t *rdptr32;
+	int width;
+	unsigned int n;
+	bool is_64;
+
+	priv = nfp_cpp_area_priv(area);
+	rdptr64 = (uint64_t *)(priv->iomem + offset);
+	rdptr32 = (uint32_t *)(priv->iomem + offset);
+
+	if (offset + length > priv->size)
+		return -EFAULT;
+
+	width = priv->width.read;
+
+	if (width <= 0)
+		return -EINVAL;
+
+	/* Unaligned? Translate to an explicit access */
+	if ((priv->offset + offset) & (width - 1)) {
+		printf("aread_read unaligned!!!\n");
+		return -EINVAL;
+	}
+
+	is_64 = width == TARGET_WIDTH_64;
+
+	/* MU reads via a PCIe2CPP BAR supports 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
+	    priv->action == NFP_CPP_ACTION_RW) {
+		is_64 = false;
+	}
+
+	if (is_64) {
+		if (offset % sizeof(uint64_t) != 0 ||
+		    length % sizeof(uint64_t) != 0)
+			return -EINVAL;
+	} else {
+		if (offset % sizeof(uint32_t) != 0 ||
+		    length % sizeof(uint32_t) != 0)
+			return -EINVAL;
+	}
+
+	if (!priv->bar)
+		return -EFAULT;
+
+	if (is_64)
+		for (n = 0; n < length; n += sizeof(uint64_t)) {
+			*wrptr64 = *rdptr64;
+			wrptr64++;
+			rdptr64++;
+		}
+	else
+		for (n = 0; n < length; n += sizeof(uint32_t)) {
+			*wrptr32 = *rdptr32;
+			wrptr32++;
+			rdptr32++;
+		}
+
+	return n;
+}
+
+static int
+nfp6000_area_write(struct nfp_cpp_area *area, const void *kernel_vaddr,
+		   unsigned long offset, unsigned int length)
+{
+	const uint64_t *rdptr64 = kernel_vaddr;
+	uint64_t *wrptr64;
+	const uint32_t *rdptr32 = kernel_vaddr;
+	struct nfp6000_area_priv *priv;
+	uint32_t *wrptr32;
+	int width;
+	unsigned int n;
+	bool is_64;
+
+	priv = nfp_cpp_area_priv(area);
+	wrptr64 = (uint64_t *)(priv->iomem + offset);
+	wrptr32 = (uint32_t *)(priv->iomem + offset);
+
+	if (offset + length > priv->size)
+		return -EFAULT;
+
+	width = priv->width.write;
+
+	if (width <= 0)
+		return -EINVAL;
+
+	/* Unaligned? Translate to an explicit access */
+	if ((priv->offset + offset) & (width - 1))
+		return -EINVAL;
+
+	is_64 = width == TARGET_WIDTH_64;
+
+	/* MU writes via a PCIe2CPP BAR supports 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
+	    priv->action == NFP_CPP_ACTION_RW)
+		is_64 = false;
+
+	if (is_64) {
+		if (offset % sizeof(uint64_t) != 0 ||
+		    length % sizeof(uint64_t) != 0)
+			return -EINVAL;
+	} else {
+		if (offset % sizeof(uint32_t) != 0 ||
+		    length % sizeof(uint32_t) != 0)
+			return -EINVAL;
+	}
+
+	if (!priv->bar)
+		return -EFAULT;
+
+	if (is_64)
+		for (n = 0; n < length; n += sizeof(uint64_t)) {
+			*wrptr64 = *rdptr64;
+			wrptr64++;
+			rdptr64++;
+		}
+	else
+		for (n = 0; n < length; n += sizeof(uint32_t)) {
+			*wrptr32 = *rdptr32;
+			wrptr32++;
+			rdptr32++;
+		}
+
+	return n;
+}
+
+#define PCI_DEVICES "/sys/bus/pci/devices"
+
+static int
+nfp_acquire_process_lock(struct nfp_pcie_user *desc)
+{
+	int rc;
+	struct flock lock;
+	char lockname[30];
+
+	memset(&lock, 0, sizeof(lock));
+
+	snprintf(lockname, sizeof(lockname), "/var/lock/nfp_%s", desc->busdev);
+	desc->lock = open(lockname, O_RDWR | O_CREAT, 0666);
+	if (desc->lock < 0)
+		return desc->lock;
+
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	rc = -1;
+	while (rc != 0) {
+		rc = fcntl(desc->lock, F_SETLKW, &lock);
+		if (rc < 0) {
+			if (errno != EAGAIN && errno != EACCES) {
+				close(desc->lock);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int
+nfp6000_set_model(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint32_t tmp;
+	int fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	lseek(fp, 0x2e, SEEK_SET);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("Error reading config file for model\n");
+		return -1;
+	}
+
+	tmp = tmp << 16;
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_model_set(cpp, tmp);
+
+	return 0;
+}
+
+static int
+nfp6000_set_interface(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint16_t tmp;
+	int fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	lseek(fp, 0x154, SEEK_SET);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for interface\n");
+		return -1;
+	}
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_interface_set(cpp, tmp);
+
+	return 0;
+}
+
+#define PCI_CFG_SPACE_SIZE	256
+#define PCI_CFG_SPACE_EXP_SIZE	4096
+#define PCI_EXT_CAP_ID(header)		(int)(header & 0x0000ffff)
+#define PCI_EXT_CAP_NEXT(header)	((header >> 20) & 0xffc)
+#define PCI_EXT_CAP_ID_DSN	0x03
+static int
+nfp_pci_find_next_ext_capability(int fp, int cap)
+{
+	uint32_t header;
+	int ttl;
+	int pos = PCI_CFG_SPACE_SIZE;
+
+	/* minimum 8 bytes per capability */
+	ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8;
+
+	lseek(fp, pos, SEEK_SET);
+	if (read(fp, &header, sizeof(header)) != sizeof(header)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	/*
+	 * If we have no capabilities, this is indicated by cap ID,
+	 * cap version and next pointer all being 0.
+	 */
+	if (header == 0)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < PCI_CFG_SPACE_SIZE)
+			break;
+
+		lseek(fp, pos, SEEK_SET);
+		if (read(fp, &header, sizeof(header)) != sizeof(header)) {
+			printf("error reading config file for serial\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nfp6000_set_serial(struct nfp_pcie_user *desc, struct nfp_cpp *cpp)
+{
+	char tmp_str[80];
+	uint16_t tmp;
+	uint8_t serial[6];
+	int serial_len = 6;
+	int fp, pos;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/config", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = open(tmp_str, O_RDONLY);
+	if (!fp)
+		return -1;
+
+	pos = nfp_pci_find_next_ext_capability(fp, PCI_EXT_CAP_ID_DSN);
+	if (pos <= 0) {
+		printf("PCI_EXT_CAP_ID_DSN not found. Using default offset\n");
+		lseek(fp, 0x156, SEEK_SET);
+	} else {
+		lseek(fp, pos + 6, SEEK_SET);
+	}
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[4] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[5] = (uint8_t)(tmp & 0xff);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[2] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[3] = (uint8_t)(tmp & 0xff);
+
+	if (read(fp, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+		printf("error reading config file for serial\n");
+		return -1;
+	}
+
+	serial[0] = (uint8_t)((tmp >> 8) & 0xff);
+	serial[1] = (uint8_t)(tmp & 0xff);
+
+	if (close(fp) == -1)
+		return -1;
+
+	nfp_cpp_serial_set(cpp, serial, serial_len);
+
+	return 0;
+}
+
+static int
+nfp6000_set_barsz(struct nfp_pcie_user *desc)
+{
+	char tmp_str[80];
+	unsigned long start, end, flags, tmp;
+	int i;
+	FILE *fp;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/resource", PCI_DEVICES,
+		 desc->busdev);
+
+	fp = fopen(tmp_str, "r");
+	if (!fp)
+		return -1;
+
+	if (fscanf(fp, "0x%lx 0x%lx 0x%lx", &start, &end, &flags) == 0) {
+		printf("error reading resource file for bar size\n");
+		return -1;
+	}
+
+	if (fclose(fp) == -1)
+		return -1;
+
+	tmp = (end - start) + 1;
+	i = 0;
+	while (tmp >>= 1)
+		i++;
+	desc->barsz = i;
+	return 0;
+}
+
+static int
+nfp6000_init(struct nfp_cpp *cpp, const char *devname)
+{
+	char link[120];
+	char tmp_str[80];
+	ssize_t size;
+	int ret = 0;
+	uint32_t model;
+	struct nfp_pcie_user *desc;
+
+	desc = malloc(sizeof(*desc));
+	if (!desc)
+		return -1;
+
+
+	memset(desc->busdev, 0, BUSDEV_SZ);
+	strncpy(desc->busdev, devname, strlen(devname));
+
+	ret = nfp_acquire_process_lock(desc);
+	if (ret)
+		return -1;
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/driver", PCI_DEVICES,
+		 desc->busdev);
+
+	size = readlink(tmp_str, link, sizeof(link));
+
+	if (size == -1)
+		tmp_str[0] = '\0';
+
+	if (size == sizeof(link))
+		tmp_str[0] = '\0';
+
+	snprintf(tmp_str, sizeof(tmp_str), "%s/%s/resource0", PCI_DEVICES,
+		 desc->busdev);
+
+	desc->device = open(tmp_str, O_RDWR);
+	if (desc->device == -1)
+		return -1;
+
+	if (nfp6000_set_model(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_interface(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_serial(desc, cpp) < 0)
+		return -1;
+	if (nfp6000_set_barsz(desc) < 0)
+		return -1;
+
+	desc->cfg = (char *)mmap(0, 1 << (desc->barsz - 3),
+				 PROT_READ | PROT_WRITE,
+				 MAP_SHARED, desc->device, 0);
+
+	if (desc->cfg == MAP_FAILED)
+		return -1;
+
+	nfp_enable_bars(desc);
+
+	nfp_cpp_priv_set(cpp, desc);
+
+	model = __nfp_cpp_model_autodetect(cpp);
+	nfp_cpp_model_set(cpp, model);
+
+	return ret;
+}
+
+static void
+nfp6000_free(struct nfp_cpp *cpp)
+{
+	struct nfp_pcie_user *desc = nfp_cpp_priv(cpp);
+	int x;
+
+	/* Unmap may cause if there are any pending transaxctions */
+	nfp_disable_bars(desc);
+	munmap(desc->cfg, 1 << (desc->barsz - 3));
+
+	for (x = ARRAY_SIZE(desc->bar); x > 0; x--) {
+		if (desc->bar[x - 1].iomem)
+			munmap(desc->bar[x - 1].iomem, 1 << (desc->barsz - 3));
+	}
+	close(desc->lock);
+	close(desc->device);
+	free(desc);
+}
+
+static const struct nfp_cpp_operations nfp6000_pcie_ops = {
+	.init = nfp6000_init,
+	.free = nfp6000_free,
+
+	.area_priv_size = sizeof(struct nfp6000_area_priv),
+	.area_init = nfp6000_area_init,
+	.area_acquire = nfp6000_area_acquire,
+	.area_release = nfp6000_area_release,
+	.area_mapped = nfp6000_area_mapped,
+	.area_read = nfp6000_area_read,
+	.area_write = nfp6000_area_write,
+	.area_iomem = nfp6000_area_iomem,
+};
+
+const struct
+nfp_cpp_operations *nfp_cpp_transport_operations(void)
+{
+	return &nfp6000_pcie_ops;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_cppcore.c b/drivers/net/nfp/nfpcore/nfp_cppcore.c
new file mode 100644
index 0000000..94d4a0b
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_cppcore.c
@@ -0,0 +1,856 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#include <rte_byteorder.h>
+
+#include "nfp_cpp.h"
+#include "nfp_target.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp6000/nfp_xpb.h"
+#include "nfp_nffw.h"
+
+#define NFP_PL_DEVICE_ID                        0x00000004
+#define NFP_PL_DEVICE_ID_MASK                   0xff
+
+#define NFP6000_ARM_GCSR_SOFTMODEL0             0x00400144
+
+void
+nfp_cpp_priv_set(struct nfp_cpp *cpp, void *priv)
+{
+	cpp->priv = priv;
+}
+
+void *
+nfp_cpp_priv(struct nfp_cpp *cpp)
+{
+	return cpp->priv;
+}
+
+void
+nfp_cpp_model_set(struct nfp_cpp *cpp, uint32_t model)
+{
+	cpp->model = model;
+}
+
+uint32_t
+nfp_cpp_model(struct nfp_cpp *cpp)
+{
+	if (!cpp)
+		return NFP_CPP_MODEL_INVALID;
+
+	if (cpp->model == 0)
+		cpp->model = __nfp_cpp_model_autodetect(cpp);
+
+	return cpp->model;
+}
+
+void
+nfp_cpp_interface_set(struct nfp_cpp *cpp, uint32_t interface)
+{
+	cpp->interface = interface;
+}
+
+int
+nfp_cpp_serial(struct nfp_cpp *cpp, const uint8_t **serial)
+{
+	*serial = cpp->serial;
+	return cpp->serial_len;
+}
+
+int
+nfp_cpp_serial_set(struct nfp_cpp *cpp, const uint8_t *serial,
+		   size_t serial_len)
+{
+	if (cpp->serial_len)
+		free(cpp->serial);
+
+	cpp->serial = malloc(serial_len);
+	if (!cpp->serial)
+		return -1;
+
+	memcpy(cpp->serial, serial, serial_len);
+	cpp->serial_len = serial_len;
+
+	return 0;
+}
+
+uint16_t
+nfp_cpp_interface(struct nfp_cpp *cpp)
+{
+	if (!cpp)
+		return NFP_CPP_INTERFACE(NFP_CPP_INTERFACE_TYPE_INVALID, 0, 0);
+
+	return cpp->interface;
+}
+
+void *
+nfp_cpp_area_priv(struct nfp_cpp_area *cpp_area)
+{
+	return &cpp_area[1];
+}
+
+struct nfp_cpp *
+nfp_cpp_area_cpp(struct nfp_cpp_area *cpp_area)
+{
+	return cpp_area->cpp;
+}
+
+const char *
+nfp_cpp_area_name(struct nfp_cpp_area *cpp_area)
+{
+	return cpp_area->name;
+}
+
+/*
+ * nfp_cpp_area_alloc - allocate a new CPP area
+ * @cpp:    CPP handle
+ * @dest:   CPP id
+ * @address:    start address on CPP target
+ * @size:   size of area in bytes
+ *
+ * Allocate and initialize a CPP area structure.  The area must later
+ * be locked down with an 'acquire' before it can be safely accessed.
+ *
+ * NOTE: @address and @size must be 32-bit aligned values.
+ */
+struct nfp_cpp_area *
+nfp_cpp_area_alloc_with_name(struct nfp_cpp *cpp, uint32_t dest,
+			      const char *name, unsigned long long address,
+			      unsigned long size)
+{
+	struct nfp_cpp_area *area;
+	uint64_t tmp64 = (uint64_t)address;
+	int tmp, err;
+
+	if (!cpp)
+		return NULL;
+
+	/* CPP bus uses only a 40-bit address */
+	if ((address + size) > (1ULL << 40))
+		return NFP_ERRPTR(EFAULT);
+
+	/* Remap from cpp_island to cpp_target */
+	err = nfp_target_cpp(dest, tmp64, &dest, &tmp64, cpp->imb_cat_table);
+	if (err < 0)
+		return NULL;
+
+	address = (unsigned long long)tmp64;
+
+	if (!name)
+		name = "";
+
+	area = calloc(1, sizeof(*area) + cpp->op->area_priv_size +
+		      strlen(name) + 1);
+	if (!area)
+		return NULL;
+
+	area->cpp = cpp;
+	area->name = ((char *)area) + sizeof(*area) + cpp->op->area_priv_size;
+	memcpy(area->name, name, strlen(name) + 1);
+
+	/*
+	 * Preserve errno around the call to area_init, since most
+	 * implementations will blindly call nfp_target_action_width()for both
+	 * read or write modes, and that will set errno to EINVAL.
+	 */
+	tmp = errno;
+
+	err = cpp->op->area_init(area, dest, address, size);
+	if (err < 0) {
+		free(area);
+		return NULL;
+	}
+
+	/* Restore errno */
+	errno = tmp;
+
+	area->offset = address;
+	area->size = size;
+
+	return area;
+}
+
+struct nfp_cpp_area *
+nfp_cpp_area_alloc(struct nfp_cpp *cpp, uint32_t dest,
+		    unsigned long long address, unsigned long size)
+{
+	return nfp_cpp_area_alloc_with_name(cpp, dest, NULL, address, size);
+}
+
+/*
+ * nfp_cpp_area_alloc_acquire - allocate a new CPP area and lock it down
+ *
+ * @cpp:    CPP handle
+ * @dest:   CPP id
+ * @address:    start address on CPP target
+ * @size:   size of area
+ *
+ * Allocate and initilizae a CPP area structure, and lock it down so
+ * that it can be accessed directly.
+ *
+ * NOTE: @address and @size must be 32-bit aligned values.
+ *
+ * NOTE: The area must also be 'released' when the structure is freed.
+ */
+struct nfp_cpp_area *
+nfp_cpp_area_alloc_acquire(struct nfp_cpp *cpp, uint32_t destination,
+			    unsigned long long address, unsigned long size)
+{
+	struct nfp_cpp_area *area;
+
+	area = nfp_cpp_area_alloc(cpp, destination, address, size);
+	if (!area)
+		return NULL;
+
+	if (nfp_cpp_area_acquire(area)) {
+		nfp_cpp_area_free(area);
+		return NULL;
+	}
+
+	return area;
+}
+
+/*
+ * nfp_cpp_area_free - free up the CPP area
+ * area:    CPP area handle
+ *
+ * Frees up memory resources held by the CPP area.
+ */
+void
+nfp_cpp_area_free(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_cleanup)
+		area->cpp->op->area_cleanup(area);
+	free(area);
+}
+
+/*
+ * nfp_cpp_area_release_free - release CPP area and free it
+ * area:    CPP area handle
+ *
+ * Releases CPP area and frees up memory resources held by the it.
+ */
+void
+nfp_cpp_area_release_free(struct nfp_cpp_area *area)
+{
+	nfp_cpp_area_release(area);
+	nfp_cpp_area_free(area);
+}
+
+/*
+ * nfp_cpp_area_acquire - lock down a CPP area for access
+ * @area:   CPP area handle
+ *
+ * Locks down the CPP area for a potential long term activity.  Area
+ * must always be locked down before being accessed.
+ */
+int
+nfp_cpp_area_acquire(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_acquire) {
+		int err = area->cpp->op->area_acquire(area);
+
+		if (err < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * nfp_cpp_area_release - release a locked down CPP area
+ * @area:   CPP area handle
+ *
+ * Releases a previously locked down CPP area.
+ */
+void
+nfp_cpp_area_release(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_release)
+		area->cpp->op->area_release(area);
+}
+
+/*
+ * nfp_cpp_area_iomem() - get IOMEM region for CPP area
+ *
+ * @area:       CPP area handle
+ *
+ * Returns an iomem pointer for use with readl()/writel() style operations.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ *
+ * Return: pointer to the area, or NULL
+ */
+void *
+nfp_cpp_area_iomem(struct nfp_cpp_area *area)
+{
+	void *iomem = NULL;
+
+	if (area->cpp->op->area_iomem)
+		iomem = area->cpp->op->area_iomem(area);
+
+	return iomem;
+}
+
+/*
+ * nfp_cpp_area_read - read data from CPP area
+ *
+ * @area:       CPP area handle
+ * @offset:     offset into CPP area
+ * @kernel_vaddr:   kernel address to put data into
+ * @length:     number of bytes to read
+ *
+ * Read data from indicated CPP region.
+ *
+ * NOTE: @offset and @length must be 32-bit aligned values.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ */
+int
+nfp_cpp_area_read(struct nfp_cpp_area *area, unsigned long offset,
+		  void *kernel_vaddr, size_t length)
+{
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EFAULT);
+
+	return area->cpp->op->area_read(area, kernel_vaddr, offset, length);
+}
+
+/*
+ * nfp_cpp_area_write - write data to CPP area
+ *
+ * @area:       CPP area handle
+ * @offset:     offset into CPP area
+ * @kernel_vaddr:   kernel address to read data from
+ * @length:     number of bytes to write
+ *
+ * Write data to indicated CPP region.
+ *
+ * NOTE: @offset and @length must be 32-bit aligned values.
+ *
+ * NOTE: Area must have been locked down with an 'acquire'.
+ */
+int
+nfp_cpp_area_write(struct nfp_cpp_area *area, unsigned long offset,
+		   const void *kernel_vaddr, size_t length)
+{
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EFAULT);
+
+	return area->cpp->op->area_write(area, kernel_vaddr, offset, length);
+}
+
+void *
+nfp_cpp_area_mapped(struct nfp_cpp_area *area)
+{
+	if (area->cpp->op->area_mapped)
+		return area->cpp->op->area_mapped(area);
+	return NULL;
+}
+
+/*
+ * nfp_cpp_area_check_range - check if address range fits in CPP area
+ *
+ * @area:   CPP area handle
+ * @offset: offset into CPP area
+ * @length: size of address range in bytes
+ *
+ * Check if address range fits within CPP area.  Return 0 if area fits
+ * or -1 on error.
+ */
+int
+nfp_cpp_area_check_range(struct nfp_cpp_area *area, unsigned long long offset,
+			 unsigned long length)
+{
+	if (((offset + length) > area->size))
+		return NFP_ERRNO(EFAULT);
+
+	return 0;
+}
+
+/*
+ * Return the correct CPP address, and fixup xpb_addr as needed,
+ * based upon NFP model.
+ */
+static uint32_t
+nfp_xpb_to_cpp(struct nfp_cpp *cpp, uint32_t *xpb_addr)
+{
+	uint32_t xpb;
+	int island;
+
+	if (!NFP_CPP_MODEL_IS_6000(cpp->model))
+		return 0;
+
+	xpb = NFP_CPP_ID(14, NFP_CPP_ACTION_RW, 0);
+
+	/*
+	 * Ensure that non-local XPB accesses go out through the
+	 * global XPBM bus.
+	 */
+	island = ((*xpb_addr) >> 24) & 0x3f;
+
+	if (!island)
+		return xpb;
+
+	if (island == 1) {
+		/*
+		 * Accesses to the ARM Island overlay uses Island 0
+		 * Global Bit
+		 */
+		(*xpb_addr) &= ~0x7f000000;
+		if (*xpb_addr < 0x60000)
+			*xpb_addr |= (1 << 30);
+		else
+			/* And only non-ARM interfaces use island id = 1 */
+			if (NFP_CPP_INTERFACE_TYPE_of(nfp_cpp_interface(cpp)) !=
+			    NFP_CPP_INTERFACE_TYPE_ARM)
+				*xpb_addr |= (1 << 24);
+	} else {
+		(*xpb_addr) |= (1 << 30);
+	}
+
+	return xpb;
+}
+
+int
+nfp_cpp_area_readl(struct nfp_cpp_area *area, unsigned long offset,
+		   uint32_t *value)
+{
+	int sz;
+	uint32_t tmp = 0;
+
+	sz = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_32(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_writel(struct nfp_cpp_area *area, unsigned long offset,
+		    uint32_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_32(value);
+	sz = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_readq(struct nfp_cpp_area *area, unsigned long offset,
+		   uint64_t *value)
+{
+	int sz;
+	uint64_t tmp = 0;
+
+	sz = nfp_cpp_area_read(area, offset, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_64(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_area_writeq(struct nfp_cpp_area *area, unsigned long offset,
+		    uint64_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_64(value);
+	sz = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_readl(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	      uint32_t *value)
+{
+	int sz;
+	uint32_t tmp;
+
+	sz = nfp_cpp_read(cpp, cpp_id, address, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_32(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_writel(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	       uint32_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_32(value);
+	sz = nfp_cpp_write(cpp, cpp_id, address, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_readq(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	      uint64_t *value)
+{
+	int sz;
+	uint64_t tmp;
+
+	sz = nfp_cpp_read(cpp, cpp_id, address, &tmp, sizeof(tmp));
+	*value = rte_le_to_cpu_64(tmp);
+
+	return (sz == sizeof(*value)) ? 0 : -1;
+}
+
+int
+nfp_cpp_writeq(struct nfp_cpp *cpp, uint32_t cpp_id, unsigned long long address,
+	       uint64_t value)
+{
+	int sz;
+
+	value = rte_cpu_to_le_64(value);
+	sz = nfp_cpp_write(cpp, cpp_id, address, &value, sizeof(value));
+
+	return (sz == sizeof(value)) ? 0 : -1;
+}
+
+int
+nfp_xpb_writel(struct nfp_cpp *cpp, uint32_t xpb_addr, uint32_t value)
+{
+	uint32_t cpp_dest;
+
+	cpp_dest = nfp_xpb_to_cpp(cpp, &xpb_addr);
+
+	return nfp_cpp_writel(cpp, cpp_dest, xpb_addr, value);
+}
+
+int
+nfp_xpb_readl(struct nfp_cpp *cpp, uint32_t xpb_addr, uint32_t *value)
+{
+	uint32_t cpp_dest;
+
+	cpp_dest = nfp_xpb_to_cpp(cpp, &xpb_addr);
+
+	return nfp_cpp_readl(cpp, cpp_dest, xpb_addr, value);
+}
+
+static struct nfp_cpp *
+nfp_cpp_alloc(const char *devname)
+{
+	const struct nfp_cpp_operations *ops;
+	struct nfp_cpp *cpp;
+	int err;
+
+	ops = nfp_cpp_transport_operations();
+
+	if (!ops || !ops->init)
+		return NFP_ERRPTR(EINVAL);
+
+	cpp = calloc(1, sizeof(*cpp));
+	if (!cpp)
+		return NULL;
+
+	cpp->op = ops;
+
+	if (cpp->op->init) {
+		err = cpp->op->init(cpp, devname);
+		if (err < 0) {
+			free(cpp);
+			return NULL;
+		}
+	}
+
+	if (NFP_CPP_MODEL_IS_6000(nfp_cpp_model(cpp))) {
+		uint32_t xpbaddr;
+		size_t tgt;
+
+		for (tgt = 0; tgt < ARRAY_SIZE(cpp->imb_cat_table); tgt++) {
+			/* Hardcoded XPB IMB Base, island 0 */
+			xpbaddr = 0x000a0000 + (tgt * 4);
+			err = nfp_xpb_readl(cpp, xpbaddr,
+				(uint32_t *)&cpp->imb_cat_table[tgt]);
+			if (err < 0) {
+				free(cpp);
+				return NULL;
+			}
+		}
+	}
+
+	return cpp;
+}
+
+/*
+ * nfp_cpp_free - free the CPP handle
+ * @cpp:    CPP handle
+ */
+void
+nfp_cpp_free(struct nfp_cpp *cpp)
+{
+	if (cpp->op && cpp->op->free)
+		cpp->op->free(cpp);
+
+	if (cpp->serial_len)
+		free(cpp->serial);
+
+	free(cpp);
+}
+
+struct nfp_cpp *
+nfp_cpp_from_device_name(const char *devname)
+{
+	return nfp_cpp_alloc(devname);
+}
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to modify
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_xpb_writelm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+		uint32_t value)
+{
+	int err;
+	uint32_t tmp;
+
+	err = nfp_xpb_readl(cpp, xpb_tgt, &tmp);
+	if (err < 0)
+		return err;
+
+	tmp &= ~mask;
+	tmp |= (mask & value);
+	return nfp_xpb_writel(cpp, xpb_tgt, tmp);
+}
+
+/*
+ * Modify bits of a 32-bit value from the XPB bus
+ *
+ * @param cpp           NFP CPP device handle
+ * @param xpb_tgt       XPB target and address
+ * @param mask          mask of bits to alter
+ * @param value         value to monitor for
+ * @param timeout_us    maximum number of us to wait (-1 for forever)
+ *
+ * @return >= 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_xpb_waitlm(struct nfp_cpp *cpp, uint32_t xpb_tgt, uint32_t mask,
+	       uint32_t value, int timeout_us)
+{
+	uint32_t tmp;
+	int err;
+
+	do {
+		err = nfp_xpb_readl(cpp, xpb_tgt, &tmp);
+		if (err < 0)
+			goto exit;
+
+		if ((tmp & mask) == (value & mask)) {
+			if (timeout_us < 0)
+				timeout_us = 0;
+			break;
+		}
+
+		if (timeout_us < 0)
+			continue;
+
+		timeout_us -= 100;
+		usleep(100);
+	} while (timeout_us >= 0);
+
+	if (timeout_us < 0)
+		err = NFP_ERRNO(ETIMEDOUT);
+	else
+		err = timeout_us;
+
+exit:
+	return err;
+}
+
+/*
+ * nfp_cpp_read - read from CPP target
+ * @cpp:        CPP handle
+ * @destination:    CPP id
+ * @address:        offset into CPP target
+ * @kernel_vaddr:   kernel buffer for result
+ * @length:     number of bytes to read
+ */
+int
+nfp_cpp_read(struct nfp_cpp *cpp, uint32_t destination,
+	     unsigned long long address, void *kernel_vaddr, size_t length)
+{
+	struct nfp_cpp_area *area;
+	int err;
+
+	area = nfp_cpp_area_alloc_acquire(cpp, destination, address, length);
+	if (!area) {
+		printf("Area allocation/acquire failed\n");
+		return -1;
+	}
+
+	err = nfp_cpp_area_read(area, 0, kernel_vaddr, length);
+
+	nfp_cpp_area_release_free(area);
+	return err;
+}
+
+/*
+ * nfp_cpp_write - write to CPP target
+ * @cpp:        CPP handle
+ * @destination:    CPP id
+ * @address:        offset into CPP target
+ * @kernel_vaddr:   kernel buffer to read from
+ * @length:     number of bytes to write
+ */
+int
+nfp_cpp_write(struct nfp_cpp *cpp, uint32_t destination,
+	      unsigned long long address, const void *kernel_vaddr,
+	      size_t length)
+{
+	struct nfp_cpp_area *area;
+	int err;
+
+	area = nfp_cpp_area_alloc_acquire(cpp, destination, address, length);
+	if (!area)
+		return -1;
+
+	err = nfp_cpp_area_write(area, 0, kernel_vaddr, length);
+
+	nfp_cpp_area_release_free(area);
+	return err;
+}
+
+/*
+ * nfp_cpp_area_fill - fill a CPP area with a value
+ * @area:       CPP area
+ * @offset:     offset into CPP area
+ * @value:      value to fill with
+ * @length:     length of area to fill
+ */
+int
+nfp_cpp_area_fill(struct nfp_cpp_area *area, unsigned long offset,
+		  uint32_t value, size_t length)
+{
+	int err;
+	size_t i;
+	uint64_t value64;
+
+	value = rte_cpu_to_le_32(value);
+	value64 = ((uint64_t)value << 32) | value;
+
+	if ((offset + length) > area->size)
+		return NFP_ERRNO(EINVAL);
+
+	if ((area->offset + offset) & 3)
+		return NFP_ERRNO(EINVAL);
+
+	if (((area->offset + offset) & 7) == 4 && length >= 4) {
+		err = nfp_cpp_area_write(area, offset, &value, sizeof(value));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value))
+			return NFP_ERRNO(ENOSPC);
+		offset += sizeof(value);
+		length -= sizeof(value);
+	}
+
+	for (i = 0; (i + sizeof(value)) < length; i += sizeof(value64)) {
+		err =
+		    nfp_cpp_area_write(area, offset + i, &value64,
+				       sizeof(value64));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value64))
+			return NFP_ERRNO(ENOSPC);
+	}
+
+	if ((i + sizeof(value)) <= length) {
+		err =
+		    nfp_cpp_area_write(area, offset + i, &value, sizeof(value));
+		if (err < 0)
+			return err;
+		if (err != sizeof(value))
+			return NFP_ERRNO(ENOSPC);
+		i += sizeof(value);
+	}
+
+	return (int)i;
+}
+
+/*
+ * NOTE: This code should not use nfp_xpb_* functions,
+ * as those are model-specific
+ */
+uint32_t
+__nfp_cpp_model_autodetect(struct nfp_cpp *cpp)
+{
+	uint32_t arm_id = NFP_CPP_ID(NFP_CPP_TARGET_ARM, 0, 0);
+	uint32_t model = 0;
+
+	nfp_cpp_readl(cpp, arm_id, NFP6000_ARM_GCSR_SOFTMODEL0, &model);
+
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		uint32_t tmp;
+
+		nfp_cpp_model_set(cpp, model);
+
+		/* The PL's PluDeviceID revision code is authoratative */
+		model &= ~0xff;
+		nfp_xpb_readl(cpp, NFP_XPB_DEVICE(1, 1, 16) +
+				   NFP_PL_DEVICE_ID, &tmp);
+		model |= (NFP_PL_DEVICE_ID_MASK & tmp) - 0x10;
+	}
+
+	return model;
+}
+
+/*
+ * nfp_cpp_map_area() - Helper function to map an area
+ * @cpp:    NFP CPP handler
+ * @domain: CPP domain
+ * @target: CPP target
+ * @addr:   CPP address
+ * @size:   Size of the area
+ * @area:   Area handle (output)
+ *
+ * Map an area of IOMEM access.  To undo the effect of this function call
+ * @nfp_cpp_area_release_free(*area).
+ *
+ * Return: Pointer to memory mapped area or ERR_PTR
+ */
+uint8_t *
+nfp_cpp_map_area(struct nfp_cpp *cpp, int domain, int target, uint64_t addr,
+		 unsigned long size, struct nfp_cpp_area **area)
+{
+	uint8_t *res;
+	uint32_t dest;
+
+	dest = NFP_CPP_ISLAND_ID(target, NFP_CPP_ACTION_RW, 0, domain);
+
+	*area = nfp_cpp_area_alloc_acquire(cpp, dest, addr, size);
+	if (!*area)
+		goto err_eio;
+
+	res = nfp_cpp_area_iomem(*area);
+	if (!res)
+		goto err_release_free;
+
+	return res;
+
+err_release_free:
+	nfp_cpp_area_release_free(*area);
+err_eio:
+	return NULL;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_crc.c b/drivers/net/nfp/nfpcore/nfp_crc.c
new file mode 100644
index 0000000..20431bf
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_crc.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "nfp_crc.h"
+
+static inline uint32_t
+nfp_crc32_be_generic(uint32_t crc, unsigned char const *p, size_t len,
+		 uint32_t polynomial)
+{
+	int i;
+	while (len--) {
+		crc ^= *p++ << 24;
+		for (i = 0; i < 8; i++)
+			crc = (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
+					  0);
+	}
+	return crc;
+}
+
+static inline uint32_t
+nfp_crc32_be(uint32_t crc, unsigned char const *p, size_t len)
+{
+	return nfp_crc32_be_generic(crc, p, len, CRCPOLY_BE);
+}
+
+static uint32_t
+nfp_crc32_posix_end(uint32_t crc, size_t total_len)
+{
+	/* Extend with the length of the string. */
+	while (total_len != 0) {
+		uint8_t c = total_len & 0xff;
+
+		crc = nfp_crc32_be(crc, &c, 1);
+		total_len >>= 8;
+	}
+
+	return ~crc;
+}
+
+uint32_t
+nfp_crc32_posix(const void *buff, size_t len)
+{
+	return nfp_crc32_posix_end(nfp_crc32_be(0, buff, len), len);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_crc.h b/drivers/net/nfp/nfpcore/nfp_crc.h
new file mode 100644
index 0000000..f99c89f
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_crc.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_CRC_H__
+#define __NFP_CRC_H__
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+uint32_t nfp_crc32_posix(const void *buff, size_t len);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_hwinfo.c b/drivers/net/nfp/nfpcore/nfp_hwinfo.c
new file mode 100644
index 0000000..c0516bf
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_hwinfo.c
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/* Parse the hwinfo table that the ARM firmware builds in the ARM scratch SRAM
+ * after chip reset.
+ *
+ * Examples of the fields:
+ *   me.count = 40
+ *   me.mask = 0x7f_ffff_ffff
+ *
+ *   me.count is the total number of MEs on the system.
+ *   me.mask is the bitmask of MEs that are available for application usage.
+ *
+ *   (ie, in this example, ME 39 has been reserved by boardconfig.)
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+#include "nfp_hwinfo.h"
+#include "nfp_crc.h"
+
+static int
+nfp_hwinfo_is_updating(struct nfp_hwinfo *hwinfo)
+{
+	return hwinfo->version & NFP_HWINFO_VERSION_UPDATING;
+}
+
+static int
+nfp_hwinfo_db_walk(struct nfp_hwinfo *hwinfo, uint32_t size)
+{
+	const char *key, *val, *end = hwinfo->data + size;
+
+	for (key = hwinfo->data; *key && key < end;
+	     key = val + strlen(val) + 1) {
+		val = key + strlen(key) + 1;
+		if (val >= end) {
+			printf("Bad HWINFO - overflowing key\n");
+			return -EINVAL;
+		}
+
+		if (val + strlen(val) + 1 > end) {
+			printf("Bad HWINFO - overflowing value\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int
+nfp_hwinfo_db_validate(struct nfp_hwinfo *db, uint32_t len)
+{
+	uint32_t size, new_crc, *crc;
+
+	size = db->size;
+	if (size > len) {
+		printf("Unsupported hwinfo size %u > %u\n", size, len);
+		return -EINVAL;
+	}
+
+	size -= sizeof(uint32_t);
+	new_crc = nfp_crc32_posix((char *)db, size);
+	crc = (uint32_t *)(db->start + size);
+	if (new_crc != *crc) {
+		printf("Corrupt hwinfo table (CRC mismatch)\n");
+		printf("\tcalculated 0x%x, expected 0x%x\n", new_crc, *crc);
+		return -EINVAL;
+	}
+
+	return nfp_hwinfo_db_walk(db, size);
+}
+
+static struct nfp_hwinfo *
+nfp_hwinfo_try_fetch(struct nfp_cpp *cpp, size_t *cpp_size)
+{
+	struct nfp_hwinfo *header;
+	void *res;
+	uint64_t cpp_addr;
+	uint32_t cpp_id;
+	int err;
+	uint8_t *db;
+
+	res = nfp_resource_acquire(cpp, NFP_RESOURCE_NFP_HWINFO);
+	if (res) {
+		cpp_id = nfp_resource_cpp_id(res);
+		cpp_addr = nfp_resource_address(res);
+		*cpp_size = nfp_resource_size(res);
+
+		nfp_resource_release(res);
+
+		if (*cpp_size < HWINFO_SIZE_MIN)
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+	db = malloc(*cpp_size + 1);
+	if (!db)
+		return NULL;
+
+	err = nfp_cpp_read(cpp, cpp_id, cpp_addr, db, *cpp_size);
+	if (err != (int)*cpp_size)
+		goto exit_free;
+
+	header = (void *)db;
+	printf("NFP HWINFO header: %08x\n", *(uint32_t *)header);
+	if (nfp_hwinfo_is_updating(header))
+		goto exit_free;
+
+	if (header->version != NFP_HWINFO_VERSION_2) {
+		printf("Unknown HWInfo version: 0x%08x\n",
+			header->version);
+		goto exit_free;
+	}
+
+	/* NULL-terminate for safety */
+	db[*cpp_size] = '\0';
+
+	return (void *)db;
+exit_free:
+	free(db);
+	return NULL;
+}
+
+static struct nfp_hwinfo *
+nfp_hwinfo_fetch(struct nfp_cpp *cpp, size_t *hwdb_size)
+{
+	struct timespec wait;
+	struct nfp_hwinfo *db;
+	int count;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 10000000;
+	count = 0;
+
+	for (;;) {
+		db = nfp_hwinfo_try_fetch(cpp, hwdb_size);
+		if (db)
+			return db;
+
+		nanosleep(&wait, NULL);
+		if (count++ > 200) {
+			printf("NFP access error\n");
+			return NULL;
+		}
+	}
+}
+
+struct nfp_hwinfo *
+nfp_hwinfo_read(struct nfp_cpp *cpp)
+{
+	struct nfp_hwinfo *db;
+	size_t hwdb_size = 0;
+	int err;
+
+	db = nfp_hwinfo_fetch(cpp, &hwdb_size);
+	if (!db)
+		return NULL;
+
+	err = nfp_hwinfo_db_validate(db, hwdb_size);
+	if (err) {
+		free(db);
+		return NULL;
+	}
+	return db;
+}
+
+/*
+ * nfp_hwinfo_lookup() - Find a value in the HWInfo table by name
+ * @hwinfo:	NFP HWinfo table
+ * @lookup:	HWInfo name to search for
+ *
+ * Return: Value of the HWInfo name, or NULL
+ */
+const char *
+nfp_hwinfo_lookup(struct nfp_hwinfo *hwinfo, const char *lookup)
+{
+	const char *key, *val, *end;
+
+	if (!hwinfo || !lookup)
+		return NULL;
+
+	end = hwinfo->data + hwinfo->size - sizeof(uint32_t);
+
+	for (key = hwinfo->data; *key && key < end;
+	     key = val + strlen(val) + 1) {
+		val = key + strlen(key) + 1;
+
+		if (strcmp(key, lookup) == 0)
+			return val;
+	}
+
+	return NULL;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_hwinfo.h b/drivers/net/nfp/nfpcore/nfp_hwinfo.h
new file mode 100644
index 0000000..ccc6163
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_hwinfo.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_HWINFO_H__
+#define __NFP_HWINFO_H__
+
+#include <inttypes.h>
+
+#define HWINFO_SIZE_MIN	0x100
+
+/*
+ * The Hardware Info Table defines the properties of the system.
+ *
+ * HWInfo v1 Table (fixed size)
+ *
+ * 0x0000: uint32_t version	        Hardware Info Table version (1.0)
+ * 0x0004: uint32_t size	        Total size of the table, including the
+ *					CRC32 (IEEE 802.3)
+ * 0x0008: uint32_t jumptab	        Offset of key/value table
+ * 0x000c: uint32_t keys	        Total number of keys in the key/value
+ *					table
+ * NNNNNN:				Key/value jump table and string data
+ * (size - 4): uint32_t crc32	CRC32 (same as IEEE 802.3, POSIX csum, etc)
+ *				CRC32("",0) = ~0, CRC32("a",1) = 0x48C279FE
+ *
+ * HWInfo v2 Table (variable size)
+ *
+ * 0x0000: uint32_t version	        Hardware Info Table version (2.0)
+ * 0x0004: uint32_t size	        Current size of the data area, excluding
+ *					CRC32
+ * 0x0008: uint32_t limit	        Maximum size of the table
+ * 0x000c: uint32_t reserved	        Unused, set to zero
+ * NNNNNN:			Key/value data
+ * (size - 4): uint32_t crc32	CRC32 (same as IEEE 802.3, POSIX csum, etc)
+ *				CRC32("",0) = ~0, CRC32("a",1) = 0x48C279FE
+ *
+ * If the HWInfo table is in the process of being updated, the low bit of
+ * version will be set.
+ *
+ * HWInfo v1 Key/Value Table
+ * -------------------------
+ *
+ *  The key/value table is a set of offsets to ASCIIZ strings which have
+ *  been strcmp(3) sorted (yes, please use bsearch(3) on the table).
+ *
+ *  All keys are guaranteed to be unique.
+ *
+ * N+0:	uint32_t key_1		Offset to the first key
+ * N+4:	uint32_t val_1		Offset to the first value
+ * N+8: uint32_t key_2		Offset to the second key
+ * N+c: uint32_t val_2		Offset to the second value
+ * ...
+ *
+ * HWInfo v2 Key/Value Table
+ * -------------------------
+ *
+ * Packed UTF8Z strings, ie 'key1\000value1\000key2\000value2\000'
+ *
+ * Unsorted.
+ */
+
+#define NFP_HWINFO_VERSION_1 ('H' << 24 | 'I' << 16 | 1 << 8 | 0 << 1 | 0)
+#define NFP_HWINFO_VERSION_2 ('H' << 24 | 'I' << 16 | 2 << 8 | 0 << 1 | 0)
+#define NFP_HWINFO_VERSION_UPDATING	BIT(0)
+
+struct nfp_hwinfo {
+	uint8_t start[0];
+
+	uint32_t version;
+	uint32_t size;
+
+	/* v2 specific fields */
+	uint32_t limit;
+	uint32_t resv;
+
+	char data[];
+};
+
+struct nfp_hwinfo *nfp_hwinfo_read(struct nfp_cpp *cpp);
+
+const char *nfp_hwinfo_lookup(struct nfp_hwinfo *hwinfo, const char *lookup);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_mip.c b/drivers/net/nfp/nfpcore/nfp_mip.c
new file mode 100644
index 0000000..c86966d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mip.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+
+#include "nfp_cpp.h"
+#include "nfp_mip.h"
+#include "nfp_nffw.h"
+
+#define NFP_MIP_SIGNATURE	rte_cpu_to_le_32(0x0050494d)  /* "MIP\0" */
+#define NFP_MIP_VERSION		rte_cpu_to_le_32(1)
+#define NFP_MIP_MAX_OFFSET	(256 * 1024)
+
+struct nfp_mip {
+	uint32_t signature;
+	uint32_t mip_version;
+	uint32_t mip_size;
+	uint32_t first_entry;
+
+	uint32_t version;
+	uint32_t buildnum;
+	uint32_t buildtime;
+	uint32_t loadtime;
+
+	uint32_t symtab_addr;
+	uint32_t symtab_size;
+	uint32_t strtab_addr;
+	uint32_t strtab_size;
+
+	char name[16];
+	char toolchain[32];
+};
+
+/* Read memory and check if it could be a valid MIP */
+static int
+nfp_mip_try_read(struct nfp_cpp *cpp, uint32_t cpp_id, uint64_t addr,
+		 struct nfp_mip *mip)
+{
+	int ret;
+
+	ret = nfp_cpp_read(cpp, cpp_id, addr, mip, sizeof(*mip));
+	if (ret != sizeof(*mip)) {
+		printf("Failed to read MIP data (%d, %zu)\n",
+			ret, sizeof(*mip));
+		return -EIO;
+	}
+	if (mip->signature != NFP_MIP_SIGNATURE) {
+		printf("Incorrect MIP signature (0x%08x)\n",
+			 rte_le_to_cpu_32(mip->signature));
+		return -EINVAL;
+	}
+	if (mip->mip_version != NFP_MIP_VERSION) {
+		printf("Unsupported MIP version (%d)\n",
+			 rte_le_to_cpu_32(mip->mip_version));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Try to locate MIP using the resource table */
+static int
+nfp_mip_read_resource(struct nfp_cpp *cpp, struct nfp_mip *mip)
+{
+	struct nfp_nffw_info *nffw_info;
+	uint32_t cpp_id;
+	uint64_t addr;
+	int err;
+
+	nffw_info = nfp_nffw_info_open(cpp);
+	if (!nffw_info)
+		return -ENODEV;
+
+	err = nfp_nffw_info_mip_first(nffw_info, &cpp_id, &addr);
+	if (err)
+		goto exit_close_nffw;
+
+	err = nfp_mip_try_read(cpp, cpp_id, addr, mip);
+exit_close_nffw:
+	nfp_nffw_info_close(nffw_info);
+	return err;
+}
+
+/*
+ * nfp_mip_open() - Get device MIP structure
+ * @cpp:	NFP CPP Handle
+ *
+ * Copy MIP structure from NFP device and return it.  The returned
+ * structure is handled internally by the library and should be
+ * freed by calling nfp_mip_close().
+ *
+ * Return: pointer to mip, NULL on failure.
+ */
+struct nfp_mip *
+nfp_mip_open(struct nfp_cpp *cpp)
+{
+	struct nfp_mip *mip;
+	int err;
+
+	mip = malloc(sizeof(*mip));
+	if (!mip)
+		return NULL;
+
+	err = nfp_mip_read_resource(cpp, mip);
+	if (err) {
+		free(mip);
+		return NULL;
+	}
+
+	mip->name[sizeof(mip->name) - 1] = 0;
+
+	return mip;
+}
+
+void
+nfp_mip_close(struct nfp_mip *mip)
+{
+	free(mip);
+}
+
+const char *
+nfp_mip_name(const struct nfp_mip *mip)
+{
+	return mip->name;
+}
+
+/*
+ * nfp_mip_symtab() - Get the address and size of the MIP symbol table
+ * @mip:	MIP handle
+ * @addr:	Location for NFP DDR address of MIP symbol table
+ * @size:	Location for size of MIP symbol table
+ */
+void
+nfp_mip_symtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size)
+{
+	*addr = rte_le_to_cpu_32(mip->symtab_addr);
+	*size = rte_le_to_cpu_32(mip->symtab_size);
+}
+
+/*
+ * nfp_mip_strtab() - Get the address and size of the MIP symbol name table
+ * @mip:	MIP handle
+ * @addr:	Location for NFP DDR address of MIP symbol name table
+ * @size:	Location for size of MIP symbol name table
+ */
+void
+nfp_mip_strtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size)
+{
+	*addr = rte_le_to_cpu_32(mip->strtab_addr);
+	*size = rte_le_to_cpu_32(mip->strtab_size);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_mip.h b/drivers/net/nfp/nfpcore/nfp_mip.h
new file mode 100644
index 0000000..d0919b5
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mip.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_MIP_H__
+#define __NFP_MIP_H__
+
+#include "nfp_nffw.h"
+
+struct nfp_mip;
+
+struct nfp_mip *nfp_mip_open(struct nfp_cpp *cpp);
+void nfp_mip_close(struct nfp_mip *mip);
+
+const char *nfp_mip_name(const struct nfp_mip *mip);
+void nfp_mip_symtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size);
+void nfp_mip_strtab(const struct nfp_mip *mip, uint32_t *addr, uint32_t *size);
+int nfp_nffw_info_mip_first(struct nfp_nffw_info *state, uint32_t *cpp_id,
+			    uint64_t *off);
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_mutex.c b/drivers/net/nfp/nfpcore/nfp_mutex.c
new file mode 100644
index 0000000..318c580
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_mutex.c
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <errno.h>
+
+#include <malloc.h>
+#include <time.h>
+#include <sched.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+
+#define MUTEX_LOCKED(interface)  ((((uint32_t)(interface)) << 16) | 0x000f)
+#define MUTEX_UNLOCK(interface)  (0                               | 0x0000)
+
+#define MUTEX_IS_LOCKED(value)   (((value) & 0xffff) == 0x000f)
+#define MUTEX_IS_UNLOCKED(value) (((value) & 0xffff) == 0x0000)
+#define MUTEX_INTERFACE(value)   (((value) >> 16) & 0xffff)
+
+/*
+ * If you need more than 65536 recursive locks, please
+ * rethink your code.
+ */
+#define MUTEX_DEPTH_MAX         0xffff
+
+struct nfp_cpp_mutex {
+	struct nfp_cpp *cpp;
+	uint8_t target;
+	uint16_t depth;
+	unsigned long long address;
+	uint32_t key;
+	unsigned int usage;
+	struct nfp_cpp_mutex *prev, *next;
+};
+
+static int
+_nfp_cpp_mutex_validate(uint32_t model, int *target, unsigned long long address)
+{
+	/* Address must be 64-bit aligned */
+	if (address & 7)
+		return NFP_ERRNO(EINVAL);
+
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		if (*target != NFP_CPP_TARGET_MU)
+			return NFP_ERRNO(EINVAL);
+	} else {
+		return NFP_ERRNO(EINVAL);
+	}
+
+	return 0;
+}
+
+/*
+ * Initialize a mutex location
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and
+ * will initialize 64 bits of data at the location.
+ *
+ * This creates the initial mutex state, as locked by this
+ * nfp_cpp_interface().
+ *
+ * This function should only be called when setting up
+ * the initial lock state upon boot-up of the system.
+ *
+ * @param mutex     NFP CPP Mutex handle
+ * @param target    NFP CPP target ID (ie NFP_CPP_TARGET_CLS or
+ *		    NFP_CPP_TARGET_MU)
+ * @param address   Offset into the address space of the NFP CPP target ID
+ * @param key       Unique 32-bit value for this mutex
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_init(struct nfp_cpp *cpp, int target, unsigned long long address,
+		   uint32_t key)
+{
+	uint32_t model = nfp_cpp_model(cpp);
+	uint32_t muw = NFP_CPP_ID(target, 4, 0);	/* atomic_write */
+	int err;
+
+	err = _nfp_cpp_mutex_validate(model, &target, address);
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_writel(cpp, muw, address + 4, key);
+	if (err < 0)
+		return err;
+
+	err =
+	    nfp_cpp_writel(cpp, muw, address + 0,
+			   MUTEX_LOCKED(nfp_cpp_interface(cpp)));
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+/*
+ * Create a mutex handle from an address controlled by a MU Atomic engine
+ *
+ * The CPP target:address must point to a 64-bit aligned location, and
+ * reserve 64 bits of data at the location for use by the handle.
+ *
+ * Only target/address pairs that point to entities that support the
+ * MU Atomic Engine are supported.
+ *
+ * @param cpp       NFP CPP handle
+ * @param target    NFP CPP target ID (ie NFP_CPP_TARGET_CLS or
+ *		    NFP_CPP_TARGET_MU)
+ * @param address   Offset into the address space of the NFP CPP target ID
+ * @param key       32-bit unique key (must match the key at this location)
+ *
+ * @return      A non-NULL struct nfp_cpp_mutex * on success, NULL on failure.
+ */
+struct nfp_cpp_mutex *
+nfp_cpp_mutex_alloc(struct nfp_cpp *cpp, int target,
+		     unsigned long long address, uint32_t key)
+{
+	uint32_t model = nfp_cpp_model(cpp);
+	struct nfp_cpp_mutex *mutex;
+	uint32_t mur = NFP_CPP_ID(target, 3, 0);	/* atomic_read */
+	int err;
+	uint32_t tmp;
+
+	/* Look for cached mutex */
+	for (mutex = cpp->mutex_cache; mutex; mutex = mutex->next) {
+		if (mutex->target == target && mutex->address == address)
+			break;
+	}
+
+	if (mutex) {
+		if (mutex->key == key) {
+			mutex->usage++;
+			return mutex;
+		}
+
+		/* If the key doesn't match... */
+		return NFP_ERRPTR(EEXIST);
+	}
+
+	err = _nfp_cpp_mutex_validate(model, &target, address);
+	if (err < 0)
+		return NULL;
+
+	err = nfp_cpp_readl(cpp, mur, address + 4, &tmp);
+	if (err < 0)
+		return NULL;
+
+	if (tmp != key)
+		return NFP_ERRPTR(EEXIST);
+
+	mutex = calloc(sizeof(*mutex), 1);
+	if (!mutex)
+		return NFP_ERRPTR(ENOMEM);
+
+	mutex->cpp = cpp;
+	mutex->target = target;
+	mutex->address = address;
+	mutex->key = key;
+	mutex->depth = 0;
+	mutex->usage = 1;
+
+	/* Add mutex to the cache */
+	if (cpp->mutex_cache) {
+		cpp->mutex_cache->prev = mutex;
+		mutex->next = cpp->mutex_cache;
+		cpp->mutex_cache = mutex;
+	} else {
+		cpp->mutex_cache = mutex;
+	}
+
+	return mutex;
+}
+
+struct nfp_cpp *
+nfp_cpp_mutex_cpp(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->cpp;
+}
+
+uint32_t
+nfp_cpp_mutex_key(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->key;
+}
+
+uint16_t
+nfp_cpp_mutex_owner(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	uint32_t value, key;
+	int err;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address, &value);
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		return err;
+
+	if (key != mutex->key)
+		return NFP_ERRNO(EPERM);
+
+	if (!MUTEX_IS_LOCKED(value))
+		return 0;
+
+	return MUTEX_INTERFACE(value);
+}
+
+int
+nfp_cpp_mutex_target(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->target;
+}
+
+uint64_t
+nfp_cpp_mutex_address(struct nfp_cpp_mutex *mutex)
+{
+	return mutex->address;
+}
+
+/*
+ * Free a mutex handle - does not alter the lock state
+ *
+ * @param mutex     NFP CPP Mutex handle
+ */
+void
+nfp_cpp_mutex_free(struct nfp_cpp_mutex *mutex)
+{
+	mutex->usage--;
+	if (mutex->usage > 0)
+		return;
+
+	/* Remove mutex from the cache */
+	if (mutex->next)
+		mutex->next->prev = mutex->prev;
+	if (mutex->prev)
+		mutex->prev->next = mutex->next;
+
+	/* If mutex->cpp == NULL, something broke */
+	if (mutex->cpp && mutex == mutex->cpp->mutex_cache)
+		mutex->cpp->mutex_cache = mutex->next;
+
+	free(mutex);
+}
+
+/*
+ * Lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex     NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex)
+{
+	int err;
+	time_t warn_at = time(NULL) + 15;
+
+	while ((err = nfp_cpp_mutex_trylock(mutex)) != 0) {
+		/* If errno != EBUSY, then the lock was damaged */
+		if (err < 0 && errno != EBUSY)
+			return err;
+		if (time(NULL) >= warn_at) {
+			printf("Warning: waiting for NFP mutex\n");
+			printf("\tusage:%u\n", mutex->usage);
+			printf("\tdepth:%hd]\n", mutex->depth);
+			printf("\ttarget:%d\n", mutex->target);
+			printf("\taddr:%llx\n", mutex->address);
+			printf("\tkey:%08x]\n", mutex->key);
+			warn_at = time(NULL) + 60;
+		}
+		sched_yield();
+	}
+	return 0;
+}
+
+/*
+ * Unlock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * @param mutex     NFP CPP Mutex handle
+ *
+ * @return 0 on success, or -1 on failure (and set errno accordingly).
+ */
+int
+nfp_cpp_mutex_unlock(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t muw = NFP_CPP_ID(mutex->target, 4, 0);	/* atomic_write */
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	struct nfp_cpp *cpp = mutex->cpp;
+	uint32_t key, value;
+	uint16_t interface = nfp_cpp_interface(cpp);
+	int err;
+
+	if (mutex->depth > 1) {
+		mutex->depth--;
+		return 0;
+	}
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address, &value);
+	if (err < 0)
+		goto exit;
+
+	err = nfp_cpp_readl(mutex->cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		goto exit;
+
+	if (key != mutex->key) {
+		err = NFP_ERRNO(EPERM);
+		goto exit;
+	}
+
+	if (value != MUTEX_LOCKED(interface)) {
+		err = NFP_ERRNO(EACCES);
+		goto exit;
+	}
+
+	err = nfp_cpp_writel(cpp, muw, mutex->address, MUTEX_UNLOCK(interface));
+	if (err < 0)
+		goto exit;
+
+	mutex->depth = 0;
+
+exit:
+	return err;
+}
+
+/*
+ * Attempt to lock a mutex handle, using the NFP MU Atomic Engine
+ *
+ * Valid lock states:
+ *
+ *      0x....0000      - Unlocked
+ *      0x....000f      - Locked
+ *
+ * @param mutex     NFP CPP Mutex handle
+ * @return      0 if the lock succeeded, -1 on failure (and errno set
+ *		appropriately).
+ */
+int
+nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex)
+{
+	uint32_t mur = NFP_CPP_ID(mutex->target, 3, 0);	/* atomic_read */
+	uint32_t muw = NFP_CPP_ID(mutex->target, 4, 0);	/* atomic_write */
+	uint32_t mus = NFP_CPP_ID(mutex->target, 5, 3);	/* test_set_imm */
+	uint32_t key, value, tmp;
+	struct nfp_cpp *cpp = mutex->cpp;
+	int err;
+
+	if (mutex->depth > 0) {
+		if (mutex->depth == MUTEX_DEPTH_MAX)
+			return NFP_ERRNO(E2BIG);
+
+		mutex->depth++;
+		return 0;
+	}
+
+	/* Verify that the lock marker is not damaged */
+	err = nfp_cpp_readl(cpp, mur, mutex->address + 4, &key);
+	if (err < 0)
+		goto exit;
+
+	if (key != mutex->key) {
+		err = NFP_ERRNO(EPERM);
+		goto exit;
+	}
+
+	/*
+	 * Compare against the unlocked state, and if true,
+	 * write the interface id into the top 16 bits, and
+	 * mark as locked.
+	 */
+	value = MUTEX_LOCKED(nfp_cpp_interface(cpp));
+
+	/*
+	 * We use test_set_imm here, as it implies a read
+	 * of the current state, and sets the bits in the
+	 * bytemask of the command to 1s. Since the mutex
+	 * is guaranteed to be 64-bit aligned, the bytemask
+	 * of this 32-bit command is ensured to be 8'b00001111,
+	 * which implies that the lower 4 bits will be set to
+	 * ones regardless of the initial state.
+	 *
+	 * Since this is a 'Readback' operation, with no Pull
+	 * data, we can treat this as a normal Push (read)
+	 * atomic, which returns the original value.
+	 */
+	err = nfp_cpp_readl(cpp, mus, mutex->address, &tmp);
+	if (err < 0)
+		goto exit;
+
+	/* Was it unlocked? */
+	if (MUTEX_IS_UNLOCKED(tmp)) {
+		/*
+		 * The read value can only be 0x....0000 in the unlocked state.
+		 * If there was another contending for this lock, then
+		 * the lock state would be 0x....000f
+		 *
+		 * Write our owner ID into the lock
+		 * While not strictly necessary, this helps with
+		 * debug and bookkeeping.
+		 */
+		err = nfp_cpp_writel(cpp, muw, mutex->address, value);
+		if (err < 0)
+			goto exit;
+
+		mutex->depth = 1;
+		goto exit;
+	}
+
+	/* Already locked by us? Success! */
+	if (tmp == value) {
+		mutex->depth = 1;
+		goto exit;
+	}
+
+	err = NFP_ERRNO(MUTEX_IS_LOCKED(tmp) ? EBUSY : EINVAL);
+
+exit:
+	return err;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nffw.c b/drivers/net/nfp/nfpcore/nfp_nffw.c
new file mode 100644
index 0000000..8bec0e3
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nffw.c
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include "nfp_cpp.h"
+#include "nfp_nffw.h"
+#include "nfp_mip.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+
+/*
+ * flg_info_version = flags[0]<27:16>
+ * This is a small version counter intended only to detect if the current
+ * implementation can read the current struct. Struct changes should be very
+ * rare and as such a 12-bit counter should cover large spans of time. By the
+ * time it wraps around, we don't expect to have 4096 versions of this struct
+ * to be in use at the same time.
+ */
+static uint32_t
+nffw_res_info_version_get(const struct nfp_nffw_info_data *res)
+{
+	return (res->flags[0] >> 16) & 0xfff;
+}
+
+/* flg_init = flags[0]<0> */
+static uint32_t
+nffw_res_flg_init_get(const struct nfp_nffw_info_data *res)
+{
+	return (res->flags[0] >> 0) & 1;
+}
+
+/* loaded = loaded__mu_da__mip_off_hi<31:31> */
+static uint32_t
+nffw_fwinfo_loaded_get(const struct nffw_fwinfo *fi)
+{
+	return (fi->loaded__mu_da__mip_off_hi >> 31) & 1;
+}
+
+/* mip_cppid = mip_cppid */
+static uint32_t
+nffw_fwinfo_mip_cppid_get(const struct nffw_fwinfo *fi)
+{
+	return fi->mip_cppid;
+}
+
+/* loaded = loaded__mu_da__mip_off_hi<8:8> */
+static uint32_t
+nffw_fwinfo_mip_mu_da_get(const struct nffw_fwinfo *fi)
+{
+	return (fi->loaded__mu_da__mip_off_hi >> 8) & 1;
+}
+
+/* mip_offset = (loaded__mu_da__mip_off_hi<7:0> << 8) | mip_offset_lo */
+static uint64_t
+nffw_fwinfo_mip_offset_get(const struct nffw_fwinfo *fi)
+{
+	uint64_t mip_off_hi = fi->loaded__mu_da__mip_off_hi;
+
+	return (mip_off_hi & 0xFF) << 32 | fi->mip_offset_lo;
+}
+
+#define NFP_IMB_TGTADDRESSMODECFG_MODE_of(_x)		(((_x) >> 13) & 0x7)
+#define NFP_IMB_TGTADDRESSMODECFG_ADDRMODE		BIT(12)
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_32_BIT	0
+#define   NFP_IMB_TGTADDRESSMODECFG_ADDRMODE_40_BIT	BIT(12)
+
+static int
+nfp_mip_mu_locality_lsb(struct nfp_cpp *cpp)
+{
+	unsigned int mode, addr40;
+	uint32_t xpbaddr, imbcppat;
+	int err;
+
+	/* Hardcoded XPB IMB Base, island 0 */
+	xpbaddr = 0x000a0000 + NFP_CPP_TARGET_MU * 4;
+	err = nfp_xpb_readl(cpp, xpbaddr, &imbcppat);
+	if (err < 0)
+		return err;
+
+	mode = NFP_IMB_TGTADDRESSMODECFG_MODE_of(imbcppat);
+	addr40 = !!(imbcppat & NFP_IMB_TGTADDRESSMODECFG_ADDRMODE);
+
+	return nfp_cppat_mu_locality_lsb(mode, addr40);
+}
+
+static unsigned int
+nffw_res_fwinfos(struct nfp_nffw_info_data *fwinf, struct nffw_fwinfo **arr)
+{
+	/*
+	 * For the this code, version 0 is most likely to be version 1 in this
+	 * case. Since the kernel driver does not take responsibility for
+	 * initialising the nfp.nffw resource, any previous code (CA firmware or
+	 * userspace) that left the version 0 and did set the init flag is going
+	 * to be version 1.
+	 */
+	switch (nffw_res_info_version_get(fwinf)) {
+	case 0:
+	case 1:
+		*arr = &fwinf->info.v1.fwinfo[0];
+		return NFFW_FWINFO_CNT_V1;
+	case 2:
+		*arr = &fwinf->info.v2.fwinfo[0];
+		return NFFW_FWINFO_CNT_V2;
+	default:
+		*arr = NULL;
+		return 0;
+	}
+}
+
+/*
+ * nfp_nffw_info_open() - Acquire the lock on the NFFW table
+ * @cpp:	NFP CPP handle
+ *
+ * Return: 0, or -ERRNO
+ */
+struct nfp_nffw_info *
+nfp_nffw_info_open(struct nfp_cpp *cpp)
+{
+	struct nfp_nffw_info_data *fwinf;
+	struct nfp_nffw_info *state;
+	uint32_t info_ver;
+	int err;
+
+	state = malloc(sizeof(*state));
+	if (!state)
+		return NULL;
+
+	memset(state, 0, sizeof(*state));
+
+	state->res = nfp_resource_acquire(cpp, NFP_RESOURCE_NFP_NFFW);
+	if (!state->res)
+		goto err_free;
+
+	fwinf = &state->fwinf;
+
+	if (sizeof(*fwinf) > nfp_resource_size(state->res))
+		goto err_release;
+
+	err = nfp_cpp_read(cpp, nfp_resource_cpp_id(state->res),
+			   nfp_resource_address(state->res),
+			   fwinf, sizeof(*fwinf));
+	if (err < (int)sizeof(*fwinf))
+		goto err_release;
+
+	if (!nffw_res_flg_init_get(fwinf))
+		goto err_release;
+
+	info_ver = nffw_res_info_version_get(fwinf);
+	if (info_ver > NFFW_INFO_VERSION_CURRENT)
+		goto err_release;
+
+	state->cpp = cpp;
+	return state;
+
+err_release:
+	nfp_resource_release(state->res);
+err_free:
+	free(state);
+	return NULL;
+}
+
+/*
+ * nfp_nffw_info_release() - Release the lock on the NFFW table
+ * @state:	NFP FW info state
+ *
+ * Return: 0, or -ERRNO
+ */
+void
+nfp_nffw_info_close(struct nfp_nffw_info *state)
+{
+	nfp_resource_release(state->res);
+	free(state);
+}
+
+/*
+ * nfp_nffw_info_fwid_first() - Return the first firmware ID in the NFFW
+ * @state:	NFP FW info state
+ *
+ * Return: First NFFW firmware info, NULL on failure
+ */
+static struct nffw_fwinfo *
+nfp_nffw_info_fwid_first(struct nfp_nffw_info *state)
+{
+	struct nffw_fwinfo *fwinfo;
+	unsigned int cnt, i;
+
+	cnt = nffw_res_fwinfos(&state->fwinf, &fwinfo);
+	if (!cnt)
+		return NULL;
+
+	for (i = 0; i < cnt; i++)
+		if (nffw_fwinfo_loaded_get(&fwinfo[i]))
+			return &fwinfo[i];
+
+	return NULL;
+}
+
+/*
+ * nfp_nffw_info_mip_first() - Retrieve the location of the first FW's MIP
+ * @state:	NFP FW info state
+ * @cpp_id:	Pointer to the CPP ID of the MIP
+ * @off:	Pointer to the CPP Address of the MIP
+ *
+ * Return: 0, or -ERRNO
+ */
+int
+nfp_nffw_info_mip_first(struct nfp_nffw_info *state, uint32_t *cpp_id,
+			uint64_t *off)
+{
+	struct nffw_fwinfo *fwinfo;
+
+	fwinfo = nfp_nffw_info_fwid_first(state);
+	if (!fwinfo)
+		return -EINVAL;
+
+	*cpp_id = nffw_fwinfo_mip_cppid_get(fwinfo);
+	*off = nffw_fwinfo_mip_offset_get(fwinfo);
+
+	if (nffw_fwinfo_mip_mu_da_get(fwinfo)) {
+		int locality_off;
+
+		if (NFP_CPP_ID_TARGET_of(*cpp_id) != NFP_CPP_TARGET_MU)
+			return 0;
+
+		locality_off = nfp_mip_mu_locality_lsb(state->cpp);
+		if (locality_off < 0)
+			return locality_off;
+
+		*off &= ~(NFP_MU_ADDR_ACCESS_TYPE_MASK << locality_off);
+		*off |= NFP_MU_ADDR_ACCESS_TYPE_DIRECT << locality_off;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nffw.h b/drivers/net/nfp/nfpcore/nfp_nffw.h
new file mode 100644
index 0000000..3bbdf1c
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nffw.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_NFFW_H__
+#define __NFP_NFFW_H__
+
+#include "nfp-common/nfp_platform.h"
+#include "nfp_cpp.h"
+
+/*
+ * Init-CSR owner IDs for firmware map to firmware IDs which start at 4.
+ * Lower IDs are reserved for target and loader IDs.
+ */
+#define NFFW_FWID_EXT   3	/* For active MEs that we didn't load. */
+#define NFFW_FWID_BASE  4
+
+#define NFFW_FWID_ALL   255
+
+/* Init-CSR owner IDs for firmware map to firmware IDs which start at 4.
+ * Lower IDs are reserved for target and loader IDs.
+ */
+#define NFFW_FWID_EXT   3 /* For active MEs that we didn't load. */
+#define NFFW_FWID_BASE  4
+
+#define NFFW_FWID_ALL   255
+
+/**
+ * NFFW_INFO_VERSION history:
+ * 0: This was never actually used (before versioning), but it refers to
+ *    the previous struct which had FWINFO_CNT = MEINFO_CNT = 120 that later
+ *    changed to 200.
+ * 1: First versioned struct, with
+ *     FWINFO_CNT = 120
+ *     MEINFO_CNT = 120
+ * 2:  FWINFO_CNT = 200
+ *     MEINFO_CNT = 200
+ */
+#define NFFW_INFO_VERSION_CURRENT 2
+
+/* Enough for all current chip families */
+#define NFFW_MEINFO_CNT_V1 120
+#define NFFW_FWINFO_CNT_V1 120
+#define NFFW_MEINFO_CNT_V2 200
+#define NFFW_FWINFO_CNT_V2 200
+
+struct nffw_meinfo {
+	uint32_t ctxmask__fwid__meid;
+};
+
+struct nffw_fwinfo {
+	uint32_t loaded__mu_da__mip_off_hi;
+	uint32_t mip_cppid; /* 0 means no MIP */
+	uint32_t mip_offset_lo;
+};
+
+struct nfp_nffw_info_v1 {
+	struct nffw_meinfo meinfo[NFFW_MEINFO_CNT_V1];
+	struct nffw_fwinfo fwinfo[NFFW_FWINFO_CNT_V1];
+};
+
+struct nfp_nffw_info_v2 {
+	struct nffw_meinfo meinfo[NFFW_MEINFO_CNT_V2];
+	struct nffw_fwinfo fwinfo[NFFW_FWINFO_CNT_V2];
+};
+
+struct nfp_nffw_info_data {
+	uint32_t flags[2];
+	union {
+		struct nfp_nffw_info_v1 v1;
+		struct nfp_nffw_info_v2 v2;
+	} info;
+};
+
+struct nfp_nffw_info {
+	struct nfp_cpp *cpp;
+	struct nfp_resource *res;
+
+	struct nfp_nffw_info_data fwinf;
+};
+
+struct nfp_nffw_info *nfp_nffw_info_open(struct nfp_cpp *cpp);
+void nfp_nffw_info_close(struct nfp_nffw_info *state);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp.c b/drivers/net/nfp/nfpcore/nfp_nsp.c
new file mode 100644
index 0000000..876a401
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp.c
@@ -0,0 +1,427 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#define NFP_SUBSYS "nfp_nsp"
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_common.h>
+
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp_resource.h"
+
+int
+nfp_nsp_config_modified(struct nfp_nsp *state)
+{
+	return state->modified;
+}
+
+void
+nfp_nsp_config_set_modified(struct nfp_nsp *state, int modified)
+{
+	state->modified = modified;
+}
+
+void *
+nfp_nsp_config_entries(struct nfp_nsp *state)
+{
+	return state->entries;
+}
+
+unsigned int
+nfp_nsp_config_idx(struct nfp_nsp *state)
+{
+	return state->idx;
+}
+
+void
+nfp_nsp_config_set_state(struct nfp_nsp *state, void *entries, unsigned int idx)
+{
+	state->entries = entries;
+	state->idx = idx;
+}
+
+void
+nfp_nsp_config_clear_state(struct nfp_nsp *state)
+{
+	state->entries = NULL;
+	state->idx = 0;
+}
+
+static void
+nfp_nsp_print_extended_error(uint32_t ret_val)
+{
+	int i;
+
+	if (!ret_val)
+		return;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_errors); i++)
+		if (ret_val == (uint32_t)nsp_errors[i].code)
+			printf("err msg: %s\n", nsp_errors[i].msg);
+}
+
+static int
+nfp_nsp_check(struct nfp_nsp *state)
+{
+	struct nfp_cpp *cpp = state->cpp;
+	uint64_t nsp_status, reg;
+	uint32_t nsp_cpp;
+	int err;
+
+	nsp_cpp = nfp_resource_cpp_id(state->res);
+	nsp_status = nfp_resource_address(state->res) + NSP_STATUS;
+
+	err = nfp_cpp_readq(cpp, nsp_cpp, nsp_status, &reg);
+	if (err < 0)
+		return err;
+
+	if (FIELD_GET(NSP_STATUS_MAGIC, reg) != NSP_MAGIC) {
+		printf("Cannot detect NFP Service Processor\n");
+		return -ENODEV;
+	}
+
+	state->ver.major = FIELD_GET(NSP_STATUS_MAJOR, reg);
+	state->ver.minor = FIELD_GET(NSP_STATUS_MINOR, reg);
+
+	if (state->ver.major != NSP_MAJOR || state->ver.minor < NSP_MINOR) {
+		printf("Unsupported ABI %hu.%hu\n", state->ver.major,
+						    state->ver.minor);
+		return -EINVAL;
+	}
+
+	if (reg & NSP_STATUS_BUSY) {
+		printf("Service processor busy!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * nfp_nsp_open() - Prepare for communication and lock the NSP resource.
+ * @cpp:	NFP CPP Handle
+ */
+struct nfp_nsp *
+nfp_nsp_open(struct nfp_cpp *cpp)
+{
+	struct nfp_resource *res;
+	struct nfp_nsp *state;
+	int err;
+
+	res = nfp_resource_acquire(cpp, NFP_RESOURCE_NSP);
+	if (!res)
+		return NULL;
+
+	state = malloc(sizeof(*state));
+	if (!state) {
+		nfp_resource_release(res);
+		return NULL;
+	}
+	memset(state, 0, sizeof(*state));
+	state->cpp = cpp;
+	state->res = res;
+
+	err = nfp_nsp_check(state);
+	if (err) {
+		nfp_nsp_close(state);
+		return NULL;
+	}
+
+	return state;
+}
+
+/*
+ * nfp_nsp_close() - Clean up and unlock the NSP resource.
+ * @state:	NFP SP state
+ */
+void
+nfp_nsp_close(struct nfp_nsp *state)
+{
+	nfp_resource_release(state->res);
+	free(state);
+}
+
+uint16_t
+nfp_nsp_get_abi_ver_major(struct nfp_nsp *state)
+{
+	return state->ver.major;
+}
+
+uint16_t
+nfp_nsp_get_abi_ver_minor(struct nfp_nsp *state)
+{
+	return state->ver.minor;
+}
+
+static int
+nfp_nsp_wait_reg(struct nfp_cpp *cpp, uint64_t *reg, uint32_t nsp_cpp,
+		 uint64_t addr, uint64_t mask, uint64_t val)
+{
+	struct timespec wait;
+	int count;
+	int err;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 25000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_cpp_readq(cpp, nsp_cpp, addr, reg);
+		if (err < 0)
+			return err;
+
+		if ((*reg & mask) == val)
+			return 0;
+
+		nanosleep(&wait, 0);
+		if (count++ > 1000)
+			return -ETIMEDOUT;
+	}
+}
+
+/*
+ * nfp_nsp_command() - Execute a command on the NFP Service Processor
+ * @state:	NFP SP state
+ * @code:	NFP SP Command Code
+ * @option:	NFP SP Command Argument
+ * @buff_cpp:	NFP SP Buffer CPP Address info
+ * @buff_addr:	NFP SP Buffer Host address
+ *
+ * Return: 0 for success with no result
+ *
+ *	 positive value for NSP completion with a result code
+ *
+ *	-EAGAIN if the NSP is not yet present
+ *	-ENODEV if the NSP is not a supported model
+ *	-EBUSY if the NSP is stuck
+ *	-EINTR if interrupted while waiting for completion
+ *	-ETIMEDOUT if the NSP took longer than 30 seconds to complete
+ */
+static int
+nfp_nsp_command(struct nfp_nsp *state, uint16_t code, uint32_t option,
+		uint32_t buff_cpp, uint64_t buff_addr)
+{
+	uint64_t reg, ret_val, nsp_base, nsp_buffer, nsp_status, nsp_command;
+	struct nfp_cpp *cpp = state->cpp;
+	uint32_t nsp_cpp;
+	int err;
+
+	nsp_cpp = nfp_resource_cpp_id(state->res);
+	nsp_base = nfp_resource_address(state->res);
+	nsp_status = nsp_base + NSP_STATUS;
+	nsp_command = nsp_base + NSP_COMMAND;
+	nsp_buffer = nsp_base + NSP_BUFFER;
+
+	err = nfp_nsp_check(state);
+	if (err)
+		return err;
+
+	if (!FIELD_FIT(NSP_BUFFER_CPP, buff_cpp >> 8) ||
+	    !FIELD_FIT(NSP_BUFFER_ADDRESS, buff_addr)) {
+		printf("Host buffer out of reach %08x %" PRIx64 "\n",
+			buff_cpp, buff_addr);
+		return -EINVAL;
+	}
+
+	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_buffer,
+			     FIELD_PREP(NSP_BUFFER_CPP, buff_cpp >> 8) |
+			     FIELD_PREP(NSP_BUFFER_ADDRESS, buff_addr));
+	if (err < 0)
+		return err;
+
+	err = nfp_cpp_writeq(cpp, nsp_cpp, nsp_command,
+			     FIELD_PREP(NSP_COMMAND_OPTION, option) |
+			     FIELD_PREP(NSP_COMMAND_CODE, code) |
+			     FIELD_PREP(NSP_COMMAND_START, 1));
+	if (err < 0)
+		return err;
+
+	/* Wait for NSP_COMMAND_START to go to 0 */
+	err = nfp_nsp_wait_reg(cpp, &reg, nsp_cpp, nsp_command,
+			       NSP_COMMAND_START, 0);
+	if (err) {
+		printf("Error %d waiting for code 0x%04x to start\n",
+			err, code);
+		return err;
+	}
+
+	/* Wait for NSP_STATUS_BUSY to go to 0 */
+	err = nfp_nsp_wait_reg(cpp, &reg, nsp_cpp, nsp_status, NSP_STATUS_BUSY,
+			       0);
+	if (err) {
+		printf("Error %d waiting for code 0x%04x to complete\n",
+			err, code);
+		return err;
+	}
+
+	err = nfp_cpp_readq(cpp, nsp_cpp, nsp_command, &ret_val);
+	if (err < 0)
+		return err;
+	ret_val = FIELD_GET(NSP_COMMAND_OPTION, ret_val);
+
+	err = FIELD_GET(NSP_STATUS_RESULT, reg);
+	if (err) {
+		printf("Result (error) code set: %d (%d) command: %d\n",
+			 -err, (int)ret_val, code);
+		nfp_nsp_print_extended_error(ret_val);
+		return -err;
+	}
+
+	return ret_val;
+}
+
+#define SZ_1M 0x00100000
+
+static int
+nfp_nsp_command_buf(struct nfp_nsp *nsp, uint16_t code, uint32_t option,
+		    const void *in_buf, unsigned int in_size, void *out_buf,
+		    unsigned int out_size)
+{
+	struct nfp_cpp *cpp = nsp->cpp;
+	unsigned int max_size;
+	uint64_t reg, cpp_buf;
+	int ret, err;
+	uint32_t cpp_id;
+
+	if (nsp->ver.minor < 13) {
+		printf("NSP: Code 0x%04x with buffer not supported\n", code);
+		printf("\t(ABI %hu.%hu)\n", nsp->ver.major, nsp->ver.minor);
+		return -EOPNOTSUPP;
+	}
+
+	err = nfp_cpp_readq(cpp, nfp_resource_cpp_id(nsp->res),
+			    nfp_resource_address(nsp->res) +
+			    NSP_DFLT_BUFFER_CONFIG,
+			    &reg);
+	if (err < 0)
+		return err;
+
+	max_size = RTE_MAX(in_size, out_size);
+	if (FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M < max_size) {
+		printf("NSP: default buffer too small for command 0x%04x\n",
+		       code);
+		printf("\t(%llu < %u)\n",
+		       FIELD_GET(NSP_DFLT_BUFFER_SIZE_MB, reg) * SZ_1M,
+		       max_size);
+		return -EINVAL;
+	}
+
+	err = nfp_cpp_readq(cpp, nfp_resource_cpp_id(nsp->res),
+			    nfp_resource_address(nsp->res) +
+			    NSP_DFLT_BUFFER,
+			    &reg);
+	if (err < 0)
+		return err;
+
+	cpp_id = FIELD_GET(NSP_BUFFER_CPP, reg) << 8;
+	cpp_buf = FIELD_GET(NSP_BUFFER_ADDRESS, reg);
+
+	if (in_buf && in_size) {
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf, in_buf, in_size);
+		if (err < 0)
+			return err;
+	}
+	/* Zero out remaining part of the buffer */
+	if (out_buf && out_size && out_size > in_size) {
+		memset(out_buf, 0, out_size - in_size);
+		err = nfp_cpp_write(cpp, cpp_id, cpp_buf + in_size, out_buf,
+				    out_size - in_size);
+		if (err < 0)
+			return err;
+	}
+
+	ret = nfp_nsp_command(nsp, code, option, cpp_id, cpp_buf);
+	if (ret < 0)
+		return ret;
+
+	if (out_buf && out_size) {
+		err = nfp_cpp_read(cpp, cpp_id, cpp_buf, out_buf, out_size);
+		if (err < 0)
+			return err;
+	}
+
+	return ret;
+}
+
+int
+nfp_nsp_wait(struct nfp_nsp *state)
+{
+	struct timespec wait;
+	int count;
+	int err;
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 25000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_nsp_command(state, SPCODE_NOOP, 0, 0, 0);
+		if (err != -EAGAIN)
+			break;
+
+		nanosleep(&wait, 0);
+
+		if (count++ > 1000) {
+			err = -ETIMEDOUT;
+			break;
+		}
+	}
+	if (err)
+		printf("NSP failed to respond %d\n", err);
+
+	return err;
+}
+
+int
+nfp_nsp_device_soft_reset(struct nfp_nsp *state)
+{
+	return nfp_nsp_command(state, SPCODE_SOFT_RESET, 0, 0, 0);
+}
+
+int
+nfp_nsp_mac_reinit(struct nfp_nsp *state)
+{
+	return nfp_nsp_command(state, SPCODE_MAC_INIT, 0, 0, 0);
+}
+
+int
+nfp_nsp_load_fw(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_FW_LOAD, size, buf, size,
+				   NULL, 0);
+}
+
+int
+nfp_nsp_read_eth_table(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_ETH_RESCAN, size, NULL, 0,
+				   buf, size);
+}
+
+int
+nfp_nsp_write_eth_table(struct nfp_nsp *state, const void *buf,
+			unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_ETH_CONTROL, size, buf, size,
+				   NULL, 0);
+}
+
+int
+nfp_nsp_read_identify(struct nfp_nsp *state, void *buf, unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_NSP_IDENTIFY, size, NULL, 0,
+				   buf, size);
+}
+
+int
+nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask, void *buf,
+		     unsigned int size)
+{
+	return nfp_nsp_command_buf(state, SPCODE_NSP_SENSORS, sensor_mask, NULL,
+				   0, buf, size);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp.h b/drivers/net/nfp/nfpcore/nfp_nsp.h
new file mode 100644
index 0000000..c9c7b0d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NSP_NSP_H
+#define NSP_NSP_H 1
+
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (64 - 1 - (h))))
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_GET(_mask, _reg)	\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		(typeof(_x))(((_reg) & (_x)) >> __bf_shf(_x));	\
+	}))
+
+#define FIELD_FIT(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		!((((typeof(_x))_val) << __bf_shf(_x)) & ~(_x)); \
+	}))
+
+#define FIELD_PREP(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		((typeof(_x))(_val) << __bf_shf(_x)) & (_x);	\
+	}))
+
+/* Offsets relative to the CSR base */
+#define NSP_STATUS		0x00
+#define   NSP_STATUS_MAGIC	GENMASK_ULL(63, 48)
+#define   NSP_STATUS_MAJOR	GENMASK_ULL(47, 44)
+#define   NSP_STATUS_MINOR	GENMASK_ULL(43, 32)
+#define   NSP_STATUS_CODE	GENMASK_ULL(31, 16)
+#define   NSP_STATUS_RESULT	GENMASK_ULL(15, 8)
+#define   NSP_STATUS_BUSY	BIT_ULL(0)
+
+#define NSP_COMMAND		0x08
+#define   NSP_COMMAND_OPTION	GENMASK_ULL(63, 32)
+#define   NSP_COMMAND_CODE	GENMASK_ULL(31, 16)
+#define   NSP_COMMAND_START	BIT_ULL(0)
+
+/* CPP address to retrieve the data from */
+#define NSP_BUFFER		0x10
+#define   NSP_BUFFER_CPP	GENMASK_ULL(63, 40)
+#define   NSP_BUFFER_PCIE	GENMASK_ULL(39, 38)
+#define   NSP_BUFFER_ADDRESS	GENMASK_ULL(37, 0)
+
+#define NSP_DFLT_BUFFER		0x18
+
+#define NSP_DFLT_BUFFER_CONFIG	0x20
+#define   NSP_DFLT_BUFFER_SIZE_MB	GENMASK_ULL(7, 0)
+
+#define NSP_MAGIC		0xab10
+#define NSP_MAJOR		0
+#define NSP_MINOR		8
+
+#define NSP_CODE_MAJOR		GENMASK(15, 12)
+#define NSP_CODE_MINOR		GENMASK(11, 0)
+
+enum nfp_nsp_cmd {
+	SPCODE_NOOP		= 0, /* No operation */
+	SPCODE_SOFT_RESET	= 1, /* Soft reset the NFP */
+	SPCODE_FW_DEFAULT	= 2, /* Load default (UNDI) FW */
+	SPCODE_PHY_INIT		= 3, /* Initialize the PHY */
+	SPCODE_MAC_INIT		= 4, /* Initialize the MAC */
+	SPCODE_PHY_RXADAPT	= 5, /* Re-run PHY RX Adaptation */
+	SPCODE_FW_LOAD		= 6, /* Load fw from buffer, len in option */
+	SPCODE_ETH_RESCAN	= 7, /* Rescan ETHs, write ETH_TABLE to buf */
+	SPCODE_ETH_CONTROL	= 8, /* Update media config from buffer */
+	SPCODE_NSP_SENSORS	= 12, /* Read NSP sensor(s) */
+	SPCODE_NSP_IDENTIFY	= 13, /* Read NSP version */
+};
+
+static const struct {
+	int code;
+	const char *msg;
+} nsp_errors[] = {
+	{ 6010, "could not map to phy for port" },
+	{ 6011, "not an allowed rate/lanes for port" },
+	{ 6012, "not an allowed rate/lanes for port" },
+	{ 6013, "high/low error, change other port first" },
+	{ 6014, "config not found in flash" },
+};
+
+struct nfp_nsp {
+	struct nfp_cpp *cpp;
+	struct nfp_resource *res;
+	struct {
+		uint16_t major;
+		uint16_t minor;
+	} ver;
+
+	/* Eth table config state */
+	int modified;
+	unsigned int idx;
+	void *entries;
+};
+
+struct nfp_nsp *nfp_nsp_open(struct nfp_cpp *cpp);
+void nfp_nsp_close(struct nfp_nsp *state);
+uint16_t nfp_nsp_get_abi_ver_major(struct nfp_nsp *state);
+uint16_t nfp_nsp_get_abi_ver_minor(struct nfp_nsp *state);
+int nfp_nsp_wait(struct nfp_nsp *state);
+int nfp_nsp_device_soft_reset(struct nfp_nsp *state);
+int nfp_nsp_load_fw(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_mac_reinit(struct nfp_nsp *state);
+int nfp_nsp_read_identify(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
+			 void *buf, unsigned int size);
+
+static inline int nfp_nsp_has_mac_reinit(struct nfp_nsp *state)
+{
+	return nfp_nsp_get_abi_ver_minor(state) > 20;
+}
+
+enum nfp_eth_interface {
+	NFP_INTERFACE_NONE	= 0,
+	NFP_INTERFACE_SFP	= 1,
+	NFP_INTERFACE_SFPP	= 10,
+	NFP_INTERFACE_SFP28	= 28,
+	NFP_INTERFACE_QSFP	= 40,
+	NFP_INTERFACE_CXP	= 100,
+	NFP_INTERFACE_QSFP28	= 112,
+};
+
+enum nfp_eth_media {
+	NFP_MEDIA_DAC_PASSIVE = 0,
+	NFP_MEDIA_DAC_ACTIVE,
+	NFP_MEDIA_FIBRE,
+};
+
+enum nfp_eth_aneg {
+	NFP_ANEG_AUTO = 0,
+	NFP_ANEG_SEARCH,
+	NFP_ANEG_25G_CONSORTIUM,
+	NFP_ANEG_25G_IEEE,
+	NFP_ANEG_DISABLED,
+};
+
+enum nfp_eth_fec {
+	NFP_FEC_AUTO_BIT = 0,
+	NFP_FEC_BASER_BIT,
+	NFP_FEC_REED_SOLOMON_BIT,
+	NFP_FEC_DISABLED_BIT,
+};
+
+#define NFP_FEC_AUTO		BIT(NFP_FEC_AUTO_BIT)
+#define NFP_FEC_BASER		BIT(NFP_FEC_BASER_BIT)
+#define NFP_FEC_REED_SOLOMON	BIT(NFP_FEC_REED_SOLOMON_BIT)
+#define NFP_FEC_DISABLED	BIT(NFP_FEC_DISABLED_BIT)
+
+#define ETH_ALEN	6
+
+/**
+ * struct nfp_eth_table - ETH table information
+ * @count:	number of table entries
+ * @max_index:	max of @index fields of all @ports
+ * @ports:	table of ports
+ *
+ * @eth_index:	port index according to legacy ethX numbering
+ * @index:	chip-wide first channel index
+ * @nbi:	NBI index
+ * @base:	first channel index (within NBI)
+ * @lanes:	number of channels
+ * @speed:	interface speed (in Mbps)
+ * @interface:	interface (module) plugged in
+ * @media:	media type of the @interface
+ * @fec:	forward error correction mode
+ * @aneg:	auto negotiation mode
+ * @mac_addr:	interface MAC address
+ * @label_port:	port id
+ * @label_subport:  id of interface within port (for split ports)
+ * @enabled:	is enabled?
+ * @tx_enabled:	is TX enabled?
+ * @rx_enabled:	is RX enabled?
+ * @override_changed: is media reconfig pending?
+ *
+ * @port_type:	one of %PORT_* defines for ethtool
+ * @port_lanes:	total number of lanes on the port (sum of lanes of all subports)
+ * @is_split:	is interface part of a split port
+ * @fec_modes_supported:	bitmap of FEC modes supported
+ */
+struct nfp_eth_table {
+	unsigned int count;
+	unsigned int max_index;
+	struct nfp_eth_table_port {
+		unsigned int eth_index;
+		unsigned int index;
+		unsigned int nbi;
+		unsigned int base;
+		unsigned int lanes;
+		unsigned int speed;
+
+		unsigned int interface;
+		enum nfp_eth_media media;
+
+		enum nfp_eth_fec fec;
+		enum nfp_eth_aneg aneg;
+
+		uint8_t mac_addr[ETH_ALEN];
+
+		uint8_t label_port;
+		uint8_t label_subport;
+
+		int enabled;
+		int tx_enabled;
+		int rx_enabled;
+
+		int override_changed;
+
+		/* Computed fields */
+		uint8_t port_type;
+
+		unsigned int port_lanes;
+
+		int is_split;
+
+		unsigned int fec_modes_supported;
+	} ports[0];
+};
+
+struct nfp_eth_table *nfp_eth_read_ports(struct nfp_cpp *cpp);
+
+int nfp_eth_set_mod_enable(struct nfp_cpp *cpp, unsigned int idx, int enable);
+int nfp_eth_set_configured(struct nfp_cpp *cpp, unsigned int idx,
+			   int configed);
+int
+nfp_eth_set_fec(struct nfp_cpp *cpp, unsigned int idx, enum nfp_eth_fec mode);
+
+int nfp_nsp_read_eth_table(struct nfp_nsp *state, void *buf, unsigned int size);
+int nfp_nsp_write_eth_table(struct nfp_nsp *state, const void *buf,
+			    unsigned int size);
+void nfp_nsp_config_set_state(struct nfp_nsp *state, void *entries,
+			      unsigned int idx);
+void nfp_nsp_config_clear_state(struct nfp_nsp *state);
+void nfp_nsp_config_set_modified(struct nfp_nsp *state, int modified);
+void *nfp_nsp_config_entries(struct nfp_nsp *state);
+int nfp_nsp_config_modified(struct nfp_nsp *state);
+unsigned int nfp_nsp_config_idx(struct nfp_nsp *state);
+
+static inline int nfp_eth_can_support_fec(struct nfp_eth_table_port *eth_port)
+{
+	return !!eth_port->fec_modes_supported;
+}
+
+static inline unsigned int
+nfp_eth_supported_fec_modes(struct nfp_eth_table_port *eth_port)
+{
+	return eth_port->fec_modes_supported;
+}
+
+struct nfp_nsp *nfp_eth_config_start(struct nfp_cpp *cpp, unsigned int idx);
+int nfp_eth_config_commit_end(struct nfp_nsp *nsp);
+void nfp_eth_config_cleanup_end(struct nfp_nsp *nsp);
+
+int __nfp_eth_set_aneg(struct nfp_nsp *nsp, enum nfp_eth_aneg mode);
+int __nfp_eth_set_speed(struct nfp_nsp *nsp, unsigned int speed);
+int __nfp_eth_set_split(struct nfp_nsp *nsp, unsigned int lanes);
+
+/**
+ * struct nfp_nsp_identify - NSP static information
+ * @version:      opaque version string
+ * @flags:        version flags
+ * @br_primary:   branch id of primary bootloader
+ * @br_secondary: branch id of secondary bootloader
+ * @br_nsp:       branch id of NSP
+ * @primary:      version of primarary bootloader
+ * @secondary:    version id of secondary bootloader
+ * @nsp:          version id of NSP
+ * @sensor_mask:  mask of present sensors available on NIC
+ */
+struct nfp_nsp_identify {
+	char version[40];
+	uint8_t flags;
+	uint8_t br_primary;
+	uint8_t br_secondary;
+	uint8_t br_nsp;
+	uint16_t primary;
+	uint16_t secondary;
+	uint16_t nsp;
+	uint64_t sensor_mask;
+};
+
+struct nfp_nsp_identify *__nfp_nsp_identify(struct nfp_nsp *nsp);
+
+enum nfp_nsp_sensor_id {
+	NFP_SENSOR_CHIP_TEMPERATURE,
+	NFP_SENSOR_ASSEMBLY_POWER,
+	NFP_SENSOR_ASSEMBLY_12V_POWER,
+	NFP_SENSOR_ASSEMBLY_3V3_POWER,
+};
+
+int nfp_hwmon_read_sensor(struct nfp_cpp *cpp, enum nfp_nsp_sensor_id id,
+			  long *val);
+
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c b/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
new file mode 100644
index 0000000..bfd1edd
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp_cmds.c
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp_nffw.h"
+
+struct nsp_identify {
+	uint8_t version[40];
+	uint8_t flags;
+	uint8_t br_primary;
+	uint8_t br_secondary;
+	uint8_t br_nsp;
+	uint16_t primary;
+	uint16_t secondary;
+	uint16_t nsp;
+	uint8_t reserved[6];
+	uint64_t sensor_mask;
+};
+
+struct nfp_nsp_identify *
+__nfp_nsp_identify(struct nfp_nsp *nsp)
+{
+	struct nfp_nsp_identify *nspi = NULL;
+	struct nsp_identify *ni;
+	int ret;
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 15)
+		return NULL;
+
+	ni = malloc(sizeof(*ni));
+	if (!ni)
+		return NULL;
+
+	memset(ni, 0, sizeof(*ni));
+	ret = nfp_nsp_read_identify(nsp, ni, sizeof(*ni));
+	if (ret < 0) {
+		printf("reading bsp version failed %d\n",
+			ret);
+		goto exit_free;
+	}
+
+	nspi = malloc(sizeof(*nspi));
+	if (!nspi)
+		goto exit_free;
+
+	memset(nspi, 0, sizeof(*nspi));
+	memcpy(nspi->version, ni->version, sizeof(nspi->version));
+	nspi->version[sizeof(nspi->version) - 1] = '\0';
+	nspi->flags = ni->flags;
+	nspi->br_primary = ni->br_primary;
+	nspi->br_secondary = ni->br_secondary;
+	nspi->br_nsp = ni->br_nsp;
+	nspi->primary = rte_le_to_cpu_16(ni->primary);
+	nspi->secondary = rte_le_to_cpu_16(ni->secondary);
+	nspi->nsp = rte_le_to_cpu_16(ni->nsp);
+	nspi->sensor_mask = rte_le_to_cpu_64(ni->sensor_mask);
+
+exit_free:
+	free(ni);
+	return nspi;
+}
+
+struct nfp_sensors {
+	uint32_t chip_temp;
+	uint32_t assembly_power;
+	uint32_t assembly_12v_power;
+	uint32_t assembly_3v3_power;
+};
+
+int
+nfp_hwmon_read_sensor(struct nfp_cpp *cpp, enum nfp_nsp_sensor_id id, long *val)
+{
+	struct nfp_sensors s;
+	struct nfp_nsp *nsp;
+	int ret;
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp)
+		return -EIO;
+
+	ret = nfp_nsp_read_sensors(nsp, BIT(id), &s, sizeof(s));
+	nfp_nsp_close(nsp);
+
+	if (ret < 0)
+		return ret;
+
+	switch (id) {
+	case NFP_SENSOR_CHIP_TEMPERATURE:
+		*val = rte_le_to_cpu_32(s.chip_temp);
+		break;
+	case NFP_SENSOR_ASSEMBLY_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_power);
+		break;
+	case NFP_SENSOR_ASSEMBLY_12V_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_12v_power);
+		break;
+	case NFP_SENSOR_ASSEMBLY_3V3_POWER:
+		*val = rte_le_to_cpu_32(s.assembly_3v3_power);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/nfp/nfpcore/nfp_nsp_eth.c
new file mode 100644
index 0000000..6794689
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_nsp_eth.c
@@ -0,0 +1,665 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_nsp.h"
+#include "nfp6000/nfp6000.h"
+
+#define GENMASK_ULL(h, l) \
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (64 - 1 - (h))))
+
+#define __bf_shf(x) (__builtin_ffsll(x) - 1)
+
+#define FIELD_GET(_mask, _reg)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		(typeof(_x))(((_reg) & (_x)) >> __bf_shf(_x));	\
+	}))
+
+#define FIELD_FIT(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		!((((typeof(_x))_val) << __bf_shf(_x)) & ~(_x)); \
+	}))
+
+#define FIELD_PREP(_mask, _val)						\
+	(__extension__ ({ \
+		typeof(_mask) _x = (_mask); \
+		((typeof(_x))(_val) << __bf_shf(_x)) & (_x);	\
+	}))
+
+#define NSP_ETH_NBI_PORT_COUNT		24
+#define NSP_ETH_MAX_COUNT		(2 * NSP_ETH_NBI_PORT_COUNT)
+#define NSP_ETH_TABLE_SIZE		(NSP_ETH_MAX_COUNT *		\
+					 sizeof(union eth_table_entry))
+
+#define NSP_ETH_PORT_LANES		GENMASK_ULL(3, 0)
+#define NSP_ETH_PORT_INDEX		GENMASK_ULL(15, 8)
+#define NSP_ETH_PORT_LABEL		GENMASK_ULL(53, 48)
+#define NSP_ETH_PORT_PHYLABEL		GENMASK_ULL(59, 54)
+#define NSP_ETH_PORT_FEC_SUPP_BASER	BIT_ULL(60)
+#define NSP_ETH_PORT_FEC_SUPP_RS	BIT_ULL(61)
+
+#define NSP_ETH_PORT_LANES_MASK		rte_cpu_to_le_64(NSP_ETH_PORT_LANES)
+
+#define NSP_ETH_STATE_CONFIGURED	BIT_ULL(0)
+#define NSP_ETH_STATE_ENABLED		BIT_ULL(1)
+#define NSP_ETH_STATE_TX_ENABLED	BIT_ULL(2)
+#define NSP_ETH_STATE_RX_ENABLED	BIT_ULL(3)
+#define NSP_ETH_STATE_RATE		GENMASK_ULL(11, 8)
+#define NSP_ETH_STATE_INTERFACE		GENMASK_ULL(19, 12)
+#define NSP_ETH_STATE_MEDIA		GENMASK_ULL(21, 20)
+#define NSP_ETH_STATE_OVRD_CHNG		BIT_ULL(22)
+#define NSP_ETH_STATE_ANEG		GENMASK_ULL(25, 23)
+#define NSP_ETH_STATE_FEC		GENMASK_ULL(27, 26)
+
+#define NSP_ETH_CTRL_CONFIGURED		BIT_ULL(0)
+#define NSP_ETH_CTRL_ENABLED		BIT_ULL(1)
+#define NSP_ETH_CTRL_TX_ENABLED		BIT_ULL(2)
+#define NSP_ETH_CTRL_RX_ENABLED		BIT_ULL(3)
+#define NSP_ETH_CTRL_SET_RATE		BIT_ULL(4)
+#define NSP_ETH_CTRL_SET_LANES		BIT_ULL(5)
+#define NSP_ETH_CTRL_SET_ANEG		BIT_ULL(6)
+#define NSP_ETH_CTRL_SET_FEC		BIT_ULL(7)
+
+/* Which connector port. */
+#define PORT_TP			0x00
+#define PORT_AUI		0x01
+#define PORT_MII		0x02
+#define PORT_FIBRE		0x03
+#define PORT_BNC		0x04
+#define PORT_DA			0x05
+#define PORT_NONE		0xef
+#define PORT_OTHER		0xff
+
+#define SPEED_10		10
+#define SPEED_100		100
+#define SPEED_1000		1000
+#define SPEED_2500		2500
+#define SPEED_5000		5000
+#define SPEED_10000		10000
+#define SPEED_14000		14000
+#define SPEED_20000		20000
+#define SPEED_25000		25000
+#define SPEED_40000		40000
+#define SPEED_50000		50000
+#define SPEED_56000		56000
+#define SPEED_100000		100000
+
+enum nfp_eth_raw {
+	NSP_ETH_RAW_PORT = 0,
+	NSP_ETH_RAW_STATE,
+	NSP_ETH_RAW_MAC,
+	NSP_ETH_RAW_CONTROL,
+
+	NSP_ETH_NUM_RAW
+};
+
+enum nfp_eth_rate {
+	RATE_INVALID = 0,
+	RATE_10M,
+	RATE_100M,
+	RATE_1G,
+	RATE_10G,
+	RATE_25G,
+};
+
+union eth_table_entry {
+	struct {
+		uint64_t port;
+		uint64_t state;
+		uint8_t mac_addr[6];
+		uint8_t resv[2];
+		uint64_t control;
+	};
+	uint64_t raw[NSP_ETH_NUM_RAW];
+};
+
+static const struct {
+	enum nfp_eth_rate rate;
+	unsigned int speed;
+} nsp_eth_rate_tbl[] = {
+	{ RATE_INVALID,	0, },
+	{ RATE_10M,	SPEED_10, },
+	{ RATE_100M,	SPEED_100, },
+	{ RATE_1G,	SPEED_1000, },
+	{ RATE_10G,	SPEED_10000, },
+	{ RATE_25G,	SPEED_25000, },
+};
+
+static unsigned int
+nfp_eth_rate2speed(enum nfp_eth_rate rate)
+{
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_eth_rate_tbl); i++)
+		if (nsp_eth_rate_tbl[i].rate == rate)
+			return nsp_eth_rate_tbl[i].speed;
+
+	return 0;
+}
+
+static unsigned int
+nfp_eth_speed2rate(unsigned int speed)
+{
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(nsp_eth_rate_tbl); i++)
+		if (nsp_eth_rate_tbl[i].speed == speed)
+			return nsp_eth_rate_tbl[i].rate;
+
+	return RATE_INVALID;
+}
+
+static void
+nfp_eth_copy_mac_reverse(uint8_t *dst, const uint8_t *src)
+{
+	int i;
+
+	for (i = 0; i < (int)ETH_ALEN; i++)
+		dst[ETH_ALEN - i - 1] = src[i];
+}
+
+static void
+nfp_eth_port_translate(struct nfp_nsp *nsp, const union eth_table_entry *src,
+		       unsigned int index, struct nfp_eth_table_port *dst)
+{
+	unsigned int rate;
+	unsigned int fec;
+	uint64_t port, state;
+
+	port = rte_le_to_cpu_64(src->port);
+	state = rte_le_to_cpu_64(src->state);
+
+	dst->eth_index = FIELD_GET(NSP_ETH_PORT_INDEX, port);
+	dst->index = index;
+	dst->nbi = index / NSP_ETH_NBI_PORT_COUNT;
+	dst->base = index % NSP_ETH_NBI_PORT_COUNT;
+	dst->lanes = FIELD_GET(NSP_ETH_PORT_LANES, port);
+
+	dst->enabled = FIELD_GET(NSP_ETH_STATE_ENABLED, state);
+	dst->tx_enabled = FIELD_GET(NSP_ETH_STATE_TX_ENABLED, state);
+	dst->rx_enabled = FIELD_GET(NSP_ETH_STATE_RX_ENABLED, state);
+
+	rate = nfp_eth_rate2speed(FIELD_GET(NSP_ETH_STATE_RATE, state));
+	dst->speed = dst->lanes * rate;
+
+	dst->interface = FIELD_GET(NSP_ETH_STATE_INTERFACE, state);
+	dst->media = FIELD_GET(NSP_ETH_STATE_MEDIA, state);
+
+	nfp_eth_copy_mac_reverse(dst->mac_addr, src->mac_addr);
+
+	dst->label_port = FIELD_GET(NSP_ETH_PORT_PHYLABEL, port);
+	dst->label_subport = FIELD_GET(NSP_ETH_PORT_LABEL, port);
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 17)
+		return;
+
+	dst->override_changed = FIELD_GET(NSP_ETH_STATE_OVRD_CHNG, state);
+	dst->aneg = FIELD_GET(NSP_ETH_STATE_ANEG, state);
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 22)
+		return;
+
+	fec = FIELD_GET(NSP_ETH_PORT_FEC_SUPP_BASER, port);
+	dst->fec_modes_supported |= fec << NFP_FEC_BASER_BIT;
+	fec = FIELD_GET(NSP_ETH_PORT_FEC_SUPP_RS, port);
+	dst->fec_modes_supported |= fec << NFP_FEC_REED_SOLOMON_BIT;
+	if (dst->fec_modes_supported)
+		dst->fec_modes_supported |= NFP_FEC_AUTO | NFP_FEC_DISABLED;
+
+	dst->fec = 1 << FIELD_GET(NSP_ETH_STATE_FEC, state);
+}
+
+static void
+nfp_eth_calc_port_geometry(struct nfp_eth_table *table)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < table->count; i++) {
+		table->max_index = RTE_MAX(table->max_index,
+					   table->ports[i].index);
+
+		for (j = 0; j < table->count; j++) {
+			if (table->ports[i].label_port !=
+			    table->ports[j].label_port)
+				continue;
+			table->ports[i].port_lanes += table->ports[j].lanes;
+
+			if (i == j)
+				continue;
+			if (table->ports[i].label_subport ==
+			    table->ports[j].label_subport)
+				printf("Port %d subport %d is a duplicate\n",
+					 table->ports[i].label_port,
+					 table->ports[i].label_subport);
+
+			table->ports[i].is_split = 1;
+		}
+	}
+}
+
+static void
+nfp_eth_calc_port_type(struct nfp_eth_table_port *entry)
+{
+	if (entry->interface == NFP_INTERFACE_NONE) {
+		entry->port_type = PORT_NONE;
+		return;
+	}
+
+	if (entry->media == NFP_MEDIA_FIBRE)
+		entry->port_type = PORT_FIBRE;
+	else
+		entry->port_type = PORT_DA;
+}
+
+static struct nfp_eth_table *
+__nfp_eth_read_ports(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries;
+	struct nfp_eth_table *table;
+	uint32_t table_sz;
+	int i, j, ret, cnt = 0;
+
+	entries = malloc(NSP_ETH_TABLE_SIZE);
+	if (!entries)
+		return NULL;
+
+	memset(entries, 0, NSP_ETH_TABLE_SIZE);
+	ret = nfp_nsp_read_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+	if (ret < 0) {
+		printf("reading port table failed %d\n", ret);
+		goto err;
+	}
+
+	for (i = 0; i < NSP_ETH_MAX_COUNT; i++)
+		if (entries[i].port & NSP_ETH_PORT_LANES_MASK)
+			cnt++;
+
+	/* Some versions of flash will give us 0 instead of port count. For
+	 * those that give a port count, verify it against the value calculated
+	 * above.
+	 */
+	if (ret && ret != cnt) {
+		printf("table entry count (%d) unmatch entries present (%d)\n",
+		       ret, cnt);
+		goto err;
+	}
+
+	table_sz = sizeof(*table) + sizeof(struct nfp_eth_table_port) * cnt;
+	table = malloc(table_sz);
+	if (!table)
+		goto err;
+
+	memset(table, 0, table_sz);
+	table->count = cnt;
+	for (i = 0, j = 0; i < NSP_ETH_MAX_COUNT; i++)
+		if (entries[i].port & NSP_ETH_PORT_LANES_MASK)
+			nfp_eth_port_translate(nsp, &entries[i], i,
+					       &table->ports[j++]);
+
+	nfp_eth_calc_port_geometry(table);
+	for (i = 0; i < (int)table->count; i++)
+		nfp_eth_calc_port_type(&table->ports[i]);
+
+	free(entries);
+
+	return table;
+
+err:
+	free(entries);
+	return NULL;
+}
+
+/*
+ * nfp_eth_read_ports() - retrieve port information
+ * @cpp:	NFP CPP handle
+ *
+ * Read the port information from the device.  Returned structure should
+ * be freed with kfree() once no longer needed.
+ *
+ * Return: populated ETH table or NULL on error.
+ */
+struct nfp_eth_table *
+nfp_eth_read_ports(struct nfp_cpp *cpp)
+{
+	struct nfp_eth_table *ret;
+	struct nfp_nsp *nsp;
+
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp)
+		return NULL;
+
+	ret = __nfp_eth_read_ports(nsp);
+	nfp_nsp_close(nsp);
+
+	return ret;
+}
+
+struct nfp_nsp *
+nfp_eth_config_start(struct nfp_cpp *cpp, unsigned int idx)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	int ret;
+
+	entries = malloc(NSP_ETH_TABLE_SIZE);
+	if (!entries)
+		return NULL;
+
+	memset(entries, 0, NSP_ETH_TABLE_SIZE);
+	nsp = nfp_nsp_open(cpp);
+	if (!nsp) {
+		free(entries);
+		return nsp;
+	}
+
+	ret = nfp_nsp_read_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+	if (ret < 0) {
+		printf("reading port table failed %d\n", ret);
+		goto err;
+	}
+
+	if (!(entries[idx].port & NSP_ETH_PORT_LANES_MASK)) {
+		printf("trying to set port state on disabled port %d\n", idx);
+		goto err;
+	}
+
+	nfp_nsp_config_set_state(nsp, entries, idx);
+	return nsp;
+
+err:
+	nfp_nsp_close(nsp);
+	free(entries);
+	return NULL;
+}
+
+void
+nfp_eth_config_cleanup_end(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+
+	nfp_nsp_config_set_modified(nsp, 0);
+	nfp_nsp_config_clear_state(nsp);
+	nfp_nsp_close(nsp);
+	free(entries);
+}
+
+/*
+ * nfp_eth_config_commit_end() - perform recorded configuration changes
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ *
+ * Perform the configuration which was requested with __nfp_eth_set_*()
+ * helpers and recorded in @nsp state.  If device was already configured
+ * as requested or no __nfp_eth_set_*() operations were made no NSP command
+ * will be performed.
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_config_commit_end(struct nfp_nsp *nsp)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+	int ret = 1;
+
+	if (nfp_nsp_config_modified(nsp)) {
+		ret = nfp_nsp_write_eth_table(nsp, entries, NSP_ETH_TABLE_SIZE);
+		ret = ret < 0 ? ret : 0;
+	}
+
+	nfp_eth_config_cleanup_end(nsp);
+
+	return ret;
+}
+
+/*
+ * nfp_eth_set_mod_enable() - set PHY module enable control bit
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @enable:	Desired state
+ *
+ * Enable or disable PHY module (this usually means setting the TX lanes
+ * disable bits).
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_mod_enable(struct nfp_cpp *cpp, unsigned int idx, int enable)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	uint64_t reg;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -1;
+
+	entries = nfp_nsp_config_entries(nsp);
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].state);
+	if (enable != (int)FIELD_GET(NSP_ETH_CTRL_ENABLED, reg)) {
+		reg = rte_le_to_cpu_64(entries[idx].control);
+		reg &= ~NSP_ETH_CTRL_ENABLED;
+		reg |= FIELD_PREP(NSP_ETH_CTRL_ENABLED, enable);
+		entries[idx].control = rte_cpu_to_le_64(reg);
+
+		nfp_nsp_config_set_modified(nsp, 1);
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+/*
+ * nfp_eth_set_configured() - set PHY module configured control bit
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @configed:	Desired state
+ *
+ * Set the ifup/ifdown state on the PHY.
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_configured(struct nfp_cpp *cpp, unsigned int idx, int configed)
+{
+	union eth_table_entry *entries;
+	struct nfp_nsp *nsp;
+	uint64_t reg;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -EIO;
+
+	/*
+	 * Older ABI versions did support this feature, however this has only
+	 * been reliable since ABI 20.
+	 */
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 20) {
+		nfp_eth_config_cleanup_end(nsp);
+		return -EOPNOTSUPP;
+	}
+
+	entries = nfp_nsp_config_entries(nsp);
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].state);
+	if (configed != (int)FIELD_GET(NSP_ETH_STATE_CONFIGURED, reg)) {
+		reg = rte_le_to_cpu_64(entries[idx].control);
+		reg &= ~NSP_ETH_CTRL_CONFIGURED;
+		reg |= FIELD_PREP(NSP_ETH_CTRL_CONFIGURED, configed);
+		entries[idx].control = rte_cpu_to_le_64(reg);
+
+		nfp_nsp_config_set_modified(nsp, 1);
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+static int
+nfp_eth_set_bit_config(struct nfp_nsp *nsp, unsigned int raw_idx,
+		       const uint64_t mask, const unsigned int shift,
+		       unsigned int val, const uint64_t ctrl_bit)
+{
+	union eth_table_entry *entries = nfp_nsp_config_entries(nsp);
+	unsigned int idx = nfp_nsp_config_idx(nsp);
+	uint64_t reg;
+
+	/*
+	 * Note: set features were added in ABI 0.14 but the error
+	 *	 codes were initially not populated correctly.
+	 */
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 17) {
+		printf("set operations not supported, please update flash\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* Check if we are already in requested state */
+	reg = rte_le_to_cpu_64(entries[idx].raw[raw_idx]);
+	if (val == (reg & mask) >> shift)
+		return 0;
+
+	reg &= ~mask;
+	reg |= (val << shift) & mask;
+	entries[idx].raw[raw_idx] = rte_cpu_to_le_64(reg);
+
+	entries[idx].control |= rte_cpu_to_le_64(ctrl_bit);
+
+	nfp_nsp_config_set_modified(nsp, 1);
+
+	return 0;
+}
+
+#define NFP_ETH_SET_BIT_CONFIG(nsp, raw_idx, mask, val, ctrl_bit)	\
+	(__extension__ ({ \
+		typeof(mask) _x = (mask); \
+		nfp_eth_set_bit_config(nsp, raw_idx, _x, __bf_shf(_x), \
+				       val, ctrl_bit);			\
+	}))
+
+/*
+ * __nfp_eth_set_aneg() - set PHY autonegotiation control bit
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @mode:	Desired autonegotiation mode
+ *
+ * Allow/disallow PHY module to advertise/perform autonegotiation.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_aneg(struct nfp_nsp *nsp, enum nfp_eth_aneg mode)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_ANEG, mode,
+				      NSP_ETH_CTRL_SET_ANEG);
+}
+
+/*
+ * __nfp_eth_set_fec() - set PHY forward error correction control bit
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @mode:	Desired fec mode
+ *
+ * Set the PHY module forward error correction mode.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+static int
+__nfp_eth_set_fec(struct nfp_nsp *nsp, enum nfp_eth_fec mode)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_FEC, mode,
+				      NSP_ETH_CTRL_SET_FEC);
+}
+
+/*
+ * nfp_eth_set_fec() - set PHY forward error correction control mode
+ * @cpp:	NFP CPP handle
+ * @idx:	NFP chip-wide port index
+ * @mode:	Desired fec mode
+ *
+ * Return:
+ * 0 - configuration successful;
+ * 1 - no changes were needed;
+ * -ERRNO - configuration failed.
+ */
+int
+nfp_eth_set_fec(struct nfp_cpp *cpp, unsigned int idx, enum nfp_eth_fec mode)
+{
+	struct nfp_nsp *nsp;
+	int err;
+
+	nsp = nfp_eth_config_start(cpp, idx);
+	if (!nsp)
+		return -EIO;
+
+	err = __nfp_eth_set_fec(nsp, mode);
+	if (err) {
+		nfp_eth_config_cleanup_end(nsp);
+		return err;
+	}
+
+	return nfp_eth_config_commit_end(nsp);
+}
+
+/*
+ * __nfp_eth_set_speed() - set interface speed/rate
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @speed:	Desired speed (per lane)
+ *
+ * Set lane speed.  Provided @speed value should be subport speed divided
+ * by number of lanes this subport is spanning (i.e. 10000 for 40G, 25000 for
+ * 50G, etc.)
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_speed(struct nfp_nsp *nsp, unsigned int speed)
+{
+	enum nfp_eth_rate rate;
+
+	rate = nfp_eth_speed2rate(speed);
+	if (rate == RATE_INVALID) {
+		printf("could not find matching lane rate for speed %u\n",
+			 speed);
+		return -EINVAL;
+	}
+
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_STATE,
+				      NSP_ETH_STATE_RATE, rate,
+				      NSP_ETH_CTRL_SET_RATE);
+}
+
+/*
+ * __nfp_eth_set_split() - set interface lane split
+ * @nsp:	NFP NSP handle returned from nfp_eth_config_start()
+ * @lanes:	Desired lanes per port
+ *
+ * Set number of lanes in the port.
+ * Will write to hwinfo overrides in the flash (persistent config).
+ *
+ * Return: 0 or -ERRNO.
+ */
+int
+__nfp_eth_set_split(struct nfp_nsp *nsp, unsigned int lanes)
+{
+	return NFP_ETH_SET_BIT_CONFIG(nsp, NSP_ETH_RAW_PORT, NSP_ETH_PORT_LANES,
+				      lanes, NSP_ETH_CTRL_SET_LANES);
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_resource.c b/drivers/net/nfp/nfpcore/nfp_resource.c
new file mode 100644
index 0000000..e1df2b2
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_resource.c
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include <endian.h>
+
+#include "nfp_cpp.h"
+#include "nfp6000/nfp6000.h"
+#include "nfp_resource.h"
+#include "nfp_crc.h"
+
+#define NFP_RESOURCE_TBL_TARGET		NFP_CPP_TARGET_MU
+#define NFP_RESOURCE_TBL_BASE		0x8100000000ULL
+
+/* NFP Resource Table self-identifier */
+#define NFP_RESOURCE_TBL_NAME		"nfp.res"
+#define NFP_RESOURCE_TBL_KEY		0x00000000 /* Special key for entry 0 */
+
+#define NFP_RESOURCE_ENTRY_NAME_SZ	8
+
+/*
+ * struct nfp_resource_entry - Resource table entry
+ * @owner:		NFP CPP Lock, interface owner
+ * @key:		NFP CPP Lock, posix_crc32(name, 8)
+ * @region:		Memory region descriptor
+ * @name:		ASCII, zero padded name
+ * @reserved
+ * @cpp_action:		CPP Action
+ * @cpp_token:		CPP Token
+ * @cpp_target:		CPP Target ID
+ * @page_offset:	256-byte page offset into target's CPP address
+ * @page_size:		size, in 256-byte pages
+ */
+struct nfp_resource_entry {
+	struct nfp_resource_entry_mutex {
+		uint32_t owner;
+		uint32_t key;
+	} mutex;
+	struct nfp_resource_entry_region {
+		uint8_t  name[NFP_RESOURCE_ENTRY_NAME_SZ];
+		uint8_t  reserved[5];
+		uint8_t  cpp_action;
+		uint8_t  cpp_token;
+		uint8_t  cpp_target;
+		uint32_t page_offset;
+		uint32_t page_size;
+	} region;
+};
+
+#define NFP_RESOURCE_TBL_SIZE		4096
+#define NFP_RESOURCE_TBL_ENTRIES	(int)(NFP_RESOURCE_TBL_SIZE /	\
+					 sizeof(struct nfp_resource_entry))
+
+struct nfp_resource {
+	char name[NFP_RESOURCE_ENTRY_NAME_SZ + 1];
+	uint32_t cpp_id;
+	uint64_t addr;
+	uint64_t size;
+	struct nfp_cpp_mutex *mutex;
+};
+
+static int
+nfp_cpp_resource_find(struct nfp_cpp *cpp, struct nfp_resource *res)
+{
+	char name_pad[NFP_RESOURCE_ENTRY_NAME_SZ] = {};
+	struct nfp_resource_entry entry;
+	uint32_t cpp_id, key;
+	int ret, i;
+
+	cpp_id = NFP_CPP_ID(NFP_RESOURCE_TBL_TARGET, 3, 0);  /* Atomic read */
+
+	memset(name_pad, 0, NFP_RESOURCE_ENTRY_NAME_SZ);
+	strncpy(name_pad, res->name, sizeof(name_pad));
+
+	/* Search for a matching entry */
+	if (!memcmp(name_pad, NFP_RESOURCE_TBL_NAME "\0\0\0\0\0\0\0\0", 8)) {
+		printf("Grabbing device lock not supported\n");
+		return -EOPNOTSUPP;
+	}
+	key = nfp_crc32_posix(name_pad, sizeof(name_pad));
+
+	for (i = 0; i < NFP_RESOURCE_TBL_ENTRIES; i++) {
+		uint64_t addr = NFP_RESOURCE_TBL_BASE +
+			sizeof(struct nfp_resource_entry) * i;
+
+		ret = nfp_cpp_read(cpp, cpp_id, addr, &entry, sizeof(entry));
+		if (ret != sizeof(entry))
+			return -EIO;
+
+		if (entry.mutex.key != key)
+			continue;
+
+		/* Found key! */
+		res->mutex =
+			nfp_cpp_mutex_alloc(cpp,
+					    NFP_RESOURCE_TBL_TARGET, addr, key);
+		res->cpp_id = NFP_CPP_ID(entry.region.cpp_target,
+					 entry.region.cpp_action,
+					 entry.region.cpp_token);
+		res->addr = ((uint64_t)entry.region.page_offset) << 8;
+		res->size = (uint64_t)entry.region.page_size << 8;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int
+nfp_resource_try_acquire(struct nfp_cpp *cpp, struct nfp_resource *res,
+			 struct nfp_cpp_mutex *dev_mutex)
+{
+	int err;
+
+	if (nfp_cpp_mutex_lock(dev_mutex))
+		return -EINVAL;
+
+	err = nfp_cpp_resource_find(cpp, res);
+	if (err)
+		goto err_unlock_dev;
+
+	err = nfp_cpp_mutex_trylock(res->mutex);
+	if (err)
+		goto err_res_mutex_free;
+
+	nfp_cpp_mutex_unlock(dev_mutex);
+
+	return 0;
+
+err_res_mutex_free:
+	nfp_cpp_mutex_free(res->mutex);
+err_unlock_dev:
+	nfp_cpp_mutex_unlock(dev_mutex);
+
+	return err;
+}
+
+/*
+ * nfp_resource_acquire() - Acquire a resource handle
+ * @cpp:	NFP CPP handle
+ * @name:	Name of the resource
+ *
+ * NOTE: This function locks the acquired resource
+ *
+ * Return: NFP Resource handle, or ERR_PTR()
+ */
+struct nfp_resource *
+nfp_resource_acquire(struct nfp_cpp *cpp, const char *name)
+{
+	struct nfp_cpp_mutex *dev_mutex;
+	struct nfp_resource *res;
+	int err;
+	struct timespec wait;
+	int count;
+
+	res = malloc(sizeof(*res));
+	if (!res)
+		return NULL;
+
+	memset(res, 0, sizeof(*res));
+
+	strncpy(res->name, name, NFP_RESOURCE_ENTRY_NAME_SZ);
+
+	dev_mutex = nfp_cpp_mutex_alloc(cpp, NFP_RESOURCE_TBL_TARGET,
+					NFP_RESOURCE_TBL_BASE,
+					NFP_RESOURCE_TBL_KEY);
+	if (!dev_mutex) {
+		free(res);
+		return NULL;
+	}
+
+	wait.tv_sec = 0;
+	wait.tv_nsec = 1000000;
+	count = 0;
+
+	for (;;) {
+		err = nfp_resource_try_acquire(cpp, res, dev_mutex);
+		if (!err)
+			break;
+		if (err != -EBUSY)
+			goto err_free;
+
+		if (count++ > 1000) {
+			printf("Error: resource %s timed out\n", name);
+			err = -EBUSY;
+			goto err_free;
+		}
+
+		nanosleep(&wait, NULL);
+	}
+
+	nfp_cpp_mutex_free(dev_mutex);
+
+	return res;
+
+err_free:
+	nfp_cpp_mutex_free(dev_mutex);
+	free(res);
+	return NULL;
+}
+
+/*
+ * nfp_resource_release() - Release a NFP Resource handle
+ * @res:	NFP Resource handle
+ *
+ * NOTE: This function implictly unlocks the resource handle
+ */
+void
+nfp_resource_release(struct nfp_resource *res)
+{
+	nfp_cpp_mutex_unlock(res->mutex);
+	nfp_cpp_mutex_free(res->mutex);
+	free(res);
+}
+
+/*
+ * nfp_resource_cpp_id() - Return the cpp_id of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: NFP CPP ID
+ */
+uint32_t
+nfp_resource_cpp_id(const struct nfp_resource *res)
+{
+	return res->cpp_id;
+}
+
+/*
+ * nfp_resource_name() - Return the name of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: const char pointer to the name of the resource
+ */
+const char
+*nfp_resource_name(const struct nfp_resource *res)
+{
+	return res->name;
+}
+
+/*
+ * nfp_resource_address() - Return the address of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: Address of the resource
+ */
+uint64_t
+nfp_resource_address(const struct nfp_resource *res)
+{
+	return res->addr;
+}
+
+/*
+ * nfp_resource_size() - Return the size in bytes of a resource handle
+ * @res:        NFP Resource handle
+ *
+ * Return: Size of the resource in bytes
+ */
+uint64_t
+nfp_resource_size(const struct nfp_resource *res)
+{
+	return res->size;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_resource.h b/drivers/net/nfp/nfpcore/nfp_resource.h
new file mode 100644
index 0000000..06cc6f7
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_resource.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NFP_RESOURCE_H
+#define NFP_RESOURCE_H
+
+#include "nfp_cpp.h"
+
+#define NFP_RESOURCE_NFP_NFFW           "nfp.nffw"
+#define NFP_RESOURCE_NFP_HWINFO         "nfp.info"
+#define NFP_RESOURCE_NSP		"nfp.sp"
+
+/**
+ * Opaque handle to a NFP Resource
+ */
+struct nfp_resource;
+
+struct nfp_resource *nfp_resource_acquire(struct nfp_cpp *cpp,
+					  const char *name);
+
+/**
+ * Release a NFP Resource, and free the handle
+ * @param[in]   res     NFP Resource handle
+ */
+void nfp_resource_release(struct nfp_resource *res);
+
+/**
+ * Return the CPP ID of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      CPP ID of the NFP Resource
+ */
+uint32_t nfp_resource_cpp_id(const struct nfp_resource *res);
+
+/**
+ * Return the name of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      Name of the NFP Resource
+ */
+const char *nfp_resource_name(const struct nfp_resource *res);
+
+/**
+ * Return the target address of a NFP Resource
+ * @param[in]   res     NFP Resource handle
+ * @return      Address of the NFP Resource
+ */
+uint64_t nfp_resource_address(const struct nfp_resource *res);
+
+uint64_t nfp_resource_size(const struct nfp_resource *res);
+
+#endif /* NFP_RESOURCE_H */
diff --git a/drivers/net/nfp/nfpcore/nfp_rtsym.c b/drivers/net/nfp/nfpcore/nfp_rtsym.c
new file mode 100644
index 0000000..cb7d83d
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_rtsym.c
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+/*
+ * nfp_rtsym.c
+ * Interface for accessing run-time symbol table
+ */
+
+#include <stdio.h>
+#include <rte_byteorder.h>
+#include "nfp_cpp.h"
+#include "nfp_mip.h"
+#include "nfp_rtsym.h"
+#include "nfp6000/nfp6000.h"
+
+/* These need to match the linker */
+#define SYM_TGT_LMEM		0
+#define SYM_TGT_EMU_CACHE	0x17
+
+struct nfp_rtsym_entry {
+	uint8_t	type;
+	uint8_t	target;
+	uint8_t	island;
+	uint8_t	addr_hi;
+	uint32_t addr_lo;
+	uint16_t name;
+	uint8_t	menum;
+	uint8_t	size_hi;
+	uint32_t size_lo;
+};
+
+struct nfp_rtsym_table {
+	struct nfp_cpp *cpp;
+	int num;
+	char *strtab;
+	struct nfp_rtsym symtab[];
+};
+
+static int
+nfp_meid(uint8_t island_id, uint8_t menum)
+{
+	return (island_id & 0x3F) == island_id && menum < 12 ?
+		(island_id << 4) | (menum + 4) : -1;
+}
+
+static void
+nfp_rtsym_sw_entry_init(struct nfp_rtsym_table *cache, uint32_t strtab_size,
+			struct nfp_rtsym *sw, struct nfp_rtsym_entry *fw)
+{
+	sw->type = fw->type;
+	sw->name = cache->strtab + rte_le_to_cpu_16(fw->name) % strtab_size;
+	sw->addr = ((uint64_t)fw->addr_hi << 32) |
+		   rte_le_to_cpu_32(fw->addr_lo);
+	sw->size = ((uint64_t)fw->size_hi << 32) |
+		   rte_le_to_cpu_32(fw->size_lo);
+
+#ifdef DEBUG
+	printf("rtsym_entry_init\n");
+	printf("\tname=%s, addr=%" PRIx64 ", size=%" PRIu64 ",target=%d\n",
+		sw->name, sw->addr, sw->size, sw->target);
+#endif
+	switch (fw->target) {
+	case SYM_TGT_LMEM:
+		sw->target = NFP_RTSYM_TARGET_LMEM;
+		break;
+	case SYM_TGT_EMU_CACHE:
+		sw->target = NFP_RTSYM_TARGET_EMU_CACHE;
+		break;
+	default:
+		sw->target = fw->target;
+		break;
+	}
+
+	if (fw->menum != 0xff)
+		sw->domain = nfp_meid(fw->island, fw->menum);
+	else if (fw->island != 0xff)
+		sw->domain = fw->island;
+	else
+		sw->domain = -1;
+}
+
+struct nfp_rtsym_table *
+nfp_rtsym_table_read(struct nfp_cpp *cpp)
+{
+	struct nfp_rtsym_table *rtbl;
+	struct nfp_mip *mip;
+
+	mip = nfp_mip_open(cpp);
+	rtbl = __nfp_rtsym_table_read(cpp, mip);
+	nfp_mip_close(mip);
+
+	return rtbl;
+}
+
+/*
+ * This looks more complex than it should be. But we need to get the type for
+ * the ~ right in round_down (it needs to be as wide as the result!), and we
+ * want to evaluate the macro arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+
+#define round_up(x, y) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		((((_x) - 1) | __round_mask(_x, y)) + 1); \
+	}))
+
+#define round_down(x, y) \
+	(__extension__ ({ \
+		typeof(x) _x = (x); \
+		((_x) & ~__round_mask(_x, y)); \
+	}))
+
+struct nfp_rtsym_table *
+__nfp_rtsym_table_read(struct nfp_cpp *cpp, const struct nfp_mip *mip)
+{
+	uint32_t strtab_addr, symtab_addr, strtab_size, symtab_size;
+	struct nfp_rtsym_entry *rtsymtab;
+	struct nfp_rtsym_table *cache;
+	const uint32_t dram =
+		NFP_CPP_ID(NFP_CPP_TARGET_MU, NFP_CPP_ACTION_RW, 0) |
+		NFP_ISL_EMEM0;
+	int err, n, size;
+
+	if (!mip)
+		return NULL;
+
+	nfp_mip_strtab(mip, &strtab_addr, &strtab_size);
+	nfp_mip_symtab(mip, &symtab_addr, &symtab_size);
+
+	if (!symtab_size || !strtab_size || symtab_size % sizeof(*rtsymtab))
+		return NULL;
+
+	/* Align to 64 bits */
+	symtab_size = round_up(symtab_size, 8);
+	strtab_size = round_up(strtab_size, 8);
+
+	rtsymtab = malloc(symtab_size);
+	if (!rtsymtab)
+		return NULL;
+
+	size = sizeof(*cache);
+	size += symtab_size / sizeof(*rtsymtab) * sizeof(struct nfp_rtsym);
+	size +=	strtab_size + 1;
+	cache = malloc(size);
+	if (!cache)
+		goto exit_free_rtsym_raw;
+
+	cache->cpp = cpp;
+	cache->num = symtab_size / sizeof(*rtsymtab);
+	cache->strtab = (void *)&cache->symtab[cache->num];
+
+	err = nfp_cpp_read(cpp, dram, symtab_addr, rtsymtab, symtab_size);
+	if (err != (int)symtab_size)
+		goto exit_free_cache;
+
+	err = nfp_cpp_read(cpp, dram, strtab_addr, cache->strtab, strtab_size);
+	if (err != (int)strtab_size)
+		goto exit_free_cache;
+	cache->strtab[strtab_size] = '\0';
+
+	for (n = 0; n < cache->num; n++)
+		nfp_rtsym_sw_entry_init(cache, strtab_size,
+					&cache->symtab[n], &rtsymtab[n]);
+
+	free(rtsymtab);
+
+	return cache;
+
+exit_free_cache:
+	free(cache);
+exit_free_rtsym_raw:
+	free(rtsymtab);
+	return NULL;
+}
+
+/*
+ * nfp_rtsym_count() - Get the number of RTSYM descriptors
+ * @rtbl:	NFP RTsym table
+ *
+ * Return: Number of RTSYM descriptors
+ */
+int
+nfp_rtsym_count(struct nfp_rtsym_table *rtbl)
+{
+	if (!rtbl)
+		return -EINVAL;
+
+	return rtbl->num;
+}
+
+/*
+ * nfp_rtsym_get() - Get the Nth RTSYM descriptor
+ * @rtbl:	NFP RTsym table
+ * @idx:	Index (0-based) of the RTSYM descriptor
+ *
+ * Return: const pointer to a struct nfp_rtsym descriptor, or NULL
+ */
+const struct nfp_rtsym *
+nfp_rtsym_get(struct nfp_rtsym_table *rtbl, int idx)
+{
+	if (!rtbl)
+		return NULL;
+
+	if (idx >= rtbl->num)
+		return NULL;
+
+	return &rtbl->symtab[idx];
+}
+
+/*
+ * nfp_rtsym_lookup() - Return the RTSYM descriptor for a symbol name
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ *
+ * Return: const pointer to a struct nfp_rtsym descriptor, or NULL
+ */
+const struct nfp_rtsym *
+nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name)
+{
+	int n;
+
+	if (!rtbl)
+		return NULL;
+
+	for (n = 0; n < rtbl->num; n++)
+		if (strcmp(name, rtbl->symtab[n].name) == 0)
+			return &rtbl->symtab[n];
+
+	return NULL;
+}
+
+/*
+ * nfp_rtsym_read_le() - Read a simple unsigned scalar value from symbol
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ * @error:	Poniter to error code (optional)
+ *
+ * Lookup a symbol, map, read it and return it's value. Value of the symbol
+ * will be interpreted as a simple little-endian unsigned value. Symbol can
+ * be 4 or 8 bytes in size.
+ *
+ * Return: value read, on error sets the error and returns ~0ULL.
+ */
+uint64_t
+nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name, int *error)
+{
+	const struct nfp_rtsym *sym;
+	uint32_t val32, id;
+	uint64_t val;
+	int err;
+
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym) {
+		err = -ENOENT;
+		goto exit;
+	}
+
+	id = NFP_CPP_ISLAND_ID(sym->target, NFP_CPP_ACTION_RW, 0, sym->domain);
+
+#ifdef DEBUG
+	printf("Reading symbol %s with size %" PRIu64 " at %" PRIx64 "\n",
+		name, sym->size, sym->addr);
+#endif
+	switch (sym->size) {
+	case 4:
+		err = nfp_cpp_readl(rtbl->cpp, id, sym->addr, &val32);
+		val = val32;
+		break;
+	case 8:
+		err = nfp_cpp_readq(rtbl->cpp, id, sym->addr, &val);
+		break;
+	default:
+		printf("rtsym '%s' unsupported size: %" PRId64 "\n",
+			name, sym->size);
+		err = -EINVAL;
+		break;
+	}
+
+	if (err)
+		err = -EIO;
+exit:
+	if (error)
+		*error = err;
+
+	if (err)
+		return ~0ULL;
+
+	return val;
+}
+
+uint8_t *
+nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name,
+	      unsigned int min_size, struct nfp_cpp_area **area)
+{
+	const struct nfp_rtsym *sym;
+	uint8_t *mem;
+
+#ifdef DEBUG
+	printf("mapping symbol %s\n", name);
+#endif
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym) {
+		printf("symbol lookup fails for %s\n", name);
+		return NULL;
+	}
+
+	if (sym->size < min_size) {
+		printf("Symbol %s too small (%" PRIu64 " < %u)\n", name,
+			sym->size, min_size);
+		return NULL;
+	}
+
+	mem = nfp_cpp_map_area(rtbl->cpp, sym->domain, sym->target, sym->addr,
+			       sym->size, area);
+	if (!mem) {
+		printf("Failed to map symbol %s\n", name);
+		return NULL;
+	}
+#ifdef DEBUG
+	printf("symbol %s with address %p\n", name, mem);
+#endif
+
+	return mem;
+}
diff --git a/drivers/net/nfp/nfpcore/nfp_rtsym.h b/drivers/net/nfp/nfpcore/nfp_rtsym.h
new file mode 100644
index 0000000..8b49421
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_rtsym.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __NFP_RTSYM_H__
+#define __NFP_RTSYM_H__
+
+#define NFP_RTSYM_TYPE_NONE             0
+#define NFP_RTSYM_TYPE_OBJECT           1
+#define NFP_RTSYM_TYPE_FUNCTION         2
+#define NFP_RTSYM_TYPE_ABS              3
+
+#define NFP_RTSYM_TARGET_NONE           0
+#define NFP_RTSYM_TARGET_LMEM           -1
+#define NFP_RTSYM_TARGET_EMU_CACHE      -7
+
+/*
+ * Structure describing a run-time NFP symbol.
+ *
+ * The memory target of the symbol is generally the CPP target number and can be
+ * used directly by the nfp_cpp API calls.  However, in some cases (i.e., for
+ * local memory or control store) the target is encoded using a negative number.
+ *
+ * When the target type can not be used to fully describe the location of a
+ * symbol the domain field is used to further specify the location (i.e., the
+ * specific ME or island number).
+ *
+ * For ME target resources, 'domain' is an MEID.
+ * For Island target resources, 'domain' is an island ID, with the one exception
+ * of "sram" symbols for backward compatibility, which are viewed as global.
+ */
+struct nfp_rtsym {
+	const char *name;
+	uint64_t addr;
+	uint64_t size;
+	int type;
+	int target;
+	int domain;
+};
+
+struct nfp_rtsym_table;
+
+struct nfp_rtsym_table *nfp_rtsym_table_read(struct nfp_cpp *cpp);
+
+struct nfp_rtsym_table *
+__nfp_rtsym_table_read(struct nfp_cpp *cpp, const struct nfp_mip *mip);
+
+int nfp_rtsym_count(struct nfp_rtsym_table *rtbl);
+
+const struct nfp_rtsym *nfp_rtsym_get(struct nfp_rtsym_table *rtbl, int idx);
+
+const struct nfp_rtsym *
+nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name);
+
+uint64_t nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
+			   int *error);
+uint8_t *
+nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name,
+	      unsigned int min_size, struct nfp_cpp_area **area);
+#endif
diff --git a/drivers/net/nfp/nfpcore/nfp_target.h b/drivers/net/nfp/nfpcore/nfp_target.h
new file mode 100644
index 0000000..2884a00
--- /dev/null
+++ b/drivers/net/nfp/nfpcore/nfp_target.h
@@ -0,0 +1,579 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Netronome Systems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef NFP_TARGET_H
+#define NFP_TARGET_H
+
+#include "nfp-common/nfp_resid.h"
+#include "nfp-common/nfp_cppat.h"
+#include "nfp-common/nfp_platform.h"
+#include "nfp_cpp.h"
+
+#define P32 1
+#define P64 2
+
+#define PUSHPULL(_pull, _push) (((_pull) << 4) | ((_push) << 0))
+
+#ifndef NFP_ERRNO
+#include <errno.h>
+#define NFP_ERRNO(x)    (errno = (x), -1)
+#endif
+
+static inline int
+pushpull_width(int pp)
+{
+	pp &= 0xf;
+
+	if (pp == 0)
+		return NFP_ERRNO(EINVAL);
+	return (2 << pp);
+}
+
+#define PUSH_WIDTH(_pushpull)      pushpull_width((_pushpull) >> 0)
+#define PULL_WIDTH(_pushpull)      pushpull_width((_pushpull) >> 4)
+
+static inline int
+target_rw(uint32_t cpp_id, int pp, int start, int len)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < start || island > (start + len)))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0):
+		return PUSHPULL(0, pp);
+	case NFP_CPP_ID(0, 1, 0):
+		return PUSHPULL(pp, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(pp, pp);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_dma(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiDma */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiDma */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_stats(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiStats */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiStats */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_tm(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiTM */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0):  /* WriteNbiTM */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi_ppc(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 0): /* ReadNbiPreclassifier */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* WriteNbiPreclassifier */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0):
+		return PUSHPULL(P64, P64);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_nbi(uint32_t cpp_id, uint64_t address)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+	uint64_t rel_addr = address & 0x3fFFFF;
+
+	if (island && (island < 8 || island > 9))
+		return NFP_ERRNO(EINVAL);
+
+	if (rel_addr < (1 << 20))
+		return nfp6000_nbi_dma(cpp_id);
+	if (rel_addr < (2 << 20))
+		return nfp6000_nbi_stats(cpp_id);
+	if (rel_addr < (3 << 20))
+		return nfp6000_nbi_tm(cpp_id);
+	return nfp6000_nbi_ppc(cpp_id);
+}
+
+/*
+ * This structure ONLY includes items that can be done with a read or write of
+ * 32-bit or 64-bit words. All others are not listed.
+ */
+static inline int
+nfp6000_mu_common(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 0): /* read_be/write_be */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 1): /* read_le/write_le */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 2): /* {read/write}_swap_be */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 3): /* {read/write}_swap_le */
+		return PUSHPULL(P64, P64);
+	case NFP_CPP_ID(0, 0, 0): /* read_be */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 1): /* read_le */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 2): /* read_swap_be */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 0, 3): /* read_swap_le */
+		return PUSHPULL(0, P64);
+	case NFP_CPP_ID(0, 1, 0): /* write_be */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 1): /* write_le */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 2): /* write_swap_be */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 1, 3): /* write_swap_le */
+		return PUSHPULL(P64, 0);
+	case NFP_CPP_ID(0, 3, 0): /* atomic_read */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 2): /* mask_compare_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 0): /* atomic_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 2): /* atomic_write_imm */
+		return PUSHPULL(0, 0);
+	case NFP_CPP_ID(0, 4, 3): /* swap_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 5, 0): /* set */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 5, 3): /* test_set_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 6, 0): /* clr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 3): /* test_clr_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 7, 0): /* add */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 7, 3): /* test_add_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 8, 0): /* addsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 3): /* test_subsat_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 0): /* sub */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 9, 3): /* test_sub_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 10, 0): /* subsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 10, 3): /* test_subsat_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 0): /* microq128_get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 1): /* microq128_pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 13, 2): /* microq128_put */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 0): /* xor */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 3): /* test_xor_imm */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 0): /* read32_be */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 1): /* read32_le */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 2): /* read32_swap_be */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 28, 3): /* read32_swap_le */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 31, 0): /* write32_be */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 1): /* write32_le */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 2): /* write32_swap_be */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 31, 3): /* write32_swap_le */
+		return PUSHPULL(P32, 0);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp6000_mu_ctm(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 16, 1): /* packet_read_packet_status */
+		return PUSHPULL(0, P32);
+	default:
+		return nfp6000_mu_common(cpp_id);
+	}
+}
+
+static inline int
+nfp6000_mu_emu(uint32_t cpp_id)
+{
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 18, 0): /* read_queue */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 18, 1): /* read_queue_ring */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 18, 2): /* write_queue */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 18, 3): /* write_queue_ring */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 20, 2): /* journal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 21, 0): /* get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 21, 1): /* get_eop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 21, 2): /* get_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 0): /* pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 1): /* pop_eop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 22, 2): /* pop_freely */
+		return PUSHPULL(0, P32);
+	default:
+		return nfp6000_mu_common(cpp_id);
+	}
+}
+
+static inline int
+nfp6000_mu_imu(uint32_t cpp_id)
+{
+	return nfp6000_mu_common(cpp_id);
+}
+
+static inline int
+nfp6000_mu(uint32_t cpp_id, uint64_t address)
+{
+	int pp;
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island == 0) {
+		if (address < 0x2000000000ULL)
+			pp = nfp6000_mu_ctm(cpp_id);
+		else if (address < 0x8000000000ULL)
+			pp = nfp6000_mu_emu(cpp_id);
+		else if (address < 0x9800000000ULL)
+			pp = nfp6000_mu_ctm(cpp_id);
+		else if (address < 0x9C00000000ULL)
+			pp = nfp6000_mu_emu(cpp_id);
+		else if (address < 0xA000000000ULL)
+			pp = nfp6000_mu_imu(cpp_id);
+		else
+			pp = nfp6000_mu_ctm(cpp_id);
+	} else if (island >= 24 && island <= 27) {
+		pp = nfp6000_mu_emu(cpp_id);
+	} else if (island >= 28 && island <= 31) {
+		pp = nfp6000_mu_imu(cpp_id);
+	} else if (island == 1 ||
+		   (island >= 4 && island <= 7) ||
+		   (island >= 12 && island <= 13) ||
+		   (island >= 32 && island <= 47) ||
+		   (island >= 48 && island <= 51)) {
+		pp = nfp6000_mu_ctm(cpp_id);
+	} else {
+		pp = NFP_ERRNO(EINVAL);
+	}
+
+	return pp;
+}
+
+static inline int
+nfp6000_ila(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 48 || island > 51))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 1): /* read_check_error */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 0): /* read_int */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0): /* write_int */
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 48, 4);
+	}
+}
+
+static inline int
+nfp6000_pci(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 4 || island > 7))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 2, 0):
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0):
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 4, 4);
+	}
+}
+
+static inline int
+nfp6000_crypto(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 12 || island > 15))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 2, 0):
+		return PUSHPULL(P64, 0);
+	default:
+		return target_rw(cpp_id, P64, 12, 4);
+	}
+}
+
+static inline int
+nfp6000_cap_xpb(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 1 || island > 63))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 1): /* RingGet */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 0, 2): /* Interthread Signal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 1, 1): /* RingPut */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 1, 2): /* CTNNWr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 0): /* ReflectRd, signal none */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 1): /* ReflectRd, signal self */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 2): /* ReflectRd, signal remote */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 2, 3): /* ReflectRd, signal both */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 3, 0): /* ReflectWr, signal none */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 1): /* ReflectWr, signal self */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 2): /* ReflectWr, signal remote */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 3, 3): /* ReflectWr, signal both */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, NFP_CPP_ACTION_RW, 1):
+		return PUSHPULL(P32, P32);
+	default:
+		return target_rw(cpp_id, P32, 1, 63);
+	}
+}
+
+static inline int
+nfp6000_cls(uint32_t cpp_id)
+{
+	int island = NFP_CPP_ID_ISLAND_of(cpp_id);
+
+	if (island && (island < 1 || island > 63))
+		return NFP_ERRNO(EINVAL);
+
+	switch (cpp_id & NFP_CPP_ID(0, ~0, ~0)) {
+	case NFP_CPP_ID(0, 0, 3): /* xor */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 0): /* set */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 2, 1): /* clr */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 0): /* add */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 4, 1): /* add64 */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 0): /* sub */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 1): /* sub64 */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 6, 2): /* subsat */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 2): /* hash_mask */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 8, 3): /* hash_clear */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 9, 0): /* ring_get */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 1): /* ring_pop */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 2): /* ring_get_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 9, 3): /* ring_pop_freely */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 10, 0): /* ring_put */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 10, 2): /* ring_journal */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 14, 0): /* reflect_write_sig_local */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 15, 1):  /* reflect_read_sig_local */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 17, 2): /* statistic */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 24, 0): /* ring_read */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 24, 1): /* ring_write */
+		return PUSHPULL(P32, 0);
+	case NFP_CPP_ID(0, 25, 0): /* ring_workq_add_thread */
+		return PUSHPULL(0, P32);
+	case NFP_CPP_ID(0, 25, 1): /* ring_workq_add_work */
+		return PUSHPULL(P32, 0);
+	default:
+		return target_rw(cpp_id, P32, 0, 64);
+	}
+}
+
+static inline int
+nfp6000_target_pushpull(uint32_t cpp_id, uint64_t address)
+{
+	switch (NFP_CPP_ID_TARGET_of(cpp_id)) {
+	case NFP6000_CPPTGT_NBI:
+		return nfp6000_nbi(cpp_id, address);
+	case NFP6000_CPPTGT_VQDR:
+		return target_rw(cpp_id, P32, 24, 4);
+	case NFP6000_CPPTGT_ILA:
+		return nfp6000_ila(cpp_id);
+	case NFP6000_CPPTGT_MU:
+		return nfp6000_mu(cpp_id, address);
+	case NFP6000_CPPTGT_PCIE:
+		return nfp6000_pci(cpp_id);
+	case NFP6000_CPPTGT_ARM:
+		if (address < 0x10000)
+			return target_rw(cpp_id, P64, 1, 1);
+		else
+			return target_rw(cpp_id, P32, 1, 1);
+	case NFP6000_CPPTGT_CRYPTO:
+		return nfp6000_crypto(cpp_id);
+	case NFP6000_CPPTGT_CTXPB:
+		return nfp6000_cap_xpb(cpp_id);
+	case NFP6000_CPPTGT_CLS:
+		return nfp6000_cls(cpp_id);
+	case 0:
+		return target_rw(cpp_id, P32, 4, 4);
+	default:
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp_target_pushpull_width(int pp, int write_not_read)
+{
+	if (pp < 0)
+		return pp;
+
+	if (write_not_read)
+		return PULL_WIDTH(pp);
+	else
+		return PUSH_WIDTH(pp);
+}
+
+static inline int
+nfp6000_target_action_width(uint32_t cpp_id, uint64_t address,
+			    int write_not_read)
+{
+	int pp;
+
+	pp = nfp6000_target_pushpull(cpp_id, address);
+
+	return nfp_target_pushpull_width(pp, write_not_read);
+}
+
+static inline int
+nfp_target_action_width(uint32_t model, uint32_t cpp_id, uint64_t address,
+			int write_not_read)
+{
+	if (NFP_CPP_MODEL_IS_6000(model)) {
+		return nfp6000_target_action_width(cpp_id, address,
+						   write_not_read);
+	} else {
+		return NFP_ERRNO(EINVAL);
+	}
+}
+
+static inline int
+nfp_target_cpp(uint32_t cpp_island_id, uint64_t cpp_island_address,
+	       uint32_t *cpp_target_id, uint64_t *cpp_target_address,
+	       const uint32_t *imb_table)
+{
+	int err;
+	int island = NFP_CPP_ID_ISLAND_of(cpp_island_id);
+	int target = NFP_CPP_ID_TARGET_of(cpp_island_id);
+	uint32_t imb;
+
+	if (target < 0 || target >= 16)
+		return NFP_ERRNO(EINVAL);
+
+	if (island == 0) {
+		/* Already translated */
+		*cpp_target_id = cpp_island_id;
+		*cpp_target_address = cpp_island_address;
+		return 0;
+	}
+
+	if (!imb_table) {
+		/* CPP + Island only allowed on systems with IMB tables */
+		return NFP_ERRNO(EINVAL);
+	}
+
+	imb = imb_table[target];
+
+	*cpp_target_address = cpp_island_address;
+	err = _nfp6000_cppat_addr_encode(cpp_target_address, island, target,
+					 ((imb >> 13) & 7),
+					 ((imb >> 12) & 1),
+					 ((imb >> 6) & 0x3f),
+					 ((imb >> 0) & 0x3f));
+	if (err == 0) {
+		*cpp_target_id =
+		    NFP_CPP_ID(target, NFP_CPP_ID_ACTION_of(cpp_island_id),
+			       NFP_CPP_ID_TOKEN_of(cpp_island_id));
+	}
+
+	return err;
+}
+
+#endif /* NFP_TARGET_H */
-- 
1.9.1

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions
  2018-04-05 12:44  9%     ` Adrien Mazarguil
@ 2018-04-05 13:36  7%       ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-04-05 13:36 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ferruh Yigit

05/04/2018 14:44, Adrien Mazarguil:
> On Thu, Apr 05, 2018 at 12:06:10PM +0200, Thomas Monjalon wrote:
> > 04/04/2018 17:56, Adrien Mazarguil:
> > > Subsequent patches will modify existing types and slightly alter the
> > > behavior of the flow API. This warrants a major ABI breakage.
> > > 
> > > While it is already taken care of for 18.05 (LIBABIVER was updated to
> > > version 9 by a prior commit), this patch explicitly adds the affected flow
> > > API functions as a safety measure.
> > 
> > I don't understand this patch.
> > 
> > If the API is broken, you must move the function from old block to
> > the new one.
> 
> Missed that part, I'll update it.
> 
> > And it must be done in the patch modifying the function.
> 
> About that, almost each patch in this series breaks the ABI in its own
> way. This left me with two options: either updating these functions once and
> for all and explaining why in a dedicated patch, or updating them in the
> first patch with an ABI impact, with subsequent patches piggybacking on that
> change.
> 
> Unless there's a way to update the map file for each patch that breaks ABI,
> I think the former is more consistent, but I don't mind if you prefer the
> latter. What do you suggest?

The ABI information must be updated when breaking (2nd solution).

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH] eal/service: remove experimental tags
@ 2018-04-05 13:15  9% Harry van Haaren
  2018-04-06  6:18  0% ` Jerin Jacob
  0 siblings, 1 reply; 200+ results
From: Harry van Haaren @ 2018-04-05 13:15 UTC (permalink / raw)
  To: dev; +Cc: Harry van Haaren

This commit removes the experimental tags from the
service cores functions, they now become part of the
main DPDK API/ABI.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

 MAINTAINERS                                        |   2 +-
 doc/guides/rel_notes/release_18_05.rst             |   7 ++
 examples/service_cores/Makefile                    |   3 -
 examples/service_cores/meson.build                 |   1 -
 lib/librte_eal/common/include/rte_service.h        | 117 ++++-----------------
 .../common/include/rte_service_component.h         |  38 ++-----
 lib/librte_eal/common/rte_service.c                |  55 +++++-----
 lib/librte_eal/rte_eal_version.map                 |  38 ++++---
 8 files changed, 87 insertions(+), 174 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ed3251d..d10c27d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -156,7 +156,7 @@ F: test/test/test_mp_secondary.c
 F: examples/multi_process/
 F: doc/guides/sample_app_ug/multi_process.rst
 
-Service Cores - EXPERIMENTAL
+Service Cores
 M: Harry van Haaren <harry.van.haaren@intel.com>
 F: lib/librte_eal/common/include/rte_service.h
 F: lib/librte_eal/common/include/rte_service_component.h
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1c..940a308 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -72,6 +72,13 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Service Cores is no longer marked as experimental.**
+
+  The service cores functions are no longer marked as experimental, and have
+  become part of the normal DPDK API and ABI. Any future ABI changes will be
+  announced at least one release before the ABI change is made. There are no
+  ABI breaking changes planned.
+
 
 ABI Changes
 -----------
diff --git a/examples/service_cores/Makefile b/examples/service_cores/Makefile
index 3156e35..a4d6b7b 100644
--- a/examples/service_cores/Makefile
+++ b/examples/service_cores/Makefile
@@ -23,8 +23,6 @@ CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
 LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
 LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
 
-CFLAGS += -DALLOW_EXPERIMENTAL_API
-
 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
 	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
 
@@ -50,7 +48,6 @@ RTE_TARGET ?= x86_64-native-linuxapp-gcc
 
 include $(RTE_SDK)/mk/rte.vars.mk
 
-CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS)
 
 # workaround for a gcc bug with noreturn attribute
diff --git a/examples/service_cores/meson.build b/examples/service_cores/meson.build
index 2b0a250..c34e11e 100644
--- a/examples/service_cores/meson.build
+++ b/examples/service_cores/meson.build
@@ -6,7 +6,6 @@
 # To build this example as a standalone application with an already-installed
 # DPDK instance, use 'make'
 
-allow_experimental_apis = true
 sources = files(
 	'main.c'
 )
diff --git a/lib/librte_eal/common/include/rte_service.h b/lib/librte_eal/common/include/rte_service.h
index 211eb37..aea4d91 100644
--- a/lib/librte_eal/common/include/rte_service.h
+++ b/lib/librte_eal/common/include/rte_service.h
@@ -47,9 +47,6 @@ extern "C" {
 #define RTE_SERVICE_CAP_MT_SAFE (1 << 0)
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  *  Return the number of services registered.
  *
  * The number of services registered can be passed to *rte_service_get_by_id*,
@@ -57,12 +54,9 @@ extern "C" {
  *
  * @return The number of services registered.
  */
-uint32_t __rte_experimental rte_service_get_count(void);
+uint32_t rte_service_get_count(void);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Return the id of a service by name.
  *
  * This function provides the id of the service using the service name as
@@ -84,24 +78,17 @@ uint32_t __rte_experimental rte_service_get_count(void);
  * @retval -EINVAL Null *service_id* pointer provided
  * @retval -ENODEV No such service registered
  */
-int32_t __rte_experimental rte_service_get_by_name(const char *name,
-					       uint32_t *service_id);
+int32_t rte_service_get_by_name(const char *name, uint32_t *service_id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Return the name of the service.
  *
  * @return A pointer to the name of the service. The returned pointer remains
  *         in ownership of the service, and the application must not free it.
  */
-const char __rte_experimental *rte_service_get_name(uint32_t id);
+const char *rte_service_get_name(uint32_t id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Check if a service has a specific capability.
  *
  * This function returns if *service* has implements *capability*.
@@ -109,13 +96,9 @@ const char __rte_experimental *rte_service_get_name(uint32_t id);
  * @retval 1 Capability supported by this service instance
  * @retval 0 Capability not supported by this service instance
  */
-int32_t __rte_experimental rte_service_probe_capability(uint32_t id,
-						    uint32_t capability);
+int32_t rte_service_probe_capability(uint32_t id, uint32_t capability);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Map or unmap a lcore to a service.
  *
  * Each core can be added or removed from running a specific service. This
@@ -134,13 +117,10 @@ int32_t __rte_experimental rte_service_probe_capability(uint32_t id,
  * @retval 0 lcore map updated successfully
  * @retval -EINVAL An invalid service or lcore was provided.
  */
-int32_t __rte_experimental rte_service_map_lcore_set(uint32_t service_id,
-				  uint32_t lcore, uint32_t enable);
+int32_t rte_service_map_lcore_set(uint32_t service_id, uint32_t lcore,
+		uint32_t enable);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Retrieve the mapping of an lcore to a service.
  *
  * @param service_id the service to apply the lcore to
@@ -150,13 +130,9 @@ int32_t __rte_experimental rte_service_map_lcore_set(uint32_t service_id,
  * @retval 0 lcore is not mapped to service
  * @retval -EINVAL An invalid service or lcore was provided.
  */
-int32_t __rte_experimental rte_service_map_lcore_get(uint32_t service_id,
-						 uint32_t lcore);
+int32_t rte_service_map_lcore_get(uint32_t service_id, uint32_t lcore);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Set the runstate of the service.
  *
  * Each service is either running or stopped. Setting a non-zero runstate
@@ -168,12 +144,9 @@ int32_t __rte_experimental rte_service_map_lcore_get(uint32_t service_id,
  * @retval 0 The service was successfully started
  * @retval -EINVAL Invalid service id
  */
-int32_t __rte_experimental rte_service_runstate_set(uint32_t id, uint32_t runstate);
+int32_t rte_service_runstate_set(uint32_t id, uint32_t runstate);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Get the runstate for the service with *id*. See *rte_service_runstate_set*
  * for details of runstates. A service can call this function to ensure that
  * the application has indicated that it will receive CPU cycles. Either a
@@ -186,12 +159,9 @@ int32_t __rte_experimental rte_service_runstate_set(uint32_t id, uint32_t runsta
  * @retval 0 Service is stopped
  * @retval -EINVAL Invalid service id
  */
-int32_t __rte_experimental rte_service_runstate_get(uint32_t id);
+int32_t rte_service_runstate_get(uint32_t id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Enable or disable the check for a service-core being mapped to the service.
  * An application can disable the check when takes the responsibility to run a
  * service itself using *rte_service_run_iter_on_app_lcore*.
@@ -202,13 +172,9 @@ int32_t __rte_experimental rte_service_runstate_get(uint32_t id);
  * @retval 0 Success
  * @retval -EINVAL Invalid service ID
  */
-int32_t __rte_experimental rte_service_set_runstate_mapped_check(uint32_t id,
-							     int32_t enable);
+int32_t rte_service_set_runstate_mapped_check(uint32_t id, int32_t enable);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * This function runs a service callback from a non-service lcore.
  *
  * This function is designed to enable gradual porting to service cores, and
@@ -241,13 +207,10 @@ int32_t __rte_experimental rte_service_set_runstate_mapped_check(uint32_t id,
  * @retval -ENOEXEC Service is not in a run-able state
  * @retval -EINVAL Invalid service id
  */
-int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id,
+int32_t rte_service_run_iter_on_app_lcore(uint32_t id,
 		uint32_t serialize_multithread_unsafe);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Start a service core.
  *
  * Starting a core makes the core begin polling. Any services assigned to it
@@ -259,12 +222,9 @@ int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id,
  * @retval -EINVAL Failed to start core. The *lcore_id* passed in is not
  *          currently assigned to be a service core.
  */
-int32_t __rte_experimental rte_service_lcore_start(uint32_t lcore_id);
+int32_t rte_service_lcore_start(uint32_t lcore_id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Stop a service core.
  *
  * Stopping a core makes the core become idle, but remains  assigned as a
@@ -278,12 +238,9 @@ int32_t __rte_experimental rte_service_lcore_start(uint32_t lcore_id);
  *          The application must stop the service first, and then stop the
  *          lcore.
  */
-int32_t __rte_experimental rte_service_lcore_stop(uint32_t lcore_id);
+int32_t rte_service_lcore_stop(uint32_t lcore_id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Adds lcore to the list of service cores.
  *
  * This functions can be used at runtime in order to modify the service core
@@ -294,12 +251,9 @@ int32_t __rte_experimental rte_service_lcore_stop(uint32_t lcore_id);
  * @retval -EALREADY lcore is already added to the service core list
  * @retval -EINVAL Invalid lcore provided
  */
-int32_t __rte_experimental rte_service_lcore_add(uint32_t lcore);
+int32_t rte_service_lcore_add(uint32_t lcore);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Removes lcore from the list of service cores.
  *
  * This can fail if the core is not stopped, see *rte_service_core_stop*.
@@ -308,12 +262,9 @@ int32_t __rte_experimental rte_service_lcore_add(uint32_t lcore);
  * @retval -EBUSY Lcore is not stopped, stop service core before removing.
  * @retval -EINVAL failed to add lcore to service core mask.
  */
-int32_t __rte_experimental rte_service_lcore_del(uint32_t lcore);
+int32_t rte_service_lcore_del(uint32_t lcore);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Retrieve the number of service cores currently available.
  *
  * This function returns the integer count of service cores available. The
@@ -325,24 +276,18 @@ int32_t __rte_experimental rte_service_lcore_del(uint32_t lcore);
  *
  * @return The number of service cores currently configured.
  */
-int32_t __rte_experimental rte_service_lcore_count(void);
+int32_t rte_service_lcore_count(void);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Resets all service core mappings. This does not remove the service cores
  * from duty, just unmaps all services / cores, and stops() the service cores.
  * The runstate of services is not modified.
  *
  * @retval 0 Success
  */
-int32_t __rte_experimental rte_service_lcore_reset_all(void);
+int32_t rte_service_lcore_reset_all(void);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Enable or disable statistics collection for *service*.
  *
  * This function enables per core, per-service cycle count collection.
@@ -351,13 +296,9 @@ int32_t __rte_experimental rte_service_lcore_reset_all(void);
  * @retval 0 Success
  * @retval -EINVAL Invalid service pointer passed
  */
-int32_t __rte_experimental rte_service_set_stats_enable(uint32_t id,
-						    int32_t enable);
+int32_t rte_service_set_stats_enable(uint32_t id, int32_t enable);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Retrieve the list of currently enabled service cores.
  *
  * This function fills in an application supplied array, with each element
@@ -373,12 +314,9 @@ int32_t __rte_experimental rte_service_set_stats_enable(uint32_t id,
  *          service core list. No items have been populated, call this function
  *          with a size of at least *rte_service_core_count* items.
  */
-int32_t __rte_experimental rte_service_lcore_list(uint32_t array[], uint32_t n);
+int32_t rte_service_lcore_list(uint32_t array[], uint32_t n);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Get the numer of services running on the supplied lcore.
  *
  * @param lcore Id of the service core.
@@ -386,19 +324,16 @@ int32_t __rte_experimental rte_service_lcore_list(uint32_t array[], uint32_t n);
  * @retval -EINVAL Invalid lcore provided
  * @retval -ENOTSUP The provided lcore is not a service core.
  */
-int32_t __rte_experimental rte_service_lcore_count_services(uint32_t lcore);
+int32_t rte_service_lcore_count_services(uint32_t lcore);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Dumps any information available about the service. When id is UINT32_MAX,
  * this function dumps info for all services.
  *
  * @retval 0 Statistics have been successfully dumped
  * @retval -EINVAL Invalid service id provided
  */
-int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id);
+int32_t rte_service_dump(FILE *f, uint32_t id);
 
 /**
  * Returns the number of cycles that this service has consumed
@@ -411,28 +346,22 @@ int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id);
 #define RTE_SERVICE_ATTR_CALL_COUNT 1
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Get an attribute from a service.
  *
  * @retval 0 Success, the attribute value has been written to *attr_value*.
  *         -EINVAL Invalid id, attr_id or attr_value was NULL.
  */
-int32_t __rte_experimental rte_service_attr_get(uint32_t id, uint32_t attr_id,
+int32_t rte_service_attr_get(uint32_t id, uint32_t attr_id,
 		uint32_t *attr_value);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Reset all attribute values of a service.
  *
  * @param id The service to reset all statistics of
  * @retval 0 Successfully reset attributes
  *         -EINVAL Invalid service id provided
  */
-int32_t __rte_experimental rte_service_attr_reset_all(uint32_t id);
+int32_t rte_service_attr_reset_all(uint32_t id);
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_eal/common/include/rte_service_component.h b/lib/librte_eal/common/include/rte_service_component.h
index 9ba4aa2..c12adbc 100644
--- a/lib/librte_eal/common/include/rte_service_component.h
+++ b/lib/librte_eal/common/include/rte_service_component.h
@@ -13,17 +13,11 @@
 #include <rte_service.h>
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Signature of callback function to run a service.
  */
 typedef int32_t (*rte_service_func)(void *args);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * The specification of a service.
  *
  * This struct contains metadata about the service itself, the callback
@@ -47,9 +41,6 @@ struct rte_service_spec {
 };
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Register a new service.
  *
  * A service represents a component that the requires CPU time periodically to
@@ -73,14 +64,10 @@ struct rte_service_spec {
  *         -EINVAL Attempted to register an invalid service (eg, no callback
  *         set)
  */
-int32_t __rte_experimental
-rte_service_component_register(const struct rte_service_spec *spec,
-			       uint32_t *service_id);
+int32_t rte_service_component_register(const struct rte_service_spec *spec,
+		uint32_t *service_id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Unregister a service component.
  *
  * The service being removed must be stopped before calling this function.
@@ -89,12 +76,9 @@ rte_service_component_register(const struct rte_service_spec *spec,
  * @retval -EBUSY The service is currently running, stop the service before
  *          calling unregister. No action has been taken.
  */
-int32_t __rte_experimental rte_service_component_unregister(uint32_t id);
+int32_t rte_service_component_unregister(uint32_t id);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Private function to allow EAL to initialized default mappings.
  *
  * This function iterates all the services, and maps then to the available
@@ -107,12 +91,9 @@ int32_t __rte_experimental rte_service_component_unregister(uint32_t id);
  * @retval -ENODEV Error in enabling service lcore on a service
  * @retval -ENOEXEC Error when starting services
  */
-int32_t __rte_experimental rte_service_start_with_defaults(void);
+int32_t rte_service_start_with_defaults(void);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Set the backend runstate of a component.
  *
  * This function allows services to be registered at startup, but not yet
@@ -124,13 +105,9 @@ int32_t __rte_experimental rte_service_start_with_defaults(void);
  *
  * @retval 0 Success
  */
-int32_t __rte_experimental rte_service_component_runstate_set(uint32_t id,
-							  uint32_t runstate);
+int32_t rte_service_component_runstate_set(uint32_t id, uint32_t runstate);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * Initialize the service library.
  *
  * In order to use the service library, it must be initialized. EAL initializes
@@ -142,14 +119,11 @@ int32_t __rte_experimental rte_service_component_runstate_set(uint32_t id,
 int32_t rte_service_init(void);
 
 /**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
  * @internal Free up the memory that has been initialized.
  * This routine is to be invoked prior to process termination.
  *
  * @retval None
  */
-void __rte_experimental rte_service_finalize(void);
+void rte_service_finalize(void);
 
 #endif /* _RTE_SERVICE_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/rte_service.c b/lib/librte_eal/common/rte_service.c
index be9b5e6..73507aa 100644
--- a/lib/librte_eal/common/rte_service.c
+++ b/lib/librte_eal/common/rte_service.c
@@ -115,7 +115,7 @@ int32_t rte_service_init(void)
 	return -ENOMEM;
 }
 
-void __rte_experimental
+void
 rte_service_finalize(void)
 {
 	if (!rte_service_library_initialized)
@@ -161,7 +161,7 @@ service_mt_safe(struct rte_service_spec_impl *s)
 	return !!(s->spec.capabilities & RTE_SERVICE_CAP_MT_SAFE);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_set_stats_enable(uint32_t id, int32_t enabled)
 {
 	struct rte_service_spec_impl *s;
@@ -175,7 +175,7 @@ rte_service_set_stats_enable(uint32_t id, int32_t enabled)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled)
 {
 	struct rte_service_spec_impl *s;
@@ -189,13 +189,13 @@ rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled)
 	return 0;
 }
 
-uint32_t __rte_experimental
+uint32_t
 rte_service_get_count(void)
 {
 	return rte_service_count;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_get_by_name(const char *name, uint32_t *service_id)
 {
 	if (!service_id)
@@ -213,7 +213,7 @@ rte_service_get_by_name(const char *name, uint32_t *service_id)
 	return -ENODEV;
 }
 
-const char * __rte_experimental
+const char *
 rte_service_get_name(uint32_t id)
 {
 	struct rte_service_spec_impl *s;
@@ -221,7 +221,7 @@ rte_service_get_name(uint32_t id)
 	return s->spec.name;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_probe_capability(uint32_t id, uint32_t capability)
 {
 	struct rte_service_spec_impl *s;
@@ -229,7 +229,7 @@ rte_service_probe_capability(uint32_t id, uint32_t capability)
 	return !!(s->spec.capabilities & capability);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_component_register(const struct rte_service_spec *spec,
 			       uint32_t *id_ptr)
 {
@@ -262,7 +262,7 @@ rte_service_component_register(const struct rte_service_spec *spec,
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_component_unregister(uint32_t id)
 {
 	uint32_t i;
@@ -283,7 +283,7 @@ rte_service_component_unregister(uint32_t id)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_component_runstate_set(uint32_t id, uint32_t runstate)
 {
 	struct rte_service_spec_impl *s;
@@ -298,7 +298,7 @@ rte_service_component_runstate_set(uint32_t id, uint32_t runstate)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_runstate_set(uint32_t id, uint32_t runstate)
 {
 	struct rte_service_spec_impl *s;
@@ -313,7 +313,7 @@ rte_service_runstate_set(uint32_t id, uint32_t runstate)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_runstate_get(uint32_t id)
 {
 	struct rte_service_spec_impl *s;
@@ -374,7 +374,7 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask)
 	return 0;
 }
 
-int32_t __rte_experimental rte_service_run_iter_on_app_lcore(uint32_t id,
+int32_t rte_service_run_iter_on_app_lcore(uint32_t id,
 		uint32_t serialize_mt_unsafe)
 {
 	/* run service on calling core, using all-ones as the service mask */
@@ -430,7 +430,7 @@ rte_service_runner_func(void *arg)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_count(void)
 {
 	int32_t count = 0;
@@ -440,7 +440,7 @@ rte_service_lcore_count(void)
 	return count;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_list(uint32_t array[], uint32_t n)
 {
 	uint32_t count = rte_service_lcore_count();
@@ -463,7 +463,7 @@ rte_service_lcore_list(uint32_t array[], uint32_t n)
 	return count;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_count_services(uint32_t lcore)
 {
 	if (lcore >= RTE_MAX_LCORE)
@@ -476,7 +476,7 @@ rte_service_lcore_count_services(uint32_t lcore)
 	return __builtin_popcountll(cs->service_mask);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_start_with_defaults(void)
 {
 	/* create a default mapping from cores to services, then start the
@@ -562,7 +562,7 @@ service_update(struct rte_service_spec *service, uint32_t lcore,
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled)
 {
 	struct rte_service_spec_impl *s;
@@ -571,7 +571,7 @@ rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled)
 	return service_update(&s->spec, lcore, &on, 0);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_map_lcore_get(uint32_t id, uint32_t lcore)
 {
 	struct rte_service_spec_impl *s;
@@ -597,7 +597,7 @@ set_lcore_state(uint32_t lcore, int32_t state)
 	lcore_states[lcore].is_service_core = (state == ROLE_SERVICE);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_reset_all(void)
 {
 	/* loop over cores, reset all to mask 0 */
@@ -617,7 +617,7 @@ rte_service_lcore_reset_all(void)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_add(uint32_t lcore)
 {
 	if (lcore >= RTE_MAX_LCORE)
@@ -636,7 +636,7 @@ rte_service_lcore_add(uint32_t lcore)
 	return rte_eal_wait_lcore(lcore);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_del(uint32_t lcore)
 {
 	if (lcore >= RTE_MAX_LCORE)
@@ -655,7 +655,7 @@ rte_service_lcore_del(uint32_t lcore)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_start(uint32_t lcore)
 {
 	if (lcore >= RTE_MAX_LCORE)
@@ -678,7 +678,7 @@ rte_service_lcore_start(uint32_t lcore)
 	return ret;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_lcore_stop(uint32_t lcore)
 {
 	if (lcore >= RTE_MAX_LCORE)
@@ -708,7 +708,7 @@ rte_service_lcore_stop(uint32_t lcore)
 	return 0;
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_attr_get(uint32_t id, uint32_t attr_id, uint32_t *attr_value)
 {
 	struct rte_service_spec_impl *s;
@@ -753,7 +753,7 @@ rte_service_dump_one(FILE *f, struct rte_service_spec_impl *s,
 			s->cycles_spent, s->cycles_spent / calls);
 }
 
-int32_t __rte_experimental
+int32_t
 rte_service_attr_reset_all(uint32_t id)
 {
 	struct rte_service_spec_impl *s;
@@ -781,7 +781,8 @@ service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset)
 	fprintf(f, "\n");
 }
 
-int32_t __rte_experimental rte_service_dump(FILE *f, uint32_t id)
+int32_t
+rte_service_dump(FILE *f, uint32_t id)
 {
 	uint32_t i;
 	int print_one = (id != UINT32_MAX);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index dd38783..5fdbb56 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -211,28 +211,14 @@ DPDK_18.02 {
 
 }  DPDK_17.11;
 
-EXPERIMENTAL {
+DPDK_18.05 {
 	global:
 
-	rte_eal_cleanup;
-	rte_eal_devargs_insert;
-	rte_eal_devargs_parse;
-	rte_eal_devargs_remove;
-	rte_eal_hotplug_add;
-	rte_eal_hotplug_remove;
-	rte_eal_mbuf_user_pool_ops;
-	rte_log_register_type_and_pick_level;
-	rte_mp_action_register;
-	rte_mp_action_unregister;
-	rte_mp_reply;
-	rte_mp_request_sync;
-	rte_mp_request_async;
-	rte_mp_sendmsg;
 	rte_service_attr_get;
 	rte_service_attr_reset_all;
 	rte_service_component_register;
-	rte_service_component_unregister;
 	rte_service_component_runstate_set;
+	rte_service_component_unregister;
 	rte_service_dump;
 	rte_service_finalize;
 	rte_service_get_by_id;
@@ -256,6 +242,26 @@ EXPERIMENTAL {
 	rte_service_set_runstate_mapped_check;
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
+
+}  DPDK_18.02;
+
+EXPERIMENTAL {
+	global:
+
+	rte_eal_cleanup;
+	rte_eal_devargs_insert;
+	rte_eal_devargs_parse;
+	rte_eal_devargs_remove;
+	rte_eal_hotplug_add;
+	rte_eal_hotplug_remove;
+	rte_eal_mbuf_user_pool_ops;
+	rte_log_register_type_and_pick_level;
+	rte_mp_action_register;
+	rte_mp_action_unregister;
+	rte_mp_reply;
+	rte_mp_request_sync;
+	rte_mp_request_async;
+	rte_mp_sendmsg;
 	rte_socket_count;
 	rte_socket_id_by_idx;
 
-- 
2.7.4

^ permalink raw reply	[relevance 9%]

* Re: [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions
  2018-04-05 10:06  4%   ` Thomas Monjalon
@ 2018-04-05 12:44  9%     ` Adrien Mazarguil
  2018-04-05 13:36  7%       ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-05 12:44 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, Ferruh Yigit

On Thu, Apr 05, 2018 at 12:06:10PM +0200, Thomas Monjalon wrote:
> 04/04/2018 17:56, Adrien Mazarguil:
> > Subsequent patches will modify existing types and slightly alter the
> > behavior of the flow API. This warrants a major ABI breakage.
> > 
> > While it is already taken care of for 18.05 (LIBABIVER was updated to
> > version 9 by a prior commit), this patch explicitly adds the affected flow
> > API functions as a safety measure.
> 
> I don't understand this patch.
> 
> If the API is broken, you must move the function from old block to
> the new one.

Missed that part, I'll update it.

> And it must be done in the patch modifying the function.

About that, almost each patch in this series breaks the ABI in its own
way. This left me with two options: either updating these functions once and
for all and explaining why in a dedicated patch, or updating them in the
first patch with an ABI impact, with subsequent patches piggybacking on that
change.

Unless there's a way to update the map file for each patch that breaks ABI,
I think the former is more consistent, but I don't mind if you prefer the
latter. What do you suggest?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 9%]

* Re: [dpdk-dev] [PATCH] doc: add meter API change to release notes
  2018-04-05 11:49  4% [dpdk-dev] [PATCH] doc: add meter API change to release notes Jasvinder Singh
@ 2018-04-05 12:03  0% ` Dumitrescu, Cristian
  0 siblings, 0 replies; 200+ results
From: Dumitrescu, Cristian @ 2018-04-05 12:03 UTC (permalink / raw)
  To: Singh, Jasvinder, dev



> -----Original Message-----
> From: Singh, Jasvinder
> Sent: Thursday, April 5, 2018 12:50 PM
> To: dev@dpdk.org
> Cc: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> Subject: [PATCH] doc: add meter API change to release notes
> 
> Update the release notes with meter api change to support configuration
> profiles.
> 
> Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> ---
>  doc/guides/rel_notes/release_18_05.rst | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/doc/guides/rel_notes/release_18_05.rst
> b/doc/guides/rel_notes/release_18_05.rst
> index e5fac1c..34222cd 100644
> --- a/doc/guides/rel_notes/release_18_05.rst
> +++ b/doc/guides/rel_notes/release_18_05.rst
> @@ -72,6 +72,16 @@ API Changes
>     Also, make sure to start the actual text at the margin.
>     =========================================================
> 
> +* **Meter API updated to accomodate configuration profiles.**
> +
> +  The meter API is changed to support meter configuration profiles. The
> +  configuration profile represents the set of configuration parameters
> +  for a given meter object, such as the rates and sizes for the token
> +  buckets. These configuration parameters were previously the part of
> meter
> +  object internal data strcuture. The separation of the configuration
> +  parameters from meter object data structure results in reducing its
> +  memory footprint which helps in better cache utilization when large
> number
> +  of meter objects are used.
> 
>  ABI Changes
>  -----------
> --
> 2.9.3

Acked-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH] doc: add meter API change to release notes
@ 2018-04-05 11:49  4% Jasvinder Singh
  2018-04-05 12:03  0% ` Dumitrescu, Cristian
  0 siblings, 1 reply; 200+ results
From: Jasvinder Singh @ 2018-04-05 11:49 UTC (permalink / raw)
  To: dev; +Cc: cristian.dumitrescu

Update the release notes with meter api change to support configuration
profiles.

Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
---
 doc/guides/rel_notes/release_18_05.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index e5fac1c..34222cd 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -72,6 +72,16 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Meter API updated to accomodate configuration profiles.**
+
+  The meter API is changed to support meter configuration profiles. The
+  configuration profile represents the set of configuration parameters
+  for a given meter object, such as the rates and sizes for the token
+  buckets. These configuration parameters were previously the part of meter
+  object internal data strcuture. The separation of the configuration
+  parameters from meter object data structure results in reducing its
+  memory footprint which helps in better cache utilization when large number
+  of meter objects are used.
 
 ABI Changes
 -----------
-- 
2.9.3

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v4] lib/librte_meter: add meter configuration profile
  2018-04-05 10:12  0%     ` Thomas Monjalon
@ 2018-04-05 11:00  0%       ` Dumitrescu, Cristian
  0 siblings, 0 replies; 200+ results
From: Dumitrescu, Cristian @ 2018-04-05 11:00 UTC (permalink / raw)
  To: Thomas Monjalon, Singh, Jasvinder; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Thursday, April 5, 2018 11:12 AM
> To: Singh, Jasvinder <jasvinder.singh@intel.com>; Dumitrescu, Cristian
> <cristian.dumitrescu@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4] lib/librte_meter: add meter
> configuration profile
> 
> 19/02/2018 22:12, Thomas Monjalon:
> > 08/01/2018 16:43, Jasvinder Singh:
> > > Signed-off-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
> > > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> >
> > Applied for 18.05 (was postponed to preserve 18.02 ABI), thanks.
> 
> We forgot to update the release notes about the API change.
> Please, could you send a patch to add it in the appropriate section?
> Thanks
> 

Will send a quick patch later today, thanks!

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  2018-04-04 18:56  3%     ` De Lara Guarch, Pablo
@ 2018-04-05 10:16  0%       ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-04-05 10:16 UTC (permalink / raw)
  To: Horton, Remy
  Cc: dev, De Lara Guarch, Pablo, Mcnamara, John, Lu, Wenzhuo, Wu,
	Jingjing, Zhang, Qi Z, Xing, Beilei, Shreyansh Jain

04/04/2018 20:56, De Lara Guarch, Pablo:
> 
> API and ABI changes should be documented in release notes.

When sending a v4 for the API change, you can add my ack:

Acked-by: Thomas Monjalon <thomas@monjalon.net>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4] lib/librte_meter: add meter configuration profile
  @ 2018-04-05 10:12  0%     ` Thomas Monjalon
  2018-04-05 11:00  0%       ` Dumitrescu, Cristian
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2018-04-05 10:12 UTC (permalink / raw)
  To: Jasvinder Singh, cristian.dumitrescu; +Cc: dev

19/02/2018 22:12, Thomas Monjalon:
> 08/01/2018 16:43, Jasvinder Singh:
> > Signed-off-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
> > Signed-off-by: Jasvinder Singh <jasvinder.singh@intel.com>
> 
> Applied for 18.05 (was postponed to preserve 18.02 ABI), thanks.

We forgot to update the release notes about the API change.
Please, could you send a patch to add it in the appropriate section?
Thanks

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions
  2018-04-04 15:56  7% ` [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions Adrien Mazarguil
@ 2018-04-05 10:06  4%   ` Thomas Monjalon
  2018-04-05 12:44  9%     ` Adrien Mazarguil
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2018-04-05 10:06 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: dev, Ferruh Yigit

04/04/2018 17:56, Adrien Mazarguil:
> Subsequent patches will modify existing types and slightly alter the
> behavior of the flow API. This warrants a major ABI breakage.
> 
> While it is already taken care of for 18.05 (LIBABIVER was updated to
> version 9 by a prior commit), this patch explicitly adds the affected flow
> API functions as a safety measure.

I don't understand this patch.

If the API is broken, you must move the function from old block to
the new one. And it must be done in the patch modifying the function.


> --- a/lib/librte_ether/rte_ethdev_version.map
> +++ b/lib/librte_ether/rte_ethdev_version.map
> +DPDK_18.05 {
> +	global:
> +
> +	rte_flow_validate;
> +	rte_flow_create;
> +	rte_flow_query;
> +	rte_flow_copy;
> +
> +} DPDK_18.02;

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v4] ethdev: replace bus specific struct with generic dev
  2018-04-04 17:57  3%           ` De Lara Guarch, Pablo
@ 2018-04-05  9:19  0%             ` Ferruh Yigit
  0 siblings, 0 replies; 200+ results
From: Ferruh Yigit @ 2018-04-05  9:19 UTC (permalink / raw)
  To: De Lara Guarch, Pablo, David Marchand, santosh
  Cc: dev, Shreyansh Jain, Legacy, Allain (Wind River),
	Tomasz Duszynski, Thomas Monjalon

On 4/4/2018 6:57 PM, De Lara Guarch, Pablo wrote:
> 
> 
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ferruh Yigit
>> Sent: Tuesday, April 3, 2018 10:50 AM
>> To: David Marchand <david.marchand@6wind.com>; santosh
>> <santosh.shukla@caviumnetworks.com>
>> Cc: dev@dpdk.org; Shreyansh Jain <shreyansh.jain@nxp.com>; Legacy, Allain
>> (Wind River) <allain.legacy@windriver.com>; Tomasz Duszynski
>> <tdu@semihalf.com>; Thomas Monjalon <thomas@monjalon.net>
>> Subject: Re: [dpdk-dev] [PATCH v4] ethdev: replace bus specific struct with
>> generic dev
>>
>> On 4/3/2018 10:06 AM, David Marchand wrote:
>>> On Mon, Apr 2, 2018 at 6:13 PM, santosh
>>> <santosh.shukla@caviumnetworks.com> wrote:
>>>> On Friday 30 March 2018 08:59 PM, David Marchand wrote:
>>>>> I can see we enforce the driver name by putting it after the call to
>>>>> .dev_infos_get.
>>>>> http://dpdk.org/browse/dpdk/tree/lib/librte_ether/rte_ethdev.c#n2399
>>>>>
>>>>> octeontx pmd seems to try to do something about it:
>>>>> http://dpdk.org/browse/dpdk/tree/drivers/net/octeontx/octeontx_ethde
>>>>> v.c#n622
>>>>>
>>>>> Not sure it does something, might be a thing to cleanup.
>>>>>
>>>>>
>>>> In case, if your referring to driver_name update then indeed its a
>>>> cleanup [1].
>>>>
>>>> Otherwise, I don't see any issue with v4 Or may be /I /misunderstood
>>>> your comment.
>>>
>>> I agree there is no fundamental issue.
>>>
>>>     dev_info->device = dev->device;
>>>
>>>     RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get);
>>>     (*dev->dev_ops->dev_infos_get)(dev, dev_info);
>>>     dev_info->driver_name = dev->device->driver->name;
>>>
>>> If somebody (I mean some pmd out there) has a usecase with
>>> dev_info->device != dev->device, why not.
>>
>> Intentional let drivers update this variable although I don't also see any use case
>> of it.
>>
>> This variable was set by PMDs before this patch, so I don't see any reason to be
>> so strict here.
>>
>> If driver does anything ethdev will set dev_info->device for it, if it want to
>> overwrite, for any reason, it will have the capability.
> 
> Looks good to me. Will do the same for cryptodev and bbdev.
> The only thing that I am missing here is an update in documentation,
> adding the ABI Change in release notes.

Right, I forget about it, will send a new version.

Thanks,
ferruh

> 
> Apart from it:
> 
> Acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> 
>>
>>>
>>> Thomas ?
>>>
>>>
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  2018-04-03 16:42  3%           ` Jerin Jacob
@ 2018-04-04 23:38  0%             ` Ananyev, Konstantin
    0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2018-04-04 23:38 UTC (permalink / raw)
  To: Jerin Jacob, Olivier Matz; +Cc: dev, Richardson, Bruce

Hi lads,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Tuesday, April 3, 2018 5:43 PM
> To: Olivier Matz <olivier.matz@6wind.com>
> Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
> 
> -----Original Message-----
> > Date: Tue, 3 Apr 2018 17:56:01 +0200
> > From: Olivier Matz <olivier.matz@6wind.com>
> > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> >  structure
> > User-Agent: NeoMutt/20170113 (1.7.2)
> >
> > On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > > -----Original Message-----
> > > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > >  structure
> > > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > >
> > > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > > -----Original Message-----
> > > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > > To: dev@dpdk.org
> > > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > > >  structure
> > > > > > X-Mailer: git-send-email 2.11.0
> > > > > >
> > > > > > The initial objective of
> > > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > > data (on platform with cache line size = 64B), preventing from
> > > > > > having them on adjacent cache lines.
> > > > > >
> > > > > > Following discussion on the mailing list, it appears that this
> > > > > > also imposes an alignment constraint that is not required.
> > > > > >
> > > > > > This patch removes the extra alignment constraint and adds the
> > > > > > empty cache lines using padding fields in the structure. The
> > > > > > size of rte_ring structure and the offset of the fields remain
> > > > > > the same on platforms with cache line size = 64B:
> > > > > >
> > > > > >   rte_ring = 384
> > > > > >   rte_ring.name = 0
> > > > > >   rte_ring.flags = 32
> > > > > >   rte_ring.memzone = 40
> > > > > >   rte_ring.size = 48
> > > > > >   rte_ring.mask = 52
> > > > > >   rte_ring.prod = 128
> > > > > >   rte_ring.cons = 256
> > > > > >
> > > > > > But it has an impact on platform where cache line size is 128B:
> > > > > >
> > > > > >   rte_ring = 384        -> 768
> > > > > >   rte_ring.name = 0
> > > > > >   rte_ring.flags = 32
> > > > > >   rte_ring.memzone = 40
> > > > > >   rte_ring.size = 48
> > > > > >   rte_ring.mask = 52
> > > > > >   rte_ring.prod = 128   -> 256
> > > > > >   rte_ring.cons = 256   -> 512
> > > > >
> > > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > > the adjust cacheline(consumer)?
> > > > >
> > > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > >
> > > > The impact on machines that have a 128B cache line is that an unused
> > > > cache line will be added between the producer and consumer data. I
> > > > expect that the impact is positive in case there is a hw prefetcher, and
> > > > null in case there is no such prefetcher.
> > >
> > > It is not NULL, Right? You are loosing 256B for each ring.
> >
> > Is it really that important?
> 
> Pipeline or eventdev SW cases there could more rings in the system.
> I don't see any downside of having config option which is enabled
> default.
> 
> In my view, such config options are good, as in embedded usecases, customers
> can really fine tune the target for the need. In server usecases, let the default
> of option be enabled, no harm.

But that would mean we have to maintain two layouts for the rte_ring structure.
I am agree with Olivier here, might be saving 256B per ring is not worth such hassle.
Konstantin

> 
> >
> >
> > > > On machines with 64B cache line, this was already the case. It just
> > > > reduces the alignment constraint.
> > >
> > > Not all the 64B CL machines will have HW prefetch.
> > >
> > > I would recommend to add conditional compilation flags to express HW
> > > prefetch enabled or not? based on that we can decide to reserve
> > > the additional space. By default, in common config, HW prefetch can
> > > be enabled so that it works for almost all cases.
> >
> > The hw prefetcher can be enabled at runtime, so a compilation flag
> > does not seem to be a good idea. Moreover, changing this compilation
> 
> On those Hardwares HW prefetch can be disabled at runtime, it is fine
> with default config. I was taking about some low end ARM hardware which
> does not have HW prefetch is not present at all.
> 
> > flag would change the ABI.
> 
> ABI is broken anyway, Right? due to size of the structure change.
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v7] eal: provide API for querying valid socket id's
  2018-03-31 17:08  5% ` [dpdk-dev] [PATCH v7] " Anatoly Burakov
@ 2018-04-04 22:31  3%   ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-04-04 22:31 UTC (permalink / raw)
  To: Anatoly Burakov
  Cc: dev, Neil Horman, John McNamara, Marko Kovacevic,
	Bruce Richardson, chaozhu, gowrishankar.m

31/03/2018 19:08, Anatoly Burakov:
> During lcore scan, find all socket ID's and store them, and
> provide public API to query valid socket id's. This will break
> the ABI, so bump ABI version.
> 
> Also, remove deprecation notice corresponding to this change.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> Acked-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
> ---
> 
> Notes:
>     v7:
>     - Renamed rte_num_socket_ids() to rte_socket_count()
>     - Removed deprecation notice associated with this change
>     - Addressed review comments

You forgot the release notes for the ABI version (from my previous review).

Applied and fixed.

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances
@ 2018-04-04 22:01  3% Arnon Warshavsky
  2018-04-13  9:16  0% ` Burakov, Anatoly
  0 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2018-04-04 22:01 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon


The purpose of this patch series is to cleanup the library code
from paths that end up aborting the process,
and move to checking error values, in order to allow the running process
perform an orderly teardown or other mitigation of the event.

This patch modifies the majority of rte_panic calls
under lib and drivers, and replaces them with a log message
and an error return code according to context,
that can be propagated up the call stack.

- Focus was given to the dpdk initialization path
- Some of the panic calls within drivers were left in place where
  the call is from within an interrupt or calls that are
  on the data path,where there is no simple applicative
  route to propagate the error to temination.
  These should be handled by the driver maintainers.
- In order to avoid breaking ABI where panic was called from public
  void functions, a panic state variable was introduced so that
  it can be queried after calling these void functions.
  This tool place for a single function call.
- local void functions with no api were changed to retrun a value
  where needed
- No change took place in example and test files
- No change took place for debug assertions calling panic
- A new function was added to devtools/checkpatches.sh
  in order to prevent new additions of calls to rte_panic
  under lib and drivers.

Keep calm and don't panic

---

v2:
- reformat error messages so that literal string are in the same line
- fix typo in commit message
- add new return code to doxigen of rte_memzone_free()

Arnon Warshavsky (13):
  crypto: replace rte_panic instances in crypto driver
  bond: replace rte_panic instances in bonding driver
  e1000: replace rte_panic instances in e1000 driver
  ixgbe: replace rte_panic instances in ixgbe driver
  eal: replace rte_panic instances in eventdev
  kni: replace rte_panic instances in kni
  e1000: replace rte_panic instances in e1000 driver
  eal: replace rte_panic instances in hugepage_info
  eal: replace rte_panic instances in common_memzone
  eal: replace rte_panic instances in interrupts thread
  eal: replace rte_panic instances in ethdev
  eal: replace rte_panic instances in init sequence
  devtools: prevent new instances of rte_panic and rte_exit

 devtools/checkpatches.sh                          |  94 ++++++++++++++++-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c       |   8 +-
 drivers/crypto/dpaa_sec/dpaa_sec.c                |   8 +-
 drivers/net/bonding/rte_eth_bond_8023ad.c         |  30 ++++--
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |   2 +-
 drivers/net/bonding/rte_eth_bond_api.c            |  20 ++--
 drivers/net/bonding/rte_eth_bond_pmd.c            |  10 +-
 drivers/net/bonding/rte_eth_bond_private.h        |   2 +-
 drivers/net/e1000/e1000_ethdev.h                  |   2 +-
 drivers/net/e1000/igb_ethdev.c                    |   3 +-
 drivers/net/e1000/igb_pf.c                        |  15 +--
 drivers/net/ixgbe/ixgbe_ethdev.c                  |   3 +-
 drivers/net/ixgbe/ixgbe_ethdev.h                  |   2 +-
 drivers/net/ixgbe/ixgbe_pf.c                      |  13 ++-
 lib/librte_eal/bsdapp/eal/eal.c                   |  87 +++++++++++-----
 lib/librte_eal/bsdapp/eal/eal_thread.c            |  65 +++++++++---
 lib/librte_eal/common/eal_common_launch.c         |  21 ++++
 lib/librte_eal/common/eal_common_memzone.c        |   3 +-
 lib/librte_eal/common/include/rte_debug.h         |  12 +++
 lib/librte_eal/common/include/rte_memzone.h       |   1 +
 lib/librte_eal/common/rte_malloc.c                |   7 +-
 lib/librte_eal/linuxapp/eal/eal.c                 | 121 +++++++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  21 ++--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c      |  27 +++--
 lib/librte_eal/linuxapp/eal/eal_thread.c          |  65 +++++++++---
 lib/librte_ether/rte_ethdev.c                     |  36 +++++--
 lib/librte_eventdev/rte_eventdev_pmd_pci.h        |   8 +-
 lib/librte_eventdev/rte_eventdev_pmd_vdev.h       |   8 +-
 lib/librte_kni/rte_kni.c                          |  18 ++--
 lib/librte_kni/rte_kni_fifo.h                     |  11 +-
 30 files changed, 540 insertions(+), 183 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v11 3/9] eventtimer: add common code
  @ 2018-04-04 21:51  3%       ` Erik Gabriel Carrillo
  0 siblings, 0 replies; 200+ results
From: Erik Gabriel Carrillo @ 2018-04-04 21:51 UTC (permalink / raw)
  To: pbhagavatula, jerin.jacob; +Cc: dev

This commit adds the logic that is shared by all event timer adapter
drivers; the common code handles instance allocation and some
initialization.

Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
Acked-by: Pavan Nikhilesh <pbhagavatula@caviumnetworks.com>
---
 config/common_base                                |   1 +
 drivers/event/sw/sw_evdev.c                       |  18 +
 lib/librte_eventdev/Makefile                      |   2 +
 lib/librte_eventdev/rte_event_timer_adapter.c     | 387 ++++++++++++++++++++++
 lib/librte_eventdev/rte_event_timer_adapter_pmd.h | 114 +++++++
 lib/librte_eventdev/rte_eventdev.c                |  22 ++
 lib/librte_eventdev/rte_eventdev.h                |  20 ++
 lib/librte_eventdev/rte_eventdev_pmd.h            |  35 ++
 lib/librte_eventdev/rte_eventdev_version.map      |  20 +-
 9 files changed, 618 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter.c
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter_pmd.h

diff --git a/config/common_base b/config/common_base
index 7abf7c6..9354c66 100644
--- a/config/common_base
+++ b/config/common_base
@@ -550,6 +550,7 @@ CONFIG_RTE_LIBRTE_EVENTDEV=y
 CONFIG_RTE_LIBRTE_EVENTDEV_DEBUG=n
 CONFIG_RTE_EVENT_MAX_DEVS=16
 CONFIG_RTE_EVENT_MAX_QUEUES_PER_DEV=64
+CONFIG_RTE_EVENT_TIMER_ADAPTER_NUM_MAX=32
 
 #
 # Compile PMD for skeleton event device
diff --git a/drivers/event/sw/sw_evdev.c b/drivers/event/sw/sw_evdev.c
index 0e89f11..dcb6551 100644
--- a/drivers/event/sw/sw_evdev.c
+++ b/drivers/event/sw/sw_evdev.c
@@ -464,6 +464,22 @@ sw_eth_rx_adapter_caps_get(const struct rte_eventdev *dev,
 	return 0;
 }
 
+static int
+sw_timer_adapter_caps_get(const struct rte_eventdev *dev,
+			  uint64_t flags,
+			  uint32_t *caps,
+			  const struct rte_event_timer_adapter_ops **ops)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(flags);
+	*caps = 0;
+
+	/* Use default SW ops */
+	*ops = NULL;
+
+	return 0;
+}
+
 static void
 sw_info_get(struct rte_eventdev *dev, struct rte_event_dev_info *info)
 {
@@ -791,6 +807,8 @@ sw_probe(struct rte_vdev_device *vdev)
 
 			.eth_rx_adapter_caps_get = sw_eth_rx_adapter_caps_get,
 
+			.timer_adapter_caps_get = sw_timer_adapter_caps_get,
+
 			.xstats_get = sw_xstats_get,
 			.xstats_get_names = sw_xstats_get_names,
 			.xstats_get_by_name = sw_xstats_get_by_name,
diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile
index 549b182..8b16e3f 100644
--- a/lib/librte_eventdev/Makefile
+++ b/lib/librte_eventdev/Makefile
@@ -20,6 +20,7 @@ LDLIBS += -lrte_eal -lrte_ring -lrte_ethdev -lrte_hash
 SRCS-y += rte_eventdev.c
 SRCS-y += rte_event_ring.c
 SRCS-y += rte_event_eth_rx_adapter.c
+SRCS-y += rte_event_timer_adapter.c
 
 # export include files
 SYMLINK-y-include += rte_eventdev.h
@@ -29,6 +30,7 @@ SYMLINK-y-include += rte_eventdev_pmd_vdev.h
 SYMLINK-y-include += rte_event_ring.h
 SYMLINK-y-include += rte_event_eth_rx_adapter.h
 SYMLINK-y-include += rte_event_timer_adapter.h
+SYMLINK-y-include += rte_event_timer_adapter_pmd.h
 
 # versioning export map
 EXPORT_MAP := rte_eventdev_version.map
diff --git a/lib/librte_eventdev/rte_event_timer_adapter.c b/lib/librte_eventdev/rte_event_timer_adapter.c
new file mode 100644
index 0000000..75a14ac
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter.c
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <string.h>
+#include <inttypes.h>
+
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_dev.h>
+#include <rte_errno.h>
+
+#include "rte_eventdev.h"
+#include "rte_eventdev_pmd.h"
+#include "rte_event_timer_adapter.h"
+#include "rte_event_timer_adapter_pmd.h"
+
+#define DATA_MZ_NAME_MAX_LEN 64
+#define DATA_MZ_NAME_FORMAT "rte_event_timer_adapter_data_%d"
+
+static int evtim_logtype;
+
+static struct rte_event_timer_adapter adapters[RTE_EVENT_TIMER_ADAPTER_NUM_MAX];
+
+#define EVTIM_LOG(level, logtype, ...) \
+	rte_log(RTE_LOG_ ## level, logtype, \
+		RTE_FMT("EVTIMER: %s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) \
+			"\n", __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,)))
+
+#define EVTIM_LOG_ERR(...) EVTIM_LOG(ERR, evtim_logtype, __VA_ARGS__)
+
+#ifdef RTE_LIBRTE_EVENTDEV_DEBUG
+#define EVTIM_LOG_DBG(...) \
+	EVTIM_LOG(DEBUG, evtim_logtype, __VA_ARGS__)
+#else
+#define EVTIM_LOG_DBG(...) (void)0
+#endif
+
+static int
+default_port_conf_cb(uint16_t id, uint8_t event_dev_id, uint8_t *event_port_id,
+		     void *conf_arg)
+{
+	struct rte_event_timer_adapter *adapter;
+	struct rte_eventdev *dev;
+	struct rte_event_dev_config dev_conf;
+	struct rte_event_port_conf *port_conf, def_port_conf = {0};
+	int started;
+	uint8_t port_id;
+	uint8_t dev_id;
+	int ret;
+
+	RTE_SET_USED(event_dev_id);
+
+	adapter = &adapters[id];
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+	dev_id = dev->data->dev_id;
+	dev_conf = dev->data->dev_conf;
+
+	started = dev->data->dev_started;
+	if (started)
+		rte_event_dev_stop(dev_id);
+
+	port_id = dev_conf.nb_event_ports;
+	dev_conf.nb_event_ports += 1;
+	ret = rte_event_dev_configure(dev_id, &dev_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to configure event dev %u\n", dev_id);
+		if (started)
+			if (rte_event_dev_start(dev_id))
+				return -EIO;
+
+		return ret;
+	}
+
+	if (conf_arg != NULL)
+		port_conf = conf_arg;
+	else {
+		port_conf = &def_port_conf;
+		ret = rte_event_port_default_conf_get(dev_id, port_id,
+						      port_conf);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = rte_event_port_setup(dev_id, port_id, port_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to setup event port %u on event dev %u\n",
+			      port_id, dev_id);
+		return ret;
+	}
+
+	*event_port_id = port_id;
+
+	if (started)
+		ret = rte_event_dev_start(dev_id);
+
+	return ret;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create(const struct rte_event_timer_adapter_conf *conf)
+{
+	return rte_event_timer_adapter_create_ext(conf, default_port_conf_cb,
+						  NULL);
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create_ext(
+		const struct rte_event_timer_adapter_conf *conf,
+		rte_event_timer_adapter_port_conf_cb_t conf_cb,
+		void *conf_arg)
+{
+	uint16_t adapter_id;
+	struct rte_event_timer_adapter *adapter;
+	const struct rte_memzone *mz;
+	char mz_name[DATA_MZ_NAME_MAX_LEN];
+	int n, ret;
+	struct rte_eventdev *dev;
+
+	if (conf == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check eventdev ID */
+	if (!rte_event_pmd_is_valid_dev(conf->event_dev_id)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	dev = &rte_eventdevs[conf->event_dev_id];
+
+	adapter_id = conf->timer_adapter_id;
+
+	/* Check that adapter_id is in range */
+	if (adapter_id >= RTE_EVENT_TIMER_ADAPTER_NUM_MAX) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check adapter ID not already allocated */
+	adapter = &adapters[adapter_id];
+	if (adapter->allocated) {
+		rte_errno = EEXIST;
+		return NULL;
+	}
+
+	/* Create shared data area. */
+	n = snprintf(mz_name, sizeof(mz_name), DATA_MZ_NAME_FORMAT, adapter_id);
+	if (n >= (int)sizeof(mz_name)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	mz = rte_memzone_reserve(mz_name,
+				 sizeof(struct rte_event_timer_adapter_data),
+				 conf->socket_id, 0);
+	if (mz == NULL)
+		/* rte_errno set by rte_memzone_reserve */
+		return NULL;
+
+	adapter->data = mz->addr;
+	memset(adapter->data, 0, sizeof(struct rte_event_timer_adapter_data));
+
+	adapter->data->mz = mz;
+	adapter->data->event_dev_id = conf->event_dev_id;
+	adapter->data->id = adapter_id;
+	adapter->data->socket_id = conf->socket_id;
+	adapter->data->conf = *conf;  /* copy conf structure */
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	if (!(adapter->data->caps &
+	      RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT)) {
+		FUNC_PTR_OR_NULL_RET_WITH_ERRNO(conf_cb, -EINVAL);
+		ret = conf_cb(adapter->data->id, adapter->data->event_dev_id,
+			      &adapter->data->event_port_id, conf_arg);
+		if (ret < 0) {
+			rte_errno = ret;
+			goto free_memzone;
+		}
+	}
+
+	/* Allow driver to do some setup */
+	FUNC_PTR_OR_NULL_RET_WITH_ERRNO(adapter->ops->init, -ENOTSUP);
+	ret = adapter->ops->init(adapter);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+
+free_memzone:
+	rte_memzone_free(adapter->data->mz);
+	return NULL;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_get_info(const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->ops->get_info)
+		/* let driver set values it knows */
+		adapter->ops->get_info(adapter, adapter_info);
+
+	/* Set common values */
+	adapter_info->conf = adapter->data->conf;
+	adapter_info->event_dev_port_id = adapter->data->event_port_id;
+	adapter_info->caps = adapter->data->caps;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_start(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->start, -EINVAL);
+
+	ret = adapter->ops->start(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 1;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stop, -EINVAL);
+
+	if (adapter->data->started == 0) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" already stopped",
+			      adapter->data->id);
+		return 0;
+	}
+
+	ret = adapter->ops->stop(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 0;
+
+	return 0;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_lookup(uint16_t adapter_id)
+{
+	char name[DATA_MZ_NAME_MAX_LEN];
+	const struct rte_memzone *mz;
+	struct rte_event_timer_adapter_data *data;
+	struct rte_event_timer_adapter *adapter;
+	int ret;
+	struct rte_eventdev *dev;
+
+	if (adapters[adapter_id].allocated)
+		return &adapters[adapter_id]; /* Adapter is already loaded */
+
+	snprintf(name, DATA_MZ_NAME_MAX_LEN, DATA_MZ_NAME_FORMAT, adapter_id);
+	mz = rte_memzone_lookup(name);
+	if (mz == NULL) {
+		rte_errno = ENOENT;
+		return NULL;
+	}
+
+	data = mz->addr;
+
+	adapter = &adapters[data->id];
+	adapter->data = data;
+
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_free(struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->uninit, -EINVAL);
+
+	if (adapter->data->started == 1) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" must be stopped "
+			      "before freeing", adapter->data->id);
+		return -EBUSY;
+	}
+
+	/* free impl priv data */
+	ret = adapter->ops->uninit(adapter);
+	if (ret < 0)
+		return ret;
+
+	/* free shared data area */
+	ret = rte_memzone_free(adapter->data->mz);
+	if (ret < 0)
+		return ret;
+
+	adapter->data = NULL;
+	adapter->allocated = 0;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter,
+				       uint32_t *service_id)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->data->service_inited && service_id != NULL)
+		*service_id = adapter->data->service_id;
+
+	return adapter->data->service_inited ? 0 : -ESRCH;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter,
+				  struct rte_event_timer_adapter_stats *stats)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_get, -EINVAL);
+	if (stats == NULL)
+		return -EINVAL;
+
+	return adapter->ops->stats_get(adapter, stats);
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_reset(struct rte_event_timer_adapter *adapter)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_reset, -EINVAL);
+	return adapter->ops->stats_reset(adapter);
+}
+
+RTE_INIT(event_timer_adapter_init_log);
+static void
+event_timer_adapter_init_log(void)
+{
+	evtim_logtype = rte_log_register("lib.eventdev.adapter.timer");
+	if (evtim_logtype >= 0)
+		rte_log_set_level(evtim_logtype, RTE_LOG_NOTICE);
+}
diff --git a/lib/librte_eventdev/rte_event_timer_adapter_pmd.h b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
new file mode 100644
index 0000000..cf3509d
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+#define __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+
+/**
+ * @file
+ * RTE Event Timer Adapter API (PMD Side)
+ *
+ * @note
+ * This file provides implementation helpers for internal use by PMDs.  They
+ * are not intended to be exposed to applications and are not subject to ABI
+ * versioning.
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "rte_event_timer_adapter.h"
+
+/*
+ * Definitions of functions exported by an event timer adapter implementation
+ * through *rte_event_timer_adapter_ops* structure supplied in the
+ * *rte_event_timer_adapter* structure associated with an event timer adapter.
+ */
+
+typedef int (*rte_event_timer_adapter_init_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation setup */
+typedef int (*rte_event_timer_adapter_uninit_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation teardown */
+typedef int (*rte_event_timer_adapter_start_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Start running event timer adapter */
+typedef int (*rte_event_timer_adapter_stop_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Stop running event timer adapter */
+typedef void (*rte_event_timer_adapter_get_info_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info);
+/**< @internal Get contextual information for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_get_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_stats *stats);
+/**< @internal Get statistics for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_reset_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Reset statistics for event timer adapter */
+
+/**
+ * @internal Structure containing the functions exported by an event timer
+ * adapter implementation.
+ */
+struct rte_event_timer_adapter_ops {
+	rte_event_timer_adapter_init_t		init;  /**< Set up adapter */
+	rte_event_timer_adapter_uninit_t	uninit;/**< Tear down adapter */
+	rte_event_timer_adapter_start_t		start; /**< Start adapter */
+	rte_event_timer_adapter_stop_t		stop;  /**< Stop adapter */
+	rte_event_timer_adapter_get_info_t	get_info;
+	/**< Get info from driver */
+	rte_event_timer_adapter_stats_get_t	stats_get;
+	/**< Get adapter statistics */
+	rte_event_timer_adapter_stats_reset_t	stats_reset;
+	/**< Reset adapter statistics */
+	rte_event_timer_arm_burst_t		arm_burst;
+	/**< Arm one or more event timers */
+	rte_event_timer_arm_tmo_tick_burst_t	arm_tmo_tick_burst;
+	/**< Arm event timers with same expiration time */
+	rte_event_timer_cancel_burst_t		cancel_burst;
+	/**< Cancel one or more event timers */
+};
+
+/**
+ * @internal Adapter data; structure to be placed in shared memory to be
+ * accessible by various processes in a multi-process configuration.
+ */
+struct rte_event_timer_adapter_data {
+	uint8_t id;
+	/**< Event timer adapter ID */
+	uint8_t event_dev_id;
+	/**< Event device ID */
+	uint32_t socket_id;
+	/**< Socket ID where memory is allocated */
+	uint8_t event_port_id;
+	/**< Optional: event port ID used when the inbuilt port is absent */
+	const struct rte_memzone *mz;
+	/**< Event timer adapter memzone pointer */
+	struct rte_event_timer_adapter_conf conf;
+	/**< Configuration used to configure the adapter. */
+	uint32_t caps;
+	/**< Adapter capabilities */
+	void *adapter_priv;
+	/**< Timer adapter private data*/
+	uint8_t service_inited;
+	/**< Service initialization state */
+	uint32_t service_id;
+	/**< Service ID*/
+
+	RTE_STD_C11
+	uint8_t started : 1;
+	/**< Flag to indicate adapter started. */
+} __rte_cache_aligned;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RTE_EVENT_TIMER_ADAPTER_PMD_H__ */
diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c
index 2de8d9a..3f016f4 100644
--- a/lib/librte_eventdev/rte_eventdev.c
+++ b/lib/librte_eventdev/rte_eventdev.c
@@ -123,6 +123,28 @@ rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				: 0;
 }
 
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps)
+{
+	struct rte_eventdev *dev;
+	const struct rte_event_timer_adapter_ops *ops;
+
+	RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_eventdevs[dev_id];
+
+	if (caps == NULL)
+		return -EINVAL;
+	*caps = 0;
+
+	return dev->dev_ops->timer_adapter_caps_get ?
+				(*dev->dev_ops->timer_adapter_caps_get)(dev,
+									0,
+									caps,
+									&ops)
+				: 0;
+}
+
 static inline int
 rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues)
 {
diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h
index 86df4be..6fcbe94 100644
--- a/lib/librte_eventdev/rte_eventdev.h
+++ b/lib/librte_eventdev/rte_eventdev.h
@@ -215,6 +215,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_memory.h>
 #include <rte_errno.h>
+#include <rte_compat.h>
 
 struct rte_mbuf; /* we just use mbuf pointers; no need to include rte_mbuf.h */
 struct rte_event;
@@ -1115,6 +1116,25 @@ int
 rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				uint32_t *caps);
 
+#define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0)
+/**< This flag is set when the timer mechanism is in HW. */
+
+/**
+ * Retrieve the event device's timer adapter capabilities.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param[out] caps
+ *   A pointer to memory to be filled with event timer adapter capabilities.
+ *
+ * @return
+ *   - 0: Success, driver provided event timer adapter capabilities.
+ *   - <0: Error code returned by the driver function.
+ */
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps);
+
 struct rte_eventdev_driver;
 struct rte_eventdev_ops;
 struct rte_eventdev;
diff --git a/lib/librte_eventdev/rte_eventdev_pmd.h b/lib/librte_eventdev/rte_eventdev_pmd.h
index 3a8ddd7..2dcb528 100644
--- a/lib/librte_eventdev/rte_eventdev_pmd.h
+++ b/lib/librte_eventdev/rte_eventdev_pmd.h
@@ -26,6 +26,7 @@ extern "C" {
 #include <rte_malloc.h>
 
 #include "rte_eventdev.h"
+#include "rte_event_timer_adapter_pmd.h"
 
 /* Logging Macros */
 #define RTE_EDEV_LOG_ERR(...) \
@@ -449,6 +450,37 @@ typedef int (*eventdev_eth_rx_adapter_caps_get_t)
 struct rte_event_eth_rx_adapter_queue_conf *queue_conf;
 
 /**
+ * Retrieve the event device's timer adapter capabilities, as well as the ops
+ * structure that an event timer adapter should call through to enter the
+ * driver
+ *
+ * @param dev
+ *   Event device pointer
+ *
+ * @param flags
+ *   Flags that can be used to determine how to select an event timer
+ *   adapter ops structure
+ *
+ * @param[out] caps
+ *   A pointer to memory filled with Rx event adapter capabilities.
+ *
+ * @param[out] ops
+ *   A pointer to the ops pointer to set with the address of the desired ops
+ *   structure
+ *
+ * @return
+ *   - 0: Success, driver provides Rx event adapter capabilities for the
+ *	ethernet device.
+ *   - <0: Error code returned by the driver function.
+ *
+ */
+typedef int (*eventdev_timer_adapter_caps_get_t)(
+				const struct rte_eventdev *dev,
+				uint64_t flags,
+				uint32_t *caps,
+				const struct rte_event_timer_adapter_ops **ops);
+
+/**
  * Add ethernet Rx queues to event device. This callback is invoked if
  * the caps returned from rte_eventdev_eth_rx_adapter_caps_get(, eth_port_id)
  * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set.
@@ -640,6 +672,9 @@ struct rte_eventdev_ops {
 	eventdev_eth_rx_adapter_stats_reset eth_rx_adapter_stats_reset;
 	/**< Reset ethernet Rx stats */
 
+	eventdev_timer_adapter_caps_get_t timer_adapter_caps_get;
+	/**< Get timer adapter capabilities */
+
 	eventdev_selftest dev_selftest;
 	/**< Start eventdev Selftest */
 
diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map
index 4396536..3ee28f7 100644
--- a/lib/librte_eventdev/rte_eventdev_version.map
+++ b/lib/librte_eventdev/rte_eventdev_version.map
@@ -66,7 +66,6 @@ DPDK_17.11 {
 	rte_event_eth_rx_adapter_stats_get;
 	rte_event_eth_rx_adapter_stats_reset;
 	rte_event_eth_rx_adapter_stop;
-
 } DPDK_17.08;
 
 DPDK_18.02 {
@@ -80,3 +79,22 @@ DPDK_18.05 {
 
 	rte_event_dev_stop_flush_callback_register;
 } DPDK_18.02;
+
+EXPERIMENTAL {
+	global:
+
+	rte_event_timer_adapter_caps_get;
+	rte_event_timer_adapter_create;
+	rte_event_timer_adapter_create_ext;
+	rte_event_timer_adapter_free;
+	rte_event_timer_adapter_get_info;
+	rte_event_timer_adapter_lookup;
+	rte_event_timer_adapter_service_id_get;
+	rte_event_timer_adapter_start;
+	rte_event_timer_adapter_stats_get;
+	rte_event_timer_adapter_stats_reset;
+	rte_event_timer_adapter_stop;
+	rte_event_timer_arm_burst;
+	rte_event_timer_arm_tmo_tick_burst;
+	rte_event_timer_cancel_burst;
+} DPDK_18.05;
-- 
2.6.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/2] eal: add API to align integer to previous power of 2
  2018-04-04 18:36  3%                 ` Pavan Nikhilesh
@ 2018-04-04 19:41  3%                   ` Matan Azrad
  0 siblings, 0 replies; 200+ results
From: Matan Azrad @ 2018-04-04 19:41 UTC (permalink / raw)
  To: Pavan Nikhilesh, jerin.jacob, keith.wiles, Thomas Monjalon; +Cc: dev

Hi Pavan

From: Pavan Nikhilesh, Wednesday, April 4, 2018 9:36 PM
> On Wed, Apr 04, 2018 at 06:23:19PM +0000, Matan Azrad wrote:
> > Hi Pavan
> >
> > From: Pavan Nikhilesh, Wednesday, April 4, 2018 9:16 PM
> > > Hi Matan,
> > >
> > > >
> > > > Got you.
> > > > Looks like you found issue here...
> > > > The experimental tag probably should be in a root .h file.
> > > > Probably, need a fix patch to move it for a different\new .h file.
> > > >
> > > > What do you think?
> > > >
> > >
> > > Actually thats just start of the rabbit hole, if we succeed to tag a
> > > inline function in rte_common.h as experimental every lib/driver
> > > that uses rte_common.h (almost everything) needs to have CFLAGS set
> > > to - DALLOW_EXPERIMENTAL_API.
> > >
> >
> > Isn't it relevant only for the libs which are using the new tagged APIs?
> 
> Static inline functions in .h files will be added to each and every .c example
> preprocessor output for rte_pci.c which includes rte_common.h:
> 
> # 231 "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
> extern int RTE_BUILD_BUG_ON_detected_error; # 249
> "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
> static inline uint32_t __attribute__((deprecated("Symbol is not yet part of
> stable ABI"), section(".text.experimental"))) rte_combine32ms1b(register
> uint32_t x) {  x |= x >> 1;  x |= x >> 2;  x |= x >> 4;  x |= x >> 8;  x |= x >> 16;
> 
>  return x;
> }
> # 271 "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
> static inline uint64_t
> rte_combine64ms1b(register uint64_t v)
> {
>  v |= v >> 1;
>  v |= v >> 2;
>  v |= v >> 4;
>  v |= v >> 8;
>  v |= v >> 16;
>  v |= v >> 32;
> 
>  return v;
> }
> 
> Which causes compiler to throw error as DALLOW_EXPERIMENTAL_API is not
> added to cflags.
> 

Are you sure?

I added the next code and the compilation passed:
static inline uint32_t
__attribute__((deprecated("Symbol is not yet part of stable ABI"), \
section(".text.experimental")))
rte_combine32ms1b(register uint32_t x)
{
	x |= x >> 1;
	x |= x >> 2;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;

	return x;
}

Actually, the combine functions should not be experimental (already used in the existed code).
It also will prevent us to add the cflag in every lib which uses the old align functions. 
Only the new align functions should be tagged.
And then, you need to add the cflag only in the places which use these functions.

Am I missing something?

> >
> > > Regards,
> > > Pavan.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters
  @ 2018-04-04 18:56  3%     ` De Lara Guarch, Pablo
  2018-04-05 10:16  0%       ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: De Lara Guarch, Pablo @ 2018-04-04 18:56 UTC (permalink / raw)
  To: Horton, Remy, dev
  Cc: Mcnamara, John, Lu, Wenzhuo, Wu, Jingjing, Zhang, Qi Z, Xing,
	Beilei, Shreyansh Jain, Thomas Monjalon

Hi Remy,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Remy Horton
> Sent: Wednesday, April 4, 2018 6:18 PM
> To: dev@dpdk.org
> Cc: Mcnamara, John <john.mcnamara@intel.com>; Lu, Wenzhuo
> <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Shreyansh Jain
> <shreyansh.jain@nxp.com>; Thomas Monjalon <thomas@monjalon.net>
> Subject: [dpdk-dev] [PATCH v3 1/4] ethdev: add support for PMD-tuned Tx/Rx
> parameters
> 
> The optimal values of several transmission & reception related parameters, such
> as burst sizes, descriptor ring sizes, and number of queues, varies between
> different network interface devices. This patch allows individual PMDs to specify
> preferred parameter values.
> 
> Signed-off-by: Remy Horton <remy.horton@intel.com>
> ---
>  doc/guides/rel_notes/deprecation.rst   | 13 ----------
>  doc/guides/rel_notes/release_18_05.rst |  5 ++++
>  lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
>  lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
>  4 files changed, 65 insertions(+), 22 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/deprecation.rst
> b/doc/guides/rel_notes/deprecation.rst
> index 0c696f7..920df6b 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -115,19 +115,6 @@ Deprecation Notices
>    The new API add rss_level field to ``rte_eth_rss_conf`` to enable a choice
>    of RSS hash calculation on outer or inner header of tunneled packet.
> 
> -* ethdev:  Currently, if the  rte_eth_rx_burst() function returns a value less
> -  than *nb_pkts*, the application will assume that no more packets are present.
> -  Some of the hw queue based hardware can only support smaller burst for RX
> -  and TX and thus break the expectation of the rx_burst API. Similar is the
> -  case for TX burst as well as ring sizes. ``rte_eth_dev_info`` will be added
> -  with following new parameters so as to support semantics for drivers to
> -  define a preferred size for Rx/Tx burst and rings.
> -
> -  - Member ``struct preferred_size`` would be added to enclose all preferred
> -    size to be fetched from driver/implementation.
> -  - Members ``uint16_t rx_burst``,  ``uint16_t tx_burst``, ``uint16_t rx_ring``,
> -    and ``uint16_t tx_ring`` would be added to ``struct preferred_size``.
> -
>  * ethdev: A work is being planned for 18.05 to expose VF port representors
>    as a mean to perform control and data path operation on the different VFs.
>    As VF representor is an ethdev port, new fields are needed in order to map diff

API and ABI changes should be documented in release notes.

Thanks,
Pablo

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/2] eal: add API to align integer to previous power of 2
  @ 2018-04-04 18:36  3%                 ` Pavan Nikhilesh
  2018-04-04 19:41  3%                   ` Matan Azrad
  0 siblings, 1 reply; 200+ results
From: Pavan Nikhilesh @ 2018-04-04 18:36 UTC (permalink / raw)
  To: Matan Azrad, jerin.jacob, keith.wiles, Thomas Monjalon; +Cc: dev

On Wed, Apr 04, 2018 at 06:23:19PM +0000, Matan Azrad wrote:
> Hi Pavan
>
> From: Pavan Nikhilesh, Wednesday, April 4, 2018 9:16 PM
> > Hi Matan,
> >
> > >
> > > Got you.
> > > Looks like you found issue here...
> > > The experimental tag probably should be in a root .h file.
> > > Probably, need a fix patch to move it for a different\new .h file.
> > >
> > > What do you think?
> > >
> >
> > Actually thats just start of the rabbit hole, if we succeed to tag a inline
> > function in rte_common.h as experimental every lib/driver that uses
> > rte_common.h (almost everything) needs to have CFLAGS set to -
> > DALLOW_EXPERIMENTAL_API.
> >
>
> Isn't it relevant only for the libs which are using the new tagged APIs?

Static inline functions in .h files will be added to each and every .c
example preprocessor output for rte_pci.c which includes rte_common.h:

# 231 "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
extern int RTE_BUILD_BUG_ON_detected_error;
# 249 "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
static inline uint32_t __attribute__((deprecated("Symbol is not yet part of stable ABI"), section(".text.experimental")))
rte_combine32ms1b(register uint32_t x)
{
 x |= x >> 1;
 x |= x >> 2;
 x |= x >> 4;
 x |= x >> 8;
 x |= x >> 16;

 return x;
}
# 271 "/home/pavan/Work/clean/dpdk/build/include/rte_common.h"
static inline uint64_t
rte_combine64ms1b(register uint64_t v)
{
 v |= v >> 1;
 v |= v >> 2;
 v |= v >> 4;
 v |= v >> 8;
 v |= v >> 16;
 v |= v >> 32;

 return v;
}

Which causes compiler to throw error as DALLOW_EXPERIMENTAL_API is not added
to cflags.

>
> > Regards,
> > Pavan.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4] ethdev: replace bus specific struct with generic dev
  @ 2018-04-04 17:57  3%           ` De Lara Guarch, Pablo
  2018-04-05  9:19  0%             ` Ferruh Yigit
  0 siblings, 1 reply; 200+ results
From: De Lara Guarch, Pablo @ 2018-04-04 17:57 UTC (permalink / raw)
  To: Yigit, Ferruh, David Marchand, santosh
  Cc: dev, Shreyansh Jain, Legacy, Allain (Wind River),
	Tomasz Duszynski, Thomas Monjalon



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ferruh Yigit
> Sent: Tuesday, April 3, 2018 10:50 AM
> To: David Marchand <david.marchand@6wind.com>; santosh
> <santosh.shukla@caviumnetworks.com>
> Cc: dev@dpdk.org; Shreyansh Jain <shreyansh.jain@nxp.com>; Legacy, Allain
> (Wind River) <allain.legacy@windriver.com>; Tomasz Duszynski
> <tdu@semihalf.com>; Thomas Monjalon <thomas@monjalon.net>
> Subject: Re: [dpdk-dev] [PATCH v4] ethdev: replace bus specific struct with
> generic dev
> 
> On 4/3/2018 10:06 AM, David Marchand wrote:
> > On Mon, Apr 2, 2018 at 6:13 PM, santosh
> > <santosh.shukla@caviumnetworks.com> wrote:
> >> On Friday 30 March 2018 08:59 PM, David Marchand wrote:
> >>> I can see we enforce the driver name by putting it after the call to
> >>> .dev_infos_get.
> >>> http://dpdk.org/browse/dpdk/tree/lib/librte_ether/rte_ethdev.c#n2399
> >>>
> >>> octeontx pmd seems to try to do something about it:
> >>> http://dpdk.org/browse/dpdk/tree/drivers/net/octeontx/octeontx_ethde
> >>> v.c#n622
> >>>
> >>> Not sure it does something, might be a thing to cleanup.
> >>>
> >>>
> >> In case, if your referring to driver_name update then indeed its a
> >> cleanup [1].
> >>
> >> Otherwise, I don't see any issue with v4 Or may be /I /misunderstood
> >> your comment.
> >
> > I agree there is no fundamental issue.
> >
> >     dev_info->device = dev->device;
> >
> >     RTE_FUNC_PTR_OR_RET(*dev->dev_ops->dev_infos_get);
> >     (*dev->dev_ops->dev_infos_get)(dev, dev_info);
> >     dev_info->driver_name = dev->device->driver->name;
> >
> > If somebody (I mean some pmd out there) has a usecase with
> > dev_info->device != dev->device, why not.
> 
> Intentional let drivers update this variable although I don't also see any use case
> of it.
> 
> This variable was set by PMDs before this patch, so I don't see any reason to be
> so strict here.
> 
> If driver does anything ethdev will set dev_info->device for it, if it want to
> overwrite, for any reason, it will have the capability.

Looks good to me. Will do the same for cryptodev and bbdev.
The only thing that I am missing here is an update in documentation,
adding the ABI Change in release notes.

Apart from it:

Acked-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>

> 
> >
> > Thomas ?
> >
> >


^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 0/4] ethdev: add per-PMD tuning of RxTx parmeters
    @ 2018-04-04 17:17  3% ` Remy Horton
    2018-04-06 14:49  4%   ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
  1 sibling, 2 replies; 200+ results
From: Remy Horton @ 2018-04-04 17:17 UTC (permalink / raw)
  To: dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

The optimal values of several transmission & reception related parameters,
such as burst sizes, descriptor ring sizes, and number of queues, varies
between different network interface devices. This patchset allows individual
PMDs to specify their preferred parameter values, and if so indicated by an
application, for them to be used automatically by the ethdev layer.

rte_eth_dev_configure() has been changed so that specifying zero for both
nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
are not available, falls back to EAL defaults. Setting one (but not both)
to zero does not cause the use of defaults, as having one of them zeroed is
a valid setup.

This patchset includes per-PMD values for e1000 and i40e but it is expected
that subsequent patchsets will cover other PMDs. A deprecation notice
covering the API/ABI change is in place.

Changes in v3:
* Changed formatting around new rte_eth_dev_info fields
* Added Doxygen documentation to struct rte_eth_dev_portconf
* Testpmd "port config all burst 0" and --burst=0 uses PMD 
  Rx burst recommendations.
* Added to release notes
* Rebased to 8ea081f38161

Changes in v2:
* Rebased to master
* Removed fallback values from rte_eth_dev_info_get()
* Added fallback values to rte_rte_[rt]x_queue_setup()
* Added fallback values to rte_eth_dev_configure()
* Corrected comment
* Removed deprecation notice
* Split RX and Tx into seperate structures
* Changed parameter names


Remy Horton (4):
  ethdev: add support for PMD-tuned Tx/Rx parameters
  net/e1000: add TxRx tuning parameters
  net/i40e: add TxRx tuning parameters
  testpmd: make use of per-PMD TxRx parameters

 app/test-pmd/cmdline.c                 | 31 +++++++++++++++++++++---
 app/test-pmd/parameters.c              | 38 +++++++++++++++++++++++++----
 app/test-pmd/testpmd.c                 |  5 ++--
 doc/guides/rel_notes/deprecation.rst   | 13 ----------
 doc/guides/rel_notes/release_18_05.rst |  5 ++++
 drivers/net/e1000/em_ethdev.c          |  6 +++++
 drivers/net/i40e/i40e_ethdev.c         | 33 ++++++++++++++++++++++---
 lib/librte_ether/rte_ethdev.c          | 44 +++++++++++++++++++++++++++-------
 lib/librte_ether/rte_ethdev.h          | 25 +++++++++++++++++++
 9 files changed, 165 insertions(+), 35 deletions(-)

-- 
2.9.5

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v7 02/10] crypto/virtio: support virtio device init
  @ 2018-04-04 17:03  1%   ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-04 17:03 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

This patch implements the initialization of the virtio crypto device.
The virtio crypto device conforms to virtio-1.0, so this patch only
supports modern mode operation.
The cryptodev is created at the virtio crypto pci device probing stage.
The function of virtio_crypto_pkt_tx_burst() is used to burst transfer
packets and virtio_crypto_pkt_rx_burst() is used to burst receive packets.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 247 ++++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 460 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 253 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 172 ++++++++++++
 10 files changed, 1442 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index a3b44e9..c4727ea 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 84aff58..4550834 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,25 +3,238 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int crypto_virtio_pci_remove(
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -32,3 +245,31 @@ static int crypto_virtio_pci_remove(
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..cd316a6
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..0a9bddb
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v1 10/16] ethdev: add encap level to RSS flow API action
  2018-04-04 15:56  4% [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-04 15:56  7% ` [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions Adrien Mazarguil
  2018-04-04 15:56  3% ` [dpdk-dev] [PATCH v1 05/16] ethdev: remove DUP action from flow API Adrien Mazarguil
@ 2018-04-04 15:56  2% ` Adrien Mazarguil
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-04 15:56 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev
  Cc: Xueming Li, Wenzhuo Lu, Jingjing Wu, Beilei Xing, Qi Zhang,
	Konstantin Ananyev, Nelio Laranjeiro, Yongseok Koh,
	Andrew Rybchenko, Pascal Mazon

RSS hash types (ETH_RSS_* macros defined in rte_ethdev.h) describe the
protocol header fields of a packet that must be taken into account while
computing RSS.

When facing encapsulated (e.g. tunneled) packets, there is an ambiguity as
to whether these should apply to inner or outer packets. Applications need
the ability to tell exactly "where" RSS must be performed.

This is addressed by adding encapsulation level information to the RSS flow
action. Its default value is 0 and stands for the usual unspecified
behavior. Other values provide a specific encapsulation level.

Contrary to the change announced by commit 676b605182a5 ("doc: announce
ethdev API change for RSS configuration"), this patch does not affect
struct rte_eth_rss_conf but struct rte_flow_action_rss as the former is not
used anymore by the RSS flow action. ABI impact is therefore limited to
rte_flow.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Cc: Xueming Li <xuemingl@mellanox.com>
Cc: Ferruh Yigit <ferruh.yigit@intel.com>
Cc: Thomas Monjalon <thomas@monjalon.net>
Cc: Wenzhuo Lu <wenzhuo.lu@intel.com>
Cc: Jingjing Wu <jingjing.wu@intel.com>
Cc: Beilei Xing <beilei.xing@intel.com>
Cc: Qi Zhang <qi.z.zhang@intel.com>
Cc: Konstantin Ananyev <konstantin.ananyev@intel.com>
Cc: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
Cc: Andrew Rybchenko <arybchenko@solarflare.com>
Cc: Pascal Mazon <pascal.mazon@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 13 ++++++++++++
 app/test-pmd/config.c                       |  1 +
 doc/guides/prog_guide/rte_flow.rst          | 24 ++++++++++++++++++++++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
 drivers/net/e1000/igb_flow.c                |  4 ++++
 drivers/net/e1000/igb_rxtx.c                |  2 ++
 drivers/net/i40e/i40e_ethdev.c              |  2 ++
 drivers/net/i40e/i40e_flow.c                |  4 ++++
 drivers/net/ixgbe/ixgbe_flow.c              |  4 ++++
 drivers/net/ixgbe/ixgbe_rxtx.c              |  2 ++
 drivers/net/mlx4/mlx4_flow.c                |  6 ++++++
 drivers/net/mlx5/mlx5_flow.c                | 11 ++++++++++
 drivers/net/sfc/sfc_flow.c                  |  3 +++
 drivers/net/tap/tap_flow.c                  |  6 +++++-
 lib/librte_ether/rte_flow.c                 |  1 +
 lib/librte_ether/rte_flow.h                 | 26 ++++++++++++++++++++++++
 16 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 23e10d623..2fbd3d8ef 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -167,6 +167,7 @@ enum index {
 	ACTION_COUNT,
 	ACTION_RSS,
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_FUNC_DEFAULT,
 	ACTION_RSS_FUNC_TOEPLITZ,
 	ACTION_RSS_FUNC_SIMPLE_XOR,
@@ -638,6 +639,7 @@ static const enum index action_queue[] = {
 
 static const enum index action_rss[] = {
 	ACTION_RSS_FUNC,
+	ACTION_RSS_LEVEL,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
 	ACTION_RSS_KEY_LEN,
@@ -1616,6 +1618,16 @@ static const struct token token_list[] = {
 		.help = "simple XOR hash function",
 		.call = parse_vc_action_rss_func,
 	},
+	[ACTION_RSS_LEVEL] = {
+		.name = "level",
+		.help = "encapsulation level for \"types\"",
+		.next = NEXT(action_rss, NEXT_ENTRY(UNSIGNED)),
+		.args = ARGS(ARGS_ENTRY_ARB
+			     (offsetof(struct action_rss_data, conf) +
+			      offsetof(struct rte_flow_action_rss, level),
+			      sizeof(((struct rte_flow_action_rss *)0)->
+				     level))),
+	},
 	[ACTION_RSS_TYPES] = {
 		.name = "types",
 		.help = "RSS hash types",
@@ -2107,6 +2119,7 @@ parse_vc_action_rss(struct context *ctx, const struct token *token,
 	*action_rss_data = (struct action_rss_data){
 		.conf = (struct rte_flow_action_rss){
 			.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+			.level = 0,
 			.types = rss_hf,
 			.key_len = sizeof(action_rss_data->key),
 			.queue_num = RTE_MIN(nb_rxq, ACTION_RSS_QUEUE_NUM),
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index b258c93e8..c0fefe475 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1085,6 +1085,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 6261233bc..c893d737a 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1304,6 +1304,28 @@ Note: RSS hash result is stored in the ``hash.rss`` mbuf field which
 overlaps ``hash.fdir.lo``. Since `Action: MARK`_ sets the ``hash.fdir.hi``
 field only, both can be requested simultaneously.
 
+Also, regarding packet encapsulation ``level``:
+
+- ``0`` requests the default behavior. Depending on the packet type, it can
+  mean outermost, innermost, anything in between or even no RSS.
+
+  It basically stands for the innermost encapsulation level RSS can be
+  performed on according to PMD and device capabilities.
+
+- ``1`` requests RSS to be performed on the outermost packet encapsulation
+  level.
+
+- ``2`` and subsequent values request RSS to be performed on the specified
+   inner packet encapsulation level, from outermost to innermost (lower to
+   higher values).
+
+Values other than ``0`` are not necessarily supported.
+
+Requesting a specific RSS level on unrecognized traffic results in undefined
+behavior. For predictable results, it is recommended to make the flow rule
+pattern match packet headers up to the requested encapsulation level so that
+only matching traffic goes through.
+
 .. _table_rte_flow_action_rss:
 
 .. table:: RSS
@@ -1313,6 +1335,8 @@ field only, both can be requested simultaneously.
    +===============+====================================+
    | ``func``      | RSS hash function to apply         |
    +---------------+------------------------------------+
+   | ``level``     | encapsulation level for ``types``  |
+   +---------------+------------------------------------+
    | ``types``     | RSS hash types (see ``ETH_RSS_*``) |
    +---------------+------------------------------------+
    | ``key_len``   | hash key length in bytes           |
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index d9d68ad9b..738461f44 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3401,6 +3401,8 @@ This section lists supported actions and their attributes, if any.
   - ``func {hash function}``: RSS hash function to apply, allowed tokens are
     the same as `set_hash_global_config`_.
 
+  - ``level {unsigned}``: encapsulation level for ``types``.
+
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
     are the same as `set_hash_input_set`_, an empty list means none (0).
 
diff --git a/drivers/net/e1000/igb_flow.c b/drivers/net/e1000/igb_flow.c
index 747c524f5..13f6f2a28 100644
--- a/drivers/net/e1000/igb_flow.c
+++ b/drivers/net/e1000/igb_flow.c
@@ -1314,6 +1314,10 @@ igb_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 18367f443..80407e6bb 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -2767,6 +2767,7 @@ igb_rss_conf_init(struct igb_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -2782,6 +2783,7 @@ igb_action_rss_same(const struct rte_flow_action_rss *comp,
 		    const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index c503d7de2..8f47039a8 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -11963,6 +11963,7 @@ i40e_rss_conf_init(struct i40e_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -11978,6 +11979,7 @@ i40e_action_rss_same(const struct rte_flow_action_rss *comp,
 		     const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index 65ee27917..1b336df74 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -4328,6 +4328,10 @@ i40e_flow_parse_rss_action(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len > RTE_DIM(rss_config->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c
index 10056a0f7..67d22b382 100644
--- a/drivers/net/ixgbe/ixgbe_flow.c
+++ b/drivers/net/ixgbe/ixgbe_flow.c
@@ -2783,6 +2783,10 @@ ixgbe_parse_rss_filter(struct rte_eth_dev *dev,
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
+			 "a nonzero RSS encapsulation level is not supported");
 	if (rss->key_len && rss->key_len != RTE_DIM(rss_conf->key))
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, act,
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 4f46eeb2b..4697ff0c0 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5530,6 +5530,7 @@ ixgbe_rss_conf_init(struct ixgbe_rte_flow_rss_conf *out,
 		return -EINVAL;
 	out->conf = (struct rte_flow_action_rss){
 		.func = in->func,
+		.level = in->level,
 		.types = in->types,
 		.key_len = in->key_len,
 		.queue_num = in->queue_num,
@@ -5545,6 +5546,7 @@ ixgbe_action_rss_same(const struct rte_flow_action_rss *comp,
 		      const struct rte_flow_action_rss *with)
 {
 	return (comp->func == with->func &&
+		comp->level == with->level &&
 		comp->types == with->types &&
 		comp->key_len == with->key_len &&
 		comp->queue_num == with->queue_num &&
diff --git a/drivers/net/mlx4/mlx4_flow.c b/drivers/net/mlx4/mlx4_flow.c
index dcaf8df44..779641e11 100644
--- a/drivers/net/mlx4/mlx4_flow.c
+++ b/drivers/net/mlx4/mlx4_flow.c
@@ -796,6 +796,11 @@ mlx4_flow_prepare(struct priv *priv,
 					" is Toeplitz";
 				goto exit_action_not_supported;
 			}
+			if (rss->level) {
+				msg = "a nonzero RSS encapsulation level is"
+					" not supported";
+				goto exit_action_not_supported;
+			}
 			rte_errno = 0;
 			fields = mlx4_conv_rss_types(priv, rss->types);
 			if (fields == (uint64_t)-1 && rte_errno) {
@@ -1290,6 +1295,7 @@ mlx4_flow_internal(struct priv *priv, struct rte_flow_error *error)
 	uint16_t queue[queues];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = -1,
 		.key_len = MLX4_RSS_HASH_KEY_SIZE,
 		.queue_num = queues,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 0771ad339..bc1176819 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -644,6 +644,14 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
+			if (rss->level) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "a nonzero RSS encapsulation"
+						   " level is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -694,6 +702,7 @@ mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+				.level = 0,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -1927,6 +1936,7 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
 	flow->queues = (uint16_t (*)[])(flow + 1);
 	flow->rss_conf = (struct rte_flow_action_rss){
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = parser.rss_conf.types,
 		.key_len = parser.rss_conf.key_len,
 		.queue_num = parser.rss_conf.queue_num,
@@ -2442,6 +2452,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	uint16_t queue[priv->reta_idx_n];
 	struct rte_flow_action_rss action_rss = {
 		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
 		.types = priv->rss_conf.rss_hf,
 		.key_len = priv->rss_conf.rss_key_len,
 		.queue_num = priv->reta_idx_n,
diff --git a/drivers/net/sfc/sfc_flow.c b/drivers/net/sfc/sfc_flow.c
index d08ba326c..bf9609735 100644
--- a/drivers/net/sfc/sfc_flow.c
+++ b/drivers/net/sfc/sfc_flow.c
@@ -1265,6 +1265,9 @@ sfc_flow_parse_rss(struct sfc_adapter *sa,
 	if (rss->func)
 		return -EINVAL;
 
+	if (rss->level)
+		return -EINVAL;
+
 	if ((rss->types & ~SFC_RSS_OFFLOADS) != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3d91da216..e5eb50fc5 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -2055,11 +2055,15 @@ static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
 	struct rss_key rss_entry = { .hash_fields = 0,
 				     .key_size = 0 };
 
-	/* Check supported hash functions */
+	/* Check supported RSS features */
 	if (rss->func)
 		return rte_flow_error_set
 			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 			 "non-default RSS hash functions are not supported");
+	if (rss->level)
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "a nonzero RSS encapsulation level is not supported");
 
 	/* Get a new map key for a new RSS rule */
 	err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index 0a2c0ac00..1f247d656 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -331,6 +331,7 @@ flow_action_conf_copy(void *buf, const struct rte_flow_action *action)
 		if (dst.rss)
 			*dst.rss = (struct rte_flow_action_rss){
 				.func = src.rss->func,
+				.level = src.rss->level,
 				.types = src.rss->types,
 				.key_len = src.rss->key_len,
 				.queue_num = src.rss->queue_num,
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 1fc1df9c3..1b222ba60 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -1039,6 +1039,32 @@ struct rte_flow_query_count {
  */
 struct rte_flow_action_rss {
 	enum rte_eth_hash_function func; /**< RSS hash function to apply. */
+	/**
+	 * Packet encapsulation level RSS hash @p types apply to.
+	 *
+	 * - @p 0 requests the default behavior. Depending on the packet
+	 *   type, it can mean outermost, innermost, anything in between or
+	 *   even no RSS.
+	 *
+	 *   It basically stands for the innermost encapsulation level RSS
+	 *   can be performed on according to PMD and device capabilities.
+	 *
+	 * - @p 1 requests RSS to be performed on the outermost packet
+	 *   encapsulation level.
+	 *
+	 * - @p 2 and subsequent values request RSS to be performed on the
+	 *   specified inner packet encapsulation level, from outermost to
+	 *   innermost (lower to higher values).
+	 *
+	 * Values other than @p 0 are not necessarily supported.
+	 *
+	 * Requesting a specific RSS level on unrecognized traffic results
+	 * in undefined behavior. For predictable results, it is recommended
+	 * to make the flow rule pattern match packet headers up to the
+	 * requested encapsulation level so that only matching traffic goes
+	 * through.
+	 */
+	uint32_t level;
 	uint64_t types; /**< RSS hash types (see ETH_RSS_*). */
 	uint32_t key_len; /**< Hash key length in bytes. */
 	uint32_t queue_num; /**< Number of entries in @p queue. */
-- 
2.11.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v1 05/16] ethdev: remove DUP action from flow API
  2018-04-04 15:56  4% [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
  2018-04-04 15:56  7% ` [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions Adrien Mazarguil
@ 2018-04-04 15:56  3% ` Adrien Mazarguil
  2018-04-04 15:56  2% ` [dpdk-dev] [PATCH v1 10/16] ethdev: add encap level to RSS flow API action Adrien Mazarguil
  2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
  3 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-04 15:56 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Upcoming changes in relation to the handling of actions list will make the
DUP action redundant as specifying several QUEUE actions will achieve the
same behavior. Besides, no PMD implements this action.

By removing an entry from enum rte_flow_action_type, this patch triggers a
major ABI breakage.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 app/test-pmd/cmdline_flow.c                 | 23 -----------------------
 app/test-pmd/config.c                       |  1 -
 doc/guides/prog_guide/rte_flow.rst          | 23 -----------------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  8 --------
 lib/librte_ether/rte_flow.c                 |  1 -
 lib/librte_ether/rte_flow.h                 | 24 ------------------------
 6 files changed, 80 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 30450f1a4..9702b3ef3 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -164,8 +164,6 @@ enum index {
 	ACTION_QUEUE_INDEX,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
-	ACTION_DUP_INDEX,
 	ACTION_RSS,
 	ACTION_RSS_TYPES,
 	ACTION_RSS_TYPE,
@@ -625,7 +623,6 @@ static const enum index next_action[] = {
 	ACTION_QUEUE,
 	ACTION_DROP,
 	ACTION_COUNT,
-	ACTION_DUP,
 	ACTION_RSS,
 	ACTION_PF,
 	ACTION_VF,
@@ -645,12 +642,6 @@ static const enum index action_queue[] = {
 	ZERO,
 };
 
-static const enum index action_dup[] = {
-	ACTION_DUP_INDEX,
-	ACTION_NEXT,
-	ZERO,
-};
-
 static const enum index action_rss[] = {
 	ACTION_RSS_TYPES,
 	ACTION_RSS_KEY,
@@ -1597,20 +1588,6 @@ static const struct token token_list[] = {
 		.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
 		.call = parse_vc,
 	},
-	[ACTION_DUP] = {
-		.name = "dup",
-		.help = "duplicate packets to a given queue index",
-		.priv = PRIV_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
-		.next = NEXT(action_dup),
-		.call = parse_vc,
-	},
-	[ACTION_DUP_INDEX] = {
-		.name = "index",
-		.help = "queue index to duplicate packets to",
-		.next = NEXT(action_dup, NEXT_ENTRY(UNSIGNED)),
-		.args = ARGS(ARGS_ENTRY(struct rte_flow_action_dup, index)),
-		.call = parse_vc_conf,
-	},
 	[ACTION_RSS] = {
 		.name = "rss",
 		.help = "spread packets among several queues",
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 7ae0295f6..8d42ea9a9 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1049,7 +1049,6 @@ static const struct {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/doc/guides/prog_guide/rte_flow.rst b/doc/guides/prog_guide/rte_flow.rst
index 51826d04c..a237e4fd2 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -1299,26 +1299,6 @@ Query structure to retrieve and reset flow rule counters:
    | ``bytes``     | out | number of bytes through this rule |
    +---------------+-----+-----------------------------------+
 
-Action: ``DUP``
-^^^^^^^^^^^^^^^
-
-Duplicates packets to a given queue index.
-
-This is normally combined with QUEUE, however when used alone, it is
-actually similar to QUEUE + PASSTHRU.
-
-- Non-terminating by default.
-
-.. _table_rte_flow_action_dup:
-
-.. table:: DUP
-
-   +-----------+------------------------------------+
-   | Field     | Value                              |
-   +===========+====================================+
-   | ``index`` | queue index to duplicate packet to |
-   +-----------+------------------------------------+
-
 Action: ``RSS``
 ^^^^^^^^^^^^^^^
 
@@ -2010,9 +1990,6 @@ Unsupported actions
   and tagging (`Action: MARK`_ or `Action: FLAG`_) may be implemented in
   software as long as the target queue is used by a single rule.
 
-- A rule specifying both `Action: DUP`_ + `Action: QUEUE`_ may be translated
-  to two hidden rules combining `Action: QUEUE`_ and `Action: PASSTHRU`_.
-
 - When a single target queue is provided, `Action: RSS`_ can also be
   implemented through `Action: QUEUE`_.
 
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index cb6f201e1..a015d02a4 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3363,10 +3363,6 @@ actions can sometimes be combined when the end result is unambiguous::
 
 ::
 
-   drop / dup index 6 / end # same as above
-
-::
-
    queue index 6 / rss queues 6 7 8 / end # queue has no effect
 
 ::
@@ -3400,10 +3396,6 @@ This section lists supported actions and their attributes, if any.
 
 - ``count``: enable counters for this rule.
 
-- ``dup``: duplicate packets to a given queue index.
-
-  - ``index {unsigned}``: queue index to duplicate packets to.
-
 - ``rss``: spread packets among several queues.
 
   - ``types [{RSS hash type} [...]] end``: RSS hash types, allowed tokens
diff --git a/lib/librte_ether/rte_flow.c b/lib/librte_ether/rte_flow.c
index ba6feddee..db04c4f94 100644
--- a/lib/librte_ether/rte_flow.c
+++ b/lib/librte_ether/rte_flow.c
@@ -73,7 +73,6 @@ static const struct rte_flow_desc_data rte_flow_desc_action[] = {
 	MK_FLOW_ACTION(QUEUE, sizeof(struct rte_flow_action_queue)),
 	MK_FLOW_ACTION(DROP, 0),
 	MK_FLOW_ACTION(COUNT, 0),
-	MK_FLOW_ACTION(DUP, sizeof(struct rte_flow_action_dup)),
 	MK_FLOW_ACTION(RSS, sizeof(struct rte_flow_action_rss)), /* +queue[] */
 	MK_FLOW_ACTION(PF, 0),
 	MK_FLOW_ACTION(VF, sizeof(struct rte_flow_action_vf)),
diff --git a/lib/librte_ether/rte_flow.h b/lib/librte_ether/rte_flow.h
index 36fd38ffa..aab637a2c 100644
--- a/lib/librte_ether/rte_flow.h
+++ b/lib/librte_ether/rte_flow.h
@@ -961,16 +961,6 @@ enum rte_flow_action_type {
 	RTE_FLOW_ACTION_TYPE_COUNT,
 
 	/**
-	 * Duplicates packets to a given queue index.
-	 *
-	 * This is normally combined with QUEUE, however when used alone, it
-	 * is actually similar to QUEUE + PASSTHRU.
-	 *
-	 * See struct rte_flow_action_dup.
-	 */
-	RTE_FLOW_ACTION_TYPE_DUP,
-
-	/**
 	 * Similar to QUEUE, except RSS is additionally performed on packets
 	 * to spread them among several queues according to the provided
 	 * parameters.
@@ -1052,20 +1042,6 @@ struct rte_flow_query_count {
 };
 
 /**
- * RTE_FLOW_ACTION_TYPE_DUP
- *
- * Duplicates packets to a given queue index.
- *
- * This is normally combined with QUEUE, however when used alone, it is
- * actually similar to QUEUE + PASSTHRU.
- *
- * Non-terminating by default.
- */
-struct rte_flow_action_dup {
-	uint16_t index; /**< Queue index to duplicate packets to. */
-};
-
-/**
  * RTE_FLOW_ACTION_TYPE_RSS
  *
  * Similar to QUEUE, except RSS is additionally performed on packets to
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions
  2018-04-04 15:56  4% [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
@ 2018-04-04 15:56  7% ` Adrien Mazarguil
  2018-04-05 10:06  4%   ` Thomas Monjalon
  2018-04-04 15:56  3% ` [dpdk-dev] [PATCH v1 05/16] ethdev: remove DUP action from flow API Adrien Mazarguil
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 200+ results
From: Adrien Mazarguil @ 2018-04-04 15:56 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

Subsequent patches will modify existing types and slightly alter the
behavior of the flow API. This warrants a major ABI breakage.

While it is already taken care of for 18.05 (LIBABIVER was updated to
version 9 by a prior commit), this patch explicitly adds the affected flow
API functions as a safety measure.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 lib/librte_ether/rte_ethdev_version.map | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev_version.map b/lib/librte_ether/rte_ethdev_version.map
index 34df6c8b5..78a6f5afb 100644
--- a/lib/librte_ether/rte_ethdev_version.map
+++ b/lib/librte_ether/rte_ethdev_version.map
@@ -203,6 +203,16 @@ DPDK_18.02 {
 
 } DPDK_17.11;
 
+DPDK_18.05 {
+	global:
+
+	rte_flow_validate;
+	rte_flow_create;
+	rte_flow_query;
+	rte_flow_copy;
+
+} DPDK_18.02;
+
 EXPERIMENTAL {
 	global:
 
-- 
2.11.0

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads
@ 2018-04-04 15:56  4% Adrien Mazarguil
  2018-04-04 15:56  7% ` [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions Adrien Mazarguil
                   ` (3 more replies)
  0 siblings, 4 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-04 15:56 UTC (permalink / raw)
  To: Thomas Monjalon, Ferruh Yigit, dev

As summarized in a prior RFC [1], the flow API (rte_flow) was chosen as a
means to manage switch offloads supported by many devices (usually going by
names such as E-Switch or vSwitch) through user-specified flow rules.

Combined with the need to support encap/decap actions, this requires a
change in the way flow actions are processed (in order and possibly
repeated) which modifies the behavior of some of the existing actions, thus
warranting a major ABI breakage.

Given this ABI breakage is also required by other work submitted for the
current release [2][3], this series addresses various longstanding issues
with the flow API and makes minor improvements in preparation for upcoming
features.

Changes summary:

- Additional error types.
- Clearer documentation.
- Improved C++ compatibility.
- Exhaustive RSS action.
- Consistent behavior of VLAN pattern item.
- New "transfer" attribute bringing consistency to VF/PF pattern items.
- Confusing "PORT" pattern item renamed "PHY_PORT", with new action
  counterpart.
- New "PORT_ID" pattern item and action to be used with port representors.

This series piggybacks on the major ABI update introduced by a prior
commit [4] for DPDK 18.05 and depends on several fixes [5] which must be
applied first.

[1] "[RFC] Switch device offload with DPDK"
    http://dpdk.org/ml/archives/dev/2018-March/092513.html

[2] commit 676b605182a5 ("doc: announce ethdev API change for RSS
    configuration")

[3] "[PATCH v1 00/21] MLX5 tunnel Rx offloading"
    http://dpdk.org/ml/archives/dev/2018-March/092264.html

[4] commit 653e038efc9b ("ethdev: remove versioning of filter control
    function")

[5] "[PATCH v2 00/13] Bunch of flow API-related fixes"
    http://dpdk.org/ml/archives/dev/2018-April/095273.html

Adrien Mazarguil (16):
  ethdev: update ABI for flow API functions
  ethdev: add error types to flow API
  ethdev: clarify flow API pattern items and actions
  doc: remove flow API migration section
  ethdev: remove DUP action from flow API
  ethdev: alter behavior of flow API actions
  ethdev: remove C99 flexible arrays from flow API
  ethdev: flatten RSS configuration in flow API
  ethdev: add hash function to RSS flow API action
  ethdev: add encap level to RSS flow API action
  ethdev: refine TPID handling in flow API
  ethdev: add transfer attribute to flow API
  ethdev: update behavior of VF/PF in flow API
  ethdev: rename physical port item in flow API
  ethdev: add physical port action to flow API
  ethdev: add port ID item and action to flow API

 app/test-pmd/cmdline_flow.c                 | 405 ++++++++++-----
 app/test-pmd/config.c                       |  78 +--
 doc/guides/nics/tap.rst                     |   2 +-
 doc/guides/prog_guide/rte_flow.rst          | 601 ++++++++---------------
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |  54 +-
 drivers/net/bnxt/bnxt_filter.c              |  52 +-
 drivers/net/e1000/e1000_ethdev.h            |  13 +-
 drivers/net/e1000/igb_ethdev.c              |   4 +-
 drivers/net/e1000/igb_flow.c                |  83 +++-
 drivers/net/e1000/igb_rxtx.c                |  55 ++-
 drivers/net/enic/enic_flow.c                |  52 +-
 drivers/net/i40e/i40e_ethdev.c              |  57 ++-
 drivers/net/i40e/i40e_ethdev.h              |  15 +-
 drivers/net/i40e/i40e_flow.c                | 144 ++++--
 drivers/net/ixgbe/ixgbe_ethdev.c            |   4 +-
 drivers/net/ixgbe/ixgbe_ethdev.h            |  13 +-
 drivers/net/ixgbe/ixgbe_flow.c              |  91 +++-
 drivers/net/ixgbe/ixgbe_rxtx.c              |  55 ++-
 drivers/net/mlx4/mlx4.c                     |   2 +-
 drivers/net/mlx4/mlx4_flow.c                | 117 +++--
 drivers/net/mlx4/mlx4_flow.h                |   2 +-
 drivers/net/mlx4/mlx4_rxq.c                 |   2 +-
 drivers/net/mlx4/mlx4_rxtx.h                |   2 +-
 drivers/net/mlx5/mlx5_flow.c                | 317 ++++++------
 drivers/net/mlx5/mlx5_rxq.c                 |  22 +-
 drivers/net/mlx5/mlx5_rxtx.h                |  26 +-
 drivers/net/mvpp2/mrvl_flow.c               |  33 +-
 drivers/net/sfc/sfc_flow.c                  |  82 +++-
 drivers/net/tap/tap_flow.c                  |  51 +-
 examples/ipsec-secgw/ipsec.c                |  21 +-
 lib/librte_ether/rte_ethdev_version.map     |  10 +
 lib/librte_ether/rte_flow.c                 |  68 +--
 lib/librte_ether/rte_flow.h                 | 328 ++++++++-----
 33 files changed, 1747 insertions(+), 1114 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated
  @ 2018-04-04 15:08  0%     ` santosh
  2018-04-06 15:51  0%     ` Olivier Matz
  2018-04-12 15:22  0%     ` Burakov, Anatoly
  2 siblings, 0 replies; 200+ results
From: santosh @ 2018-04-04 15:08 UTC (permalink / raw)
  To: Andrew Rybchenko, dev; +Cc: Olivier MATZ


On Monday 26 March 2018 09:39 PM, Andrew Rybchenko wrote:
> Size of memory chunk required to populate mempool objects depends
> on how objects are stored in the memory. Different mempool drivers
> may have different requirements and a new operation allows to
> calculate memory size in accordance with driver requirements and
> advertise requirements on minimum memory chunk size and alignment
> in a generic way.
>
> Bump ABI version since the patch breaks it.
>
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
> ---

Acked-by: Santosh Shukla <Santosh.Shukla@caviumnetworks.com>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2 12/13] ethdev: fix ABI version in meson build
  2018-04-04 14:57  3% ` [dpdk-dev] [PATCH v2 00/13] " Adrien Mazarguil
@ 2018-04-04 14:58  4%   ` Adrien Mazarguil
  2018-04-06 13:22  3%   ` [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes Adrien Mazarguil
  1 sibling, 0 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-04 14:58 UTC (permalink / raw)
  To: dev; +Cc: Kirill Rybalchenko

Must remain synchronized with its Makefile counterpart.

Fixes: 653e038efc9b ("ethdev: remove versioning of filter control function")
Cc: Kirill Rybalchenko <kirill.rybalchenko@intel.com>

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 lib/librte_ether/meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_ether/meson.build b/lib/librte_ether/meson.build
index 7fed86056..12bdb6b61 100644
--- a/lib/librte_ether/meson.build
+++ b/lib/librte_ether/meson.build
@@ -2,7 +2,7 @@
 # Copyright(c) 2017 Intel Corporation
 
 name = 'ethdev'
-version = 8
+version = 9
 allow_experimental_apis = true
 sources = files('ethdev_profile.c',
 	'rte_ethdev.c',
-- 
2.11.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v2 00/13] Bunch of flow API-related fixes
  @ 2018-04-04 14:57  3% ` Adrien Mazarguil
  2018-04-04 14:58  4%   ` [dpdk-dev] [PATCH v2 12/13] ethdev: fix ABI version in meson build Adrien Mazarguil
  2018-04-06 13:22  3%   ` [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes Adrien Mazarguil
  0 siblings, 2 replies; 200+ results
From: Adrien Mazarguil @ 2018-04-04 14:57 UTC (permalink / raw)
  To: dev

This series contains several fixes for rte_flow and its implementation in
PMDs and testpmd. Upcoming work on the flow API depends on it.

v2 changes:

- mlx5 fix (patch #3).
- bnxt fix (patch #4).
- sfc fix (patch #6).
- Missing include (patch #13).

Adrien Mazarguil (13):
  net/mlx4: fix RSS resource leak in case of error
  net/mlx4: fix ignored RSS hash types
  net/mlx5: fix RSS flow action bounds check
  net/bnxt: fix matching of flow API item masks
  net/sfc: fix endian conversions in flow API
  app/testpmd: fix flow completion for RSS queues
  app/testpmd: fix lack of flow action configuration
  app/testpmd: fix RSS flow action configuration
  app/testpmd: fix missing RSS fields in flow action
  ethdev: fix shallow copy of flow API RSS action
  ethdev: fix missing boolean values in flow command
  ethdev: fix ABI version in meson build
  ethdev: fix missing include in flow API

 app/test-pmd/cmdline_flow.c                 | 255 ++++++++++++++++++++---
 app/test-pmd/config.c                       | 160 +++++++++-----
 app/test-pmd/testpmd.h                      |  13 ++
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +
 drivers/net/bnxt/bnxt_filter.c              |  14 +-
 drivers/net/mlx4/mlx4_flow.c                |  17 +-
 drivers/net/mlx5/mlx5_flow.c                |   9 +
 drivers/net/sfc/sfc_flow.c                  |  13 +-
 lib/librte_ether/meson.build                |   2 +-
 lib/librte_ether/rte_flow.c                 | 145 +++++++++----
 lib/librte_ether/rte_flow.h                 |   2 +
 11 files changed, 503 insertions(+), 135 deletions(-)

-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH 12/13] eal: replace rte_panic instances in init sequence
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (4 preceding siblings ...)
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
@ 2018-04-04 11:27  2% ` Arnon Warshavsky
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local functions to this file,
changing from void to int are non-abi-breaking.
For handling the single function that cannot
change from void to int due to abi,
where this is the only place it is called in,
I added a state variable that is being checked
right after the call to this function.

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_eal/bsdapp/eal/eal.c           |  87 ++++++++++++++-------
 lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
 lib/librte_eal/common/eal_common_launch.c |  21 ++++++
 lib/librte_eal/common/include/rte_debug.h |  12 +++
 lib/librte_eal/linuxapp/eal/eal.c         | 121 ++++++++++++++++++++----------
 lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
 6 files changed, 272 insertions(+), 99 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5..f6aa3b2 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -150,7 +150,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -159,60 +159,79 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
 	if (retval < 0){
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
 	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	void *rte_mem_cfg_addr;
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	if (mem_cfg_fd < 0){
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+					__func__, pathname);
+			return -1;
+		}
 	}
 
 	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 	close(mem_cfg_fd);
-	if (rte_mem_cfg_addr == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
+				__func__);
+		return -1;
+	}
 
 	rte_config.mem_config = rte_mem_cfg_addr;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -236,23 +255,28 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create())
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach())
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+	return 0;
 }
 
 /* display usage */
@@ -583,7 +607,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_mp_channel_init() < 0) {
 		rte_eal_init_alert("failed to init mp channel\n");
@@ -630,7 +655,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_check_mem_on_local_socket();
 
-	eal_thread_init_master(rte_config.master_lcore);
+	if (eal_thread_init_master(rte_config.master_lcore) != 0)
+		return -1;
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -644,18 +670,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
index d602daf..5c3947c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_thread.c
+++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
index fe0ba3f..6f8bd46 100644
--- a/lib/librte_eal/common/eal_common_launch.c
+++ b/lib/librte_eal/common/eal_common_launch.c
@@ -14,6 +14,7 @@
 #include <rte_pause.h>
 #include <rte_per_lcore.h>
 #include <rte_lcore.h>
+#include <rte_debug.h>
 
 /*
  * Wait until a lcore finished its job.
@@ -88,3 +89,23 @@ enum rte_lcore_state_t
 		rte_eal_wait_lcore(lcore_id);
 	}
 }
+
+/* panic state */
+static int _panic_state;
+
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void)
+{
+	return _panic_state;
+}
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void)
+{
+	_panic_state = 1;
+}
diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
index 272df49..b421d33 100644
--- a/lib/librte_eal/common/include/rte_debug.h
+++ b/lib/librte_eal/common/include/rte_debug.h
@@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
 }
 #endif
 
+/**
+ * Check if the system is in panic state
+ * @return int
+ */
+int rte_get_panic_state(void);
+
+/**
+ * Move the system to be in panic state
+ */
+void rte_move_to_panic_state(void);
+
+
 #endif /* _RTE_DEBUG_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 2ecd07b..b7b950a 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -160,7 +160,7 @@ enum rte_iova_mode
  * We also don't lock the whole file, so that in future we can use read-locks
  * on other parts, e.g. memzones, to detect if there are running secondary
  * processes. */
-static void
+static int
 rte_eal_config_create(void)
 {
 	void *rte_mem_cfg_addr;
@@ -169,7 +169,7 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* map the config before hugepage address so that we don't waste a page */
 	if (internal_config.base_virtaddr != 0)
@@ -179,30 +179,39 @@ enum rte_iova_mode
 	else
 		rte_mem_cfg_addr = NULL;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for "
+					"rte_mem_config\n", __func__, pathname);
+			return -1;
+		}
 	}
 
 	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
-	if (retval < 0){
+	if (retval < 0) {
 		close(mem_cfg_fd);
-		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-				"process running?\n", pathname);
+		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
+				" Is another primary process running?\n",
+				__func__, pathname);
+		return -1;
 	}
 
 	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
 				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
-	if (rte_mem_cfg_addr == MAP_FAILED){
-		rte_panic("Cannot mmap memory for rte_config\n");
+	if (rte_mem_cfg_addr == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config\n", __func__);
+		return -1;
 	}
 	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
 	rte_config.mem_config = rte_mem_cfg_addr;
@@ -211,10 +220,11 @@ enum rte_iova_mode
 	 * processes could later map the config into this exact location */
 	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
 
+	return 0;
 }
 
 /* attach to an existing shared memory config */
-static void
+static int
 rte_eal_config_attach(void)
 {
 	struct rte_mem_config *mem_config;
@@ -222,33 +232,41 @@ enum rte_iova_mode
 	const char *pathname = eal_runtime_config_path();
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
-	if (mem_cfg_fd < 0){
+	if (mem_cfg_fd < 0) {
 		mem_cfg_fd = open(pathname, O_RDWR);
-		if (mem_cfg_fd < 0)
-			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+		if (mem_cfg_fd < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
+						__func__, pathname);
+			return -1;
+		}
 	}
 
 	/* map it as read-only first */
 	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
 			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
-	if (mem_config == MAP_FAILED)
-		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-			  errno, strerror(errno));
+	if (mem_config == MAP_FAILED) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+				"rte_config! error %i (%s)\n",
+				__func__, errno, strerror(errno));
+		return -1;
+	}
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* reattach the shared config at exact memory location primary process has it */
-static void
+static int
 rte_eal_config_reattach(void)
 {
 	struct rte_mem_config *mem_config;
 	void *rte_mem_cfg_addr;
 
 	if (internal_config.no_shconf)
-		return;
+		return 0;
 
 	/* save the address primary process has mapped shared config to */
 	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
@@ -263,16 +281,21 @@ enum rte_iova_mode
 	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
 		if (mem_config != MAP_FAILED)
 			/* errno is stale, don't use */
-			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
-				  " - please use '--base-virtaddr' option\n",
-				  rte_mem_cfg_addr, mem_config);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config at [%p], got [%p] - please use "
+					"'--base-virtaddr' option\n",
+					__func__, rte_mem_cfg_addr, mem_config);
 		else
-			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-				  errno, strerror(errno));
+			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
+					"rte_config! error %i (%s)\n",
+					__func__, errno, strerror(errno));
+		return -1;
 	}
 	close(mem_cfg_fd);
 
 	rte_config.mem_config = mem_config;
+
+	return 0;
 }
 
 /* Detect if we are a primary or a secondary process */
@@ -296,24 +319,31 @@ enum rte_proc_type_t
 }
 
 /* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
+static int
 rte_config_init(void)
 {
 	rte_config.process_type = internal_config.process_type;
 
 	switch (rte_config.process_type){
 	case RTE_PROC_PRIMARY:
-		rte_eal_config_create();
+		if (rte_eal_config_create() != 0)
+			return -1;
 		break;
 	case RTE_PROC_SECONDARY:
-		rte_eal_config_attach();
+		if (rte_eal_config_attach() != 0)
+			return -1;
 		rte_eal_mcfg_wait_complete(rte_config.mem_config);
-		rte_eal_config_reattach();
+		if (rte_eal_config_reattach() != 0)
+			return -1;
 		break;
 	case RTE_PROC_AUTO:
 	case RTE_PROC_INVALID:
-		rte_panic("Invalid process type\n");
+		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
+				__func__, rte_config.process_type);
+		return -1;
 	}
+
+	return 0;
 }
 
 /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
@@ -827,7 +857,8 @@ static void rte_eal_init_alert(const char *msg)
 
 	rte_srand(rte_rdtsc());
 
-	rte_config_init();
+	if (rte_config_init() != 0)
+		return -1;
 
 	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
 		rte_eal_init_alert("Cannot init logging.");
@@ -890,6 +921,9 @@ static void rte_eal_init_alert(const char *msg)
 
 	eal_thread_init_master(rte_config.master_lcore);
 
+	if (rte_get_panic_state())
+		return -1;
+
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
 	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
@@ -907,18 +941,27 @@ static void rte_eal_init_alert(const char *msg)
 		 * create communication pipes between master thread
 		 * and children
 		 */
-		if (pipe(lcore_config[i].pipe_master2slave) < 0)
-			rte_panic("Cannot create pipe\n");
-		if (pipe(lcore_config[i].pipe_slave2master) < 0)
-			rte_panic("Cannot create pipe\n");
+		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
+		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
+					__func__);
+			return -1;
+		}
 
 		lcore_config[i].state = WAIT;
 
 		/* create a thread for each lcore */
 		ret = pthread_create(&lcore_config[i].thread_id, NULL,
 				     eal_thread_loop, NULL);
-		if (ret != 0)
-			rte_panic("Cannot create thread\n");
+		if (ret != 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
+					__func__);
+			return -1;
+		}
 
 		/* Set thread_name for aid in debugging. */
 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
index 08e150b..3afcee5 100644
--- a/lib/librte_eal/linuxapp/eal/eal_thread.c
+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c
@@ -51,16 +51,22 @@
 	n = 0;
 	while (n == 0 || (n < 0 && errno == EINTR))
 		n = write(m2s, &c, 1);
-	if (n < 0)
-		rte_panic("cannot write on configuration pipe\n");
+	if (n < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	/* wait ack */
 	do {
 		n = read(s2m, &c, 1);
 	} while (n < 0 && errno == EINTR);
 
-	if (n <= 0)
-		rte_panic("cannot read on configuration pipe\n");
+	if (n <= 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+				__func__);
+		return -1;
+	}
 
 	return 0;
 }
@@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		rte_move_to_panic_state();
+	}
+}
+
+/* move to panic state and do not return */
+static __attribute__((noreturn)) void
+defunct_and_remain_in_endless_loop(void)
+{
+	rte_move_to_panic_state();
+	while (1)
+		sleep(1);
 }
 
 /* main loop of threads */
@@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
 		if (thread_id == lcore_config[lcore_id].thread_id)
 			break;
 	}
-	if (lcore_id == RTE_MAX_LCORE)
-		rte_panic("cannot retrieve lcore id\n");
+	if (lcore_id == RTE_MAX_LCORE) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
+				__func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	m2s = lcore_config[lcore_id].pipe_master2slave[0];
 	s2m = lcore_config[lcore_id].pipe_slave2master[1];
@@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
 	RTE_PER_LCORE(_lcore_id) = lcore_id;
 
 	/* set CPU affinity */
-	if (eal_thread_set_affinity() < 0)
-		rte_panic("cannot set affinity\n");
+	if (eal_thread_set_affinity() < 0) {
+		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
+		defunct_and_remain_in_endless_loop();
+	}
 
 	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
 
@@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
 			n = read(m2s, &c, 1);
 		} while (n < 0 && errno == EINTR);
 
-		if (n <= 0)
-			rte_panic("cannot read on configuration pipe\n");
+		if (n <= 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		lcore_config[lcore_id].state = RUNNING;
 
@@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
 		n = 0;
 		while (n == 0 || (n < 0 && errno == EINTR))
 			n = write(s2m, &c, 1);
-		if (n < 0)
-			rte_panic("cannot write on configuration pipe\n");
-
-		if (lcore_config[lcore_id].f == NULL)
-			rte_panic("NULL function pointer\n");
+		if (n < 0) {
+			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
+
+		if (lcore_config[lcore_id].f == NULL) {
+			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
+					__func__);
+			defunct_and_remain_in_endless_loop();
+		}
 
 		/* call the function and store the return value */
 		fct_arg = lcore_config[lcore_id].arg;
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH 11/13] eal: replace rte_panic instances in ethdev
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (3 preceding siblings ...)
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
@ 2018-04-04 11:27  3% ` Arnon Warshavsky
  2018-04-04 11:27  2% ` [dpdk-dev] [PATCH 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_ether/rte_ethdev.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 2c74f7e..57e1e6b 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -194,7 +194,7 @@ enum {
 	return port_id;
 }
 
-static void
+static int
 rte_eth_dev_shared_data_prepare(void)
 {
 	const unsigned flags = 0;
@@ -210,8 +210,12 @@ enum {
 					rte_socket_id(), flags);
 		} else
 			mz = rte_memzone_lookup(MZ_RTE_ETH_DEV_DATA);
-		if (mz == NULL)
-			rte_panic("Cannot allocate ethdev shared data\n");
+		if (mz == NULL) {
+			rte_spinlock_unlock(&rte_eth_shared_data_lock);
+			RTE_LOG(CRIT, EAL, "%s(): Cannot allocate ethdev shared data\n",
+					__func__);
+			return -1;
+		}
 
 		rte_eth_dev_shared_data = mz->addr;
 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
@@ -224,6 +228,8 @@ enum {
 	}
 
 	rte_spinlock_unlock(&rte_eth_shared_data_lock);
+
+	return 0;
 }
 
 struct rte_eth_dev *
@@ -274,7 +280,8 @@ struct rte_eth_dev *
 	uint16_t port_id;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port creation between primary and secondary threads. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -317,7 +324,8 @@ struct rte_eth_dev *
 	uint16_t i;
 	struct rte_eth_dev *eth_dev = NULL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return NULL;
 
 	/* Synchronize port attachment to primary port creation and release. */
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
@@ -345,7 +353,8 @@ struct rte_eth_dev *
 	if (eth_dev == NULL)
 		return -EINVAL;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -399,7 +408,8 @@ struct rte_eth_dev *
 int __rte_experimental
 rte_eth_dev_owner_new(uint64_t *owner_id)
 {
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -450,7 +460,8 @@ struct rte_eth_dev *
 {
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -467,7 +478,8 @@ struct rte_eth_dev *
 			{.id = RTE_ETH_DEV_NO_OWNER, .name = ""};
 	int ret;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -482,7 +494,8 @@ struct rte_eth_dev *
 {
 	uint16_t port_id;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
@@ -502,7 +515,8 @@ struct rte_eth_dev *
 {
 	int ret = 0;
 
-	rte_eth_dev_shared_data_prepare();
+	if (rte_eth_dev_shared_data_prepare() != 0)
+		return -1;
 
 	rte_spinlock_lock(&rte_eth_dev_shared_data->ownership_lock);
 
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH 06/13] kni: replace rte_panic instances in kni
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
                   ` (2 preceding siblings ...)
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
@ 2018-04-04 11:27  3% ` Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
  2018-04-04 11:27  2% ` [dpdk-dev] [PATCH 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 lib/librte_kni/rte_kni.c      | 18 ++++++++++++------
 lib/librte_kni/rte_kni_fifo.h | 11 ++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2867411..54050c8 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -353,37 +353,43 @@ struct rte_kni *
 	/* TX RING */
 	mz = slot->m_tx_q;
 	ctx->tx_q = mz->addr;
-	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.tx_phys = mz->phys_addr;
 
 	/* RX RING */
 	mz = slot->m_rx_q;
 	ctx->rx_q = mz->addr;
-	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.rx_phys = mz->phys_addr;
 
 	/* ALLOC RING */
 	mz = slot->m_alloc_q;
 	ctx->alloc_q = mz->addr;
-	kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->alloc_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.alloc_phys = mz->phys_addr;
 
 	/* FREE RING */
 	mz = slot->m_free_q;
 	ctx->free_q = mz->addr;
-	kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->free_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.free_phys = mz->phys_addr;
 
 	/* Request RING */
 	mz = slot->m_req_q;
 	ctx->req_q = mz->addr;
-	kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->req_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.req_phys = mz->phys_addr;
 
 	/* Response RING */
 	mz = slot->m_resp_q;
 	ctx->resp_q = mz->addr;
-	kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX);
+	if (kni_fifo_init(ctx->resp_q, KNI_FIFO_COUNT_MAX))
+		return NULL;
 	dev_info.resp_phys = mz->phys_addr;
 
 	/* Req/Resp sync mem area */
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c..5052015 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -7,17 +7,22 @@
 /**
  * Initializes the kni fifo structure
  */
-static void
+static int
 kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size)
 {
 	/* Ensure size is power of 2 */
-	if (size & (size - 1))
-		rte_panic("KNI fifo size must be power of 2\n");
+	if (size & (size - 1)) {
+		RTE_LOG(CRIT, EAL, "%s(): KNI fifo size must be power of 2\n",
+				__func__);
+		return -1;
+	}
 
 	fifo->write = 0;
 	fifo->read = 0;
 	fifo->len = size;
 	fifo->elem_size = sizeof(void *);
+
+	return 0;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH 04/13] ixgbe: replace rte_panic instances in ixgbe driver
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
@ 2018-04-04 11:27  3% ` Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/ixgbe/ixgbe_ethdev.c |  3 ++-
 drivers/net/ixgbe/ixgbe_ethdev.h |  2 +-
 drivers/net/ixgbe/ixgbe_pf.c     | 13 +++++++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 4df5c75..96188dc 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1224,7 +1224,8 @@ struct rte_ixgbe_xstats_name_off {
 	memset(hwstrip, 0, sizeof(*hwstrip));
 
 	/* initialize PF if max_vfs not zero */
-	ixgbe_pf_host_init(eth_dev);
+	if (ixgbe_pf_host_init(eth_dev) != 0)
+		return -1;
 
 	ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);
 	/* let hardware know driver is loaded */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h b/drivers/net/ixgbe/ixgbe_ethdev.h
index c56d652..82d7fd2 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.h
+++ b/drivers/net/ixgbe/ixgbe_ethdev.h
@@ -663,7 +663,7 @@ int ixgbe_fdir_filter_program(struct rte_eth_dev *dev,
 
 void ixgbe_vlan_hw_strip_disable_all(struct rte_eth_dev *dev);
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/ixgbe/ixgbe_pf.c b/drivers/net/ixgbe/ixgbe_pf.c
index ea99737..5c25de0 100644
--- a/drivers/net/ixgbe/ixgbe_pf.c
+++ b/drivers/net/ixgbe/ixgbe_pf.c
@@ -66,7 +66,7 @@ int ixgbe_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
+int ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct ixgbe_vf_info **vfinfo =
 		IXGBE_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -84,11 +84,14 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	vf_num = dev_num_vf(eth_dev);
 	if (vf_num == 0)
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct ixgbe_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(ERR, PMD, "%s() Cannot allocate memory for private VF data\n",
+				__func__);
+		return -1;
+	}
 
 	memset(mirror_info, 0, sizeof(struct ixgbe_mirror_info));
 	memset(uta_info, 0, sizeof(struct ixgbe_uta_info));
@@ -116,6 +119,8 @@ void ixgbe_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	/* set mb interrupt mask */
 	ixgbe_mb_intr_setup(eth_dev);
+
+	return 0;
 }
 
 void ixgbe_pf_host_uninit(struct rte_eth_dev *eth_dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH 03/13] e1000: replace rte_panic instances in e1000 driver
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
@ 2018-04-04 11:27  3% ` Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local function to this file,
changing from void to int is non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/e1000/e1000_ethdev.h |  2 +-
 drivers/net/e1000/igb_ethdev.c   |  3 ++-
 drivers/net/e1000/igb_pf.c       | 15 +++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 23b089c..a66ff42 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -405,7 +405,7 @@ int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
 /*
  * misc function prototypes
  */
-void igb_pf_host_init(struct rte_eth_dev *eth_dev);
+int igb_pf_host_init(struct rte_eth_dev *eth_dev);
 
 void igb_pf_mbx_process(struct rte_eth_dev *eth_dev);
 
diff --git a/drivers/net/e1000/igb_ethdev.c b/drivers/net/e1000/igb_ethdev.c
index d7eef9a..994bb5a 100644
--- a/drivers/net/e1000/igb_ethdev.c
+++ b/drivers/net/e1000/igb_ethdev.c
@@ -833,7 +833,8 @@ static int igb_flex_filter_uninit(struct rte_eth_dev *eth_dev)
 	}
 
 	/* initialize PF if max_vfs not zero */
-	igb_pf_host_init(eth_dev);
+	if (igb_pf_host_init(eth_dev) != 0)
+		goto err_late;
 
 	ctrl_ext = E1000_READ_REG(hw, E1000_CTRL_EXT);
 	/* Set PF Reset Done bit so PF/VF Mail Ops can work */
diff --git a/drivers/net/e1000/igb_pf.c b/drivers/net/e1000/igb_pf.c
index b9f2e53..dfa63c9 100644
--- a/drivers/net/e1000/igb_pf.c
+++ b/drivers/net/e1000/igb_pf.c
@@ -63,7 +63,7 @@ int igb_vf_perm_addr_gen(struct rte_eth_dev *dev, uint16_t vf_num)
 	return 0;
 }
 
-void igb_pf_host_init(struct rte_eth_dev *eth_dev)
+int igb_pf_host_init(struct rte_eth_dev *eth_dev)
 {
 	struct e1000_vf_info **vfinfo =
 		E1000_DEV_PRIVATE_TO_P_VFDATA(eth_dev->data->dev_private);
@@ -74,7 +74,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = 0;
 	if (0 == (vf_num = dev_num_vf(eth_dev)))
-		return;
+		return 0;
 
 	if (hw->mac.type == e1000_i350)
 		nb_queue = 1;
@@ -82,11 +82,14 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 		/* per datasheet, it should be 2, but 1 seems correct */
 		nb_queue = 1;
 	else
-		return;
+		return 0;
 
 	*vfinfo = rte_zmalloc("vf_info", sizeof(struct e1000_vf_info) * vf_num, 0);
-	if (*vfinfo == NULL)
-		rte_panic("Cannot allocate memory for private VF data\n");
+	if (*vfinfo == NULL) {
+		RTE_LOG(CRIT, PMD, "%s(): Cannot allocate memory for private "
+				"VF data\n", __func__);
+		return -1;
+	}
 
 	RTE_ETH_DEV_SRIOV(eth_dev).active = ETH_8_POOLS;
 	RTE_ETH_DEV_SRIOV(eth_dev).nb_q_per_pool = nb_queue;
@@ -98,7 +101,7 @@ void igb_pf_host_init(struct rte_eth_dev *eth_dev)
 	/* set mb interrupt mask */
 	igb_mb_intr_setup(eth_dev);
 
-	return;
+	return 0;
 }
 
 void igb_pf_host_uninit(struct rte_eth_dev *dev)
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH 02/13] bond: replace rte_panic instances in bonding driver
  2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
@ 2018-04-04 11:27  3% ` Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon

replace panic calls with log and retrun value.
Local functions to this file,
changing from void to int are non-abi-breaking

Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c         | 30 +++++++++++++++--------
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |  2 +-
 drivers/net/bonding/rte_eth_bond_api.c            | 20 ++++++++++-----
 drivers/net/bonding/rte_eth_bond_pmd.c            | 10 +++++---
 drivers/net/bonding/rte_eth_bond_private.h        |  2 +-
 5 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index c452318..310118c 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -893,7 +893,7 @@
 			bond_mode_8023ad_periodic_cb, arg);
 }
 
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev,
 				uint16_t slave_id)
 {
@@ -939,7 +939,7 @@
 	timer_cancel(&port->warning_timer);
 
 	if (port->mbuf_pool != NULL)
-		return;
+		return 0;
 
 	RTE_ASSERT(port->rx_ring == NULL);
 	RTE_ASSERT(port->tx_ring == NULL);
@@ -968,8 +968,10 @@
 	/* Any memory allocation failure in initialization is critical because
 	 * resources can't be free, so reinitialization is impossible. */
 	if (port->mbuf_pool == NULL) {
-		rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-			slave_id, mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create memory"
+				" pool '%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id);
@@ -977,8 +979,9 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0);
 
 	if (port->rx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create rx ring '%s': %s\n",
+			__func__, slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
 
 	/* TX ring is at least one pkt longer to make room for marker packet. */
@@ -987,9 +990,13 @@
 			rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0);
 
 	if (port->tx_ring == NULL) {
-		rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id,
-			mem_name, rte_strerror(rte_errno));
+		RTE_LOG(ERR, PMD, "%s() Slave %u: Fail to create tx ring "
+				"'%s': %s\n", __func__,
+				slave_id, mem_name, rte_strerror(rte_errno));
+		return -1;
 	}
+
+	return 0;
 }
 
 int
@@ -1143,9 +1150,12 @@
 	struct bond_dev_private *internals = bond_dev->data->dev_private;
 	uint8_t i;
 
-	for (i = 0; i < internals->active_slave_count; i++)
-		bond_mode_8023ad_activate_slave(bond_dev,
+	for (i = 0; i < internals->active_slave_count; i++) {
+		int rc = bond_mode_8023ad_activate_slave(bond_dev,
 				internals->active_slaves[i]);
+		if (rc != 0)
+			return rc;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
index 0f490a5..96a42f2 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad_private.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad_private.h
@@ -263,7 +263,7 @@ struct mode8023ad_private {
  * @return
  *  0 on success, negative value otherwise.
  */
-void
+int
 bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint16_t port_id);
 
 /**
diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index f854b73..6bc5887 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -69,14 +69,15 @@
 	return 0;
 }
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id)
 {
 	struct bond_dev_private *internals = eth_dev->data->dev_private;
 	uint8_t active_count = internals->active_slave_count;
 
 	if (internals->mode == BONDING_MODE_8023AD)
-		bond_mode_8023ad_activate_slave(eth_dev, port_id);
+		if (bond_mode_8023ad_activate_slave(eth_dev, port_id) != 0)
+			return -1;
 
 	if (internals->mode == BONDING_MODE_TLB
 			|| internals->mode == BONDING_MODE_ALB) {
@@ -349,10 +350,17 @@
 				bond_ethdev_primary_set(internals,
 							slave_port_id);
 
-			if (find_slave_by_id(internals->active_slaves,
-					     internals->active_slave_count,
-					     slave_port_id) == internals->active_slave_count)
-				activate_slave(bonded_eth_dev, slave_port_id);
+			int rc =
+				find_slave_by_id(internals->active_slaves,
+					internals->active_slave_count,
+					slave_port_id);
+
+			if (rc == internals->active_slave_count) {
+				int rc = activate_slave(bonded_eth_dev,
+							slave_port_id);
+				if (rc != 0)
+					return -1;
+			}
 		}
 	}
 
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index b59ba9f..96f8b1a 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -1740,8 +1740,11 @@ struct bwg_slave {
 		/* Any memory allocation failure in initialization is critical because
 		 * resources can't be free, so reinitialization is impossible. */
 		if (port->slow_pool == NULL) {
-			rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
-				slave_id, mem_name, rte_strerror(rte_errno));
+			RTE_LOG(ERR, PMD, "%s() Slave %u: Failed to create"
+					" memory pool '%s': %s\n",
+					__func__, slave_id, mem_name,
+					rte_strerror(rte_errno));
+			return -1;
 		}
 	}
 
@@ -2660,7 +2663,8 @@ struct bwg_slave {
 			mac_address_slaves_update(bonded_eth_dev);
 		}
 
-		activate_slave(bonded_eth_dev, port_id);
+		if (activate_slave(bonded_eth_dev, port_id) != 0)
+			return -1;
 
 		/* If user has defined the primary port then default to using it */
 		if (internals->user_defined_primary_port &&
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 92e15f8..65453aa 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -185,7 +185,7 @@ struct bond_dev_private {
 void
 deactivate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
-void
+int
 activate_slave(struct rte_eth_dev *eth_dev, uint16_t port_id);
 
 void
-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances
@ 2018-04-04 11:27  3% Arnon Warshavsky
  2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
                   ` (5 more replies)
  0 siblings, 6 replies; 200+ results
From: Arnon Warshavsky @ 2018-04-04 11:27 UTC (permalink / raw)
  To: thomas, anatoly.burakov, wenzhuo.lu, declan.doherty, jerin.jacob,
	bruce.richardson, ferruh.yigit
  Cc: dev, arnon



The purpose of this patch series is to cleanup the library code
from paths that end up aborting the process,
and move to checking error values, in order to allow the running process
perform an orderly teardown or other mitigation of the event.

This patch modifies the majority of rte_panic calls
under lib and drivers, and replaces them with a log message
and an error return code according to context,
that can be propagated up the call stack.

- Focus was given to the dpdk initialization path
- Some of the panic calls within drivers were left in place where
  the call is from within an interrupt or calls that are
  on the data path,where there is no simple applicative
  route to propagate the error to temination.
  These should be handled by the driver maintainers.
- In order to avoid breaking ABI where panic was called from public
  void functions, a panic state variable was introduced so that
  it can be queried after calling these void functions.
  This tool place for a single function call.
- local void functions with no api were changed to retrun a value
  where needed
- No change took place in example and test files
- No change took place for debug assertions calling panic
- A new function was added to devtools/checkpatches.sh
  in order to prevent new additions of calls to rte_panic
  under lib and drivers.

Keep calm and don't panic.


Arnon Warshavsky (13):
  crypto: replace rte_panic instances in crypto driver
  bond: replace rte_panic instances in bonding driver
  e1000: replace rte_panic instances in e1000 driver
  ixgbe: replace rte_panic instances in ixgbe driver
  eal: replace rte_panic instances in eventdev
  kni: replace rte_panic instances in kni
  e1000: replace rte_panic instances in e1000 driver
  eal: replace rte_panic instances in hugepage_info
  eal: replace rte_panic instances in common_memzone
  eal: replace rte_panic instances in interrupts thread
  eal: replace rte_panic instances in ethdev
  eal: replace rte_panic instances in init sequence
  devtools: prevent new instances of rte_panic and rte_exit

 devtools/checkpatches.sh                          |  94 ++++++++++++++++-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c       |   8 +-
 drivers/crypto/dpaa_sec/dpaa_sec.c                |   8 +-
 drivers/net/bonding/rte_eth_bond_8023ad.c         |  30 ++++--
 drivers/net/bonding/rte_eth_bond_8023ad_private.h |   2 +-
 drivers/net/bonding/rte_eth_bond_api.c            |  20 ++--
 drivers/net/bonding/rte_eth_bond_pmd.c            |  10 +-
 drivers/net/bonding/rte_eth_bond_private.h        |   2 +-
 drivers/net/e1000/e1000_ethdev.h                  |   2 +-
 drivers/net/e1000/igb_ethdev.c                    |   3 +-
 drivers/net/e1000/igb_pf.c                        |  15 +--
 drivers/net/ixgbe/ixgbe_ethdev.c                  |   3 +-
 drivers/net/ixgbe/ixgbe_ethdev.h                  |   2 +-
 drivers/net/ixgbe/ixgbe_pf.c                      |  13 ++-
 lib/librte_eal/bsdapp/eal/eal.c                   |  87 +++++++++++-----
 lib/librte_eal/bsdapp/eal/eal_thread.c            |  65 +++++++++---
 lib/librte_eal/common/eal_common_launch.c         |  21 ++++
 lib/librte_eal/common/eal_common_memzone.c        |   5 +-
 lib/librte_eal/common/include/rte_debug.h         |  12 +++
 lib/librte_eal/common/rte_malloc.c                |   7 +-
 lib/librte_eal/linuxapp/eal/eal.c                 | 121 +++++++++++++++-------
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c   |  21 ++--
 lib/librte_eal/linuxapp/eal/eal_interrupts.c      |  27 +++--
 lib/librte_eal/linuxapp/eal/eal_thread.c          |  65 +++++++++---
 lib/librte_ether/rte_ethdev.c                     |  36 +++++--
 lib/librte_eventdev/rte_eventdev_pmd_pci.h        |   8 +-
 lib/librte_eventdev/rte_eventdev_pmd_vdev.h       |   8 +-
 lib/librte_kni/rte_kni.c                          |  18 ++--
 lib/librte_kni/rte_kni_fifo.h                     |  11 +-
 29 files changed, 540 insertions(+), 184 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 24/68] mempool: add support for the new allocation methods
  @ 2018-04-03 23:21  3%   ` Anatoly Burakov
  0 siblings, 0 replies; 200+ results
From: Anatoly Burakov @ 2018-04-03 23:21 UTC (permalink / raw)
  To: dev
  Cc: Olivier Matz, keith.wiles, jianfeng.tan, andras.kovacs,
	laszlo.vadkeri, benjamin.walker, bruce.richardson, thomas,
	konstantin.ananyev, kuralamudhan.ramakrishnan, louise.m.daly,
	nelio.laranjeiro, yskoh, pepperjo, jerin.jacob, hemant.agrawal,
	shreyansh.jain, gowrishankar.m

If a user has specified that the zone should have contiguous memory,
use the new _contig allocation API's instead of normal ones.
Otherwise, account for the fact that unless we're in IOVA_AS_VA
mode, we cannot guarantee that the pages would be physically
contiguous, so we calculate the memzone size and alignments as if
we were getting the smallest page size available.

Existing mempool size calculation function also doesn't give us
expected results, because it will return memzone sizes aligned to
page size (e.g. a 1MB mempool will reserve the entire 1GB page if
all user has are 1GB pages), so add a new one that will give us
results more in line with what we would expect.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v3:
    - Fixed mempool size calculation
    - Fixed handling of contiguous memzones
    - Moved earlier in the patchset

 lib/librte_mempool/Makefile      |   3 +
 lib/librte_mempool/meson.build   |   3 +
 lib/librte_mempool/rte_mempool.c | 137 ++++++++++++++++++++++++++++++++-------
 3 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 24e735a..cfc69b4 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -13,6 +13,9 @@ EXPORT_MAP := rte_mempool_version.map
 
 LIBABIVER := 3
 
+# uses new contiguous memzone allocation that isn't yet in stable ABI
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
diff --git a/lib/librte_mempool/meson.build b/lib/librte_mempool/meson.build
index 712720f..5916a0f 100644
--- a/lib/librte_mempool/meson.build
+++ b/lib/librte_mempool/meson.build
@@ -5,3 +5,6 @@ version = 3
 sources = files('rte_mempool.c', 'rte_mempool_ops.c')
 headers = files('rte_mempool.h')
 deps += ['ring']
+
+# contig memzone allocation is not yet part of stable API
+allow_experimental_apis = true
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 54f7f4b..e147180 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2016 6WIND S.A.
  */
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
@@ -98,6 +99,27 @@ static unsigned optimize_object_size(unsigned obj_size)
 	return new_obj_size * RTE_MEMPOOL_ALIGN;
 }
 
+static size_t
+get_min_page_size(void)
+{
+	const struct rte_mem_config *mcfg =
+			rte_eal_get_configuration()->mem_config;
+	int i;
+	size_t min_pagesz = SIZE_MAX;
+
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		const struct rte_memseg *ms = &mcfg->memseg[i];
+
+		if (ms->addr == NULL)
+			continue;
+
+		if (ms->hugepage_sz < min_pagesz)
+			min_pagesz = ms->hugepage_sz;
+	}
+
+	return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
+}
+
 static void
 mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
 {
@@ -204,7 +226,6 @@ rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 	return sz->total_size;
 }
 
-
 /*
  * Calculate maximum amount of memory required to store given number of objects.
  */
@@ -367,16 +388,6 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	/* update mempool capabilities */
 	mp->flags |= mp_capa_flags;
 
-	/* Detect pool area has sufficient space for elements */
-	if (mp_capa_flags & MEMPOOL_F_CAPA_PHYS_CONTIG) {
-		if (len < total_elt_sz * mp->size) {
-			RTE_LOG(ERR, MEMPOOL,
-				"pool area %" PRIx64 " not enough\n",
-				(uint64_t)len);
-			return -ENOSPC;
-		}
-	}
-
 	memhdr = rte_zmalloc("MEMPOOL_MEMHDR", sizeof(*memhdr), 0);
 	if (memhdr == NULL)
 		return -ENOMEM;
@@ -549,6 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	unsigned int mp_flags;
 	int ret;
+	bool force_contig, no_contig, try_contig, no_pageshift;
 
 	/* mempool must not be populated */
 	if (mp->nb_mem_chunks != 0)
@@ -563,9 +575,62 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	/* update mempool capabilities */
 	mp->flags |= mp_flags;
 
-	if (rte_eal_has_hugepages()) {
-		pg_shift = 0; /* not needed, zone is physically contiguous */
+	no_contig = mp->flags & MEMPOOL_F_NO_PHYS_CONTIG;
+	force_contig = mp->flags & MEMPOOL_F_CAPA_PHYS_CONTIG;
+
+	/*
+	 * the following section calculates page shift and page size values.
+	 *
+	 * these values impact the result of rte_mempool_xmem_size(), which
+	 * returns the amount of memory that should be allocated to store the
+	 * desired number of objects. when not zero, it allocates more memory
+	 * for the padding between objects, to ensure that an object does not
+	 * cross a page boundary. in other words, page size/shift are to be set
+	 * to zero if mempool elements won't care about page boundaries.
+	 * there are several considerations for page size and page shift here.
+	 *
+	 * if we don't need our mempools to have physically contiguous objects,
+	 * then just set page shift and page size to 0, because the user has
+	 * indicated that there's no need to care about anything.
+	 *
+	 * if we do need contiguous objects, there is also an option to reserve
+	 * the entire mempool memory as one contiguous block of memory, in
+	 * which case the page shift and alignment wouldn't matter as well.
+	 *
+	 * if we require contiguous objects, but not necessarily the entire
+	 * mempool reserved space to be contiguous, then there are two options.
+	 *
+	 * if our IO addresses are virtual, not actual physical (IOVA as VA
+	 * case), then no page shift needed - our memory allocation will give us
+	 * contiguous physical memory as far as the hardware is concerned, so
+	 * act as if we're getting contiguous memory.
+	 *
+	 * if our IO addresses are physical, we may get memory from bigger
+	 * pages, or we might get memory from smaller pages, and how much of it
+	 * we require depends on whether we want bigger or smaller pages.
+	 * However, requesting each and every memory size is too much work, so
+	 * what we'll do instead is walk through the page sizes available, pick
+	 * the smallest one and set up page shift to match that one. We will be
+	 * wasting some space this way, but it's much nicer than looping around
+	 * trying to reserve each and every page size.
+	 *
+	 * However, since size calculation will produce page-aligned sizes, it
+	 * makes sense to first try and see if we can reserve the entire memzone
+	 * in one contiguous chunk as well (otherwise we might end up wasting a
+	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
+	 * memory, then we'll go and reserve space page-by-page.
+	 */
+	no_pageshift = no_contig || force_contig ||
+			rte_eal_iova_mode() == RTE_IOVA_VA;
+	try_contig = !no_contig && !no_pageshift && rte_eal_has_hugepages();
+
+	if (no_pageshift) {
 		pg_sz = 0;
+		pg_shift = 0;
+		align = RTE_CACHE_LINE_SIZE;
+	} else if (try_contig) {
+		pg_sz = get_min_page_size();
+		pg_shift = rte_bsf32(pg_sz);
 		align = RTE_CACHE_LINE_SIZE;
 	} else {
 		pg_sz = getpagesize();
@@ -575,8 +640,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
-		size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
-						mp->flags);
+		if (try_contig || no_pageshift)
+			size = rte_mempool_xmem_size(n, total_elt_sz, 0,
+				mp->flags);
+		else
+			size = rte_mempool_xmem_size(n, total_elt_sz, pg_shift,
+				mp->flags);
 
 		ret = snprintf(mz_name, sizeof(mz_name),
 			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
@@ -585,23 +654,47 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		mz = rte_memzone_reserve_aligned(mz_name, size,
-			mp->socket_id, mz_flags, align);
-		/* not enough memory, retry with the biggest zone we have */
-		if (mz == NULL)
-			mz = rte_memzone_reserve_aligned(mz_name, 0,
+		mz = NULL;
+		if (force_contig || try_contig) {
+			/* if contiguous memory for entire mempool memory was
+			 * requested, don't try reserving again if we fail...
+			 */
+			mz = rte_memzone_reserve_aligned_contig(mz_name, size,
+				mp->socket_id, mz_flags, align);
+
+			/* ...unless we are doing best effort allocation, in
+			 * which case recalculate size and try again */
+			if (try_contig && mz == NULL) {
+				try_contig = false;
+				align = pg_sz;
+				size = rte_mempool_xmem_size(n, total_elt_sz,
+					pg_shift, mp->flags);
+			}
+		}
+		/* only try this if we're not trying to reserve contiguous
+		 * memory.
+		 */
+		if (!force_contig && mz == NULL) {
+			mz = rte_memzone_reserve_aligned(mz_name, size,
 				mp->socket_id, mz_flags, align);
+			/* not enough memory, retry with the biggest zone we
+			 * have
+			 */
+			if (mz == NULL)
+				mz = rte_memzone_reserve_aligned(mz_name, 0,
+					mp->socket_id, mz_flags, align);
+		}
 		if (mz == NULL) {
 			ret = -rte_errno;
 			goto fail;
 		}
 
-		if (mp->flags & MEMPOOL_F_NO_PHYS_CONTIG)
+		if (no_contig)
 			iova = RTE_BAD_IOVA;
 		else
 			iova = mz->iova;
 
-		if (rte_eal_has_hugepages())
+		if (no_pageshift || try_contig)
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v10 3/9] eventtimer: add common code
  @ 2018-04-03 21:44  3%     ` Erik Gabriel Carrillo
    1 sibling, 0 replies; 200+ results
From: Erik Gabriel Carrillo @ 2018-04-03 21:44 UTC (permalink / raw)
  To: pbhagavatula, jerin.jacob; +Cc: dev, hemant.agrawal

This commit adds the logic that is shared by all event timer adapter
drivers; the common code handles instance allocation and some
initialization.

Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
 config/common_base                                |   1 +
 drivers/event/sw/sw_evdev.c                       |  18 +
 lib/librte_eventdev/Makefile                      |   2 +
 lib/librte_eventdev/rte_event_timer_adapter.c     | 387 ++++++++++++++++++++++
 lib/librte_eventdev/rte_event_timer_adapter_pmd.h | 114 +++++++
 lib/librte_eventdev/rte_eventdev.c                |  22 ++
 lib/librte_eventdev/rte_eventdev.h                |  20 ++
 lib/librte_eventdev/rte_eventdev_pmd.h            |  35 ++
 lib/librte_eventdev/rte_eventdev_version.map      |  20 +-
 9 files changed, 618 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter.c
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter_pmd.h

diff --git a/config/common_base b/config/common_base
index 7abf7c6..9354c66 100644
--- a/config/common_base
+++ b/config/common_base
@@ -550,6 +550,7 @@ CONFIG_RTE_LIBRTE_EVENTDEV=y
 CONFIG_RTE_LIBRTE_EVENTDEV_DEBUG=n
 CONFIG_RTE_EVENT_MAX_DEVS=16
 CONFIG_RTE_EVENT_MAX_QUEUES_PER_DEV=64
+CONFIG_RTE_EVENT_TIMER_ADAPTER_NUM_MAX=32
 
 #
 # Compile PMD for skeleton event device
diff --git a/drivers/event/sw/sw_evdev.c b/drivers/event/sw/sw_evdev.c
index 0e89f11..dcb6551 100644
--- a/drivers/event/sw/sw_evdev.c
+++ b/drivers/event/sw/sw_evdev.c
@@ -464,6 +464,22 @@ sw_eth_rx_adapter_caps_get(const struct rte_eventdev *dev,
 	return 0;
 }
 
+static int
+sw_timer_adapter_caps_get(const struct rte_eventdev *dev,
+			  uint64_t flags,
+			  uint32_t *caps,
+			  const struct rte_event_timer_adapter_ops **ops)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(flags);
+	*caps = 0;
+
+	/* Use default SW ops */
+	*ops = NULL;
+
+	return 0;
+}
+
 static void
 sw_info_get(struct rte_eventdev *dev, struct rte_event_dev_info *info)
 {
@@ -791,6 +807,8 @@ sw_probe(struct rte_vdev_device *vdev)
 
 			.eth_rx_adapter_caps_get = sw_eth_rx_adapter_caps_get,
 
+			.timer_adapter_caps_get = sw_timer_adapter_caps_get,
+
 			.xstats_get = sw_xstats_get,
 			.xstats_get_names = sw_xstats_get_names,
 			.xstats_get_by_name = sw_xstats_get_by_name,
diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile
index 549b182..8b16e3f 100644
--- a/lib/librte_eventdev/Makefile
+++ b/lib/librte_eventdev/Makefile
@@ -20,6 +20,7 @@ LDLIBS += -lrte_eal -lrte_ring -lrte_ethdev -lrte_hash
 SRCS-y += rte_eventdev.c
 SRCS-y += rte_event_ring.c
 SRCS-y += rte_event_eth_rx_adapter.c
+SRCS-y += rte_event_timer_adapter.c
 
 # export include files
 SYMLINK-y-include += rte_eventdev.h
@@ -29,6 +30,7 @@ SYMLINK-y-include += rte_eventdev_pmd_vdev.h
 SYMLINK-y-include += rte_event_ring.h
 SYMLINK-y-include += rte_event_eth_rx_adapter.h
 SYMLINK-y-include += rte_event_timer_adapter.h
+SYMLINK-y-include += rte_event_timer_adapter_pmd.h
 
 # versioning export map
 EXPORT_MAP := rte_eventdev_version.map
diff --git a/lib/librte_eventdev/rte_event_timer_adapter.c b/lib/librte_eventdev/rte_event_timer_adapter.c
new file mode 100644
index 0000000..75a14ac
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter.c
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <string.h>
+#include <inttypes.h>
+
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_dev.h>
+#include <rte_errno.h>
+
+#include "rte_eventdev.h"
+#include "rte_eventdev_pmd.h"
+#include "rte_event_timer_adapter.h"
+#include "rte_event_timer_adapter_pmd.h"
+
+#define DATA_MZ_NAME_MAX_LEN 64
+#define DATA_MZ_NAME_FORMAT "rte_event_timer_adapter_data_%d"
+
+static int evtim_logtype;
+
+static struct rte_event_timer_adapter adapters[RTE_EVENT_TIMER_ADAPTER_NUM_MAX];
+
+#define EVTIM_LOG(level, logtype, ...) \
+	rte_log(RTE_LOG_ ## level, logtype, \
+		RTE_FMT("EVTIMER: %s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) \
+			"\n", __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,)))
+
+#define EVTIM_LOG_ERR(...) EVTIM_LOG(ERR, evtim_logtype, __VA_ARGS__)
+
+#ifdef RTE_LIBRTE_EVENTDEV_DEBUG
+#define EVTIM_LOG_DBG(...) \
+	EVTIM_LOG(DEBUG, evtim_logtype, __VA_ARGS__)
+#else
+#define EVTIM_LOG_DBG(...) (void)0
+#endif
+
+static int
+default_port_conf_cb(uint16_t id, uint8_t event_dev_id, uint8_t *event_port_id,
+		     void *conf_arg)
+{
+	struct rte_event_timer_adapter *adapter;
+	struct rte_eventdev *dev;
+	struct rte_event_dev_config dev_conf;
+	struct rte_event_port_conf *port_conf, def_port_conf = {0};
+	int started;
+	uint8_t port_id;
+	uint8_t dev_id;
+	int ret;
+
+	RTE_SET_USED(event_dev_id);
+
+	adapter = &adapters[id];
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+	dev_id = dev->data->dev_id;
+	dev_conf = dev->data->dev_conf;
+
+	started = dev->data->dev_started;
+	if (started)
+		rte_event_dev_stop(dev_id);
+
+	port_id = dev_conf.nb_event_ports;
+	dev_conf.nb_event_ports += 1;
+	ret = rte_event_dev_configure(dev_id, &dev_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to configure event dev %u\n", dev_id);
+		if (started)
+			if (rte_event_dev_start(dev_id))
+				return -EIO;
+
+		return ret;
+	}
+
+	if (conf_arg != NULL)
+		port_conf = conf_arg;
+	else {
+		port_conf = &def_port_conf;
+		ret = rte_event_port_default_conf_get(dev_id, port_id,
+						      port_conf);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = rte_event_port_setup(dev_id, port_id, port_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to setup event port %u on event dev %u\n",
+			      port_id, dev_id);
+		return ret;
+	}
+
+	*event_port_id = port_id;
+
+	if (started)
+		ret = rte_event_dev_start(dev_id);
+
+	return ret;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create(const struct rte_event_timer_adapter_conf *conf)
+{
+	return rte_event_timer_adapter_create_ext(conf, default_port_conf_cb,
+						  NULL);
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create_ext(
+		const struct rte_event_timer_adapter_conf *conf,
+		rte_event_timer_adapter_port_conf_cb_t conf_cb,
+		void *conf_arg)
+{
+	uint16_t adapter_id;
+	struct rte_event_timer_adapter *adapter;
+	const struct rte_memzone *mz;
+	char mz_name[DATA_MZ_NAME_MAX_LEN];
+	int n, ret;
+	struct rte_eventdev *dev;
+
+	if (conf == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check eventdev ID */
+	if (!rte_event_pmd_is_valid_dev(conf->event_dev_id)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	dev = &rte_eventdevs[conf->event_dev_id];
+
+	adapter_id = conf->timer_adapter_id;
+
+	/* Check that adapter_id is in range */
+	if (adapter_id >= RTE_EVENT_TIMER_ADAPTER_NUM_MAX) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check adapter ID not already allocated */
+	adapter = &adapters[adapter_id];
+	if (adapter->allocated) {
+		rte_errno = EEXIST;
+		return NULL;
+	}
+
+	/* Create shared data area. */
+	n = snprintf(mz_name, sizeof(mz_name), DATA_MZ_NAME_FORMAT, adapter_id);
+	if (n >= (int)sizeof(mz_name)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	mz = rte_memzone_reserve(mz_name,
+				 sizeof(struct rte_event_timer_adapter_data),
+				 conf->socket_id, 0);
+	if (mz == NULL)
+		/* rte_errno set by rte_memzone_reserve */
+		return NULL;
+
+	adapter->data = mz->addr;
+	memset(adapter->data, 0, sizeof(struct rte_event_timer_adapter_data));
+
+	adapter->data->mz = mz;
+	adapter->data->event_dev_id = conf->event_dev_id;
+	adapter->data->id = adapter_id;
+	adapter->data->socket_id = conf->socket_id;
+	adapter->data->conf = *conf;  /* copy conf structure */
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	if (!(adapter->data->caps &
+	      RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT)) {
+		FUNC_PTR_OR_NULL_RET_WITH_ERRNO(conf_cb, -EINVAL);
+		ret = conf_cb(adapter->data->id, adapter->data->event_dev_id,
+			      &adapter->data->event_port_id, conf_arg);
+		if (ret < 0) {
+			rte_errno = ret;
+			goto free_memzone;
+		}
+	}
+
+	/* Allow driver to do some setup */
+	FUNC_PTR_OR_NULL_RET_WITH_ERRNO(adapter->ops->init, -ENOTSUP);
+	ret = adapter->ops->init(adapter);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+
+free_memzone:
+	rte_memzone_free(adapter->data->mz);
+	return NULL;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_get_info(const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->ops->get_info)
+		/* let driver set values it knows */
+		adapter->ops->get_info(adapter, adapter_info);
+
+	/* Set common values */
+	adapter_info->conf = adapter->data->conf;
+	adapter_info->event_dev_port_id = adapter->data->event_port_id;
+	adapter_info->caps = adapter->data->caps;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_start(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->start, -EINVAL);
+
+	ret = adapter->ops->start(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 1;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stop, -EINVAL);
+
+	if (adapter->data->started == 0) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" already stopped",
+			      adapter->data->id);
+		return 0;
+	}
+
+	ret = adapter->ops->stop(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 0;
+
+	return 0;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_lookup(uint16_t adapter_id)
+{
+	char name[DATA_MZ_NAME_MAX_LEN];
+	const struct rte_memzone *mz;
+	struct rte_event_timer_adapter_data *data;
+	struct rte_event_timer_adapter *adapter;
+	int ret;
+	struct rte_eventdev *dev;
+
+	if (adapters[adapter_id].allocated)
+		return &adapters[adapter_id]; /* Adapter is already loaded */
+
+	snprintf(name, DATA_MZ_NAME_MAX_LEN, DATA_MZ_NAME_FORMAT, adapter_id);
+	mz = rte_memzone_lookup(name);
+	if (mz == NULL) {
+		rte_errno = ENOENT;
+		return NULL;
+	}
+
+	data = mz->addr;
+
+	adapter = &adapters[data->id];
+	adapter->data = data;
+
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_free(struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->uninit, -EINVAL);
+
+	if (adapter->data->started == 1) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" must be stopped "
+			      "before freeing", adapter->data->id);
+		return -EBUSY;
+	}
+
+	/* free impl priv data */
+	ret = adapter->ops->uninit(adapter);
+	if (ret < 0)
+		return ret;
+
+	/* free shared data area */
+	ret = rte_memzone_free(adapter->data->mz);
+	if (ret < 0)
+		return ret;
+
+	adapter->data = NULL;
+	adapter->allocated = 0;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter,
+				       uint32_t *service_id)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->data->service_inited && service_id != NULL)
+		*service_id = adapter->data->service_id;
+
+	return adapter->data->service_inited ? 0 : -ESRCH;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter,
+				  struct rte_event_timer_adapter_stats *stats)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_get, -EINVAL);
+	if (stats == NULL)
+		return -EINVAL;
+
+	return adapter->ops->stats_get(adapter, stats);
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_reset(struct rte_event_timer_adapter *adapter)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_reset, -EINVAL);
+	return adapter->ops->stats_reset(adapter);
+}
+
+RTE_INIT(event_timer_adapter_init_log);
+static void
+event_timer_adapter_init_log(void)
+{
+	evtim_logtype = rte_log_register("lib.eventdev.adapter.timer");
+	if (evtim_logtype >= 0)
+		rte_log_set_level(evtim_logtype, RTE_LOG_NOTICE);
+}
diff --git a/lib/librte_eventdev/rte_event_timer_adapter_pmd.h b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
new file mode 100644
index 0000000..cf3509d
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+#define __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+
+/**
+ * @file
+ * RTE Event Timer Adapter API (PMD Side)
+ *
+ * @note
+ * This file provides implementation helpers for internal use by PMDs.  They
+ * are not intended to be exposed to applications and are not subject to ABI
+ * versioning.
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "rte_event_timer_adapter.h"
+
+/*
+ * Definitions of functions exported by an event timer adapter implementation
+ * through *rte_event_timer_adapter_ops* structure supplied in the
+ * *rte_event_timer_adapter* structure associated with an event timer adapter.
+ */
+
+typedef int (*rte_event_timer_adapter_init_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation setup */
+typedef int (*rte_event_timer_adapter_uninit_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation teardown */
+typedef int (*rte_event_timer_adapter_start_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Start running event timer adapter */
+typedef int (*rte_event_timer_adapter_stop_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Stop running event timer adapter */
+typedef void (*rte_event_timer_adapter_get_info_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info);
+/**< @internal Get contextual information for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_get_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_stats *stats);
+/**< @internal Get statistics for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_reset_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Reset statistics for event timer adapter */
+
+/**
+ * @internal Structure containing the functions exported by an event timer
+ * adapter implementation.
+ */
+struct rte_event_timer_adapter_ops {
+	rte_event_timer_adapter_init_t		init;  /**< Set up adapter */
+	rte_event_timer_adapter_uninit_t	uninit;/**< Tear down adapter */
+	rte_event_timer_adapter_start_t		start; /**< Start adapter */
+	rte_event_timer_adapter_stop_t		stop;  /**< Stop adapter */
+	rte_event_timer_adapter_get_info_t	get_info;
+	/**< Get info from driver */
+	rte_event_timer_adapter_stats_get_t	stats_get;
+	/**< Get adapter statistics */
+	rte_event_timer_adapter_stats_reset_t	stats_reset;
+	/**< Reset adapter statistics */
+	rte_event_timer_arm_burst_t		arm_burst;
+	/**< Arm one or more event timers */
+	rte_event_timer_arm_tmo_tick_burst_t	arm_tmo_tick_burst;
+	/**< Arm event timers with same expiration time */
+	rte_event_timer_cancel_burst_t		cancel_burst;
+	/**< Cancel one or more event timers */
+};
+
+/**
+ * @internal Adapter data; structure to be placed in shared memory to be
+ * accessible by various processes in a multi-process configuration.
+ */
+struct rte_event_timer_adapter_data {
+	uint8_t id;
+	/**< Event timer adapter ID */
+	uint8_t event_dev_id;
+	/**< Event device ID */
+	uint32_t socket_id;
+	/**< Socket ID where memory is allocated */
+	uint8_t event_port_id;
+	/**< Optional: event port ID used when the inbuilt port is absent */
+	const struct rte_memzone *mz;
+	/**< Event timer adapter memzone pointer */
+	struct rte_event_timer_adapter_conf conf;
+	/**< Configuration used to configure the adapter. */
+	uint32_t caps;
+	/**< Adapter capabilities */
+	void *adapter_priv;
+	/**< Timer adapter private data*/
+	uint8_t service_inited;
+	/**< Service initialization state */
+	uint32_t service_id;
+	/**< Service ID*/
+
+	RTE_STD_C11
+	uint8_t started : 1;
+	/**< Flag to indicate adapter started. */
+} __rte_cache_aligned;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RTE_EVENT_TIMER_ADAPTER_PMD_H__ */
diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c
index 2de8d9a..3f016f4 100644
--- a/lib/librte_eventdev/rte_eventdev.c
+++ b/lib/librte_eventdev/rte_eventdev.c
@@ -123,6 +123,28 @@ rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				: 0;
 }
 
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps)
+{
+	struct rte_eventdev *dev;
+	const struct rte_event_timer_adapter_ops *ops;
+
+	RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_eventdevs[dev_id];
+
+	if (caps == NULL)
+		return -EINVAL;
+	*caps = 0;
+
+	return dev->dev_ops->timer_adapter_caps_get ?
+				(*dev->dev_ops->timer_adapter_caps_get)(dev,
+									0,
+									caps,
+									&ops)
+				: 0;
+}
+
 static inline int
 rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues)
 {
diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h
index 86df4be..6fcbe94 100644
--- a/lib/librte_eventdev/rte_eventdev.h
+++ b/lib/librte_eventdev/rte_eventdev.h
@@ -215,6 +215,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_memory.h>
 #include <rte_errno.h>
+#include <rte_compat.h>
 
 struct rte_mbuf; /* we just use mbuf pointers; no need to include rte_mbuf.h */
 struct rte_event;
@@ -1115,6 +1116,25 @@ int
 rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				uint32_t *caps);
 
+#define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0)
+/**< This flag is set when the timer mechanism is in HW. */
+
+/**
+ * Retrieve the event device's timer adapter capabilities.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param[out] caps
+ *   A pointer to memory to be filled with event timer adapter capabilities.
+ *
+ * @return
+ *   - 0: Success, driver provided event timer adapter capabilities.
+ *   - <0: Error code returned by the driver function.
+ */
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps);
+
 struct rte_eventdev_driver;
 struct rte_eventdev_ops;
 struct rte_eventdev;
diff --git a/lib/librte_eventdev/rte_eventdev_pmd.h b/lib/librte_eventdev/rte_eventdev_pmd.h
index 3a8ddd7..2dcb528 100644
--- a/lib/librte_eventdev/rte_eventdev_pmd.h
+++ b/lib/librte_eventdev/rte_eventdev_pmd.h
@@ -26,6 +26,7 @@ extern "C" {
 #include <rte_malloc.h>
 
 #include "rte_eventdev.h"
+#include "rte_event_timer_adapter_pmd.h"
 
 /* Logging Macros */
 #define RTE_EDEV_LOG_ERR(...) \
@@ -449,6 +450,37 @@ typedef int (*eventdev_eth_rx_adapter_caps_get_t)
 struct rte_event_eth_rx_adapter_queue_conf *queue_conf;
 
 /**
+ * Retrieve the event device's timer adapter capabilities, as well as the ops
+ * structure that an event timer adapter should call through to enter the
+ * driver
+ *
+ * @param dev
+ *   Event device pointer
+ *
+ * @param flags
+ *   Flags that can be used to determine how to select an event timer
+ *   adapter ops structure
+ *
+ * @param[out] caps
+ *   A pointer to memory filled with Rx event adapter capabilities.
+ *
+ * @param[out] ops
+ *   A pointer to the ops pointer to set with the address of the desired ops
+ *   structure
+ *
+ * @return
+ *   - 0: Success, driver provides Rx event adapter capabilities for the
+ *	ethernet device.
+ *   - <0: Error code returned by the driver function.
+ *
+ */
+typedef int (*eventdev_timer_adapter_caps_get_t)(
+				const struct rte_eventdev *dev,
+				uint64_t flags,
+				uint32_t *caps,
+				const struct rte_event_timer_adapter_ops **ops);
+
+/**
  * Add ethernet Rx queues to event device. This callback is invoked if
  * the caps returned from rte_eventdev_eth_rx_adapter_caps_get(, eth_port_id)
  * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set.
@@ -640,6 +672,9 @@ struct rte_eventdev_ops {
 	eventdev_eth_rx_adapter_stats_reset eth_rx_adapter_stats_reset;
 	/**< Reset ethernet Rx stats */
 
+	eventdev_timer_adapter_caps_get_t timer_adapter_caps_get;
+	/**< Get timer adapter capabilities */
+
 	eventdev_selftest dev_selftest;
 	/**< Start eventdev Selftest */
 
diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map
index 4396536..6979577 100644
--- a/lib/librte_eventdev/rte_eventdev_version.map
+++ b/lib/librte_eventdev/rte_eventdev_version.map
@@ -66,7 +66,6 @@ DPDK_17.11 {
 	rte_event_eth_rx_adapter_stats_get;
 	rte_event_eth_rx_adapter_stats_reset;
 	rte_event_eth_rx_adapter_stop;
-
 } DPDK_17.08;
 
 DPDK_18.02 {
@@ -80,3 +79,22 @@ DPDK_18.05 {
 
 	rte_event_dev_stop_flush_callback_register;
 } DPDK_18.02;
+
+EXPERIMENTAL {
+	global:
+
+        rte_event_timer_adapter_caps_get;
+	rte_event_timer_adapter_create;
+	rte_event_timer_adapter_create_ext;
+	rte_event_timer_adapter_free;
+	rte_event_timer_adapter_get_info;
+	rte_event_timer_adapter_lookup;
+	rte_event_timer_adapter_service_id_get;
+	rte_event_timer_adapter_start;
+	rte_event_timer_adapter_stats_get;
+	rte_event_timer_adapter_stats_reset;
+	rte_event_timer_adapter_stop;
+	rte_event_timer_arm_burst;
+	rte_event_timer_arm_tmo_tick_burst;
+	rte_event_timer_cancel_burst;
+} DPDK_18.05;
-- 
2.6.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  2018-04-03 15:56  3%         ` Olivier Matz
@ 2018-04-03 16:42  3%           ` Jerin Jacob
  2018-04-04 23:38  0%             ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Jerin Jacob @ 2018-04-03 16:42 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev, konstantin.ananyev, bruce.richardson

-----Original Message-----
> Date: Tue, 3 Apr 2018 17:56:01 +0200
> From: Olivier Matz <olivier.matz@6wind.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
>  structure
> User-Agent: NeoMutt/20170113 (1.7.2)
> 
> On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> > -----Original Message-----
> > > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > >  structure
> > > User-Agent: NeoMutt/20170113 (1.7.2)
> > > 
> > > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > > -----Original Message-----
> > > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > > To: dev@dpdk.org
> > > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > > >  structure
> > > > > X-Mailer: git-send-email 2.11.0
> > > > > 
> > > > > The initial objective of
> > > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > > was to add an empty cache line betwee, the producer and consumer
> > > > > data (on platform with cache line size = 64B), preventing from
> > > > > having them on adjacent cache lines.
> > > > > 
> > > > > Following discussion on the mailing list, it appears that this
> > > > > also imposes an alignment constraint that is not required.
> > > > > 
> > > > > This patch removes the extra alignment constraint and adds the
> > > > > empty cache lines using padding fields in the structure. The
> > > > > size of rte_ring structure and the offset of the fields remain
> > > > > the same on platforms with cache line size = 64B:
> > > > > 
> > > > >   rte_ring = 384
> > > > >   rte_ring.name = 0
> > > > >   rte_ring.flags = 32
> > > > >   rte_ring.memzone = 40
> > > > >   rte_ring.size = 48
> > > > >   rte_ring.mask = 52
> > > > >   rte_ring.prod = 128
> > > > >   rte_ring.cons = 256
> > > > > 
> > > > > But it has an impact on platform where cache line size is 128B:
> > > > > 
> > > > >   rte_ring = 384        -> 768
> > > > >   rte_ring.name = 0
> > > > >   rte_ring.flags = 32
> > > > >   rte_ring.memzone = 40
> > > > >   rte_ring.size = 48
> > > > >   rte_ring.mask = 52
> > > > >   rte_ring.prod = 128   -> 256
> > > > >   rte_ring.cons = 256   -> 512
> > > > 
> > > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > > the adjust cacheline(consumer)?
> > > > 
> > > > If so, Will it have impact on those machine where it is 128B Cache line
> > > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > > 
> > > The impact on machines that have a 128B cache line is that an unused
> > > cache line will be added between the producer and consumer data. I
> > > expect that the impact is positive in case there is a hw prefetcher, and
> > > null in case there is no such prefetcher.
> > 
> > It is not NULL, Right? You are loosing 256B for each ring.
> 
> Is it really that important?

Pipeline or eventdev SW cases there could more rings in the system.
I don't see any downside of having config option which is enabled
default.

In my view, such config options are good, as in embedded usecases, customers
can really fine tune the target for the need. In server usecases, let the default
of option be enabled, no harm.

> 
> 
> > > On machines with 64B cache line, this was already the case. It just
> > > reduces the alignment constraint.
> > 
> > Not all the 64B CL machines will have HW prefetch.
> > 
> > I would recommend to add conditional compilation flags to express HW
> > prefetch enabled or not? based on that we can decide to reserve
> > the additional space. By default, in common config, HW prefetch can
> > be enabled so that it works for almost all cases.
> 
> The hw prefetcher can be enabled at runtime, so a compilation flag
> does not seem to be a good idea. Moreover, changing this compilation

On those Hardwares HW prefetch can be disabled at runtime, it is fine
with default config. I was taking about some low end ARM hardware which
does not have HW prefetch is not present at all.

> flag would change the ABI.

ABI is broken anyway, Right? due to size of the structure change.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  @ 2018-04-03 15:56  3%         ` Olivier Matz
  2018-04-03 16:42  3%           ` Jerin Jacob
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2018-04-03 15:56 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, konstantin.ananyev, bruce.richardson

On Tue, Apr 03, 2018 at 09:07:04PM +0530, Jerin Jacob wrote:
> -----Original Message-----
> > Date: Tue, 3 Apr 2018 17:25:17 +0200
> > From: Olivier Matz <olivier.matz@6wind.com>
> > To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > CC: dev@dpdk.org, konstantin.ananyev@intel.com, bruce.richardson@intel.com
> > Subject: Re: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> >  structure
> > User-Agent: NeoMutt/20170113 (1.7.2)
> > 
> > On Tue, Apr 03, 2018 at 08:37:23PM +0530, Jerin Jacob wrote:
> > > -----Original Message-----
> > > > Date: Tue, 3 Apr 2018 15:26:44 +0200
> > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > To: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH] ring: relax alignment constraint on ring
> > > >  structure
> > > > X-Mailer: git-send-email 2.11.0
> > > > 
> > > > The initial objective of
> > > > commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
> > > > was to add an empty cache line betwee, the producer and consumer
> > > > data (on platform with cache line size = 64B), preventing from
> > > > having them on adjacent cache lines.
> > > > 
> > > > Following discussion on the mailing list, it appears that this
> > > > also imposes an alignment constraint that is not required.
> > > > 
> > > > This patch removes the extra alignment constraint and adds the
> > > > empty cache lines using padding fields in the structure. The
> > > > size of rte_ring structure and the offset of the fields remain
> > > > the same on platforms with cache line size = 64B:
> > > > 
> > > >   rte_ring = 384
> > > >   rte_ring.name = 0
> > > >   rte_ring.flags = 32
> > > >   rte_ring.memzone = 40
> > > >   rte_ring.size = 48
> > > >   rte_ring.mask = 52
> > > >   rte_ring.prod = 128
> > > >   rte_ring.cons = 256
> > > > 
> > > > But it has an impact on platform where cache line size is 128B:
> > > > 
> > > >   rte_ring = 384        -> 768
> > > >   rte_ring.name = 0
> > > >   rte_ring.flags = 32
> > > >   rte_ring.memzone = 40
> > > >   rte_ring.size = 48
> > > >   rte_ring.mask = 52
> > > >   rte_ring.prod = 128   -> 256
> > > >   rte_ring.cons = 256   -> 512
> > > 
> > > Are we leaving TWO cacheline to make sure, HW prefetch don't load
> > > the adjust cacheline(consumer)?
> > > 
> > > If so, Will it have impact on those machine where it is 128B Cache line
> > > and the HW prefetcher is not loading the next caching explicitly. Right?
> > 
> > The impact on machines that have a 128B cache line is that an unused
> > cache line will be added between the producer and consumer data. I
> > expect that the impact is positive in case there is a hw prefetcher, and
> > null in case there is no such prefetcher.
> 
> It is not NULL, Right? You are loosing 256B for each ring.

Is it really that important?


> > On machines with 64B cache line, this was already the case. It just
> > reduces the alignment constraint.
> 
> Not all the 64B CL machines will have HW prefetch.
> 
> I would recommend to add conditional compilation flags to express HW
> prefetch enabled or not? based on that we can decide to reserve
> the additional space. By default, in common config, HW prefetch can
> be enabled so that it works for almost all cases.

The hw prefetcher can be enabled at runtime, so a compilation flag
does not seem to be a good idea. Moreover, changing this compilation
flag would change the ABI.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 0/2] gcc-8 build fixes
  @ 2018-04-03 15:10  3%   ` Stephen Hemminger
  0 siblings, 0 replies; 200+ results
From: Stephen Hemminger @ 2018-04-03 15:10 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev

On Tue, 3 Apr 2018 10:23:43 +0100
Ferruh Yigit <ferruh.yigit@intel.com> wrote:

> On 3/29/2018 6:05 PM, Stephen Hemminger wrote:
> > This fixes some of the obvious warnings found building DPDK
> > with gcc-8. There still are some deeper issues in the rte_hash_table
> > code; leave the fix for that up to the maintainer.
> > 
> > Stephen Hemminger (2):
> >   rte_mbuf: fix strncpy warnings
> >   rte_metrics: fix strncpy truncation warning
> > 
> > v3
> >   missing SOB on 1st patch
> > 
> > v2
> >   fix issues with wrong length in mbuf pool_ops
> >   don't need memset in metrics names
> > 
> > Stephen Hemminger (2):
> >   rte_mbuf: fix strncpy warnings
> >   rte_metrics: fix strncpy truncation warning  
> 
> I tried with gcc-8 [1] and getting a few more build errors similar to these
> ones. Are these two files only build error you get?
> 
> 
> [1]
> gcc (GCC) 8.0.1 20180401 (experimental)
> 

This fixes the easy ones. The harder one is in cuckoo hash.

  CC rte_table_hash_cuckoo.o
lib/librte_table/rte_table_hash_cuckoo.c: In function ‘rte_table_hash_cuckoo_create’:
lib/librte_table/rte_table_hash_cuckoo.c:110:16: error: cast between incompatible function types from ‘rte_table_hash_op_hash’ {aka ‘long unsigned int (*)(void *, void *, unsigned int,  long unsigned int)’} to ‘uint32_t (*)(const void *, uint32_t,  uint32_t)’ {aka ‘unsigned int (*)(const void *, unsigned int,  unsigned int)’} [-Werror=cast-function-type]
   .hash_func = (rte_hash_function)(p->f_hash),
                ^
cc1: all warnings being treated as errors

Not sure what the right way to fix this one is. Hash table should not be defining
its own special hash function prototype. Changing to a common definition is
non-trivial and breaks ABI.  Casting seems wrong, error prone,
and a bad precedent in this case.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support
  2018-04-03 14:42  0%           ` Tan, Jianfeng
@ 2018-04-03 14:48  0%             ` Wodkowski, PawelX
  0 siblings, 0 replies; 200+ results
From: Wodkowski, PawelX @ 2018-04-03 14:48 UTC (permalink / raw)
  To: Tan, Jianfeng, Maxime Coquelin, Zhang, Roy Fan, dev; +Cc: jianjay.zhou

> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Tuesday, April 3, 2018 4:43 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>; Zhang, Roy Fan
> <roy.fan.zhang@intel.com>; Wodkowski, PawelX
> <pawelx.wodkowski@intel.com>; dev@dpdk.org
> Cc: jianjay.zhou@huawei.com
> Subject: Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> backend support
> 
> 
> 
> On 4/3/2018 9:44 PM, Maxime Coquelin wrote:
> > Hi Pawel, Fan,
> >
> > On 04/01/2018 09:53 PM, Zhang, Roy Fan wrote:
> >> Hi Pawel,
> >>
> >>> -----Original Message-----
> >>> From: Wodkowski, PawelX
> >>> Sent: Thursday, March 29, 2018 2:48 PM
> >>> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org
> >>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
> Jianfeng
> >>> <jianfeng.tan@intel.com>
> >>> Subject: RE: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> >>> backend support
> >>>
> >>>> -----Original Message-----
> >>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Fan Zhang
> >>>> Sent: Thursday, March 29, 2018 2:53 PM
> >>>> To: dev@dpdk.org
> >>>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
> >>> Jianfeng
> >>>> <jianfeng.tan@intel.com>
> >>>> Subject: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> >>>> backend support
> >>>>
> >>>> This patch adds external backend support to vhost library. The patch
> >>>> provides new APIs for the external backend to register pre and post
> >>>> vhost-user message handlers.
> >>>>
> >>>> Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
> >>>> ---
> >>>>   lib/librte_vhost/rte_vhost.h           | 64
> >>>> +++++++++++++++++++++++++++++++++-
> >>>>   lib/librte_vhost/rte_vhost_version.map |  6 ++++
> >>>>   lib/librte_vhost/vhost.c               | 17 ++++++++-
> >>>>   lib/librte_vhost/vhost.h               |  8 +++--
> >>>>   lib/librte_vhost/vhost_user.c          | 33 +++++++++++++++++-
> >>>>   5 files changed, 123 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/lib/librte_vhost/rte_vhost.h
> >>>> b/lib/librte_vhost/rte_vhost.h index d332069..b902c44 100644
> >>>> --- a/lib/librte_vhost/rte_vhost.h
> >>>> +++ b/lib/librte_vhost/rte_vhost.h
> >>>> @@ -1,5 +1,5 @@
> >
> > <snip/>
> >
> >>
> >>>> + * @param require_reply
> >>>> + *  If the handler requires sending a reply, this varaible shall be
> >>>> +written 1,
> >>>> + *  otherwise 0.
> >>>> + * @return
> >>>> + *  0 on success, -1 on failure
> >>>> + */
> >>>> +typedef int (*rte_vhost_msg_post_handle)(int vid, void *msg,
> >>>> +        uint32_t *require_reply);
> >>>> +
> >>>
> >>> What mean 'Message pointer' Is this const for us? Is this payload?
> >>> Making
> >>> msg 'void *' is not a way to go here. Those pre and post handlers
> >>> need to see
> >>> exactly the same structures like vhost_user.c file. Otherwise we can
> >>> get into
> >>> troubles when ABI changes.
> >>
> >> It is the pointer to the vhost_user message. It cannot be const as
> >> the backend
> >> may change the payload.
> >>
> >>>
> >>> Also you can easily merge pre and post handlers into one handler
> >>> with one
> >>> Parameter describing what phase of message processing we are now.
> >>>
> >>
> >> No I don't think so. To do so it will be quite unclear in the future
> >> as we are
> >> using one function to do two totally different things.
> >
> > Time is running out for v18.05 integration deadline (April 6th), and
> > we haven't reached a consensus.
> >
> > Except this API point, I think vhost-crypto is at the right level.
> > Since vhost-crypto lives in librte_vhost, I propose Fan cooks an
> > intermediate solution that does not need API change.
> >
> > Doing this, we postpone the API change to v18.08, so we have time to
> > discuss what the right API should be. Once agreed, vhost-crypto moves to
> > the new API.
> >
> > Pawel, Jianfeng, Fan, is it fine for you?
> 
> +1. This can avoid blocking this patch set, and give more time for
> discussing new APIs and external structs.
> 
> Thanks,
> Jianfeng
> 
> >
> > Thanks,
> > Maxime

Fine for me too.
Pawel

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support
  2018-04-03 13:44  0%         ` Maxime Coquelin
  2018-04-03 13:55  0%           ` Zhang, Roy Fan
@ 2018-04-03 14:42  0%           ` Tan, Jianfeng
  2018-04-03 14:48  0%             ` Wodkowski, PawelX
  1 sibling, 1 reply; 200+ results
From: Tan, Jianfeng @ 2018-04-03 14:42 UTC (permalink / raw)
  To: Maxime Coquelin, Zhang, Roy Fan, Wodkowski, PawelX, dev; +Cc: jianjay.zhou



On 4/3/2018 9:44 PM, Maxime Coquelin wrote:
> Hi Pawel, Fan,
>
> On 04/01/2018 09:53 PM, Zhang, Roy Fan wrote:
>> Hi Pawel,
>>
>>> -----Original Message-----
>>> From: Wodkowski, PawelX
>>> Sent: Thursday, March 29, 2018 2:48 PM
>>> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org
>>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan, Jianfeng
>>> <jianfeng.tan@intel.com>
>>> Subject: RE: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
>>> backend support
>>>
>>>> -----Original Message-----
>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Fan Zhang
>>>> Sent: Thursday, March 29, 2018 2:53 PM
>>>> To: dev@dpdk.org
>>>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
>>> Jianfeng
>>>> <jianfeng.tan@intel.com>
>>>> Subject: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
>>>> backend support
>>>>
>>>> This patch adds external backend support to vhost library. The patch
>>>> provides new APIs for the external backend to register pre and post
>>>> vhost-user message handlers.
>>>>
>>>> Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
>>>> ---
>>>>   lib/librte_vhost/rte_vhost.h           | 64
>>>> +++++++++++++++++++++++++++++++++-
>>>>   lib/librte_vhost/rte_vhost_version.map |  6 ++++
>>>>   lib/librte_vhost/vhost.c               | 17 ++++++++-
>>>>   lib/librte_vhost/vhost.h               |  8 +++--
>>>>   lib/librte_vhost/vhost_user.c          | 33 +++++++++++++++++-
>>>>   5 files changed, 123 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/lib/librte_vhost/rte_vhost.h
>>>> b/lib/librte_vhost/rte_vhost.h index d332069..b902c44 100644
>>>> --- a/lib/librte_vhost/rte_vhost.h
>>>> +++ b/lib/librte_vhost/rte_vhost.h
>>>> @@ -1,5 +1,5 @@
>
> <snip/>
>
>>
>>>> + * @param require_reply
>>>> + *  If the handler requires sending a reply, this varaible shall be
>>>> +written 1,
>>>> + *  otherwise 0.
>>>> + * @return
>>>> + *  0 on success, -1 on failure
>>>> + */
>>>> +typedef int (*rte_vhost_msg_post_handle)(int vid, void *msg,
>>>> +        uint32_t *require_reply);
>>>> +
>>>
>>> What mean 'Message pointer' Is this const for us? Is this payload? 
>>> Making
>>> msg 'void *' is not a way to go here. Those pre and post handlers 
>>> need to see
>>> exactly the same structures like vhost_user.c file. Otherwise we can 
>>> get into
>>> troubles when ABI changes.
>>
>> It is the pointer to the vhost_user message. It cannot be const as 
>> the backend
>> may change the payload.
>>
>>>
>>> Also you can easily merge pre and post handlers into one handler 
>>> with one
>>> Parameter describing what phase of message processing we are now.
>>>
>>
>> No I don't think so. To do so it will be quite unclear in the future 
>> as we are
>> using one function to do two totally different things.
>
> Time is running out for v18.05 integration deadline (April 6th), and 
> we haven't reached a consensus.
>
> Except this API point, I think vhost-crypto is at the right level.
> Since vhost-crypto lives in librte_vhost, I propose Fan cooks an
> intermediate solution that does not need API change.
>
> Doing this, we postpone the API change to v18.08, so we have time to
> discuss what the right API should be. Once agreed, vhost-crypto moves to
> the new API.
>
> Pawel, Jianfeng, Fan, is it fine for you?

+1. This can avoid blocking this patch set, and give more time for 
discussing new APIs and external structs.

Thanks,
Jianfeng

>
> Thanks,
> Maxime

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support
  2018-04-03 13:44  0%         ` Maxime Coquelin
@ 2018-04-03 13:55  0%           ` Zhang, Roy Fan
  2018-04-03 14:42  0%           ` Tan, Jianfeng
  1 sibling, 0 replies; 200+ results
From: Zhang, Roy Fan @ 2018-04-03 13:55 UTC (permalink / raw)
  To: Maxime Coquelin, Wodkowski, PawelX, dev; +Cc: jianjay.zhou, Tan, Jianfeng

Hi Maxime,

No problem. I will work on that.
Pawel, Jianfeng, if you guys have other concerns or suggestions, please give me a shout.

Thanks a lot guys, for the review and help!

Regards,
Fan

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Tuesday, April 3, 2018 2:45 PM
> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; Wodkowski, PawelX
> <pawelx.wodkowski@intel.com>; dev@dpdk.org
> Cc: jianjay.zhou@huawei.com; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> backend support
> 
> Hi Pawel, Fan,
> 
> On 04/01/2018 09:53 PM, Zhang, Roy Fan wrote:
> > Hi Pawel,
> >
> >> -----Original Message-----
> >> From: Wodkowski, PawelX
> >> Sent: Thursday, March 29, 2018 2:48 PM
> >> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org
> >> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
> >> Jianfeng <jianfeng.tan@intel.com>
> >> Subject: RE: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> >> backend support
> >>
> >>> -----Original Message-----
> >>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Fan Zhang
> >>> Sent: Thursday, March 29, 2018 2:53 PM
> >>> To: dev@dpdk.org
> >>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
> >> Jianfeng
> >>> <jianfeng.tan@intel.com>
> >>> Subject: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> >>> backend support
> >>>
> >>> This patch adds external backend support to vhost library. The patch
> >>> provides new APIs for the external backend to register pre and post
> >>> vhost-user message handlers.
> >>>
> >>> Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
> >>> ---
> >>>   lib/librte_vhost/rte_vhost.h           | 64
> >>> +++++++++++++++++++++++++++++++++-
> >>>   lib/librte_vhost/rte_vhost_version.map |  6 ++++
> >>>   lib/librte_vhost/vhost.c               | 17 ++++++++-
> >>>   lib/librte_vhost/vhost.h               |  8 +++--
> >>>   lib/librte_vhost/vhost_user.c          | 33 +++++++++++++++++-
> >>>   5 files changed, 123 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/lib/librte_vhost/rte_vhost.h
> >>> b/lib/librte_vhost/rte_vhost.h index d332069..b902c44 100644
> >>> --- a/lib/librte_vhost/rte_vhost.h
> >>> +++ b/lib/librte_vhost/rte_vhost.h
> >>> @@ -1,5 +1,5 @@
> 
> <snip/>
> 
> >
> >>> + * @param require_reply
> >>> + *  If the handler requires sending a reply, this varaible shall be
> >>> +written 1,
> >>> + *  otherwise 0.
> >>> + * @return
> >>> + *  0 on success, -1 on failure
> >>> + */
> >>> +typedef int (*rte_vhost_msg_post_handle)(int vid, void *msg,
> >>> +		uint32_t *require_reply);
> >>> +
> >>
> >> What mean 'Message pointer' Is this const for us? Is this payload?
> >> Making msg 'void *' is not a way to go here. Those pre and post
> >> handlers need to see exactly the same structures like vhost_user.c
> >> file. Otherwise we can get into troubles when ABI changes.
> >
> > It is the pointer to the vhost_user message. It cannot be const as the
> > backend may change the payload.
> >
> >>
> >> Also you can easily merge pre and post handlers into one handler with
> >> one Parameter describing what phase of message processing we are now.
> >>
> >
> > No I don't think so. To do so it will be quite unclear in the future
> > as we are using one function to do two totally different things.
> 
> Time is running out for v18.05 integration deadline (April 6th), and we haven't
> reached a consensus.
> 
> Except this API point, I think vhost-crypto is at the right level.
> Since vhost-crypto lives in librte_vhost, I propose Fan cooks an intermediate
> solution that does not need API change.
> 
> Doing this, we postpone the API change to v18.08, so we have time to discuss
> what the right API should be. Once agreed, vhost-crypto moves to the new
> API.
> 
> Pawel, Jianfeng, Fan, is it fine for you?
> 
> Thanks,
> Maxime

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support
  2018-04-01 19:53  0%       ` Zhang, Roy Fan
@ 2018-04-03 13:44  0%         ` Maxime Coquelin
  2018-04-03 13:55  0%           ` Zhang, Roy Fan
  2018-04-03 14:42  0%           ` Tan, Jianfeng
  0 siblings, 2 replies; 200+ results
From: Maxime Coquelin @ 2018-04-03 13:44 UTC (permalink / raw)
  To: Zhang, Roy Fan, Wodkowski, PawelX, dev; +Cc: jianjay.zhou, Tan, Jianfeng

Hi Pawel, Fan,

On 04/01/2018 09:53 PM, Zhang, Roy Fan wrote:
> Hi Pawel,
> 
>> -----Original Message-----
>> From: Wodkowski, PawelX
>> Sent: Thursday, March 29, 2018 2:48 PM
>> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org
>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan, Jianfeng
>> <jianfeng.tan@intel.com>
>> Subject: RE: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
>> backend support
>>
>>> -----Original Message-----
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Fan Zhang
>>> Sent: Thursday, March 29, 2018 2:53 PM
>>> To: dev@dpdk.org
>>> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
>> Jianfeng
>>> <jianfeng.tan@intel.com>
>>> Subject: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
>>> backend support
>>>
>>> This patch adds external backend support to vhost library. The patch
>>> provides new APIs for the external backend to register pre and post
>>> vhost-user message handlers.
>>>
>>> Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
>>> ---
>>>   lib/librte_vhost/rte_vhost.h           | 64
>>> +++++++++++++++++++++++++++++++++-
>>>   lib/librte_vhost/rte_vhost_version.map |  6 ++++
>>>   lib/librte_vhost/vhost.c               | 17 ++++++++-
>>>   lib/librte_vhost/vhost.h               |  8 +++--
>>>   lib/librte_vhost/vhost_user.c          | 33 +++++++++++++++++-
>>>   5 files changed, 123 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/rte_vhost.h
>>> b/lib/librte_vhost/rte_vhost.h index d332069..b902c44 100644
>>> --- a/lib/librte_vhost/rte_vhost.h
>>> +++ b/lib/librte_vhost/rte_vhost.h
>>> @@ -1,5 +1,5 @@

<snip/>

> 
>>> + * @param require_reply
>>> + *  If the handler requires sending a reply, this varaible shall be
>>> +written 1,
>>> + *  otherwise 0.
>>> + * @return
>>> + *  0 on success, -1 on failure
>>> + */
>>> +typedef int (*rte_vhost_msg_post_handle)(int vid, void *msg,
>>> +		uint32_t *require_reply);
>>> +
>>
>> What mean 'Message pointer' Is this const for us? Is this payload? Making
>> msg 'void *' is not a way to go here. Those pre and post handlers need to see
>> exactly the same structures like vhost_user.c file. Otherwise we can get into
>> troubles when ABI changes.
> 
> It is the pointer to the vhost_user message. It cannot be const as the backend
> may change the payload.
> 
>>
>> Also you can easily merge pre and post handlers into one handler with one
>> Parameter describing what phase of message processing we are now.
>>
> 
> No I don't think so. To do so it will be quite unclear in the future as we are
> using one function to do two totally different things.

Time is running out for v18.05 integration deadline (April 6th), and we 
haven't reached a consensus.

Except this API point, I think vhost-crypto is at the right level.
Since vhost-crypto lives in librte_vhost, I propose Fan cooks an
intermediate solution that does not need API change.

Doing this, we postpone the API change to v18.08, so we have time to
discuss what the right API should be. Once agreed, vhost-crypto moves to
the new API.

Pawel, Jianfeng, Fan, is it fine for you?

Thanks,
Maxime

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH] mbuf: remove control mbuf
  @ 2018-04-03 13:39  3% ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2018-04-03 13:39 UTC (permalink / raw)
  To: dev

The rte_ctrlmbuf structure is not used by any example application
in dpdk. Remove it, as announced on the mailing list.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 doc/guides/prog_guide/glossary.rst     |  3 --
 doc/guides/prog_guide/mbuf_lib.rst     | 11 ++---
 doc/guides/prog_guide/overview.rst     |  4 +-
 doc/guides/prog_guide/source_org.rst   |  2 +-
 doc/guides/rel_notes/deprecation.rst   | 13 -----
 doc/guides/rel_notes/release_18_05.rst | 15 +++++-
 lib/librte_mbuf/Makefile               |  2 +-
 lib/librte_mbuf/rte_mbuf.c             | 15 ------
 lib/librte_mbuf/rte_mbuf.h             | 86 ----------------------------------
 lib/librte_mbuf/rte_mbuf_version.map   |  1 -
 10 files changed, 23 insertions(+), 129 deletions(-)

diff --git a/doc/guides/prog_guide/glossary.rst b/doc/guides/prog_guide/glossary.rst
index e101bc022..dda45bd18 100644
--- a/doc/guides/prog_guide/glossary.rst
+++ b/doc/guides/prog_guide/glossary.rst
@@ -41,9 +41,6 @@ CPU
 CRC
    Cyclic Redundancy Check
 
-ctrlmbuf
-   An *mbuf* carrying control data.
-
 Data Plane
    In contrast to the control plane, the data plane in a network architecture
    are the layers involved when forwarding packets.  These layers must be
diff --git a/doc/guides/prog_guide/mbuf_lib.rst b/doc/guides/prog_guide/mbuf_lib.rst
index 210a9af9f..0d3223b08 100644
--- a/doc/guides/prog_guide/mbuf_lib.rst
+++ b/doc/guides/prog_guide/mbuf_lib.rst
@@ -10,9 +10,8 @@ The mbuf library provides the ability to allocate and free buffers (mbufs)
 that may be used by the DPDK application to store message buffers.
 The message buffers are stored in a mempool, using the :ref:`Mempool Library <Mempool_Library>`.
 
-A rte_mbuf struct can carry network packet buffers
-or generic control buffers (indicated by the CTRL_MBUF_FLAG).
-This can be extended to other types.
+A rte_mbuf struct generally carries network packet buffers, but it can actually
+be any data (control data, events, ...).
 The rte_mbuf header structure is kept as small as possible and currently uses
 just two cache lines, with the most frequently used fields being on the first
 of the two cache lines.
@@ -68,13 +67,13 @@ Buffers Stored in Memory Pools
 The Buffer Manager uses the :ref:`Mempool Library <Mempool_Library>` to allocate buffers.
 Therefore, it ensures that the packet header is interleaved optimally across the channels and ranks for L3 processing.
 An mbuf contains a field indicating the pool that it originated from.
-When calling rte_ctrlmbuf_free(m) or rte_pktmbuf_free(m), the mbuf returns to its original pool.
+When calling rte_pktmbuf_free(m), the mbuf returns to its original pool.
 
 Constructors
 ------------
 
-Packet and control mbuf constructors are provided by the API.
-The rte_pktmbuf_init() and rte_ctrlmbuf_init() functions initialize some fields in the mbuf structure that
+Packet mbuf constructors are provided by the API.
+The rte_pktmbuf_init() function initializes some fields in the mbuf structure that
 are not modified by the user once created (mbuf type, origin pool, buffer start address, and so on).
 This function is given as a callback function to the rte_mempool_create() function at pool creation time.
 
diff --git a/doc/guides/prog_guide/overview.rst b/doc/guides/prog_guide/overview.rst
index 2663fe0e8..c01f37e3c 100644
--- a/doc/guides/prog_guide/overview.rst
+++ b/doc/guides/prog_guide/overview.rst
@@ -130,8 +130,8 @@ The mbuf library provides the facility to create and destroy buffers
 that may be used by the DPDK application to store message buffers.
 The message buffers are created at startup time and stored in a mempool, using the DPDK mempool library.
 
-This library provides an API to allocate/free mbufs, manipulate control message buffers (ctrlmbuf) which are generic message buffers,
-and packet buffers (pktmbuf) which are used to carry network packets.
+This library provides an API to allocate/free mbufs, manipulate
+packet buffers which are used to carry network packets.
 
 Network Packet Buffer Management is described in :ref:`Mbuf Library <Mbuf_Library>`.
 
diff --git a/doc/guides/prog_guide/source_org.rst b/doc/guides/prog_guide/source_org.rst
index a8f5832bc..b640b0111 100644
--- a/doc/guides/prog_guide/source_org.rst
+++ b/doc/guides/prog_guide/source_org.rst
@@ -46,7 +46,7 @@ The lib directory contains::
     +-- librte_kni          # Kernel NIC interface
     +-- librte_kvargs       # Argument parsing library
     +-- librte_lpm          # Longest prefix match library
-    +-- librte_mbuf         # Packet and control mbuf manipulation
+    +-- librte_mbuf         # Packet buffer manipulation
     +-- librte_mempool      # Memory pool manager (fixed sized objects)
     +-- librte_meter        # QoS metering library
     +-- librte_net          # Various IP-related headers
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 84e153461..61b8ac705 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -76,19 +76,6 @@ Deprecation Notices
     customize objects population and allocate contiguous
     block of objects if underlying driver supports it.
 
-* mbuf: The control mbuf API will be removed in v18.05. The impacted
-  functions and macros are:
-
-  - ``rte_ctrlmbuf_init()``
-  - ``rte_ctrlmbuf_alloc()``
-  - ``rte_ctrlmbuf_free()``
-  - ``rte_ctrlmbuf_data()``
-  - ``rte_ctrlmbuf_len()``
-  - ``rte_is_ctrlmbuf()``
-  - ``CTRL_MBUF_FLAG``
-
-  The packet mbuf API should be used as a replacement.
-
 * mbuf: The opaque ``mbuf->hash.sched`` field will be updated to support generic
   definition in line with the ethdev TM and MTR APIs. Currently, this field
   is defined in librte_sched in a non-generic way. The new generic format
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 4d0276f1d..9b9a74885 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -72,6 +72,19 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* mbuf: The control mbuf API has been removed in v18.05. The impacted
+  functions and macros are:
+
+  - ``rte_ctrlmbuf_init()``
+  - ``rte_ctrlmbuf_alloc()``
+  - ``rte_ctrlmbuf_free()``
+  - ``rte_ctrlmbuf_data()``
+  - ``rte_ctrlmbuf_len()``
+  - ``rte_is_ctrlmbuf()``
+  - ``CTRL_MBUF_FLAG``
+
+  The packet mbuf API should be used as a replacement.
+
 
 ABI Changes
 -----------
@@ -163,7 +176,7 @@ The libraries prepended with a plus sign were incremented in this version.
      librte_kvargs.so.1
      librte_latencystats.so.1
      librte_lpm.so.2
-     librte_mbuf.so.3
+   + librte_mbuf.so.4
      librte_mempool.so.3
    + librte_meter.so.2
      librte_metrics.so.1
diff --git a/lib/librte_mbuf/Makefile b/lib/librte_mbuf/Makefile
index 367568ae3..8749a00fe 100644
--- a/lib/librte_mbuf/Makefile
+++ b/lib/librte_mbuf/Makefile
@@ -12,7 +12,7 @@ LDLIBS += -lrte_eal -lrte_mempool
 
 EXPORT_MAP := rte_mbuf_version.map
 
-LIBABIVER := 3
+LIBABIVER := 4
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MBUF) := rte_mbuf.c rte_mbuf_ptype.c rte_mbuf_pool_ops.c
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 091d388d3..3f4c83305 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -33,21 +33,6 @@
 #include <rte_memcpy.h>
 
 /*
- * ctrlmbuf constructor, given as a callback function to
- * rte_mempool_obj_iter() or rte_mempool_create()
- */
-void
-rte_ctrlmbuf_init(struct rte_mempool *mp,
-		__attribute__((unused)) void *opaque_arg,
-		void *_m,
-		__attribute__((unused)) unsigned i)
-{
-	struct rte_mbuf *m = _m;
-	rte_pktmbuf_init(mp, opaque_arg, _m, i);
-	m->ol_flags |= CTRL_MBUF_FLAG;
-}
-
-/*
  * pktmbuf pool constructor, given as a callback function to
  * rte_mempool_create(), or called directly if using
  * rte_mempool_create_empty()/rte_mempool_populate()
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 62740254d..06eceba37 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -330,9 +330,6 @@ extern "C" {
 
 #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
 
-/* Use final bit of flags to indicate a control mbuf */
-#define CTRL_MBUF_FLAG       (1ULL << 63) /**< Mbuf contains control data */
-
 /** Alignment constraint of mbuf private area. */
 #define RTE_MBUF_PRIV_ALIGN 8
 
@@ -915,89 +912,6 @@ __rte_mbuf_raw_free(struct rte_mbuf *m)
 	rte_mbuf_raw_free(m);
 }
 
-/* Operations on ctrl mbuf */
-
-/**
- * The control mbuf constructor.
- *
- * This function initializes some fields in an mbuf structure that are
- * not modified by the user once created (mbuf type, origin pool, buffer
- * start address, and so on). This function is given as a callback function
- * to rte_mempool_obj_iter() or rte_mempool_create() at pool creation time.
- *
- * @param mp
- *   The mempool from which the mbuf is allocated.
- * @param opaque_arg
- *   A pointer that can be used by the user to retrieve useful information
- *   for mbuf initialization. This pointer is the opaque argument passed to
- *   rte_mempool_obj_iter() or rte_mempool_create().
- * @param m
- *   The mbuf to initialize.
- * @param i
- *   The index of the mbuf in the pool table.
- */
-void rte_ctrlmbuf_init(struct rte_mempool *mp, void *opaque_arg,
-		void *m, unsigned i);
-
-/**
- * Allocate a new mbuf (type is ctrl) from mempool *mp*.
- *
- * This new mbuf is initialized with data pointing to the beginning of
- * buffer, and with a length of zero.
- *
- * @param mp
- *   The mempool from which the mbuf is allocated.
- * @return
- *   - The pointer to the new mbuf on success.
- *   - NULL if allocation failed.
- */
-#define rte_ctrlmbuf_alloc(mp) rte_pktmbuf_alloc(mp)
-
-/**
- * Free a control mbuf back into its original mempool.
- *
- * @param m
- *   The control mbuf to be freed.
- */
-#define rte_ctrlmbuf_free(m) rte_pktmbuf_free(m)
-
-/**
- * A macro that returns the pointer to the carried data.
- *
- * The value that can be read or assigned.
- *
- * @param m
- *   The control mbuf.
- */
-#define rte_ctrlmbuf_data(m) ((char *)((m)->buf_addr) + (m)->data_off)
-
-/**
- * A macro that returns the length of the carried data.
- *
- * The value that can be read or assigned.
- *
- * @param m
- *   The control mbuf.
- */
-#define rte_ctrlmbuf_len(m) rte_pktmbuf_data_len(m)
-
-/**
- * Tests if an mbuf is a control mbuf
- *
- * @param m
- *   The mbuf to be tested
- * @return
- *   - True (1) if the mbuf is a control mbuf
- *   - False(0) otherwise
- */
-static inline int
-rte_is_ctrlmbuf(struct rte_mbuf *m)
-{
-	return !!(m->ol_flags & CTRL_MBUF_FLAG);
-}
-
-/* Operations on pkt mbuf */
-
 /**
  * The packet mbuf constructor.
  *
diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
index d418dcb82..2e056d994 100644
--- a/lib/librte_mbuf/rte_mbuf_version.map
+++ b/lib/librte_mbuf/rte_mbuf_version.map
@@ -1,7 +1,6 @@
 DPDK_2.0 {
 	global:
 
-	rte_ctrlmbuf_init;
 	rte_get_rx_ol_flag_name;
 	rte_get_tx_ol_flag_name;
 	rte_mbuf_sanity_check;
-- 
2.11.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH] ring: relax alignment constraint on ring structure
  @ 2018-04-03 13:26  9% ` Olivier Matz
    0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2018-04-03 13:26 UTC (permalink / raw)
  To: dev

The initial objective of
commit d9f0d3a1ffd4 ("ring: remove split cacheline build setting")
was to add an empty cache line betwee, the producer and consumer
data (on platform with cache line size = 64B), preventing from
having them on adjacent cache lines.

Following discussion on the mailing list, it appears that this
also imposes an alignment constraint that is not required.

This patch removes the extra alignment constraint and adds the
empty cache lines using padding fields in the structure. The
size of rte_ring structure and the offset of the fields remain
the same on platforms with cache line size = 64B:

  rte_ring = 384
  rte_ring.name = 0
  rte_ring.flags = 32
  rte_ring.memzone = 40
  rte_ring.size = 48
  rte_ring.mask = 52
  rte_ring.prod = 128
  rte_ring.cons = 256

But it has an impact on platform where cache line size is 128B:

  rte_ring = 384        -> 768
  rte_ring.name = 0
  rte_ring.flags = 32
  rte_ring.memzone = 40
  rte_ring.size = 48
  rte_ring.mask = 52
  rte_ring.prod = 128   -> 256
  rte_ring.cons = 256   -> 512

Link: http://dpdk.org/dev/patchwork/patch/25039/
Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 doc/guides/rel_notes/deprecation.rst   |  6 ------
 doc/guides/rel_notes/release_18_05.rst |  8 +++++++-
 lib/librte_ring/Makefile               |  2 +-
 lib/librte_ring/rte_ring.h             | 16 ++++++----------
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 40448961a..84e153461 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -139,9 +139,3 @@ Deprecation Notices
   required the previous behavior can be configured using existing flow
   director APIs. There is no ABI/API break. This change will just remove a
   global configuration setting and require explicit configuration.
-
-* ring: The alignment constraints on the ring structure will be relaxed
-  to one cache line instead of two, and an empty cache line padding will
-  be added between the producer and consumer structures. The size of the
-  structure and the offset of the fields will remain the same on
-  platforms with 64B cache line, but will change on other platforms.
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 9cc77f893..4d0276f1d 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -86,6 +86,12 @@ ABI Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* ring: the alignment constraints on the ring structure has been relaxed
+  to one cache line instead of two, and an empty cache line padding is
+  added between the producer and consumer structures. The size of the
+  structure and the offset of the fields remains the same on platforms
+  with 64B cache line, but changes on other platforms.
+
 
 Removed Items
 -------------
@@ -176,7 +182,7 @@ The libraries prepended with a plus sign were incremented in this version.
      librte_power.so.1
      librte_rawdev.so.1
      librte_reorder.so.1
-     librte_ring.so.1
+   + librte_ring.so.2
      librte_sched.so.1
      librte_security.so.1
      librte_table.so.3
diff --git a/lib/librte_ring/Makefile b/lib/librte_ring/Makefile
index bde8907d6..21a36770d 100644
--- a/lib/librte_ring/Makefile
+++ b/lib/librte_ring/Makefile
@@ -11,7 +11,7 @@ LDLIBS += -lrte_eal
 
 EXPORT_MAP := rte_ring_version.map
 
-LIBABIVER := 1
+LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_RING) := rte_ring.c
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 253cdc96a..d3d3f7f97 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -62,14 +62,6 @@ enum rte_ring_queue_behavior {
 
 struct rte_memzone; /* forward declaration, so as not to require memzone.h */
 
-#if RTE_CACHE_LINE_SIZE < 128
-#define PROD_ALIGN (RTE_CACHE_LINE_SIZE * 2)
-#define CONS_ALIGN (RTE_CACHE_LINE_SIZE * 2)
-#else
-#define PROD_ALIGN RTE_CACHE_LINE_SIZE
-#define CONS_ALIGN RTE_CACHE_LINE_SIZE
-#endif
-
 /* structure to hold a pair of head/tail values and other metadata */
 struct rte_ring_headtail {
 	volatile uint32_t head;  /**< Prod/consumer head. */
@@ -101,11 +93,15 @@ struct rte_ring {
 	uint32_t mask;           /**< Mask (size-1) of ring. */
 	uint32_t capacity;       /**< Usable size of ring */
 
+	char pad0 __rte_cache_aligned; /**< empty cache line */
+
 	/** Ring producer status. */
-	struct rte_ring_headtail prod __rte_aligned(PROD_ALIGN);
+	struct rte_ring_headtail prod __rte_cache_aligned;
+	char pad1 __rte_cache_aligned; /**< empty cache line */
 
 	/** Ring consumer status. */
-	struct rte_ring_headtail cons __rte_aligned(CONS_ALIGN);
+	struct rte_ring_headtail cons __rte_cache_aligned;
+	char pad2 __rte_cache_aligned; /**< empty cache line */
 };
 
 #define RING_F_SP_ENQ 0x0001 /**< The default enqueue is "single-producer". */
-- 
2.11.0

^ permalink raw reply	[relevance 9%]

* Re: [dpdk-dev] [PATCH v3 1/2] doc: add vfio api support
  2018-04-03  8:28  4% ` [dpdk-dev] [PATCH v3 1/2] doc: add vfio api support Hemant Agrawal
@ 2018-04-03 10:16  0%   ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-04-03 10:16 UTC (permalink / raw)
  To: Hemant Agrawal; +Cc: dev, anatoly.burakov

03/04/2018 10:28, Hemant Agrawal:
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -179,4 +179,5 @@ The public API headers are grouped by topics:
>    [EAL config]         (@ref rte_eal.h),
>    [common]             (@ref rte_common.h),
>    [ABI compat]         (@ref rte_compat.h),
> -  [version]            (@ref rte_version.h)
> +  [version]            (@ref rte_version.h),
> +  [vfio]               (@ref rte_vfio.h)

It would be more appropriate after rte_pci.h in "device" section.

> diff --git a/doc/api/doxy-api.conf b/doc/api/doxy-api.conf
> index cda52fd..166612f 100644
> --- a/doc/api/doxy-api.conf
> +++ b/doc/api/doxy-api.conf
> @@ -82,6 +82,7 @@ INPUT                   = doc/api/doxy-api-index.md \
>  FILE_PATTERNS           = rte_*.h \
>                            cmdline.h
>  PREDEFINED              = __DOXYGEN__ \
> +			  VFIO_PRESENT \
>                            __attribute__(x)=

The indent is not the same as other lines.

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 02/10] crypto/virtio: support virtio device init
  @ 2018-04-03  9:43  1% ` Jay Zhou
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-04-03  9:43 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
---
 drivers/crypto/virtio/Makefile           |   3 +
 drivers/crypto/virtio/virtio_cryptodev.c | 247 ++++++++++++++++-
 drivers/crypto/virtio/virtio_cryptodev.h |  13 +
 drivers/crypto/virtio/virtio_logs.h      |  91 ++++++
 drivers/crypto/virtio/virtio_pci.c       | 460 +++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h       | 253 +++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h      | 137 +++++++++
 drivers/crypto/virtio/virtio_rxtx.c      |  26 ++
 drivers/crypto/virtio/virtqueue.c        |  43 +++
 drivers/crypto/virtio/virtqueue.h        | 172 ++++++++++++
 10 files changed, 1442 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtio_rxtx.c
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/drivers/crypto/virtio/Makefile b/drivers/crypto/virtio/Makefile
index a3b44e9..c4727ea 100644
--- a/drivers/crypto/virtio/Makefile
+++ b/drivers/crypto/virtio/Makefile
@@ -18,6 +18,9 @@ LIBABIVER := 1
 #
 # all source are stored in SRCS-y
 #
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtqueue.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_pci.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO) += virtio_cryptodev.c
 
 # this lib depends upon:
diff --git a/drivers/crypto/virtio/virtio_cryptodev.c b/drivers/crypto/virtio/virtio_cryptodev.c
index 84aff58..4550834 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.c
+++ b/drivers/crypto/virtio/virtio_cryptodev.c
@@ -3,25 +3,238 @@
  */
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
 #include <rte_cryptodev_pmd.h>
+#include <rte_eal.h>
 #include "virtio_cryptodev.h"
+#include "virtqueue.h"
+
+int virtio_crypto_logtype_init;
+int virtio_crypto_logtype_session;
+int virtio_crypto_logtype_rx;
+int virtio_crypto_logtype_tx;
+int virtio_crypto_logtype_driver;
+
+/*
+ * The set of PCI devices this driver supports
+ */
+static const struct rte_pci_id pci_id_virtio_crypto_map[] = {
+	{ RTE_PCI_DEVICE(VIRTIO_CRYPTO_PCI_VENDORID,
+				VIRTIO_CRYPTO_PCI_DEVICEID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
 
 uint8_t cryptodev_virtio_driver_id;
 
+/*
+ * dev_ops for virtio, bare necessities for basic operation
+ */
+static struct rte_cryptodev_ops virtio_crypto_dev_ops = {
+	/* Device related operations */
+	.dev_configure			 = NULL,
+	.dev_start			 = NULL,
+	.dev_stop			 = NULL,
+	.dev_close			 = NULL,
+	.dev_infos_get			 = NULL,
+
+	.stats_get			 = NULL,
+	.stats_reset			 = NULL,
+
+	.queue_pair_setup                = NULL,
+	.queue_pair_release              = NULL,
+	.queue_pair_start                = NULL,
+	.queue_pair_stop                 = NULL,
+	.queue_pair_count                = NULL,
+
+	/* Crypto related operations */
+	.session_get_size	= NULL,
+	.session_configure	= NULL,
+	.session_clear		= NULL,
+	.qp_attach_session = NULL,
+	.qp_detach_session = NULL
+};
+
+static int
+virtio_negotiate_features(struct virtio_crypto_hw *hw, uint64_t req_features)
+{
+	uint64_t host_features;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Prepare guest_features: feature that driver wants to support */
+	VIRTIO_CRYPTO_INIT_LOG_DBG("guest_features before negotiate = %" PRIx64,
+		req_features);
+
+	/* Read device(host) feature bits */
+	host_features = VTPCI_OPS(hw)->get_features(hw);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("host_features before negotiate = %" PRIx64,
+		host_features);
+
+	/*
+	 * Negotiate features: Subset of device feature bits are written back
+	 * guest feature bits.
+	 */
+	hw->guest_features = req_features;
+	hw->guest_features = vtpci_cryptodev_negotiate_features(hw,
+							host_features);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("features after negotiate = %" PRIx64,
+		hw->guest_features);
+
+	if (hw->modern) {
+		if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"VIRTIO_F_VERSION_1 features is not enabled.");
+			return -1;
+		}
+		vtpci_cryptodev_set_status(hw,
+			VIRTIO_CONFIG_STATUS_FEATURES_OK);
+		if (!(vtpci_cryptodev_get_status(hw) &
+			VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR("failed to set FEATURES_OK "
+						"status!");
+			return -1;
+		}
+	}
+
+	hw->req_guest_features = req_features;
+
+	return 0;
+}
+
+/* reset device and renegotiate features if needed */
+static int
+virtio_crypto_init_device(struct rte_cryptodev *cryptodev,
+	uint64_t req_features)
+{
+	struct virtio_crypto_hw *hw = cryptodev->data->dev_private;
+	struct virtio_crypto_config local_config;
+	struct virtio_crypto_config *config = &local_config;
+
+	PMD_INIT_FUNC_TRACE();
+
+	/* Reset the device although not necessary at startup */
+	vtpci_cryptodev_reset(hw);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
+
+	/* Tell the host we've known how to drive the device. */
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (virtio_negotiate_features(hw, req_features) < 0)
+		return -1;
+
+	/* Get status of the device */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, status),
+		&config->status, sizeof(config->status));
+	if (config->status != VIRTIO_CRYPTO_S_HW_READY) {
+		VIRTIO_CRYPTO_DRV_LOG_ERR("accelerator hardware is "
+				"not ready");
+		return -1;
+	}
+
+	/* Get number of data queues */
+	vtpci_read_cryptodev_config(hw,
+		offsetof(struct virtio_crypto_config, max_dataqueues),
+		&config->max_dataqueues,
+		sizeof(config->max_dataqueues));
+	hw->max_dataqueues = config->max_dataqueues;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("hw->max_dataqueues=%d",
+		hw->max_dataqueues);
+
+	return 0;
+}
+
+/*
+ * This function is based on probe() function
+ * It returns 0 on success.
+ */
+static int
+crypto_virtio_create(const char *name, struct rte_pci_device *pci_dev,
+		struct rte_cryptodev_pmd_init_params *init_params)
+{
+	struct rte_cryptodev *cryptodev;
+	struct virtio_crypto_hw *hw;
+
+	PMD_INIT_FUNC_TRACE();
+
+	cryptodev = rte_cryptodev_pmd_create(name, &pci_dev->device,
+					init_params);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
+	cryptodev->driver_id = cryptodev_virtio_driver_id;
+	cryptodev->dev_ops = &virtio_crypto_dev_ops;
+
+	cryptodev->enqueue_burst = virtio_crypto_pkt_tx_burst;
+	cryptodev->dequeue_burst = virtio_crypto_pkt_rx_burst;
+
+	cryptodev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+		RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+	hw = cryptodev->data->dev_private;
+	hw->dev_id = cryptodev->data->dev_id;
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("dev %d vendorID=0x%x deviceID=0x%x",
+		cryptodev->data->dev_id, pci_dev->id.vendor_id,
+		pci_dev->id.device_id);
+
+	/* pci device init */
+	if (vtpci_cryptodev_init(pci_dev, hw))
+		return -1;
+
+	if (virtio_crypto_init_device(cryptodev,
+			VIRTIO_CRYPTO_PMD_GUEST_FEATURES) < 0)
+		return -1;
+
+	return 0;
+}
+
 static int crypto_virtio_pci_probe(
 	struct rte_pci_driver *pci_drv __rte_unused,
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
-	return 0;
+	struct rte_cryptodev_pmd_init_params init_params = {
+		.name = "",
+		.socket_id = rte_socket_id(),
+		.private_data_size = sizeof(struct virtio_crypto_hw),
+		.max_nb_sessions = RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS
+	};
+	char name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	VIRTIO_CRYPTO_DRV_LOG_DBG("Found Crypto device at %02x:%02x.%x",
+			pci_dev->addr.bus,
+			pci_dev->addr.devid,
+			pci_dev->addr.function);
+
+	rte_pci_device_name(&pci_dev->addr, name, sizeof(name));
+
+	return crypto_virtio_create(name, pci_dev, &init_params);
 }
 
 static int crypto_virtio_pci_remove(
-	struct rte_pci_device *pci_dev __rte_unused)
+	struct rte_pci_device *pci_dev)
 {
+	struct rte_cryptodev *cryptodev;
+	char cryptodev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	rte_pci_device_name(&pci_dev->addr, cryptodev_name,
+			sizeof(cryptodev_name));
+
+	cryptodev = rte_cryptodev_pmd_get_named_dev(cryptodev_name);
+	if (cryptodev == NULL)
+		return -ENODEV;
+
 	return 0;
 }
 
 static struct rte_pci_driver rte_virtio_crypto_driver = {
+	.id_table = pci_id_virtio_crypto_map,
+	.drv_flags = 0,
 	.probe = crypto_virtio_pci_probe,
 	.remove = crypto_virtio_pci_remove
 };
@@ -32,3 +245,31 @@ static int crypto_virtio_pci_remove(
 RTE_PMD_REGISTER_CRYPTO_DRIVER(virtio_crypto_drv,
 	rte_virtio_crypto_driver.driver,
 	cryptodev_virtio_driver_id);
+
+RTE_INIT(virtio_crypto_init_log);
+static void
+virtio_crypto_init_log(void)
+{
+	virtio_crypto_logtype_init = rte_log_register("pmd.crypto.virtio.init");
+	if (virtio_crypto_logtype_init >= 0)
+		rte_log_set_level(virtio_crypto_logtype_init, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_session =
+		rte_log_register("pmd.crypto.virtio.session");
+	if (virtio_crypto_logtype_session >= 0)
+		rte_log_set_level(virtio_crypto_logtype_session,
+				RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_rx = rte_log_register("pmd.crypto.virtio.rx");
+	if (virtio_crypto_logtype_rx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_rx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_tx = rte_log_register("pmd.crypto.virtio.tx");
+	if (virtio_crypto_logtype_tx >= 0)
+		rte_log_set_level(virtio_crypto_logtype_tx, RTE_LOG_NOTICE);
+
+	virtio_crypto_logtype_driver =
+		rte_log_register("pmd.crypto.virtio.driver");
+	if (virtio_crypto_logtype_driver >= 0)
+		rte_log_set_level(virtio_crypto_logtype_driver, RTE_LOG_NOTICE);
+}
diff --git a/drivers/crypto/virtio/virtio_cryptodev.h b/drivers/crypto/virtio/virtio_cryptodev.h
index 44517b8..392db4a 100644
--- a/drivers/crypto/virtio/virtio_cryptodev.h
+++ b/drivers/crypto/virtio/virtio_cryptodev.h
@@ -5,6 +5,19 @@
 #ifndef _VIRTIO_CRYPTODEV_H_
 #define _VIRTIO_CRYPTODEV_H_
 
+#include <rte_cryptodev.h>
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_CRYPTO_PMD_GUEST_FEATURES (1ULL << VIRTIO_F_VERSION_1)
+
 #define CRYPTODEV_NAME_VIRTIO_PMD crypto_virtio
 
+uint16_t virtio_crypto_pkt_tx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
+uint16_t virtio_crypto_pkt_rx_burst(void *tx_queue,
+		struct rte_crypto_op **tx_pkts,
+		uint16_t nb_pkts);
+
 #endif /* _VIRTIO_CRYPTODEV_H_ */
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..cd316a6
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtio_rxtx.c b/drivers/crypto/virtio/virtio_rxtx.c
new file mode 100644
index 0000000..51f6e09
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_rxtx.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+#include "virtio_cryptodev.h"
+
+uint16_t
+virtio_crypto_pkt_rx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **rx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_rx = 0;
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_crypto_pkt_tx_burst(
+	void *tx_queue __rte_unused,
+	struct rte_crypto_op **tx_pkts __rte_unused,
+	uint16_t nb_pkts __rte_unused)
+{
+	uint16_t nb_tx = 0;
+
+	return nb_tx;
+}
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..0a9bddb
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 1/2] doc: add vfio api support
  @ 2018-04-03  8:28  4% ` Hemant Agrawal
  2018-04-03 10:16  0%   ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Hemant Agrawal @ 2018-04-03  8:28 UTC (permalink / raw)
  To: dev; +Cc: anatoly.burakov, thomas

Signed-off-by: Hemant Agrawal <hemant.agrawal@nxp.com>
---
 doc/api/doxy-api-index.md                | 3 ++-
 doc/api/doxy-api.conf                    | 1 +
 lib/librte_eal/common/include/rte_vfio.h | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index d77f205..12c1ebe 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -179,4 +179,5 @@ The public API headers are grouped by topics:
   [EAL config]         (@ref rte_eal.h),
   [common]             (@ref rte_common.h),
   [ABI compat]         (@ref rte_compat.h),
-  [version]            (@ref rte_version.h)
+  [version]            (@ref rte_version.h),
+  [vfio]               (@ref rte_vfio.h)
diff --git a/doc/api/doxy-api.conf b/doc/api/doxy-api.conf
index cda52fd..166612f 100644
--- a/doc/api/doxy-api.conf
+++ b/doc/api/doxy-api.conf
@@ -82,6 +82,7 @@ INPUT                   = doc/api/doxy-api-index.md \
 FILE_PATTERNS           = rte_*.h \
                           cmdline.h
 PREDEFINED              = __DOXYGEN__ \
+			  VFIO_PRESENT \
                           __attribute__(x)=
 
 OPTIMIZE_OUTPUT_FOR_C   = YES
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 249095e..9b7b983 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -5,6 +5,11 @@
 #ifndef _RTE_VFIO_H_
 #define _RTE_VFIO_H_
 
+/**
+ * @file
+ * RTE VFIO. This library provides various VFIO related utility functions.
+ */
+
 /*
  * determine if VFIO is present on the system
  */
-- 
2.7.4

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [RFC PATCH 5/5] test: add few eBPF samples
  @ 2018-04-02 22:26  3%         ` Jerin Jacob
  0 siblings, 0 replies; 200+ results
From: Jerin Jacob @ 2018-04-02 22:26 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: 'dev@dpdk.org'

-----Original Message-----
> Date: Fri, 30 Mar 2018 17:42:22 +0000
> From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> To: 'Jerin Jacob' <jerin.jacob@caviumnetworks.com>
> CC: "'dev@dpdk.org'" <dev@dpdk.org>
> Subject: RE: [dpdk-dev] [RFC PATCH 5/5] test: add few eBPF samples
> 
> Hi Jerin,
> > > > Add few simple eBPF programs as an example.
> > > >
> > > > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > > > diff --git a/test/bpf/mbuf.h b/test/bpf/mbuf.h
> > > > new file mode 100644
> > > > index 000000000..aeef6339d
> > > > --- /dev/null
> > > > +++ b/test/bpf/mbuf.h
> > > > @@ -0,0 +1,556 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > + * Copyright(c) 2010-2014 Intel Corporation.
> > > > + * Copyright 2014 6WIND S.A.
> > > > + */
> > > > +
> > > > +/*
> > > > + * Snipper from dpdk.org rte_mbuf.h.
> > > > + * used to provide BPF programs information about rte_mbuf layout.
> > > > + */
> > > > +
> > > > +#ifndef _MBUF_H_
> > > > +#define _MBUF_H_
> > > > +
> > > > +#include <stdint.h>
> > > > +#include <rte_common.h>
> > > > +#include <rte_memory.h>
> > >
> > > Is it worth to keep an copy of mbuf for standalone purpose?
> > > Since clang is already supported, I think, if someone need mbuf then
> > > they can include DPDK headers. Just thinking in maintainability
> > > perspective.
> > 
> > That would be ideal.
> > I made a snippet just to avoid compiler errors for bpf target.
> > Will try to address it in next version.
> > 
> 
> I looked at it a bit more and it seems that it wouldn't be that straightforward as I thought.
> There are things not supported by bpf target (thread local-storage and simd related definitions)
> inside include chain.
> So to fix it some changes in our core include files might be needed .
> The simplest way would probably be to move struct rte_mbuf and related macros definitions into a separate
> file (rte_mbuf_common.h or so).

I think, rte_mbuf_common.h should be the way to go. IMO, KNI also benefited with that.

I guess, There is NO ABI change if we move the generic stuff to rte_mbuf_common.h.
But if you think, it is quite controversial change then we could
postpone to next release.(Only my worry is that, once it is postponed it
may not happen). I am fine with either way.

> Though it is quite controversial change and I think it is better to postpone it till a separate patch and
> probably next release.
> So for now I left a snipper test/bpf/mbuf.h in place.
> Konstantin

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v9 3/9] eventtimer: add common code
  @ 2018-04-02 19:39  3%   ` Erik Gabriel Carrillo
    1 sibling, 0 replies; 200+ results
From: Erik Gabriel Carrillo @ 2018-04-02 19:39 UTC (permalink / raw)
  To: pbhagavatula; +Cc: dev, jerin.jacob, hemant.agrawal

This commit adds the logic that is shared by all event timer adapter
drivers; the common code handles instance allocation and some
initialization.

Signed-off-by: Erik Gabriel Carrillo <erik.g.carrillo@intel.com>
---
 config/common_base                                |   1 +
 drivers/event/sw/sw_evdev.c                       |  18 +
 lib/librte_eventdev/Makefile                      |   2 +
 lib/librte_eventdev/rte_event_timer_adapter.c     | 387 ++++++++++++++++++++++
 lib/librte_eventdev/rte_event_timer_adapter_pmd.h | 114 +++++++
 lib/librte_eventdev/rte_eventdev.c                |  22 ++
 lib/librte_eventdev/rte_eventdev.h                |  20 ++
 lib/librte_eventdev/rte_eventdev_pmd.h            |  35 ++
 lib/librte_eventdev/rte_eventdev_version.map      |  21 +-
 9 files changed, 619 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter.c
 create mode 100644 lib/librte_eventdev/rte_event_timer_adapter_pmd.h

diff --git a/config/common_base b/config/common_base
index ee10b44..accc6f5 100644
--- a/config/common_base
+++ b/config/common_base
@@ -550,6 +550,7 @@ CONFIG_RTE_LIBRTE_EVENTDEV=y
 CONFIG_RTE_LIBRTE_EVENTDEV_DEBUG=n
 CONFIG_RTE_EVENT_MAX_DEVS=16
 CONFIG_RTE_EVENT_MAX_QUEUES_PER_DEV=64
+CONFIG_RTE_EVENT_TIMER_ADAPTER_NUM_MAX=32
 
 #
 # Compile PMD for skeleton event device
diff --git a/drivers/event/sw/sw_evdev.c b/drivers/event/sw/sw_evdev.c
index 6672fd8..0847547 100644
--- a/drivers/event/sw/sw_evdev.c
+++ b/drivers/event/sw/sw_evdev.c
@@ -464,6 +464,22 @@ sw_eth_rx_adapter_caps_get(const struct rte_eventdev *dev,
 	return 0;
 }
 
+static int
+sw_timer_adapter_caps_get(const struct rte_eventdev *dev,
+			  uint64_t flags,
+			  uint32_t *caps,
+			  const struct rte_event_timer_adapter_ops **ops)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(flags);
+	*caps = 0;
+
+	/* Use default SW ops */
+	*ops = NULL;
+
+	return 0;
+}
+
 static void
 sw_info_get(struct rte_eventdev *dev, struct rte_event_dev_info *info)
 {
@@ -791,6 +807,8 @@ sw_probe(struct rte_vdev_device *vdev)
 
 			.eth_rx_adapter_caps_get = sw_eth_rx_adapter_caps_get,
 
+			.timer_adapter_caps_get = sw_timer_adapter_caps_get,
+
 			.xstats_get = sw_xstats_get,
 			.xstats_get_names = sw_xstats_get_names,
 			.xstats_get_by_name = sw_xstats_get_by_name,
diff --git a/lib/librte_eventdev/Makefile b/lib/librte_eventdev/Makefile
index 549b182..8b16e3f 100644
--- a/lib/librte_eventdev/Makefile
+++ b/lib/librte_eventdev/Makefile
@@ -20,6 +20,7 @@ LDLIBS += -lrte_eal -lrte_ring -lrte_ethdev -lrte_hash
 SRCS-y += rte_eventdev.c
 SRCS-y += rte_event_ring.c
 SRCS-y += rte_event_eth_rx_adapter.c
+SRCS-y += rte_event_timer_adapter.c
 
 # export include files
 SYMLINK-y-include += rte_eventdev.h
@@ -29,6 +30,7 @@ SYMLINK-y-include += rte_eventdev_pmd_vdev.h
 SYMLINK-y-include += rte_event_ring.h
 SYMLINK-y-include += rte_event_eth_rx_adapter.h
 SYMLINK-y-include += rte_event_timer_adapter.h
+SYMLINK-y-include += rte_event_timer_adapter_pmd.h
 
 # versioning export map
 EXPORT_MAP := rte_eventdev_version.map
diff --git a/lib/librte_eventdev/rte_event_timer_adapter.c b/lib/librte_eventdev/rte_event_timer_adapter.c
new file mode 100644
index 0000000..75a14ac
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter.c
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <string.h>
+#include <inttypes.h>
+
+#include <rte_memzone.h>
+#include <rte_memory.h>
+#include <rte_dev.h>
+#include <rte_errno.h>
+
+#include "rte_eventdev.h"
+#include "rte_eventdev_pmd.h"
+#include "rte_event_timer_adapter.h"
+#include "rte_event_timer_adapter_pmd.h"
+
+#define DATA_MZ_NAME_MAX_LEN 64
+#define DATA_MZ_NAME_FORMAT "rte_event_timer_adapter_data_%d"
+
+static int evtim_logtype;
+
+static struct rte_event_timer_adapter adapters[RTE_EVENT_TIMER_ADAPTER_NUM_MAX];
+
+#define EVTIM_LOG(level, logtype, ...) \
+	rte_log(RTE_LOG_ ## level, logtype, \
+		RTE_FMT("EVTIMER: %s() line %u: " RTE_FMT_HEAD(__VA_ARGS__,) \
+			"\n", __func__, __LINE__, RTE_FMT_TAIL(__VA_ARGS__,)))
+
+#define EVTIM_LOG_ERR(...) EVTIM_LOG(ERR, evtim_logtype, __VA_ARGS__)
+
+#ifdef RTE_LIBRTE_EVENTDEV_DEBUG
+#define EVTIM_LOG_DBG(...) \
+	EVTIM_LOG(DEBUG, evtim_logtype, __VA_ARGS__)
+#else
+#define EVTIM_LOG_DBG(...) (void)0
+#endif
+
+static int
+default_port_conf_cb(uint16_t id, uint8_t event_dev_id, uint8_t *event_port_id,
+		     void *conf_arg)
+{
+	struct rte_event_timer_adapter *adapter;
+	struct rte_eventdev *dev;
+	struct rte_event_dev_config dev_conf;
+	struct rte_event_port_conf *port_conf, def_port_conf = {0};
+	int started;
+	uint8_t port_id;
+	uint8_t dev_id;
+	int ret;
+
+	RTE_SET_USED(event_dev_id);
+
+	adapter = &adapters[id];
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+	dev_id = dev->data->dev_id;
+	dev_conf = dev->data->dev_conf;
+
+	started = dev->data->dev_started;
+	if (started)
+		rte_event_dev_stop(dev_id);
+
+	port_id = dev_conf.nb_event_ports;
+	dev_conf.nb_event_ports += 1;
+	ret = rte_event_dev_configure(dev_id, &dev_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to configure event dev %u\n", dev_id);
+		if (started)
+			if (rte_event_dev_start(dev_id))
+				return -EIO;
+
+		return ret;
+	}
+
+	if (conf_arg != NULL)
+		port_conf = conf_arg;
+	else {
+		port_conf = &def_port_conf;
+		ret = rte_event_port_default_conf_get(dev_id, port_id,
+						      port_conf);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = rte_event_port_setup(dev_id, port_id, port_conf);
+	if (ret < 0) {
+		EVTIM_LOG_ERR("failed to setup event port %u on event dev %u\n",
+			      port_id, dev_id);
+		return ret;
+	}
+
+	*event_port_id = port_id;
+
+	if (started)
+		ret = rte_event_dev_start(dev_id);
+
+	return ret;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create(const struct rte_event_timer_adapter_conf *conf)
+{
+	return rte_event_timer_adapter_create_ext(conf, default_port_conf_cb,
+						  NULL);
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_create_ext(
+		const struct rte_event_timer_adapter_conf *conf,
+		rte_event_timer_adapter_port_conf_cb_t conf_cb,
+		void *conf_arg)
+{
+	uint16_t adapter_id;
+	struct rte_event_timer_adapter *adapter;
+	const struct rte_memzone *mz;
+	char mz_name[DATA_MZ_NAME_MAX_LEN];
+	int n, ret;
+	struct rte_eventdev *dev;
+
+	if (conf == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check eventdev ID */
+	if (!rte_event_pmd_is_valid_dev(conf->event_dev_id)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	dev = &rte_eventdevs[conf->event_dev_id];
+
+	adapter_id = conf->timer_adapter_id;
+
+	/* Check that adapter_id is in range */
+	if (adapter_id >= RTE_EVENT_TIMER_ADAPTER_NUM_MAX) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Check adapter ID not already allocated */
+	adapter = &adapters[adapter_id];
+	if (adapter->allocated) {
+		rte_errno = EEXIST;
+		return NULL;
+	}
+
+	/* Create shared data area. */
+	n = snprintf(mz_name, sizeof(mz_name), DATA_MZ_NAME_FORMAT, adapter_id);
+	if (n >= (int)sizeof(mz_name)) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	mz = rte_memzone_reserve(mz_name,
+				 sizeof(struct rte_event_timer_adapter_data),
+				 conf->socket_id, 0);
+	if (mz == NULL)
+		/* rte_errno set by rte_memzone_reserve */
+		return NULL;
+
+	adapter->data = mz->addr;
+	memset(adapter->data, 0, sizeof(struct rte_event_timer_adapter_data));
+
+	adapter->data->mz = mz;
+	adapter->data->event_dev_id = conf->event_dev_id;
+	adapter->data->id = adapter_id;
+	adapter->data->socket_id = conf->socket_id;
+	adapter->data->conf = *conf;  /* copy conf structure */
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	if (!(adapter->data->caps &
+	      RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT)) {
+		FUNC_PTR_OR_NULL_RET_WITH_ERRNO(conf_cb, -EINVAL);
+		ret = conf_cb(adapter->data->id, adapter->data->event_dev_id,
+			      &adapter->data->event_port_id, conf_arg);
+		if (ret < 0) {
+			rte_errno = ret;
+			goto free_memzone;
+		}
+	}
+
+	/* Allow driver to do some setup */
+	FUNC_PTR_OR_NULL_RET_WITH_ERRNO(adapter->ops->init, -ENOTSUP);
+	ret = adapter->ops->init(adapter);
+	if (ret < 0) {
+		rte_errno = ret;
+		goto free_memzone;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+
+free_memzone:
+	rte_memzone_free(adapter->data->mz);
+	return NULL;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_get_info(const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->ops->get_info)
+		/* let driver set values it knows */
+		adapter->ops->get_info(adapter, adapter_info);
+
+	/* Set common values */
+	adapter_info->conf = adapter->data->conf;
+	adapter_info->event_dev_port_id = adapter->data->event_port_id;
+	adapter_info->caps = adapter->data->caps;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_start(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->start, -EINVAL);
+
+	ret = adapter->ops->start(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 1;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stop(const struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stop, -EINVAL);
+
+	if (adapter->data->started == 0) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" already stopped",
+			      adapter->data->id);
+		return 0;
+	}
+
+	ret = adapter->ops->stop(adapter);
+	if (ret < 0)
+		return ret;
+
+	adapter->data->started = 0;
+
+	return 0;
+}
+
+struct rte_event_timer_adapter * __rte_experimental
+rte_event_timer_adapter_lookup(uint16_t adapter_id)
+{
+	char name[DATA_MZ_NAME_MAX_LEN];
+	const struct rte_memzone *mz;
+	struct rte_event_timer_adapter_data *data;
+	struct rte_event_timer_adapter *adapter;
+	int ret;
+	struct rte_eventdev *dev;
+
+	if (adapters[adapter_id].allocated)
+		return &adapters[adapter_id]; /* Adapter is already loaded */
+
+	snprintf(name, DATA_MZ_NAME_MAX_LEN, DATA_MZ_NAME_FORMAT, adapter_id);
+	mz = rte_memzone_lookup(name);
+	if (mz == NULL) {
+		rte_errno = ENOENT;
+		return NULL;
+	}
+
+	data = mz->addr;
+
+	adapter = &adapters[data->id];
+	adapter->data = data;
+
+	dev = &rte_eventdevs[adapter->data->event_dev_id];
+
+	/* Query eventdev PMD for timer adapter capabilities and ops */
+	ret = dev->dev_ops->timer_adapter_caps_get(dev,
+						   adapter->data->conf.flags,
+						   &adapter->data->caps,
+						   &adapter->ops);
+	if (ret < 0) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	/* Set fast-path function pointers */
+	adapter->arm_burst = adapter->ops->arm_burst;
+	adapter->arm_tmo_tick_burst = adapter->ops->arm_tmo_tick_burst;
+	adapter->cancel_burst = adapter->ops->cancel_burst;
+
+	adapter->allocated = 1;
+
+	return adapter;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_free(struct rte_event_timer_adapter *adapter)
+{
+	int ret;
+
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->uninit, -EINVAL);
+
+	if (adapter->data->started == 1) {
+		EVTIM_LOG_ERR("event timer adapter %"PRIu8" must be stopped "
+			      "before freeing", adapter->data->id);
+		return -EBUSY;
+	}
+
+	/* free impl priv data */
+	ret = adapter->ops->uninit(adapter);
+	if (ret < 0)
+		return ret;
+
+	/* free shared data area */
+	ret = rte_memzone_free(adapter->data->mz);
+	if (ret < 0)
+		return ret;
+
+	adapter->data = NULL;
+	adapter->allocated = 0;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_service_id_get(struct rte_event_timer_adapter *adapter,
+				       uint32_t *service_id)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+
+	if (adapter->data->service_inited && service_id != NULL)
+		*service_id = adapter->data->service_id;
+
+	return adapter->data->service_inited ? 0 : -ESRCH;
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_get(struct rte_event_timer_adapter *adapter,
+				  struct rte_event_timer_adapter_stats *stats)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_get, -EINVAL);
+	if (stats == NULL)
+		return -EINVAL;
+
+	return adapter->ops->stats_get(adapter, stats);
+}
+
+int __rte_experimental
+rte_event_timer_adapter_stats_reset(struct rte_event_timer_adapter *adapter)
+{
+	ADAPTER_VALID_OR_ERR_RET(adapter, -EINVAL);
+	FUNC_PTR_OR_ERR_RET(adapter->ops->stats_reset, -EINVAL);
+	return adapter->ops->stats_reset(adapter);
+}
+
+RTE_INIT(event_timer_adapter_init_log);
+static void
+event_timer_adapter_init_log(void)
+{
+	evtim_logtype = rte_log_register("lib.eventdev.adapter.timer");
+	if (evtim_logtype >= 0)
+		rte_log_set_level(evtim_logtype, RTE_LOG_NOTICE);
+}
diff --git a/lib/librte_eventdev/rte_event_timer_adapter_pmd.h b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
new file mode 100644
index 0000000..cf3509d
--- /dev/null
+++ b/lib/librte_eventdev/rte_event_timer_adapter_pmd.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+#define __RTE_EVENT_TIMER_ADAPTER_PMD_H__
+
+/**
+ * @file
+ * RTE Event Timer Adapter API (PMD Side)
+ *
+ * @note
+ * This file provides implementation helpers for internal use by PMDs.  They
+ * are not intended to be exposed to applications and are not subject to ABI
+ * versioning.
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "rte_event_timer_adapter.h"
+
+/*
+ * Definitions of functions exported by an event timer adapter implementation
+ * through *rte_event_timer_adapter_ops* structure supplied in the
+ * *rte_event_timer_adapter* structure associated with an event timer adapter.
+ */
+
+typedef int (*rte_event_timer_adapter_init_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation setup */
+typedef int (*rte_event_timer_adapter_uninit_t)(
+		struct rte_event_timer_adapter *adapter);
+/**< @internal Event timer adapter implementation teardown */
+typedef int (*rte_event_timer_adapter_start_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Start running event timer adapter */
+typedef int (*rte_event_timer_adapter_stop_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Stop running event timer adapter */
+typedef void (*rte_event_timer_adapter_get_info_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_info *adapter_info);
+/**< @internal Get contextual information for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_get_t)(
+		const struct rte_event_timer_adapter *adapter,
+		struct rte_event_timer_adapter_stats *stats);
+/**< @internal Get statistics for event timer adapter */
+typedef int (*rte_event_timer_adapter_stats_reset_t)(
+		const struct rte_event_timer_adapter *adapter);
+/**< @internal Reset statistics for event timer adapter */
+
+/**
+ * @internal Structure containing the functions exported by an event timer
+ * adapter implementation.
+ */
+struct rte_event_timer_adapter_ops {
+	rte_event_timer_adapter_init_t		init;  /**< Set up adapter */
+	rte_event_timer_adapter_uninit_t	uninit;/**< Tear down adapter */
+	rte_event_timer_adapter_start_t		start; /**< Start adapter */
+	rte_event_timer_adapter_stop_t		stop;  /**< Stop adapter */
+	rte_event_timer_adapter_get_info_t	get_info;
+	/**< Get info from driver */
+	rte_event_timer_adapter_stats_get_t	stats_get;
+	/**< Get adapter statistics */
+	rte_event_timer_adapter_stats_reset_t	stats_reset;
+	/**< Reset adapter statistics */
+	rte_event_timer_arm_burst_t		arm_burst;
+	/**< Arm one or more event timers */
+	rte_event_timer_arm_tmo_tick_burst_t	arm_tmo_tick_burst;
+	/**< Arm event timers with same expiration time */
+	rte_event_timer_cancel_burst_t		cancel_burst;
+	/**< Cancel one or more event timers */
+};
+
+/**
+ * @internal Adapter data; structure to be placed in shared memory to be
+ * accessible by various processes in a multi-process configuration.
+ */
+struct rte_event_timer_adapter_data {
+	uint8_t id;
+	/**< Event timer adapter ID */
+	uint8_t event_dev_id;
+	/**< Event device ID */
+	uint32_t socket_id;
+	/**< Socket ID where memory is allocated */
+	uint8_t event_port_id;
+	/**< Optional: event port ID used when the inbuilt port is absent */
+	const struct rte_memzone *mz;
+	/**< Event timer adapter memzone pointer */
+	struct rte_event_timer_adapter_conf conf;
+	/**< Configuration used to configure the adapter. */
+	uint32_t caps;
+	/**< Adapter capabilities */
+	void *adapter_priv;
+	/**< Timer adapter private data*/
+	uint8_t service_inited;
+	/**< Service initialization state */
+	uint32_t service_id;
+	/**< Service ID*/
+
+	RTE_STD_C11
+	uint8_t started : 1;
+	/**< Flag to indicate adapter started. */
+} __rte_cache_aligned;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RTE_EVENT_TIMER_ADAPTER_PMD_H__ */
diff --git a/lib/librte_eventdev/rte_eventdev.c b/lib/librte_eventdev/rte_eventdev.c
index 851a119..eb3c601 100644
--- a/lib/librte_eventdev/rte_eventdev.c
+++ b/lib/librte_eventdev/rte_eventdev.c
@@ -123,6 +123,28 @@ rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				: 0;
 }
 
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps)
+{
+	struct rte_eventdev *dev;
+	const struct rte_event_timer_adapter_ops *ops;
+
+	RTE_EVENTDEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_eventdevs[dev_id];
+
+	if (caps == NULL)
+		return -EINVAL;
+	*caps = 0;
+
+	return dev->dev_ops->timer_adapter_caps_get ?
+				(*dev->dev_ops->timer_adapter_caps_get)(dev,
+									0,
+									caps,
+									&ops)
+				: 0;
+}
+
 static inline int
 rte_event_dev_queue_config(struct rte_eventdev *dev, uint8_t nb_queues)
 {
diff --git a/lib/librte_eventdev/rte_eventdev.h b/lib/librte_eventdev/rte_eventdev.h
index 297a93d..5c4032c 100644
--- a/lib/librte_eventdev/rte_eventdev.h
+++ b/lib/librte_eventdev/rte_eventdev.h
@@ -215,6 +215,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_memory.h>
 #include <rte_errno.h>
+#include <rte_compat.h>
 
 struct rte_mbuf; /* we just use mbuf pointers; no need to include rte_mbuf.h */
 
@@ -1069,6 +1070,25 @@ int
 rte_event_eth_rx_adapter_caps_get(uint8_t dev_id, uint8_t eth_port_id,
 				uint32_t *caps);
 
+#define RTE_EVENT_TIMER_ADAPTER_CAP_INTERNAL_PORT (1ULL << 0)
+/**< This flag is set when the timer mechanism is in HW. */
+
+/**
+ * Retrieve the event device's timer adapter capabilities.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param[out] caps
+ *   A pointer to memory to be filled with event timer adapter capabilities.
+ *
+ * @return
+ *   - 0: Success, driver provided event timer adapter capabilities.
+ *   - <0: Error code returned by the driver function.
+ */
+int __rte_experimental
+rte_event_timer_adapter_caps_get(uint8_t dev_id, uint32_t *caps);
+
 struct rte_eventdev_driver;
 struct rte_eventdev_ops;
 struct rte_eventdev;
diff --git a/lib/librte_eventdev/rte_eventdev_pmd.h b/lib/librte_eventdev/rte_eventdev_pmd.h
index 31343b5..0e37f1c 100644
--- a/lib/librte_eventdev/rte_eventdev_pmd.h
+++ b/lib/librte_eventdev/rte_eventdev_pmd.h
@@ -26,6 +26,7 @@ extern "C" {
 #include <rte_malloc.h>
 
 #include "rte_eventdev.h"
+#include "rte_event_timer_adapter_pmd.h"
 
 /* Logging Macros */
 #define RTE_EDEV_LOG_ERR(...) \
@@ -449,6 +450,37 @@ typedef int (*eventdev_eth_rx_adapter_caps_get_t)
 struct rte_event_eth_rx_adapter_queue_conf *queue_conf;
 
 /**
+ * Retrieve the event device's timer adapter capabilities, as well as the ops
+ * structure that an event timer adapter should call through to enter the
+ * driver
+ *
+ * @param dev
+ *   Event device pointer
+ *
+ * @param flags
+ *   Flags that can be used to determine how to select an event timer
+ *   adapter ops structure
+ *
+ * @param[out] caps
+ *   A pointer to memory filled with Rx event adapter capabilities.
+ *
+ * @param[out] ops
+ *   A pointer to the ops pointer to set with the address of the desired ops
+ *   structure
+ *
+ * @return
+ *   - 0: Success, driver provides Rx event adapter capabilities for the
+ *	ethernet device.
+ *   - <0: Error code returned by the driver function.
+ *
+ */
+typedef int (*eventdev_timer_adapter_caps_get_t)(
+				const struct rte_eventdev *dev,
+				uint64_t flags,
+				uint32_t *caps,
+				const struct rte_event_timer_adapter_ops **ops);
+
+/**
  * Add ethernet Rx queues to event device. This callback is invoked if
  * the caps returned from rte_eventdev_eth_rx_adapter_caps_get(, eth_port_id)
  * has RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT set.
@@ -640,6 +672,9 @@ struct rte_eventdev_ops {
 	eventdev_eth_rx_adapter_stats_reset eth_rx_adapter_stats_reset;
 	/**< Reset ethernet Rx stats */
 
+	eventdev_timer_adapter_caps_get_t timer_adapter_caps_get;
+	/**< Get timer adapter capabilities */
+
 	eventdev_selftest dev_selftest;
 	/**< Start eventdev Selftest */
 };
diff --git a/lib/librte_eventdev/rte_eventdev_version.map b/lib/librte_eventdev/rte_eventdev_version.map
index 2aef470..537afb8 100644
--- a/lib/librte_eventdev/rte_eventdev_version.map
+++ b/lib/librte_eventdev/rte_eventdev_version.map
@@ -66,7 +66,6 @@ DPDK_17.11 {
 	rte_event_eth_rx_adapter_stats_get;
 	rte_event_eth_rx_adapter_stats_reset;
 	rte_event_eth_rx_adapter_stop;
-
 } DPDK_17.08;
 
 DPDK_18.02 {
@@ -74,3 +73,23 @@ DPDK_18.02 {
 
 	rte_event_dev_selftest;
 } DPDK_17.11;
+
+EXPERIMENTAL {
+	global:
+
+        rte_event_timer_adapter_caps_get;
+	rte_event_timer_adapter_create;
+	rte_event_timer_adapter_create_ext;
+	rte_event_timer_adapter_free;
+	rte_event_timer_adapter_get_info;
+	rte_event_timer_adapter_lookup;
+	rte_event_timer_adapter_service_id_get;
+	rte_event_timer_adapter_start;
+	rte_event_timer_adapter_stats_get;
+	rte_event_timer_adapter_stats_reset;
+	rte_event_timer_adapter_stop;
+	rte_event_timer_init;
+	rte_event_timer_arm_burst;
+	rte_event_timer_arm_tmo_tick_burst;
+	rte_event_timer_cancel_burst;
+} DPDK_18.02;
-- 
2.6.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 2/5] vhost: support selective datapath
  2018-03-31  6:10  3%     ` Maxime Coquelin
@ 2018-04-02  1:58  0%       ` Wang, Zhihong
  0 siblings, 0 replies; 200+ results
From: Wang, Zhihong @ 2018-04-02  1:58 UTC (permalink / raw)
  To: Maxime Coquelin, dev
  Cc: Tan, Jianfeng, Bie, Tiwei, yliu, Liang, Cunming, Wang, Xiao W, Daly, Dan



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Saturday, March 31, 2018 2:10 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: Tan, Jianfeng <jianfeng.tan@intel.com>; Bie, Tiwei
> <tiwei.bie@intel.com>; yliu@fridaylinux.org; Liang, Cunming
> <cunming.liang@intel.com>; Wang, Xiao W <xiao.w.wang@intel.com>; Daly,
> Dan <dan.daly@intel.com>
> Subject: Re: [PATCH v4 2/5] vhost: support selective datapath
> 
> 
> 
> On 03/10/2018 11:01 AM, Zhihong Wang wrote:
> > This patch set introduces support for selective datapath in DPDK vhost-user
> > lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
> > virtio ring compatible devices to serve virtio driver directly to enable
> > datapath acceleration.
> >
> > A set of device ops is defined for device specific operations:
> >
> >       a. queue_num_get: Called to get supported queue number of the
> device.
> >
> >       b. feature_get: Called to get supported features of the device.
> >
> >       c. protocol_feature_get: Called to get supported protocol features of
> >          the device.
> >
> >       d. dev_conf: Called to configure the actual device when the virtio
> >          device becomes ready.
> >
> >       e. dev_close: Called to close the actual device when the virtio device
> >          is stopped.
> >
> >       f. vring_state_set: Called to change the state of the vring in the
> >          actual device when vring state changes.
> >
> >       g. feature_set: Called to set the negotiated features to device.
> >
> >       h. migration_done: Called to allow the device to response to RARP
> >          sending.
> >
> >       i. get_vfio_group_fd: Called to get the VFIO group fd of the device.
> >
> >       j. get_vfio_device_fd: Called to get the VFIO device fd of the device.
> >
> >       k. get_notify_area: Called to get the notify area info of the queue.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
> > Changes in v4:
> >
> >   1. Remove the "engine" concept in the lib.
> >
> > ---
> > Changes in v2:
> >
> >   1. Add VFIO related vDPA device ops.
> >
> >   lib/librte_vhost/Makefile              |  4 +-
> >   lib/librte_vhost/rte_vdpa.h            | 94
> +++++++++++++++++++++++++++++++++
> >   lib/librte_vhost/rte_vhost_version.map |  6 +++
> >   lib/librte_vhost/vdpa.c                | 96
> ++++++++++++++++++++++++++++++++++
> >   4 files changed, 198 insertions(+), 2 deletions(-)
> >   create mode 100644 lib/librte_vhost/rte_vdpa.h
> >   create mode 100644 lib/librte_vhost/vdpa.c
> >
> > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> > index 5d6c6abae..37044ac03 100644
> > --- a/lib/librte_vhost/Makefile
> > +++ b/lib/librte_vhost/Makefile
> > @@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -
> lrte_ethdev -lrte_net
> >
> >   # all source are stored in SRCS-y
> >   SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c
> \
> > -					vhost_user.c virtio_net.c
> > +					vhost_user.c virtio_net.c vdpa.c
> >
> >   # install includes
> > -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> > +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> rte_vdpa.h
> >
> >   include $(RTE_SDK)/mk/rte.lib.mk
> > diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
> > new file mode 100644
> > index 000000000..a4bbbd93d
> > --- /dev/null
> > +++ b/lib/librte_vhost/rte_vdpa.h
> > @@ -0,0 +1,94 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation
> > + */
> > +
> > +#ifndef _RTE_VDPA_H_
> > +#define _RTE_VDPA_H_
> > +
> > +/**
> > + * @file
> > + *
> > + * Device specific vhost lib
> > + */
> > +
> > +#include <rte_pci.h>
> > +#include "rte_vhost.h"
> > +
> > +#define MAX_VDPA_NAME_LEN 128
> > +
> > +enum vdpa_addr_type {
> > +	PCI_ADDR,
> > +	VDPA_ADDR_MAX
> > +};
> > +
> > +struct rte_vdpa_dev_addr {
> > +	enum vdpa_addr_type type;
> > +	union {
> > +		uint8_t __dummy[64];
> > +		struct rte_pci_addr pci_addr;
> > +	};
> > +};
> > +
> > +/* Get capabilities of this device */
> > +typedef int (*vdpa_dev_queue_num_get_t)(int did, uint32_t
> *queue_num);
> > +typedef int (*vdpa_dev_feature_get_t)(int did, uint64_t *features);
> > +
> > +/* Driver configure/close the device */
> > +typedef int (*vdpa_dev_conf_t)(int vid);
> > +typedef int (*vdpa_dev_close_t)(int vid);
> > +
> > +/* Enable/disable this vring */
> > +typedef int (*vdpa_vring_state_set_t)(int vid, int vring, int state);
> > +
> > +/* Set features when changed */
> > +typedef int (*vdpa_feature_set_t)(int vid);
> > +
> > +/* Destination operations when migration done */
> > +typedef int (*vdpa_migration_done_t)(int vid);
> > +
> > +/* Get the vfio group fd */
> > +typedef int (*vdpa_get_vfio_group_fd_t)(int vid);
> > +
> > +/* Get the vfio device fd */
> > +typedef int (*vdpa_get_vfio_device_fd_t)(int vid);
> > +
> > +/* Get the notify area info of the queue */
> > +typedef int (*vdpa_get_notify_area_t)(int vid, int qid, uint64_t *offset,
> > +		uint64_t *size);
> > +/* Device ops */
> > +struct rte_vdpa_dev_ops {
> > +	vdpa_dev_queue_num_get_t  queue_num_get;
> > +	vdpa_dev_feature_get_t    feature_get;
> > +	vdpa_dev_feature_get_t    protocol_feature_get;
> > +	vdpa_dev_conf_t           dev_conf;
> > +	vdpa_dev_close_t          dev_close;
> > +	vdpa_vring_state_set_t    vring_state_set;
> > +	vdpa_feature_set_t        feature_set;
> > +	vdpa_migration_done_t     migration_done;
> > +	vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
> > +	vdpa_get_vfio_device_fd_t get_vfio_device_fd;
> > +	vdpa_get_notify_area_t    get_notify_area;
> 
> Maybe you could reserve some room here to avoid breaking the ABI in the
> future if we need to add some optional ops.

Good suggestion.

> 
> > +};
> > +
> > +struct rte_vdpa_device {
> > +	struct rte_vdpa_dev_addr addr;
> > +	struct rte_vdpa_dev_ops *ops;
> > +} __rte_cache_aligned;
> > +
> > +extern struct rte_vdpa_device *vdpa_devices[];
> > +extern uint32_t vdpa_device_num;
> > +
> > +/* Register a vdpa device, return did if successful, -1 on failure */
> > +int __rte_experimental
> > +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
> > +		struct rte_vdpa_dev_ops *ops);
> > +
> > +/* Unregister a vdpa device, return -1 on failure */
> > +int __rte_experimental
> > +rte_vdpa_unregister_device(int did);
> > +
> > +/* Find did of a vdpa device, return -1 on failure */
> > +int __rte_experimental
> > +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr);
> > +
> > +#endif /* _RTE_VDPA_H_ */
> > diff --git a/lib/librte_vhost/rte_vhost_version.map
> b/lib/librte_vhost/rte_vhost_version.map
> > index df0103129..7bcffb490 100644
> > --- a/lib/librte_vhost/rte_vhost_version.map
> > +++ b/lib/librte_vhost/rte_vhost_version.map
> > @@ -59,3 +59,9 @@ DPDK_18.02 {
> >   	rte_vhost_vring_call;
> >
> >   } DPDK_17.08;
> > +
> > +EXPERIMENTAL {
> > +	rte_vdpa_register_device;
> > +	rte_vdpa_unregister_device;
> > +	rte_vdpa_find_device_id;
> 
> I think you need also to declare the new structs here,
> not only the new functions.

Ok.

> 
> > +} DPDK_18.02;
> > diff --git a/lib/librte_vhost/vdpa.c b/lib/librte_vhost/vdpa.c
> > new file mode 100644
> > index 000000000..0c950d45f
> > --- /dev/null
> > +++ b/lib/librte_vhost/vdpa.c
> > @@ -0,0 +1,96 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation
> > + */
> > +
> > +/**
> > + * @file
> > + *
> > + * Device specific vhost lib
> > + */
> > +
> > +#include <stdbool.h>
> > +
> > +#include <rte_malloc.h>
> > +#include "rte_vdpa.h"
> > +#include "vhost.h"
> > +
> > +struct rte_vdpa_device *vdpa_devices[MAX_VHOST_DEVICE];
> > +uint32_t vdpa_device_num;
> > +
> > +static int is_same_vdpa_dev_addr(struct rte_vdpa_dev_addr *a,
> > +		struct rte_vdpa_dev_addr *b)
> > +{
> 
> Given the boolean nature of the function name, I would return 1 if same
> device, 0 if different.

Ok, will use bool.

> 
> > +	int ret = 0;
> > +
> > +	if (a->type != b->type)
> > +		return -1;
> > +
> > +	switch (a->type) {
> > +	case PCI_ADDR:
> > +		if (a->pci_addr.domain != b->pci_addr.domain ||
> > +				a->pci_addr.bus != b->pci_addr.bus ||
> > +				a->pci_addr.devid != b->pci_addr.devid ||
> > +				a->pci_addr.function != b->pci_addr.function)
> > +			ret = -1;
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +int rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
> > +		struct rte_vdpa_dev_ops *ops)
> > +{
> > +	struct rte_vdpa_device *dev;
> > +	char device_name[MAX_VDPA_NAME_LEN];
> > +	int i;
> > +
> > +	if (vdpa_device_num >= MAX_VHOST_DEVICE)
> > +		return -1;
> > +
> > +	for (i = 0; i < MAX_VHOST_DEVICE; i++) {
> > +		if (vdpa_devices[i] == NULL)
> > +			break;
> You might want to check same device isn't being registering a second
> time, and return an error in that case.

Will do.

Thanks
-Zhihong

> 
> This is not a blocker though, and can be done in a dedicated patch.
> 
> > +	}
> > +
> > +	sprintf(device_name, "vdpa-dev-%d", i);
> > +	dev = rte_zmalloc(device_name, sizeof(struct rte_vdpa_device),
> > +			RTE_CACHE_LINE_SIZE);
> > +	if (!dev)
> > +		return -1;
> > +
> > +	memcpy(&dev->addr, addr, sizeof(struct rte_vdpa_dev_addr));
> > +	dev->ops = ops;
> > +	vdpa_devices[i] = dev;
> > +	vdpa_device_num++;
> > +
> > +	return i;
> > +}
> > +
> > +int rte_vdpa_unregister_device(int did)
> > +{
> > +	if (did < 0 || did >= MAX_VHOST_DEVICE || vdpa_devices[did] ==
> NULL)
> > +		return -1;
> > +
> > +	rte_free(vdpa_devices[did]);
> > +	vdpa_devices[did] = NULL;
> > +	vdpa_device_num--;
> > +
> > +	return did;
> > +}
> > +
> > +int rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr)
> > +{
> > +	struct rte_vdpa_device *dev;
> > +	int i;
> > +
> > +	for (i = 0; i < MAX_VHOST_DEVICE; ++i) {
> > +		dev = vdpa_devices[i];
> > +		if (dev && is_same_vdpa_dev_addr(&dev->addr, addr) == 0)
> > +			return i;
> > +	}
> > +
> > +	return -1;
> > +}
> >

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support
  @ 2018-04-01 19:53  0%       ` Zhang, Roy Fan
  2018-04-03 13:44  0%         ` Maxime Coquelin
  0 siblings, 1 reply; 200+ results
From: Zhang, Roy Fan @ 2018-04-01 19:53 UTC (permalink / raw)
  To: Wodkowski, PawelX, dev; +Cc: maxime.coquelin, jianjay.zhou, Tan, Jianfeng

Hi Pawel,

> -----Original Message-----
> From: Wodkowski, PawelX
> Sent: Thursday, March 29, 2018 2:48 PM
> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan, Jianfeng
> <jianfeng.tan@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> backend support
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Fan Zhang
> > Sent: Thursday, March 29, 2018 2:53 PM
> > To: dev@dpdk.org
> > Cc: maxime.coquelin@redhat.com; jianjay.zhou@huawei.com; Tan,
> Jianfeng
> > <jianfeng.tan@intel.com>
> > Subject: [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external
> > backend support
> >
> > This patch adds external backend support to vhost library. The patch
> > provides new APIs for the external backend to register pre and post
> > vhost-user message handlers.
> >
> > Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
> > ---
> >  lib/librte_vhost/rte_vhost.h           | 64
> > +++++++++++++++++++++++++++++++++-
> >  lib/librte_vhost/rte_vhost_version.map |  6 ++++
> >  lib/librte_vhost/vhost.c               | 17 ++++++++-
> >  lib/librte_vhost/vhost.h               |  8 +++--
> >  lib/librte_vhost/vhost_user.c          | 33 +++++++++++++++++-
> >  5 files changed, 123 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index d332069..b902c44 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -1,5 +1,5 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> > - * Copyright(c) 2010-2017 Intel Corporation
> > + * Copyright(c) 2010-2018 Intel Corporation
> >   */
> >
> >  #ifndef _RTE_VHOST_H_
> > @@ -88,6 +88,55 @@ struct vhost_device_ops {  };
> >
> >  /**
> > + * function prototype for the vhost backend to handler specific vhost
> > + user
> > + * messages prior to the master message handling
> > + *
> > + * @param vid
> > + *  vhost device id
> > + * @param msg
> > + *  Message pointer.
> > + * @param payload
> > + *  Message payload.
> 
> No payload parameter.
Sorry about that. I will fix the comment.

> 
> > + * @param require_reply
> > + *  If the handler requires sending a reply, this varaible shall be
> > + written 1,
> > + *  otherwise 0.
> > + * @param skip_master
> > + *  If the handler requires skipping the master message handling,
> > + this
> > variable
> > + *  shall be written 1, otherwise 0.
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +typedef int (*rte_vhost_msg_pre_handle)(int vid, void *msg,
> > +		uint32_t *require_reply, uint32_t *skip_master);
> > +
> > +/**
> > + * function prototype for the vhost backend to handler specific vhost
> > +user
> > + * messages after the master message handling is done
> > + *
> > + * @param vid
> > + *  vhost device id
> > + * @param msg
> > + *  Message pointer.
> > + * @param payload
> > + *  Message payload.
> 
> No payload parameter :)
> 

Same here

> > + * @param require_reply
> > + *  If the handler requires sending a reply, this varaible shall be
> > +written 1,
> > + *  otherwise 0.
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +typedef int (*rte_vhost_msg_post_handle)(int vid, void *msg,
> > +		uint32_t *require_reply);
> > +
> 
> What mean 'Message pointer' Is this const for us? Is this payload? Making
> msg 'void *' is not a way to go here. Those pre and post handlers need to see
> exactly the same structures like vhost_user.c file. Otherwise we can get into
> troubles when ABI changes.

It is the pointer to the vhost_user message. It cannot be const as the backend
may change the payload. 

> 
> Also you can easily merge pre and post handlers into one handler with one
> Parameter describing what phase of message processing we are now.
> 

No I don't think so. To do so it will be quite unclear in the future as we are
using one function to do two totally different things. 

> > +/**
> > + * pre and post vhost user message handlers  */ struct
> > +vhost_user_extern_ops {
> > +	rte_vhost_msg_pre_handle pre_msg_handle;
> > +	rte_vhost_msg_post_handle post_msg_handle; };
> > +
> > +/**
> >   * Convert guest physical address to host virtual address
> >   *
> >   * @param mem
> > @@ -434,6 +483,19 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
> >   */
> >  uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
> >
> > +/**
> > + * register external vhost backend
> > + *
> > + * @param vid
> > + *  vhost device ID
> > + * @param ops
> > + *  ops that process external vhost user messages
> > + * @return
> > + *  0 on success, -1 on failure
> > + */
> > +int
> > +rte_vhost_user_register_extern_ops(int vid, struct
> > vhost_user_extern_ops *ops);
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_vhost/rte_vhost_version.map
> > b/lib/librte_vhost/rte_vhost_version.map
> > index df01031..91bf9f0 100644
> > --- a/lib/librte_vhost/rte_vhost_version.map
> > +++ b/lib/librte_vhost/rte_vhost_version.map
> > @@ -59,3 +59,9 @@ DPDK_18.02 {
> >  	rte_vhost_vring_call;
> >
> >  } DPDK_17.08;
> > +
> > +DPDK_18.05 {
> > +	global:
> > +
> > +	rte_vhost_user_register_extern_ops;
> > +} DPDK_18.02;
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > a407067..80af341 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -1,5 +1,5 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> > - * Copyright(c) 2010-2016 Intel Corporation
> > + * Copyright(c) 2010-2018 Intel Corporation
> >   */
> >
> >  #include <linux/vhost.h>
> > @@ -627,3 +627,18 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
> >
> >  	return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
> > }
> > +
> > +int
> > +rte_vhost_user_register_extern_ops(int vid, struct
> > vhost_user_extern_ops *ops)
> > +{
> > +	struct virtio_net *dev;
> > +
> > +	dev = get_device(vid);
> > +	if (dev == NULL)
> > +		return -1;
> > +
> > +	if (ops)
> > +		rte_memcpy(&dev->extern_ops, ops, sizeof(*ops));
> > +
> > +	return 0;
> > +}
> 
> Why we need this new "register" API? Why can't you use one of the (struct
> vhost_device_ops).reserved[0] field to put this callback there?
> I think this is right time to utilize this field.
> 

The patch here is a more generic and intuitive way for external backend to
register the handlers to process the vhost user message only recognized by it.
Please read Maxime's comments in v2 version of this patch.
http://dpdk.org/ml/archives/dev/2018-March/093408.html.
As we discussed we need 2 different handlers for external vhost user device
to handle device specifc vhost user messages. A public API is needed. 

> Can you do something similar to
> http://dpdk.org/ml/archives/dev/2018-March/094213.html ?

The patch content here causes the least damage to the existing library. Plus
the patch you mentioned won't help with the pre and post handlers problem - 
or it would consume all of two remaining reserved fields in vhost_user_ops
structure for pre and post handlers, respectively.

> 
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > d947bc9..2072b88 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -1,5 +1,5 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> > - * Copyright(c) 2010-2014 Intel Corporation
> > + * Copyright(c) 2010-2018 Intel Corporation
> >   */
> >
> >  #ifndef _VHOST_NET_CDEV_H_
> > @@ -241,8 +241,12 @@ struct virtio_net {
> >  	struct guest_page       *guest_pages;
> >
> >  	int			slave_req_fd;
> > -} __rte_cache_aligned;
> >
> > +	/* private data for external virtio device */
> > +	void			*extern_data;
> > +	/* pre and post vhost user message handlers for externel backend */
> > +	struct vhost_user_extern_ops extern_ops; } __rte_cache_aligned;
> >
> >  #define VHOST_LOG_PAGE	4096
> >
> > diff --git a/lib/librte_vhost/vhost_user.c
> > b/lib/librte_vhost/vhost_user.c index 90ed211..ede8a5e 100644
> > --- a/lib/librte_vhost/vhost_user.c
> > +++ b/lib/librte_vhost/vhost_user.c
> > @@ -1,5 +1,5 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> > - * Copyright(c) 2010-2016 Intel Corporation
> > + * Copyright(c) 2010-2018 Intel Corporation
> >   */
> >
> >  #include <stdint.h>
> > @@ -50,6 +50,8 @@ static const char
> > *vhost_message_str[VHOST_USER_MAX] = {
> >  	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
> >  	[VHOST_USER_SET_SLAVE_REQ_FD]  =
> > "VHOST_USER_SET_SLAVE_REQ_FD",
> >  	[VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
> > +	[VHOST_USER_CRYPTO_CREATE_SESS] =
> > "VHOST_USER_CRYPTO_CREATE_SESS",
> > +	[VHOST_USER_CRYPTO_CLOSE_SESS] =
> > "VHOST_USER_CRYPTO_CLOSE_SESS",
> >  };
> >
> >  static uint64_t
> > @@ -1302,6 +1304,7 @@ vhost_user_msg_handler(int vid, int fd)
> >  	struct VhostUserMsg msg;
> >  	int ret;
> >  	int unlock_required = 0;
> > +	uint32_t skip_master = 0;
> >
> >  	dev = get_device(vid);
> >  	if (dev == NULL)
> > @@ -1379,6 +1382,21 @@ vhost_user_msg_handler(int vid, int fd)
> >
> >  	}
> >
> > +	if (dev->extern_ops.pre_msg_handle) {
> > +		uint32_t need_reply;
> > +
> > +		ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
> > +				(void *)&msg, &need_reply, &skip_master);
> > +		if (ret < 0)
> > +			goto skip_to_reply;
> > +
> > +		if (need_reply)
> > +			send_vhost_reply(fd, &msg);
> > +	}
> > +
> > +	if (skip_master)
> > +		goto skip_to_post_handle;
> 
> This can be moved inside above  if () { }

Yes, you are right.

> 
> > +
> >  	switch (msg.request.master) {
> >  	case VHOST_USER_GET_FEATURES:
> >  		msg.payload.u64 = vhost_user_get_features(dev); @@ -
> 1479,9 +1497,22
> > @@ vhost_user_msg_handler(int vid, int fd)
> >  	default:
> >  		ret = -1;
> >  		break;
> > +	}
> > +
> > +skip_to_post_handle:
> > +	if (dev->extern_ops.post_msg_handle) {
> > +		uint32_t need_reply;
> > +
> > +		ret = (*dev->extern_ops.post_msg_handle)(
> > +				dev->vid, (void *)&msg, &need_reply);
> > +		if (ret < 0)
> > +			goto skip_to_reply;
> >
> > +		if (need_reply)
> > +			send_vhost_reply(fd, &msg);
> >  	}
> >
> > +skip_to_reply:
> >  	if (unlock_required)
> >  		vhost_user_unlock_all_queue_pairs(dev);
> >
> > --
> > 2.7.4
> 
> Overall, I think, this direction where we need to go.
> 
> Pawel

Regards,
Fan

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v6 4/8] ethdev: Add port representor device flag
  @ 2018-04-01  6:14  0%       ` Shahaf Shuler
  0 siblings, 0 replies; 200+ results
From: Shahaf Shuler @ 2018-04-01  6:14 UTC (permalink / raw)
  To: Doherty, Declan, dev
  Cc: Alex Rosenbaum, Ferruh Yigit, Thomas Monjalon, Qi Zhang,
	Alejandro Lucero, Andrew Rybchenko, Mohammad Abdul Awal,
	Remy Horton, John McNamara, Rony Efraim, Wu, Jingjing, Lu,
	Wenzhuo, Vincent JArdin, Yuanhan Liu, Richardson, Bruce, Ananyev,
	Konstantin, Wang, Zhihong

Thursday, March 29, 2018 5:53 PM, Doherty, Declan:
> On 29/03/2018 7:13 AM, Shahaf Shuler wrote:
> > Wednesday, March 28, 2018 4:54 PM, Declan Doherty:
> >> Subject: [dpdk-dev][PATCH v6 4/8] ethdev: Add port representor device
> >> flag
> >>
> >> Add new device flag to specify that ethdev port is a port representor.
> >> Extend rte_eth_dev_info structure to expose device flags to user
> >> which enable applications to discover if a port is a representor port.
> >>
> >> Signed-off-by: Declan Doherty <declan.doherty@intel.com>
> >> ---
> >>   lib/librte_ether/rte_ethdev.c             | 1 +
> >>   lib/librte_ether/rte_ethdev.h             | 9 ++++++---
> >>   lib/librte_ether/rte_ethdev_representor.h | 3 +++
> >>   3 files changed, 10 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/lib/librte_ether/rte_ethdev.c
> >> b/lib/librte_ether/rte_ethdev.c index c719f84a3..163246433 100644
> >> --- a/lib/librte_ether/rte_ethdev.c
> >> +++ b/lib/librte_ether/rte_ethdev.c
> >> @@ -2399,6 +2399,7 @@ rte_eth_dev_info_get(uint16_t port_id, struct
> >> rte_eth_dev_info *dev_info)
> >>   	dev_info->nb_rx_queues = dev->data->nb_rx_queues;
> >>   	dev_info->nb_tx_queues = dev->data->nb_tx_queues;
> >>   	dev_info->switch_id = dev->data->switch_id;
> >> +	dev_info->dev_flags = dev->data->dev_flags;
> >>   }
> >>
> >>   int
> >> diff --git a/lib/librte_ether/rte_ethdev.h
> >> b/lib/librte_ether/rte_ethdev.h index dced4fc41..226acc8b1 100644
> >> --- a/lib/librte_ether/rte_ethdev.h
> >> +++ b/lib/librte_ether/rte_ethdev.h
> >> @@ -996,6 +996,7 @@ struct rte_eth_dev_info {
> >>   	const char *driver_name; /**< Device Driver name. */
> >>   	unsigned int if_index; /**< Index to bound host interface, or 0 if
> >> none.
> >>   		Use if_indextoname() to translate into an interface name. */
> >> +	uint32_t dev_flags; /**< Device flags */
> >>   	uint32_t min_rx_bufsize; /**< Minimum size of RX buffer. */
> >>   	uint32_t max_rx_pktlen; /**< Maximum configurable length of RX
> >> pkt. */
> >>   	uint16_t max_rx_queues; /**< Maximum number of RX queues. */
> @@
> >> -1229,11 +1230,13 @@ struct rte_eth_dev_owner {  };
> >>
> >>   /** Device supports link state interrupt */
> >> -#define RTE_ETH_DEV_INTR_LSC     0x0002
> >> +#define RTE_ETH_DEV_INTR_LSC		0x0002
> >>   /** Device is a bonded slave */
> >> -#define RTE_ETH_DEV_BONDED_SLAVE 0x0004
> >> +#define RTE_ETH_DEV_BONDED_SLAVE	0x0004
> >>   /** Device supports device removal interrupt */
> >> -#define RTE_ETH_DEV_INTR_RMV     0x0008
> >> +#define RTE_ETH_DEV_INTR_RMV		0x0008
> >> +/** Device is port representor */
> >> +#define RTE_ETH_DEV_REPRESENTOR		0x0010
> >
> > Maybe it is a good time to make some order here.
> > I understand the decision to use flags instead of bit-field. It is better.
> >
> > However there is a mix here of device capabilities like :
> RTE_ETH_DEV_INTR_LSC   and RTE_ETH_DEV_INTR_RMV
> > And device attributes like : RTE_ETH_DEV_BONDED_SLAVE and
> RTE_ETH_DEV_REPRESENTOR.
> > I don't think they belong together under the genetic name of dev_flags.
> >
> > Moreover, I am not sure the fact device is bonded slave should be exposed
> to the application. It should be internal to ethdev and its port iterators.
> 
> That's a good point on the bonded slave flag, I'll look at fixing that for the
> next release. I don't think changing it should effect ABI but I'll need to have a
> closer look.
> 
> Do you think that we should have a separate device attributes field, which
> the representor flag is contained in.
> 
> >
> > Finally I think representor port may need more info now (and in the
> future), for example the associated vf id.
> > For that, I think it is better it to be exposed as a dedicated struct on device
> info.
> 
> I think a switch port id should suffice for that, for SR-IOV devices it would
> map to the vf_id.

I think we need both switch_domain and vf_id. 
Because for representors, the application should know which VFs can be reached from this representor and which VF it represent. 

> 
> >
> >>
> >>   /**
> >>    * @warning
> >> diff --git a/lib/librte_ether/rte_ethdev_representor.h
> >> b/lib/librte_ether/rte_ethdev_representor.h
> >> index cbc1f2855..f3726d0ba 100644
> >> --- a/lib/librte_ether/rte_ethdev_representor.h
> >> +++ b/lib/librte_ether/rte_ethdev_representor.h
> >> @@ -22,6 +22,9 @@ eth_dev_representor_port_init(struct rte_eth_dev
> >> *ethdev, void *init_params)
> >>   	/** representor inherits the switch id of it's base device */
> >>   	ethdev->data->switch_id = base_ethdev->data->switch_id;
> >>
> >> +	/** Set device flags to specify that device is a representor port */
> >> +	ethdev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
> >
> > Should be set in the PMD, not in ethdev layer
> 
> As in the previous patch this is just a generic port bus init function which
> meets the simplest use case of representor port with a single switch domain,
> a PMD doesn't need to use it but having it here saves duplicating the same
> code across multiple PMD which are only supporting the basic mode.
> 
> >
> >> +
> >>   	return 0;
> >>   }
> >>
> >> --
> >> 2.14.3
> >


^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v7] eal: provide API for querying valid socket id's
  @ 2018-03-31 17:08  5% ` Anatoly Burakov
  2018-04-04 22:31  3%   ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Anatoly Burakov @ 2018-03-31 17:08 UTC (permalink / raw)
  To: dev
  Cc: Neil Horman, John McNamara, Marko Kovacevic, Bruce Richardson,
	thomas, chaozhu, gowrishankar.m

During lcore scan, find all socket ID's and store them, and
provide public API to query valid socket id's. This will break
the ABI, so bump ABI version.

Also, remove deprecation notice corresponding to this change.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
---

Notes:
    v7:
    - Renamed rte_num_socket_ids() to rte_socket_count()
    - Removed deprecation notice associated with this change
    - Addressed review comments
    
    v6:
    - Fixed meson ABI version header
    
    v5:
    - Move API to experimental
    - Store list of valid socket id's instead of simply
      recording the biggest one
    
    v4:
    - Remove backwards ABI compatibility, bump ABI instead
    
    v3:
    - Added ABI compatibility
    
    v2:
    - checkpatch changes
    - check socket before deciding if the core is not to be used

 doc/guides/rel_notes/deprecation.rst      |  3 --
 lib/librte_eal/bsdapp/eal/Makefile        |  2 +-
 lib/librte_eal/common/eal_common_lcore.c  | 75 ++++++++++++++++++++++++++-----
 lib/librte_eal/common/include/rte_eal.h   |  2 +
 lib/librte_eal/common/include/rte_lcore.h | 30 +++++++++++++
 lib/librte_eal/linuxapp/eal/Makefile      |  2 +-
 lib/librte_eal/meson.build                |  2 +-
 lib/librte_eal/rte_eal_version.map        |  2 +
 8 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 74c18ed..80472f5 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -38,9 +38,6 @@ Deprecation Notices
   success and failure, respectively.  This will change to 1 and 0 for true and
   false, respectively, to make use of the function more intuitive.
 
-* eal: new ``numa_node_count`` member will be added to ``rte_config`` structure
-  in v18.05.
-
 * eal: due to internal data layout reorganization, there will be changes to
   several structures and functions as a result of coming changes to support
   memory hotplug in v18.05.
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index dd455e6..ed1d17b 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -21,7 +21,7 @@ LDLIBS += -lgcc_s
 
 EXPORT_MAP := ../../rte_eal_version.map
 
-LIBABIVER := 6
+LIBABIVER := 7
 
 # specific to bsdapp exec-env
 SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) := eal.c
diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
index 7724fa4..3167e9d 100644
--- a/lib/librte_eal/common/eal_common_lcore.c
+++ b/lib/librte_eal/common/eal_common_lcore.c
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <dirent.h>
 
+#include <rte_errno.h>
 #include <rte_log.h>
 #include <rte_eal.h>
 #include <rte_lcore.h>
@@ -16,6 +17,19 @@
 #include "eal_private.h"
 #include "eal_thread.h"
 
+static int
+socket_id_cmp(const void *a, const void *b)
+{
+	const int *lcore_id_a = a;
+	const int *lcore_id_b = b;
+
+	if (*lcore_id_a < *lcore_id_b)
+		return -1;
+	if (*lcore_id_a > *lcore_id_b)
+		return 1;
+	return 0;
+}
+
 /*
  * Parse /sys/devices/system/cpu to get the number of physical and logical
  * processors on the machine. The function will fill the cpu_info
@@ -28,6 +42,8 @@ rte_eal_cpu_init(void)
 	struct rte_config *config = rte_eal_get_configuration();
 	unsigned lcore_id;
 	unsigned count = 0;
+	unsigned int socket_id, prev_socket_id;
+	int lcore_to_socket_id[RTE_MAX_LCORE];
 
 	/*
 	 * Parse the maximum set of logical cores, detect the subset of running
@@ -39,6 +55,19 @@ rte_eal_cpu_init(void)
 		/* init cpuset for per lcore config */
 		CPU_ZERO(&lcore_config[lcore_id].cpuset);
 
+		/* find socket first */
+		socket_id = eal_cpu_socket_id(lcore_id);
+		if (socket_id >= RTE_MAX_NUMA_NODES) {
+#ifdef RTE_EAL_ALLOW_INV_SOCKET_ID
+			socket_id = 0;
+#else
+			RTE_LOG(ERR, EAL, "Socket ID (%u) is greater than RTE_MAX_NUMA_NODES (%d)\n",
+					socket_id, RTE_MAX_NUMA_NODES);
+			return -1;
+#endif
+		}
+		lcore_to_socket_id[lcore_id] = socket_id;
+
 		/* in 1:1 mapping, record related cpu detected state */
 		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
 		if (lcore_config[lcore_id].detected == 0) {
@@ -54,18 +83,7 @@ rte_eal_cpu_init(void)
 		config->lcore_role[lcore_id] = ROLE_RTE;
 		lcore_config[lcore_id].core_role = ROLE_RTE;
 		lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id);
-		lcore_config[lcore_id].socket_id = eal_cpu_socket_id(lcore_id);
-		if (lcore_config[lcore_id].socket_id >= RTE_MAX_NUMA_NODES) {
-#ifdef RTE_EAL_ALLOW_INV_SOCKET_ID
-			lcore_config[lcore_id].socket_id = 0;
-#else
-			RTE_LOG(ERR, EAL, "Socket ID (%u) is greater than "
-				"RTE_MAX_NUMA_NODES (%d)\n",
-				lcore_config[lcore_id].socket_id,
-				RTE_MAX_NUMA_NODES);
-			return -1;
-#endif
-		}
+		lcore_config[lcore_id].socket_id = socket_id;
 		RTE_LOG(DEBUG, EAL, "Detected lcore %u as "
 				"core %u on socket %u\n",
 				lcore_id, lcore_config[lcore_id].core_id,
@@ -79,5 +97,38 @@ rte_eal_cpu_init(void)
 		RTE_MAX_LCORE);
 	RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count);
 
+	/* sort all socket id's in ascending order */
+	qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id),
+			sizeof(lcore_to_socket_id[0]), socket_id_cmp);
+
+	prev_socket_id = -1;
+	config->numa_node_count = 0;
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		socket_id = lcore_to_socket_id[lcore_id];
+		if (socket_id != prev_socket_id)
+			config->numa_nodes[config->numa_node_count++] =
+					socket_id;
+		prev_socket_id = socket_id;
+	}
+	RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count);
+
 	return 0;
 }
+
+unsigned int __rte_experimental
+rte_socket_count(void)
+{
+	const struct rte_config *config = rte_eal_get_configuration();
+	return config->numa_node_count;
+}
+
+int __rte_experimental
+rte_socket_id_by_idx(unsigned int idx)
+{
+	const struct rte_config *config = rte_eal_get_configuration();
+	if (idx >= config->numa_node_count) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+	return config->numa_nodes[idx];
+}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 93ca4cc..991cbe0 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -57,6 +57,8 @@ enum rte_proc_type_t {
 struct rte_config {
 	uint32_t master_lcore;       /**< Id of the master lcore */
 	uint32_t lcore_count;        /**< Number of available logical cores. */
+	uint32_t numa_node_count;    /**< Number of detected NUMA nodes. */
+	uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected numa nodes. */
 	uint32_t service_lcore_count;/**< Number of available service cores. */
 	enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */
 
diff --git a/lib/librte_eal/common/include/rte_lcore.h b/lib/librte_eal/common/include/rte_lcore.h
index 0472220..7312975 100644
--- a/lib/librte_eal/common/include/rte_lcore.h
+++ b/lib/librte_eal/common/include/rte_lcore.h
@@ -132,6 +132,36 @@ rte_lcore_index(int lcore_id)
 unsigned rte_socket_id(void);
 
 /**
+ * Return number of physical sockets detected on the system.
+ *
+ * Note that number of nodes may not be correspondent to their physical id's:
+ * for example, a system may report two socket id's, but the actual socket id's
+ * may be 0 and 8.
+ *
+ * @return
+ *   the number of physical sockets as recognized by EAL
+ */
+unsigned int __rte_experimental
+rte_socket_count(void);
+
+/**
+ * Return socket id with a particular index.
+ *
+ * This will return socket id at a particular position in list of all detected
+ * physical socket id's. For example, on a machine with sockets [0, 8], passing
+ * 1 as a parameter will return 8.
+ *
+ * @param idx
+ *   index of physical socket id to return
+ *
+ * @return
+ *   - physical socket id as recognized by EAL
+ *   - -1 on error, with errno set to EINVAL
+ */
+int __rte_experimental
+rte_socket_id_by_idx(unsigned int idx);
+
+/**
  * Get the ID of the physical socket of the specified lcore
  *
  * @param lcore_id
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 7e5bbe8..b9c7727 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -10,7 +10,7 @@ ARCH_DIR ?= $(RTE_ARCH)
 EXPORT_MAP := ../../rte_eal_version.map
 VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
 
-LIBABIVER := 6
+LIBABIVER := 7
 
 VPATH += $(RTE_SDK)/lib/librte_eal/common
 
diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build
index 15d1c6a..4aa63e3 100644
--- a/lib/librte_eal/meson.build
+++ b/lib/librte_eal/meson.build
@@ -21,7 +21,7 @@ else
 	error('unsupported system type @0@'.format(hostmachine.system()))
 endif
 
-version = 6  # the version of the EAL API
+version = 7  # the version of the EAL API
 allow_experimental_apis = true
 deps += 'compat'
 cflags += '-D_GNU_SOURCE'
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 1d88437..30ec1fc 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -257,5 +257,7 @@ EXPERIMENTAL {
 	rte_service_set_runstate_mapped_check;
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
+	rte_socket_count;
+	rte_socket_id_by_idx;
 
 } DPDK_18.02;
-- 
2.7.4

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v5 1/7] crypto/virtio: add virtio related fundamental functions
  @ 2018-03-31  9:18  2% ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-03-31  9:18 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

Since there does not have the common virtio library, we have to put
these files here. They are basically the same with virtio net related files
with some minor changes.

Meanwhile, adding virtio crypto PMD related release note for 18.05.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 config/common_base                     |  14 +
 doc/guides/rel_notes/release_18_05.rst |   6 +
 drivers/crypto/virtio/virtio_logs.h    |  91 +++++++
 drivers/crypto/virtio/virtio_pci.c     | 460 +++++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h     | 253 ++++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h    | 137 ++++++++++
 drivers/crypto/virtio/virtqueue.c      |  43 +++
 drivers/crypto/virtio/virtqueue.h      | 172 ++++++++++++
 8 files changed, 1176 insertions(+)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/config/common_base b/config/common_base
index a842478..bf6bbc7 100644
--- a/config/common_base
+++ b/config/common_base
@@ -486,6 +486,20 @@ CONFIG_RTE_LIBRTE_PMD_QAT_DEBUG_DRIVER=n
 CONFIG_RTE_QAT_PMD_MAX_NB_SESSIONS=2048
 
 #
+# Compile PMD for virtio crypto devices
+#
+CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO=n
+#
+# Number of maximum virtio crypto devices
+#
+CONFIG_RTE_MAX_VIRTIO_CRYPTO=32
+#
+# Number of sessions to create in the session memory pool
+# on a single virtio crypto device.
+#
+CONFIG_RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS=1024
+
+#
 # Compile PMD for AESNI backed device
 #
 CONFIG_RTE_LIBRTE_PMD_AESNI_MB=n
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 0eeabf5..a90c25e 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -53,6 +53,12 @@ New Features
   :doc:`../cryptodevs/ccp` crypto driver guide for more details on
   this new driver.
 
+* **Added the virtio crypto PMD.**
+
+  Added a new virtio crypto PMD, which provides AES-CBC ciphering and
+  AES-CBC with HMAC-SHA1 algorithm-chaining. See the
+  :doc:`../cryptodevs/virtio` crypto driver guide for more details on
+  this new driver.
 
 API Changes
 -----------
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..cd316a6
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..0a9bddb
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v4 1/7] crypto/virtio: add virtio related fundamental functions
  @ 2018-03-31  7:49  2% ` Jay Zhou
  0 siblings, 0 replies; 200+ results
From: Jay Zhou @ 2018-03-31  7:49 UTC (permalink / raw)
  To: dev
  Cc: pablo.de.lara.guarch, roy.fan.zhang, thomas, arei.gonglei,
	xin.zeng, weidong.huang, wangxinxin.wang, longpeng2,
	jianjay.zhou

Since there does not have the common virtio library, we have to put
these files here. They are basically the same with virtio net related files
with some minor changes.

Meanwhile, adding virtio crypto PMD related release note for 18.05.

Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Fan Zhang <roy.fan.zhang@intel.com>
---
 config/common_base                     |  14 +
 doc/guides/rel_notes/release_18_05.rst |   4 +
 drivers/crypto/virtio/virtio_logs.h    |  91 +++++++
 drivers/crypto/virtio/virtio_pci.c     | 460 +++++++++++++++++++++++++++++++++
 drivers/crypto/virtio/virtio_pci.h     | 253 ++++++++++++++++++
 drivers/crypto/virtio/virtio_ring.h    | 137 ++++++++++
 drivers/crypto/virtio/virtqueue.c      |  43 +++
 drivers/crypto/virtio/virtqueue.h      | 172 ++++++++++++
 8 files changed, 1174 insertions(+)
 create mode 100644 drivers/crypto/virtio/virtio_logs.h
 create mode 100644 drivers/crypto/virtio/virtio_pci.c
 create mode 100644 drivers/crypto/virtio/virtio_pci.h
 create mode 100644 drivers/crypto/virtio/virtio_ring.h
 create mode 100644 drivers/crypto/virtio/virtqueue.c
 create mode 100644 drivers/crypto/virtio/virtqueue.h

diff --git a/config/common_base b/config/common_base
index ee10b44..91d3102 100644
--- a/config/common_base
+++ b/config/common_base
@@ -486,6 +486,20 @@ CONFIG_RTE_LIBRTE_PMD_QAT_DEBUG_DRIVER=n
 CONFIG_RTE_QAT_PMD_MAX_NB_SESSIONS=2048
 
 #
+# Compile PMD for virtio crypto devices
+#
+CONFIG_RTE_LIBRTE_PMD_VIRTIO_CRYPTO=n
+#
+# Number of maximum virtio crypto devices
+#
+CONFIG_RTE_MAX_VIRTIO_CRYPTO=32
+#
+# Number of sessions to create in the session memory pool
+# on a single virtio crypto device.
+#
+CONFIG_RTE_VIRTIO_CRYPTO_PMD_MAX_NB_SESSIONS=1024
+
+#
 # Compile PMD for AESNI backed device
 #
 CONFIG_RTE_LIBRTE_PMD_AESNI_MB=n
diff --git a/doc/guides/rel_notes/release_18_05.rst b/doc/guides/rel_notes/release_18_05.rst
index 3923dc2..32c39d5 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -41,6 +41,10 @@ New Features
      Also, make sure to start the actual text at the margin.
      =========================================================
 
+* **Added Virtio Crypto PMD.**
+
+  Added new Virtio Crypto PMD, which provides AES-CBC ciphering and AES-CBC
+  with HMAC-SHA1 algorithm-chaining.
 
 API Changes
 -----------
diff --git a/drivers/crypto/virtio/virtio_logs.h b/drivers/crypto/virtio/virtio_logs.h
new file mode 100644
index 0000000..26a286c
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_logs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, RTE_LOGTYPE_PMD, \
+		"PMD: %s(): " fmt "\n", __func__, ##args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+extern int virtio_crypto_logtype_init;
+
+#define VIRTIO_CRYPTO_INIT_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_init, \
+		"INIT: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_INIT_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_INIT_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_session;
+
+#define VIRTIO_CRYPTO_SESSION_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_session, \
+		"SESSION: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_SESSION_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_SESSION_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_rx;
+
+#define VIRTIO_CRYPTO_RX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_rx, \
+		"RX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_RX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_RX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_RX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_tx;
+
+#define VIRTIO_CRYPTO_TX_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_tx, \
+		"TX: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_TX_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_TX_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_TX_LOG_IMPL(ERR, fmt, ## args)
+
+extern int virtio_crypto_logtype_driver;
+
+#define VIRTIO_CRYPTO_DRV_LOG_IMPL(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, virtio_crypto_logtype_driver, \
+		"DRIVER: %s(): " fmt "\n", __func__, ##args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_INFO(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(INFO, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_DBG(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(DEBUG, fmt, ## args)
+
+#define VIRTIO_CRYPTO_DRV_LOG_ERR(fmt, args...) \
+	VIRTIO_CRYPTO_DRV_LOG_IMPL(ERR, fmt, ## args)
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/drivers/crypto/virtio/virtio_pci.c b/drivers/crypto/virtio/virtio_pci.c
new file mode 100644
index 0000000..43ec1a4
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.c
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("vring address shouldn't be above 16TB!");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_crypto_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_crypto_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_crypto_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_crypto_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_crypto_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_crypto_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_crypto_hw *hw, struct virtqueue *vq,
+		uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_crypto_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("queue %u addresses:", vq->vq_queue_index);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t desc_addr: %" PRIx64, desc_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t aval_addr: %" PRIx64, avail_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t used_addr: %" PRIx64, used_addr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("\t notify addr: %p (notify offset: %u)",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_crypto_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_crypto_hw *hw __rte_unused,
+		struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_crypto_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+void
+vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+		const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+		uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+vtpci_cryptodev_reset(struct virtio_crypto_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw)
+{
+	vtpci_cryptodev_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+vtpci_cryptodev_isr(struct virtio_crypto_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("invalid bar: %u", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("offset(%u) + length(%u) overflows",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR(
+			"invalid cap: overflows bar space: %u > %" PRIu64,
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_ERR("bar %u base addr is NULL", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to map pci device!");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		VIRTIO_CRYPTO_INIT_LOG_DBG("failed to read pci capability list");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			VIRTIO_CRYPTO_INIT_LOG_ERR(
+				"failed to read pci cap at pos: %x", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			VIRTIO_CRYPTO_INIT_LOG_DBG(
+				"[%2x] skipping non VNDR cap id: %02x",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		VIRTIO_CRYPTO_INIT_LOG_DBG(
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("no modern virtio pci device found.");
+		return -1;
+	}
+
+	VIRTIO_CRYPTO_INIT_LOG_INFO("found modern virtio pci device.");
+
+	VIRTIO_CRYPTO_INIT_LOG_DBG("common cfg mapped at: %p", hw->common_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("device cfg mapped at: %p", hw->dev_cfg);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("isr cfg mapped at: %p", hw->isr);
+	VIRTIO_CRYPTO_INIT_LOG_DBG("notify base: %p, notify off multiplier: %u",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+vtpci_cryptodev_init(struct rte_pci_device *dev, struct virtio_crypto_hw *hw)
+{
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device. If failed, we fallback to legacy
+	 * virtio handling.
+	 */
+	if (virtio_read_caps(dev, hw) == 0) {
+		VIRTIO_CRYPTO_INIT_LOG_INFO("modern virtio pci detected.");
+		virtio_hw_internal[hw->dev_id].vtpci_ops =
+					&virtio_crypto_modern_ops;
+		hw->modern = 1;
+		return 0;
+	}
+
+	/*
+	 * virtio crypto conforms to virtio 1.0 and doesn't support
+	 * legacy mode
+	 */
+	return -1;
+}
diff --git a/drivers/crypto/virtio/virtio_pci.h b/drivers/crypto/virtio/virtio_pci.h
new file mode 100644
index 0000000..cd316a6
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_pci.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_cryptodev.h>
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_CRYPTO_PCI_VENDORID 0x1AF4
+#define VIRTIO_CRYPTO_PCI_DEVICEID 0x1054
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO)
+				      */
+/* Only if MSIX is enabled: */
+
+/* configuration change vector (16, RW) */
+#define VIRTIO_MSI_CONFIG_VECTOR  20
+/* vector for selected VQ notifications */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them?
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;	/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;	/* Generic PCI field: next ptr. */
+	uint8_t cap_len;	/* Generic PCI field: capability length */
+	uint8_t cfg_type;	/* Identifies the structure. */
+	uint8_t bar;		/* Where to find it. */
+	uint8_t padding[3];	/* Pad to full dword. */
+	uint32_t offset;	/* Offset within bar. */
+	uint32_t length;	/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_crypto_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_crypto_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_crypto_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_crypto_hw *hw);
+	void (*set_status)(struct virtio_crypto_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_crypto_hw *hw);
+	void (*set_features)(struct virtio_crypto_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_crypto_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_crypto_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_crypto_hw *hw,
+			struct virtqueue *vq, uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_crypto_hw *hw,
+			uint16_t queue_id);
+	int (*setup_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_crypto_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_crypto_hw {
+	/* control queue */
+	struct virtqueue *cvq;
+	uint16_t    dev_id;
+	uint16_t    max_dataqueues;
+	uint64_t    req_guest_features;
+	uint64_t    guest_features;
+	uint8_t	    use_msix;
+	uint8_t     modern;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	struct virtio_crypto_config *dev_cfg;
+	const struct rte_cryptodev_capabilities *virtio_dev_capabilities;
+};
+
+/*
+ * While virtio_crypto_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+	struct rte_pci_ioport io;
+};
+
+#define VTPCI_OPS(hw)	(virtio_hw_internal[(hw)->dev_id].vtpci_ops)
+#define VTPCI_IO(hw)	(&virtio_hw_internal[(hw)->dev_id].io)
+
+extern struct virtio_hw_internal virtio_hw_internal[RTE_MAX_VIRTIO_CRYPTO];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+vtpci_with_feature(struct virtio_crypto_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int vtpci_cryptodev_init(struct rte_pci_device *dev,
+	struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_reset(struct virtio_crypto_hw *hw);
+
+void vtpci_cryptodev_reinit_complete(struct virtio_crypto_hw *hw);
+
+uint8_t vtpci_cryptodev_get_status(struct virtio_crypto_hw *hw);
+void vtpci_cryptodev_set_status(struct virtio_crypto_hw *hw, uint8_t status);
+
+uint64_t vtpci_cryptodev_negotiate_features(struct virtio_crypto_hw *hw,
+	uint64_t host_features);
+
+void vtpci_write_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	const void *src, int length);
+
+void vtpci_read_cryptodev_config(struct virtio_crypto_hw *hw, size_t offset,
+	void *dst, int length);
+
+uint8_t vtpci_cryptodev_isr(struct virtio_crypto_hw *hw);
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/crypto/virtio/virtio_ring.h b/drivers/crypto/virtio/virtio_ring.h
new file mode 100644
index 0000000..ee30674
--- /dev/null
+++ b/drivers/crypto/virtio/virtio_ring.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTIO_RING_H_
+#define _VIRTIO_RING_H_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT   4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers.
+ */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.
+ */
+#define VRING_AVAIL_F_NO_INTERRUPT  1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next".
+ */
+struct vring_desc {
+	uint64_t addr;  /*  Address (guest-physical). */
+	uint32_t len;   /* Length. */
+	uint16_t flags; /* The flags as indicated above. */
+	uint16_t next;  /* We chain unused descriptors via this. */
+};
+
+struct vring_avail {
+	uint16_t flags;
+	uint16_t idx;
+	uint16_t ring[0];
+};
+
+/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+	/* Index of start of used descriptor chain. */
+	uint32_t id;
+	/* Total length of the descriptor chain which was written to. */
+	uint32_t len;
+};
+
+struct vring_used {
+	uint16_t flags;
+	volatile uint16_t idx;
+	struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+	struct vring_desc  *desc;
+	struct vring_avail *avail;
+	struct vring_used  *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *      __u16 used_event_idx;
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ *      __u16 avail_event_idx;
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+/*
+ * We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility.
+ */
+#define vring_used_event(vr)  ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num])
+
+static inline size_t
+vring_size(unsigned int num, unsigned long align)
+{
+	size_t size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = RTE_ALIGN_CEIL(size, align);
+	size += sizeof(struct vring_used) +
+		(num * sizeof(struct vring_used_elem));
+	return size;
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+	unsigned long align)
+{
+	vr->num = num;
+	vr->desc = (struct vring_desc *) p;
+	vr->avail = (struct vring_avail *) (p +
+		num * sizeof(struct vring_desc));
+	vr->used = (void *)
+		RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
+}
+
+/*
+ * The following is used with VIRTIO_RING_F_EVENT_IDX.
+ * Assuming a given event_idx value from the other size, if we have
+ * just incremented index from old to new_idx, should we trigger an
+ * event?
+ */
+static inline int
+vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+	return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old);
+}
+
+#endif /* _VIRTIO_RING_H_ */
diff --git a/drivers/crypto/virtio/virtqueue.c b/drivers/crypto/virtio/virtqueue.c
new file mode 100644
index 0000000..fd8be58
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_crypto.h>
+#include <rte_malloc.h>
+
+#include "virtqueue.h"
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	/*
+	 * Set VRING_AVAIL_F_NO_INTERRUPT to hint host
+	 * not to interrupt when it consumes packets
+	 * Note: this is only considered a hint to the host
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+void
+virtqueue_detatch_unused(struct virtqueue *vq)
+{
+	struct rte_crypto_op *cop = NULL;
+
+	int idx;
+
+	if (vq != NULL)
+		for (idx = 0; idx < vq->vq_nentries; idx++) {
+			cop = vq->vq_descx[idx].crypto_op;
+			if (cop) {
+				if (cop->sym->m_src)
+					rte_pktmbuf_free(cop->sym->m_src);
+				if (cop->sym->m_dst)
+					rte_pktmbuf_free(cop->sym->m_dst);
+				rte_crypto_op_free(cop);
+				vq->vq_descx[idx].crypto_op = NULL;
+			}
+		}
+}
diff --git a/drivers/crypto/virtio/virtqueue.h b/drivers/crypto/virtio/virtqueue.h
new file mode 100644
index 0000000..0a9bddb
--- /dev/null
+++ b/drivers/crypto/virtio/virtqueue.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 HUAWEI TECHNOLOGIES CO., LTD.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <linux/virtio_crypto.h>
+
+#include <stdint.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+#include "virtio_ring.h"
+#include "virtio_logs.h"
+
+struct rte_mbuf;
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+enum { VTCRYPTO_DATAQ = 0, VTCRYPTO_CTRLQ = 1 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void     *crypto_op;
+	void     *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	/**< virtio_crypto_hw structure pointer. */
+	struct virtio_crypto_hw *hw;
+	/**< mem zone to populate RX ring. */
+	const struct rte_memzone *mz;
+	/**< memzone to populate hdr and request. */
+	struct rte_mempool *mpool;
+	uint8_t     dev_id;              /**< Device identifier. */
+	uint16_t    vq_queue_index;       /**< PCI queue index */
+
+	void        *vq_ring_virt_mem;    /**< linear address of vring*/
+	unsigned int vq_ring_size;
+	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+
+	struct vring vq_ring;    /**< vring keeping desc, used and avail */
+	uint16_t    vq_free_cnt; /**< num of desc available */
+	uint16_t    vq_nentries; /**< vring desc numbers */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_avail_idx;
+
+	/* Statistics */
+	uint64_t	packets_sent_total;
+	uint64_t	packets_sent_failed;
+	uint64_t	packets_received_total;
+	uint64_t	packets_received_failed;
+
+	uint16_t  *notify_addr;
+
+	struct vq_desc_extra vq_descx[0];
+};
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+void virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) \
+	((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+/**
+ * Dump virtqueue internal structures, for debug purpose only.
+ */
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	VIRTIO_CRYPTO_INIT_LOG_DBG(\
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+
+#endif /* _VIRTQUEUE_H_ */
-- 
1.8.3.1

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v4 2/5] vhost: support selective datapath
  @ 2018-03-31  6:10  3%     ` Maxime Coquelin
  2018-04-02  1:58  0%       ` Wang, Zhihong
  0 siblings, 1 reply; 200+ results
From: Maxime Coquelin @ 2018-03-31  6:10 UTC (permalink / raw)
  To: Zhihong Wang, dev
  Cc: jianfeng.tan, tiwei.bie, yliu, cunming.liang, xiao.w.wang, dan.daly



On 03/10/2018 11:01 AM, Zhihong Wang wrote:
> This patch set introduces support for selective datapath in DPDK vhost-user
> lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
> virtio ring compatible devices to serve virtio driver directly to enable
> datapath acceleration.
> 
> A set of device ops is defined for device specific operations:
> 
>       a. queue_num_get: Called to get supported queue number of the device.
> 
>       b. feature_get: Called to get supported features of the device.
> 
>       c. protocol_feature_get: Called to get supported protocol features of
>          the device.
> 
>       d. dev_conf: Called to configure the actual device when the virtio
>          device becomes ready.
> 
>       e. dev_close: Called to close the actual device when the virtio device
>          is stopped.
> 
>       f. vring_state_set: Called to change the state of the vring in the
>          actual device when vring state changes.
> 
>       g. feature_set: Called to set the negotiated features to device.
> 
>       h. migration_done: Called to allow the device to response to RARP
>          sending.
> 
>       i. get_vfio_group_fd: Called to get the VFIO group fd of the device.
> 
>       j. get_vfio_device_fd: Called to get the VFIO device fd of the device.
> 
>       k. get_notify_area: Called to get the notify area info of the queue.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
> Changes in v4:
> 
>   1. Remove the "engine" concept in the lib.
> 
> ---
> Changes in v2:
> 
>   1. Add VFIO related vDPA device ops.
> 
>   lib/librte_vhost/Makefile              |  4 +-
>   lib/librte_vhost/rte_vdpa.h            | 94 +++++++++++++++++++++++++++++++++
>   lib/librte_vhost/rte_vhost_version.map |  6 +++
>   lib/librte_vhost/vdpa.c                | 96 ++++++++++++++++++++++++++++++++++
>   4 files changed, 198 insertions(+), 2 deletions(-)
>   create mode 100644 lib/librte_vhost/rte_vdpa.h
>   create mode 100644 lib/librte_vhost/vdpa.c
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index 5d6c6abae..37044ac03 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net
>   
>   # all source are stored in SRCS-y
>   SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
> -					vhost_user.c virtio_net.c
> +					vhost_user.c virtio_net.c vdpa.c
>   
>   # install includes
> -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
>   
>   include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
> new file mode 100644
> index 000000000..a4bbbd93d
> --- /dev/null
> +++ b/lib/librte_vhost/rte_vdpa.h
> @@ -0,0 +1,94 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _RTE_VDPA_H_
> +#define _RTE_VDPA_H_
> +
> +/**
> + * @file
> + *
> + * Device specific vhost lib
> + */
> +
> +#include <rte_pci.h>
> +#include "rte_vhost.h"
> +
> +#define MAX_VDPA_NAME_LEN 128
> +
> +enum vdpa_addr_type {
> +	PCI_ADDR,
> +	VDPA_ADDR_MAX
> +};
> +
> +struct rte_vdpa_dev_addr {
> +	enum vdpa_addr_type type;
> +	union {
> +		uint8_t __dummy[64];
> +		struct rte_pci_addr pci_addr;
> +	};
> +};
> +
> +/* Get capabilities of this device */
> +typedef int (*vdpa_dev_queue_num_get_t)(int did, uint32_t *queue_num);
> +typedef int (*vdpa_dev_feature_get_t)(int did, uint64_t *features);
> +
> +/* Driver configure/close the device */
> +typedef int (*vdpa_dev_conf_t)(int vid);
> +typedef int (*vdpa_dev_close_t)(int vid);
> +
> +/* Enable/disable this vring */
> +typedef int (*vdpa_vring_state_set_t)(int vid, int vring, int state);
> +
> +/* Set features when changed */
> +typedef int (*vdpa_feature_set_t)(int vid);
> +
> +/* Destination operations when migration done */
> +typedef int (*vdpa_migration_done_t)(int vid);
> +
> +/* Get the vfio group fd */
> +typedef int (*vdpa_get_vfio_group_fd_t)(int vid);
> +
> +/* Get the vfio device fd */
> +typedef int (*vdpa_get_vfio_device_fd_t)(int vid);
> +
> +/* Get the notify area info of the queue */
> +typedef int (*vdpa_get_notify_area_t)(int vid, int qid, uint64_t *offset,
> +		uint64_t *size);
> +/* Device ops */
> +struct rte_vdpa_dev_ops {
> +	vdpa_dev_queue_num_get_t  queue_num_get;
> +	vdpa_dev_feature_get_t    feature_get;
> +	vdpa_dev_feature_get_t    protocol_feature_get;
> +	vdpa_dev_conf_t           dev_conf;
> +	vdpa_dev_close_t          dev_close;
> +	vdpa_vring_state_set_t    vring_state_set;
> +	vdpa_feature_set_t        feature_set;
> +	vdpa_migration_done_t     migration_done;
> +	vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
> +	vdpa_get_vfio_device_fd_t get_vfio_device_fd;
> +	vdpa_get_notify_area_t    get_notify_area;

Maybe you could reserve some room here to avoid breaking the ABI in the
future if we need to add some optional ops.

> +};
> +
> +struct rte_vdpa_device {
> +	struct rte_vdpa_dev_addr addr;
> +	struct rte_vdpa_dev_ops *ops;
> +} __rte_cache_aligned;
> +
> +extern struct rte_vdpa_device *vdpa_devices[];
> +extern uint32_t vdpa_device_num;
> +
> +/* Register a vdpa device, return did if successful, -1 on failure */
> +int __rte_experimental
> +rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
> +		struct rte_vdpa_dev_ops *ops);
> +
> +/* Unregister a vdpa device, return -1 on failure */
> +int __rte_experimental
> +rte_vdpa_unregister_device(int did);
> +
> +/* Find did of a vdpa device, return -1 on failure */
> +int __rte_experimental
> +rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr);
> +
> +#endif /* _RTE_VDPA_H_ */
> diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
> index df0103129..7bcffb490 100644
> --- a/lib/librte_vhost/rte_vhost_version.map
> +++ b/lib/librte_vhost/rte_vhost_version.map
> @@ -59,3 +59,9 @@ DPDK_18.02 {
>   	rte_vhost_vring_call;
>   
>   } DPDK_17.08;
> +
> +EXPERIMENTAL {
> +	rte_vdpa_register_device;
> +	rte_vdpa_unregister_device;
> +	rte_vdpa_find_device_id;

I think you need also to declare the new structs here,
not only the new functions.

> +} DPDK_18.02;
> diff --git a/lib/librte_vhost/vdpa.c b/lib/librte_vhost/vdpa.c
> new file mode 100644
> index 000000000..0c950d45f
> --- /dev/null
> +++ b/lib/librte_vhost/vdpa.c
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +/**
> + * @file
> + *
> + * Device specific vhost lib
> + */
> +
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include "rte_vdpa.h"
> +#include "vhost.h"
> +
> +struct rte_vdpa_device *vdpa_devices[MAX_VHOST_DEVICE];
> +uint32_t vdpa_device_num;
> +
> +static int is_same_vdpa_dev_addr(struct rte_vdpa_dev_addr *a,
> +		struct rte_vdpa_dev_addr *b)
> +{

Given the boolean nature of the function name, I would return 1 if same
device, 0 if different.

> +	int ret = 0;
> +
> +	if (a->type != b->type)
> +		return -1;
> +
> +	switch (a->type) {
> +	case PCI_ADDR:
> +		if (a->pci_addr.domain != b->pci_addr.domain ||
> +				a->pci_addr.bus != b->pci_addr.bus ||
> +				a->pci_addr.devid != b->pci_addr.devid ||
> +				a->pci_addr.function != b->pci_addr.function)
> +			ret = -1;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
> +int rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
> +		struct rte_vdpa_dev_ops *ops)
> +{
> +	struct rte_vdpa_device *dev;
> +	char device_name[MAX_VDPA_NAME_LEN];
> +	int i;
> +
> +	if (vdpa_device_num >= MAX_VHOST_DEVICE)
> +		return -1;
> +
> +	for (i = 0; i < MAX_VHOST_DEVICE; i++) {
> +		if (vdpa_devices[i] == NULL)
> +			break;
You might want to check same device isn't being registering a second
time, and return an error in that case.

This is not a blocker though, and can be done in a dedicated patch.

> +	}
> +
> +	sprintf(device_name, "vdpa-dev-%d", i);
> +	dev = rte_zmalloc(device_name, sizeof(struct rte_vdpa_device),
> +			RTE_CACHE_LINE_SIZE);
> +	if (!dev)
> +		return -1;
> +
> +	memcpy(&dev->addr, addr, sizeof(struct rte_vdpa_dev_addr));
> +	dev->ops = ops;
> +	vdpa_devices[i] = dev;
> +	vdpa_device_num++;
> +
> +	return i;
> +}
> +
> +int rte_vdpa_unregister_device(int did)
> +{
> +	if (did < 0 || did >= MAX_VHOST_DEVICE || vdpa_devices[did] == NULL)
> +		return -1;
> +
> +	rte_free(vdpa_devices[did]);
> +	vdpa_devices[did] = NULL;
> +	vdpa_device_num--;
> +
> +	return did;
> +}
> +
> +int rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr)
> +{
> +	struct rte_vdpa_device *dev;
> +	int i;
> +
> +	for (i = 0; i < MAX_VHOST_DEVICE; ++i) {
> +		dev = vdpa_devices[i];
> +		if (dev && is_same_vdpa_dev_addr(&dev->addr, addr) == 0)
> +			return i;
> +	}
> +
> +	return -1;
> +}
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  2018-03-30 10:34  0%   ` Ferruh Yigit
@ 2018-03-31  0:05  0%     ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2018-03-31  0:05 UTC (permalink / raw)
  To: Ferruh Yigit, Remy Horton
  Cc: dev, John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang,
	Beilei Xing, Shreyansh Jain

30/03/2018 12:34, Ferruh Yigit:
> On 3/27/2018 7:43 PM, Ferruh Yigit wrote:
> > On 3/21/2018 2:27 PM, Remy Horton wrote:
> >> The optimal values of several transmission & reception related parameters,
> >> such as burst sizes, descriptor ring sizes, and number of queues, varies
> >> between different network interface devices. This patchset allows individual
> >> PMDs to specify their preferred parameter values, and if so indicated by an
> >> application, for them to be used automatically by the ethdev layer.
> >>
> >> rte_eth_dev_configure() has been changed so that specifying zero for both
> >> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
> >> are not available, falls back to EAL defaults. Setting one (but not both)
> >> to zero does not cause the use of defaults, as having one of them zeroed is
> >> a valid setup.
> >>
> >> This RFC/V1 includes per-PMD values for e1000 and i40e but it is expected
> >> that subsequent patchsets will cover other PMDs. A deprecation notice
> >> covering the API/ABI change is in place.
> >>
> >>
> >> Changes in v2:
> >> * Rebased to 
> >> * Removed fallback values from rte_eth_dev_info_get()
> >> * Added fallback values to rte_rte_[rt]x_queue_setup()
> >> * Added fallback values to rte_eth_dev_configure()
> >> * Corrected comment
> >> * Removed deprecation notice
> >> * Split RX and Tx into seperate structures
> >> * Changed parameter names
> >>
> >>
> >> Remy Horton (4):
> >>   ethdev: add support for PMD-tuned Tx/Rx parameters
> >>   net/e1000: add TxRx tuning parameters
> >>   net/i40e: add TxRx tuning parameters
> >>   testpmd: make use of per-PMD TxRx parameters
> > 
> > Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
> 
> Series applied to dpdk-next-net/master, thanks.

I prefer not pulling this series in master and give a chance to have
a more complete v3 for testpmd and examples.

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2 2/7] bpf: add BPF loading and execution framework
  @ 2018-03-30 17:32  2% ` Konstantin Ananyev
  0 siblings, 0 replies; 200+ results
From: Konstantin Ananyev @ 2018-03-30 17:32 UTC (permalink / raw)
  To: dev; +Cc: Konstantin Ananyev

librte_bpf provides a framework to load and execute eBPF bytecode
inside user-space dpdk based applications.
It supports basic set of features from eBPF spec
(https://www.kernel.org/doc/Documentation/networking/filter.txt).

Not currently supported features:
 - JIT
 - cBPF
 - tail-pointer call
 - eBPF MAP
 - skb

It also adds dependency on libelf.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 config/common_base                 |   5 +
 lib/Makefile                       |   2 +
 lib/librte_bpf/Makefile            |  30 +++
 lib/librte_bpf/bpf.c               |  59 +++++
 lib/librte_bpf/bpf_exec.c          | 452 +++++++++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_impl.h          |  41 ++++
 lib/librte_bpf/bpf_load.c          | 385 +++++++++++++++++++++++++++++++
 lib/librte_bpf/bpf_validate.c      |  55 +++++
 lib/librte_bpf/meson.build         |  18 ++
 lib/librte_bpf/rte_bpf.h           | 160 +++++++++++++
 lib/librte_bpf/rte_bpf_version.map |  12 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   2 +
 13 files changed, 1222 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_bpf/Makefile
 create mode 100644 lib/librte_bpf/bpf.c
 create mode 100644 lib/librte_bpf/bpf_exec.c
 create mode 100644 lib/librte_bpf/bpf_impl.h
 create mode 100644 lib/librte_bpf/bpf_load.c
 create mode 100644 lib/librte_bpf/bpf_validate.c
 create mode 100644 lib/librte_bpf/meson.build
 create mode 100644 lib/librte_bpf/rte_bpf.h
 create mode 100644 lib/librte_bpf/rte_bpf_version.map

diff --git a/config/common_base b/config/common_base
index ee10b449b..97b60f9ff 100644
--- a/config/common_base
+++ b/config/common_base
@@ -827,3 +827,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile librte_bpf
+#
+CONFIG_RTE_LIBRTE_BPF=y
diff --git a/lib/Makefile b/lib/Makefile
index ec965a606..a4a2329f9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -97,6 +97,8 @@ DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ether
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ether librte_net
 DEPDIRS-librte_gso += librte_mempool
+DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
+DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ether
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_bpf/Makefile b/lib/librte_bpf/Makefile
new file mode 100644
index 000000000..e0f434e77
--- /dev/null
+++ b/lib/librte_bpf/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_bpf.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+LDLIBS += -lrte_net -lrte_eal
+LDLIBS += -lrte_mempool -lrte_ring
+LDLIBS += -lrte_mbuf -lrte_ethdev
+LDLIBS += -lelf
+
+EXPORT_MAP := rte_bpf_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_exec.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_load.c
+SRCS-$(CONFIG_RTE_LIBRTE_BPF) += bpf_validate.c
+
+# install header files
+SYMLINK-$(CONFIG_RTE_LIBRTE_BPF)-include += rte_bpf.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_bpf/bpf.c b/lib/librte_bpf/bpf.c
new file mode 100644
index 000000000..d7f68c017
--- /dev/null
+++ b/lib/librte_bpf/bpf.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+int rte_bpf_logtype;
+
+__rte_experimental void
+rte_bpf_destroy(struct rte_bpf *bpf)
+{
+	if (bpf != NULL) {
+		if (bpf->jit.func != NULL)
+			munmap(bpf->jit.func, bpf->jit.sz);
+		munmap(bpf, bpf->sz);
+	}
+}
+
+__rte_experimental int
+rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	jit[0] = bpf->jit;
+	return 0;
+}
+
+int
+bpf_jit(struct rte_bpf *bpf)
+{
+	int32_t rc;
+
+	rc = -ENOTSUP;
+	if (rc != 0)
+		RTE_BPF_LOG(WARNING, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
+
+RTE_INIT(rte_bpf_init_log);
+
+static void
+rte_bpf_init_log(void)
+{
+	rte_bpf_logtype = rte_log_register("lib.bpf");
+	if (rte_bpf_logtype >= 0)
+		rte_log_set_level(rte_bpf_logtype, RTE_LOG_INFO);
+}
diff --git a/lib/librte_bpf/bpf_exec.c b/lib/librte_bpf/bpf_exec.c
new file mode 100644
index 000000000..0382ade98
--- /dev/null
+++ b/lib/librte_bpf/bpf_exec.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+
+#include "bpf_impl.h"
+
+#define BPF_JMP_UNC(ins)	((ins) += (ins)->off)
+
+#define BPF_JMP_CND_REG(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg]) ? \
+		(ins)->off : 0)
+
+#define BPF_JMP_CND_IMM(reg, ins, op, type)	\
+	((ins) += \
+		((type)(reg)[(ins)->dst_reg] op (type)(ins)->imm) ? \
+		(ins)->off : 0)
+
+#define BPF_NEG_ALU(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(-(reg)[(ins)->dst_reg]))
+
+#define BPF_MOV_ALU_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(reg)[(ins)->src_reg])
+
+#define BPF_OP_ALU_REG(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(reg)[(ins)->src_reg])
+
+#define BPF_MOV_ALU_IMM(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = (type)(ins)->imm)
+
+#define BPF_OP_ALU_IMM(reg, ins, op, type)	\
+	((reg)[(ins)->dst_reg] = \
+		(type)(reg)[(ins)->dst_reg] op (type)(ins)->imm)
+
+#define BPF_DIV_ZERO_CHECK(bpf, reg, ins, type) do { \
+	if ((type)(reg)[(ins)->src_reg] == 0) { \
+		RTE_BPF_LOG(ERR, \
+			"%s(%p): division by 0 at pc: %#zx;\n", \
+			__func__, bpf, \
+			(uintptr_t)(ins) - (uintptr_t)(bpf)->prm.ins); \
+		return 0; \
+	} \
+} while (0)
+
+#define BPF_LD_REG(reg, ins, type)	\
+	((reg)[(ins)->dst_reg] = \
+		*(type *)(uintptr_t)((reg)[(ins)->src_reg] + (ins)->off))
+
+#define BPF_ST_IMM(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(ins)->imm)
+
+#define BPF_ST_REG(reg, ins, type)	\
+	(*(type *)(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off) = \
+		(type)(reg)[(ins)->src_reg])
+
+#define BPF_ST_XADD_REG(reg, ins, tp)	\
+	(rte_atomic##tp##_add((rte_atomic##tp##_t *) \
+		(uintptr_t)((reg)[(ins)->dst_reg] + (ins)->off), \
+		reg[ins->src_reg]))
+
+static inline void
+bpf_alu_be(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_be_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_be_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_be_64(*v);
+		break;
+	}
+}
+
+static inline void
+bpf_alu_le(uint64_t reg[MAX_BPF_REG], const struct bpf_insn *ins)
+{
+	uint64_t *v;
+
+	v = reg + ins->dst_reg;
+	switch (ins->imm) {
+	case 16:
+		*v = rte_cpu_to_le_16(*v);
+		break;
+	case 32:
+		*v = rte_cpu_to_le_32(*v);
+		break;
+	case 64:
+		*v = rte_cpu_to_le_64(*v);
+		break;
+	}
+}
+
+static inline uint64_t
+bpf_exec(const struct rte_bpf *bpf, uint64_t reg[MAX_BPF_REG])
+{
+	const struct bpf_insn *ins;
+
+	for (ins = bpf->prm.ins; ; ins++) {
+		switch (ins->code) {
+		/* 32 bit ALU IMM operations */
+		case (BPF_ALU | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint32_t);
+			break;
+		/* 32 bit ALU REG operations */
+		case (BPF_ALU | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint32_t);
+			break;
+		case (BPF_ALU | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint32_t);
+			break;
+		case (BPF_ALU | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint32_t);
+			break;
+		case (BPF_ALU | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint32_t);
+			break;
+		case (BPF_ALU | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint32_t);
+			break;
+		case (BPF_ALU | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint32_t);
+			break;
+		case (BPF_ALU | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint32_t);
+			break;
+		case (BPF_ALU | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint32_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint32_t);
+			break;
+		case (BPF_ALU | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint32_t);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_BE):
+			bpf_alu_be(reg, ins);
+			break;
+		case (BPF_ALU | BPF_END | BPF_TO_LE):
+			bpf_alu_le(reg, ins);
+			break;
+		/* 64 bit ALU IMM operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_K):
+			BPF_OP_ALU_IMM(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_K):
+			BPF_MOV_ALU_IMM(reg, ins, uint64_t);
+			break;
+		/* 64 bit ALU REG operations */
+		case (BPF_ALU64 | BPF_ADD | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, +, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_SUB | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, -, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_AND | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, &, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_OR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, |, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_LSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, <<, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_RSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_ARSH | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, >>, int64_t);
+			break;
+		case (BPF_ALU64 | BPF_XOR | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, ^, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MUL | BPF_X):
+			BPF_OP_ALU_REG(reg, ins, *, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_DIV | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, /, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOD | BPF_X):
+			BPF_DIV_ZERO_CHECK(bpf, reg, ins, uint64_t);
+			BPF_OP_ALU_REG(reg, ins, %, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_MOV | BPF_X):
+			BPF_MOV_ALU_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ALU64 | BPF_NEG):
+			BPF_NEG_ALU(reg, ins, uint64_t);
+			break;
+		/* load instructions */
+		case (BPF_LDX | BPF_MEM | BPF_B):
+			BPF_LD_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_H):
+			BPF_LD_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_W):
+			BPF_LD_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_LDX | BPF_MEM | BPF_DW):
+			BPF_LD_REG(reg, ins, uint64_t);
+			break;
+		/* load 64 bit immediate value */
+		case (BPF_LD | BPF_IMM | BPF_DW):
+			reg[ins->dst_reg] = (uint32_t)ins[0].imm |
+				(uint64_t)(uint32_t)ins[1].imm << 32;
+			ins++;
+			break;
+		/* store instructions */
+		case (BPF_STX | BPF_MEM | BPF_B):
+			BPF_ST_REG(reg, ins, uint8_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_H):
+			BPF_ST_REG(reg, ins, uint16_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_W):
+			BPF_ST_REG(reg, ins, uint32_t);
+			break;
+		case (BPF_STX | BPF_MEM | BPF_DW):
+			BPF_ST_REG(reg, ins, uint64_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_B):
+			BPF_ST_IMM(reg, ins, uint8_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_H):
+			BPF_ST_IMM(reg, ins, uint16_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_W):
+			BPF_ST_IMM(reg, ins, uint32_t);
+			break;
+		case (BPF_ST | BPF_MEM | BPF_DW):
+			BPF_ST_IMM(reg, ins, uint64_t);
+			break;
+		/* atomic add instructions */
+		case (BPF_STX | BPF_XADD | BPF_W):
+			BPF_ST_XADD_REG(reg, ins, 32);
+			break;
+		case (BPF_STX | BPF_XADD | BPF_DW):
+			BPF_ST_XADD_REG(reg, ins, 64);
+			break;
+		/* jump instructions */
+		case (BPF_JMP | BPF_JA):
+			BPF_JMP_UNC(ins);
+			break;
+		/* jump IMM instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_K):
+			BPF_JMP_CND_IMM(reg, ins, &, uint64_t);
+			break;
+		/* jump REG instructions */
+		case (BPF_JMP | BPF_JEQ | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, ==, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JNE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, !=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, uint64_t);
+			break;
+		case (BPF_JMP | BPF_JSGT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLT | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSGE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, >=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSLE | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, <=, int64_t);
+			break;
+		case (BPF_JMP | BPF_JSET | BPF_X):
+			BPF_JMP_CND_REG(reg, ins, &, uint64_t);
+			break;
+		/* call instructions */
+		case (BPF_JMP | BPF_CALL):
+			reg[BPF_REG_0] = bpf->prm.xsym[ins->imm].func(
+				reg[BPF_REG_1], reg[BPF_REG_2], reg[BPF_REG_3],
+				reg[BPF_REG_4], reg[BPF_REG_5]);
+			break;
+		/* return instruction */
+		case (BPF_JMP | BPF_EXIT):
+			return reg[BPF_REG_0];
+		default:
+			RTE_BPF_LOG(ERR,
+				"%s(%p): invalid opcode %#x at pc: %#zx;\n",
+				__func__, bpf, ins->code,
+				(uintptr_t)ins - (uintptr_t)bpf->prm.ins);
+			return 0;
+		}
+	}
+
+	/* should never be reached */
+	RTE_VERIFY(0);
+	return 0;
+}
+
+__rte_experimental uint32_t
+rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
+	uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[MAX_BPF_REG];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		reg[BPF_REG_1] = (uintptr_t)ctx[i];
+		reg[BPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+__rte_experimental uint64_t
+rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
+{
+	uint64_t rc;
+
+	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
+	return rc;
+}
diff --git a/lib/librte_bpf/bpf_impl.h b/lib/librte_bpf/bpf_impl.h
new file mode 100644
index 000000000..5d7e65c31
--- /dev/null
+++ b/lib/librte_bpf/bpf_impl.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _BPF_H_
+#define _BPF_H_
+
+#include <rte_bpf.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BPF_STACK_SIZE	0x200
+
+struct rte_bpf {
+	struct rte_bpf_prm prm;
+	struct rte_bpf_jit jit;
+	size_t sz;
+	uint32_t stack_sz;
+};
+
+extern int bpf_validate(struct rte_bpf *bpf);
+
+extern int bpf_jit(struct rte_bpf *bpf);
+
+#ifdef RTE_ARCH_X86_64
+extern int bpf_jit_x86(struct rte_bpf *);
+#endif
+
+extern int rte_bpf_logtype;
+
+#define	RTE_BPF_LOG(lvl, fmt, args...) \
+	rte_log(RTE_LOG_## lvl, rte_bpf_logtype, fmt, ##args)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BPF_H_ */
diff --git a/lib/librte_bpf/bpf_load.c b/lib/librte_bpf/bpf_load.c
new file mode 100644
index 000000000..e1ff5714a
--- /dev/null
+++ b/lib/librte_bpf/bpf_load.c
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+
+#include "bpf_impl.h"
+
+/* To overcome compatibility issue */
+#ifndef EM_BPF
+#define	EM_BPF	247
+#endif
+
+static uint32_t
+bpf_find_xsym(const char *sn, enum rte_bpf_xtype type,
+	const struct rte_bpf_xsym fp[], uint32_t fn)
+{
+	uint32_t i;
+
+	if (sn == NULL || fp == NULL)
+		return UINT32_MAX;
+
+	for (i = 0; i != fn; i++) {
+		if (fp[i].type == type && strcmp(sn, fp[i].name) == 0)
+			break;
+	}
+
+	return (i != fn) ? i : UINT32_MAX;
+}
+
+/*
+ * update BPF code at offset *ofs* with a proper address(index) for external
+ * symbol *sn*
+ */
+static int
+resolve_xsym(const char *sn, size_t ofs, struct bpf_insn *ins, size_t ins_sz,
+	const struct rte_bpf_prm *prm)
+{
+	uint32_t idx, fidx;
+	enum rte_bpf_xtype type;
+
+	if (ofs % sizeof(ins[0]) != 0 || ofs >= ins_sz)
+		return -EINVAL;
+
+	idx = ofs / sizeof(ins[0]);
+	if (ins[idx].code == (BPF_JMP | BPF_CALL))
+		type = RTE_BPF_XTYPE_FUNC;
+	else if (ins[idx].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+			ofs < ins_sz - sizeof(ins[idx]))
+		type = RTE_BPF_XTYPE_VAR;
+	else
+		return -EINVAL;
+
+	fidx = bpf_find_xsym(sn, type, prm->xsym, prm->nb_xsym);
+	if (fidx == UINT32_MAX)
+		return -ENOENT;
+
+	/* for function we just need an index in our xsym table */
+	if (type == RTE_BPF_XTYPE_FUNC)
+		ins[idx].imm = fidx;
+	/* for variable we need to store its absolute address */
+	else {
+		ins[idx].imm = (uintptr_t)prm->xsym[fidx].var;
+		ins[idx + 1].imm = (uintptr_t)prm->xsym[fidx].var >> 32;
+	}
+
+	return 0;
+}
+
+static int
+check_elf_header(const Elf64_Ehdr * eh)
+{
+	const char *err;
+
+	err = NULL;
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	if (eh->e_ident[EI_DATA] != ELFDATA2LSB)
+#else
+	if (eh->e_ident[EI_DATA] != ELFDATA2MSB)
+#endif
+		err = "not native byte order";
+	else if (eh->e_ident[EI_OSABI] != ELFOSABI_NONE)
+		err = "unexpected OS ABI";
+	else if (eh->e_type != ET_REL)
+		err = "unexpected ELF type";
+	else if (eh->e_machine != EM_NONE && eh->e_machine != EM_BPF)
+		err = "unexpected machine type";
+
+	if (err != NULL) {
+		RTE_BPF_LOG(ERR, "%s(): %s\n", __func__, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find executable section by name.
+ */
+static int
+find_elf_code(Elf *elf, const char *section, Elf_Data **psd, size_t *pidx)
+{
+	Elf_Scn *sc;
+	const Elf64_Ehdr *eh;
+	const Elf64_Shdr *sh;
+	Elf_Data *sd;
+	const char *sn;
+	int32_t rc;
+
+	eh = elf64_getehdr(elf);
+	if (eh == NULL) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	if (check_elf_header(eh) != 0)
+		return -EINVAL;
+
+	/* find given section by name */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL;
+			sc = elf_nextscn(elf, sc)) {
+		sh = elf64_getshdr(sc);
+		sn = elf_strptr(elf, eh->e_shstrndx, sh->sh_name);
+		if (sn != NULL && strcmp(section, sn) == 0 &&
+				sh->sh_type == SHT_PROGBITS &&
+				sh->sh_flags == (SHF_ALLOC | SHF_EXECINSTR))
+			break;
+	}
+
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL || sd->d_size == 0 ||
+			sd->d_size % sizeof(struct bpf_insn) != 0) {
+		rc = elf_errno();
+		RTE_BPF_LOG(ERR, "%s(%p, %s) error code: %d(%s)\n",
+			__func__, elf, section, rc, elf_errmsg(rc));
+		return -EINVAL;
+	}
+
+	*psd = sd;
+	*pidx = elf_ndxscn(sc);
+	return 0;
+}
+
+/*
+ * helper function to process data from relocation table.
+ */
+static int
+process_reloc(Elf *elf, size_t sym_idx, Elf64_Rel *re, size_t re_sz,
+	struct bpf_insn *ins, size_t ins_sz, const struct rte_bpf_prm *prm)
+{
+	int32_t rc;
+	uint32_t i, n;
+	size_t ofs, sym;
+	const char *sn;
+	const Elf64_Ehdr *eh;
+	Elf_Scn *sc;
+	const Elf_Data *sd;
+	Elf64_Sym *sm;
+
+	eh = elf64_getehdr(elf);
+
+	/* get symtable by section index */
+	sc = elf_getscn(elf, sym_idx);
+	sd = elf_getdata(sc, NULL);
+	if (sd == NULL)
+		return -EINVAL;
+	sm = sd->d_buf;
+
+	n = re_sz / sizeof(re[0]);
+	for (i = 0; i != n; i++) {
+
+		ofs = re[i].r_offset;
+
+		/* retrieve index in the symtable */
+		sym = ELF64_R_SYM(re[i].r_info);
+		if (sym * sizeof(sm[0]) >= sd->d_size)
+			return -EINVAL;
+
+		sn = elf_strptr(elf, eh->e_shstrndx, sm[sym].st_name);
+
+		rc = resolve_xsym(sn, ofs, ins, ins_sz, prm);
+		if (rc != 0) {
+			RTE_BPF_LOG(ERR,
+				"resolve_xsym(%s, %zu) error code: %d\n",
+				sn, ofs, rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * helper function, find relocation information (if any)
+ * and update bpf code.
+ */
+static int
+elf_reloc_code(Elf *elf, Elf_Data *ed, size_t sidx,
+	const struct rte_bpf_prm *prm)
+{
+	Elf64_Rel *re;
+	Elf_Scn *sc;
+	const Elf64_Shdr *sh;
+	const Elf_Data *sd;
+	int32_t rc;
+
+	rc = 0;
+
+	/* walk through all sections */
+	for (sc = elf_nextscn(elf, NULL); sc != NULL && rc == 0;
+			sc = elf_nextscn(elf, sc)) {
+
+		sh = elf64_getshdr(sc);
+
+		/* relocation data for our code section */
+		if (sh->sh_type == SHT_REL && sh->sh_info == sidx) {
+			sd = elf_getdata(sc, NULL);
+			if (sd == NULL || sd->d_size == 0 ||
+					sd->d_size % sizeof(re[0]) != 0)
+				return -EINVAL;
+			rc = process_reloc(elf, sh->sh_link,
+				sd->d_buf, sd->d_size, ed->d_buf, ed->d_size,
+				prm);
+		}
+	}
+
+	return rc;
+}
+
+static struct rte_bpf *
+bpf_load(const struct rte_bpf_prm *prm)
+{
+	uint8_t *buf;
+	struct rte_bpf *bpf;
+	size_t sz, bsz, insz, xsz;
+
+	xsz =  prm->nb_xsym * sizeof(prm->xsym[0]);
+	insz = prm->nb_ins * sizeof(prm->ins[0]);
+	bsz = sizeof(bpf[0]);
+	sz = insz + xsz + bsz;
+
+	buf = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf == MAP_FAILED)
+		return NULL;
+
+	bpf = (void *)buf;
+	bpf->sz = sz;
+
+	memcpy(&bpf->prm, prm, sizeof(bpf->prm));
+
+	memcpy(buf + bsz, prm->xsym, xsz);
+	memcpy(buf + bsz + xsz, prm->ins, insz);
+
+	bpf->prm.xsym = (void *)(buf + bsz);
+	bpf->prm.ins = (void *)(buf + bsz + xsz);
+
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_load(const struct rte_bpf_prm *prm)
+{
+	struct rte_bpf *bpf;
+	int32_t rc;
+
+	if (prm == NULL || prm->ins == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load(prm);
+	if (bpf == NULL) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	rc = bpf_validate(bpf);
+	if (rc == 0) {
+		bpf_jit(bpf);
+		if (mprotect(bpf, bpf->sz, PROT_READ) != 0)
+			rc = -ENOMEM;
+	}
+
+	if (rc != 0) {
+		rte_bpf_destroy(bpf);
+		rte_errno = -rc;
+		return NULL;
+	}
+
+	return bpf;
+}
+
+static struct rte_bpf *
+bpf_load_elf(const struct rte_bpf_prm *prm, int32_t fd, const char *section)
+{
+	Elf *elf;
+	Elf_Data *sd;
+	size_t sidx;
+	int32_t rc;
+	struct rte_bpf *bpf;
+	struct rte_bpf_prm np;
+
+	elf_version(EV_CURRENT);
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+
+	rc = find_elf_code(elf, section, &sd, &sidx);
+	if (rc == 0)
+		rc = elf_reloc_code(elf, sd, sidx, prm);
+
+	if (rc == 0) {
+		np = prm[0];
+		np.ins = sd->d_buf;
+		np.nb_ins = sd->d_size / sizeof(struct bpf_insn);
+		bpf = rte_bpf_load(&np);
+	} else {
+		bpf = NULL;
+		rte_errno = -rc;
+	}
+
+	elf_end(elf);
+	return bpf;
+}
+
+__rte_experimental struct rte_bpf *
+rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
+	const char *sname)
+{
+	int32_t fd, rc;
+	struct rte_bpf *bpf;
+
+	if (prm == NULL || fname == NULL || sname == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		rc = errno;
+		RTE_BPF_LOG(ERR, "%s(%s) error code: %d(%s)\n",
+			__func__, fname, rc, strerror(rc));
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	bpf = bpf_load_elf(prm, fd, sname);
+	close(fd);
+
+	if (bpf == NULL) {
+		RTE_BPF_LOG(ERR,
+			"%s(fname=\"%s\", sname=\"%s\") failed, "
+			"error code: %d\n",
+			__func__, fname, sname, rte_errno);
+		return NULL;
+	}
+
+	RTE_BPF_LOG(INFO, "%s(fname=\"%s\", sname=\"%s\") "
+		"successfully creates %p(jit={.func=%p,.sz=%zu});\n",
+		__func__, fname, sname, bpf, bpf->jit.func, bpf->jit.sz);
+	return bpf;
+}
diff --git a/lib/librte_bpf/bpf_validate.c b/lib/librte_bpf/bpf_validate.c
new file mode 100644
index 000000000..1911e1381
--- /dev/null
+++ b/lib/librte_bpf/bpf_validate.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#include "bpf_impl.h"
+
+/*
+ * dummy one for now, need more work.
+ */
+int
+bpf_validate(struct rte_bpf *bpf)
+{
+	int32_t rc, ofs, stack_sz;
+	uint32_t i, op, dr;
+	const struct bpf_insn *ins;
+
+	rc = 0;
+	stack_sz = 0;
+	for (i = 0; i != bpf->prm.nb_ins; i++) {
+
+		ins = bpf->prm.ins + i;
+		op = ins->code;
+		dr = ins->dst_reg;
+		ofs = ins->off;
+
+		if ((BPF_CLASS(op) == BPF_STX || BPF_CLASS(op) == BPF_ST) &&
+				dr == BPF_REG_10) {
+			ofs -= sizeof(uint64_t);
+			stack_sz = RTE_MIN(ofs, stack_sz);
+		}
+	}
+
+	if (stack_sz != 0) {
+		stack_sz = -stack_sz;
+		if (stack_sz > MAX_BPF_STACK_SIZE)
+			rc = -ERANGE;
+		else
+			bpf->stack_sz = stack_sz;
+	}
+
+	if (rc != 0)
+		RTE_BPF_LOG(ERR, "%s(%p) failed, error code: %d;\n",
+			__func__, bpf, rc);
+	return rc;
+}
diff --git a/lib/librte_bpf/meson.build b/lib/librte_bpf/meson.build
new file mode 100644
index 000000000..05c48c7ff
--- /dev/null
+++ b/lib/librte_bpf/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+allow_experimental_apis = true
+sources = files('bpf.c',
+		'bpf_exec.c',
+		'bpf_load.c',
+		'bpf_validate.c')
+
+install_headers = files('rte_bpf.h')
+
+deps += ['mbuf', 'net']
+
+dep = dependency('libelf', required: false)
+if dep.found() == false
+	build = false
+endif
+ext_deps += dep
diff --git a/lib/librte_bpf/rte_bpf.h b/lib/librte_bpf/rte_bpf.h
new file mode 100644
index 000000000..4d4b93599
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_BPF_H_
+#define _RTE_BPF_H_
+
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_bpf_def.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible types for external symbols.
+ */
+enum rte_bpf_xtype {
+	RTE_BPF_XTYPE_FUNC, /**< function */
+	RTE_BPF_XTYPE_VAR, /**< variable */
+	RTE_BPF_XTYPE_NUM
+};
+
+/**
+ * Definition for external symbols available in the BPF program.
+ */
+struct rte_bpf_xsym {
+	const char *name;        /**< name */
+	enum rte_bpf_xtype type; /**< type */
+	union {
+		uint64_t (*func)(uint64_t, uint64_t, uint64_t,
+				uint64_t, uint64_t);
+		void *var;
+	}; /**< value */
+};
+
+/**
+ * Possible BPF program types.
+ * Use negative values for DPDK specific prog-types, to make sure they will
+ * not interfere with Linux related ones.
+ */
+enum rte_bpf_prog_type {
+	RTE_BPF_PROG_TYPE_UNSPEC = BPF_PROG_TYPE_UNSPEC,
+	/**< input is a pointer to raw data */
+	RTE_BPF_PROG_TYPE_MBUF = INT32_MIN,
+	/**< input is a pointer to rte_mbuf */
+};
+
+/**
+ * Input parameters for loading eBPF code.
+ */
+struct rte_bpf_prm {
+	const struct bpf_insn *ins; /**< array of eBPF instructions */
+	uint32_t nb_ins;            /**< number of instructions in ins */
+	const struct rte_bpf_xsym *xsym;
+	/**< array of external symbols that eBPF code is allowed to reference */
+	uint32_t nb_xsym; /**< number of elements in xsym */
+	enum rte_bpf_prog_type prog_type; /**< eBPF program type */
+};
+
+/**
+ * Information about compiled into native ISA eBPF code.
+ */
+struct rte_bpf_jit {
+	uint64_t (*func)(void *);
+	size_t sz;
+};
+
+struct rte_bpf;
+
+/**
+ * De-allocate all memory used by this eBPF execution context.
+ *
+ * @param bpf
+ *   BPF handle to destroy.
+ */
+void rte_bpf_destroy(struct rte_bpf *bpf);
+
+/**
+ * Create a new eBPF execution context and load given BPF code into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_load(const struct rte_bpf_prm *prm);
+
+/**
+ * Create a new eBPF execution context and load BPF code from given ELF
+ * file into it.
+ *
+ * @param prm
+ *  Parameters used to create and initialise the BPF exeution context.
+ * @param fname
+ *  Pathname for a ELF file.
+ * @param sname
+ *  Name of the executable section within the file to load.
+ * @return
+ *   BPF handle that is used in future BPF operations,
+ *   or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ *   - ENOMEM - can't reserve enough memory
+ */
+struct rte_bpf *rte_bpf_elf_load(const struct rte_bpf_prm *prm,
+	const char *fname, const char *sname);
+
+/**
+ * Execute given BPF bytecode.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to input context.
+ * @return
+ *   BPF execution return value.
+ */
+uint64_t rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
+
+/**
+ * Execute given BPF bytecode over a set of input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   array of pointers to the input contexts.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number of elements in ctx[] (and rc[]).
+ * @return
+ *   number of successfully processed inputs.
+ */
+uint32_t rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[],
+	uint64_t rc[], uint32_t num);
+
+/**
+ * Provide information about natively compield code for given BPF handle.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the rte_bpf_jit structure to be filled with related data.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BPF_H_ */
diff --git a/lib/librte_bpf/rte_bpf_version.map b/lib/librte_bpf/rte_bpf_version.map
new file mode 100644
index 000000000..ff65144df
--- /dev/null
+++ b/lib/librte_bpf/rte_bpf_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_bpf_destroy;
+	rte_bpf_elf_load;
+	rte_bpf_exec;
+	rte_bpf_exec_burst;
+	rte_bpf_get_jit;
+	rte_bpf_load;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ef6159170..7ff7aaaa5 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -23,7 +23,7 @@ libraries = [ 'compat', # just a header, used for versioning
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
-	'flow_classify']
+	'flow_classify', 'bpf']
 
 foreach l:libraries
 	build = true
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 94525dc80..07a9bcfe2 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -83,6 +83,8 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER)          += -lrte_timer
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 
+_LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf -lelf
+
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
-- 
2.13.6

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2 0/4] ethdev: add per-PMD tuning of RxTx parmeters
  @ 2018-03-30 10:34  0%   ` Ferruh Yigit
  2018-03-31  0:05  0%     ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2018-03-30 10:34 UTC (permalink / raw)
  To: Remy Horton, dev
  Cc: John McNamara, Wenzhuo Lu, Jingjing Wu, Qi Zhang, Beilei Xing,
	Shreyansh Jain, Thomas Monjalon

On 3/27/2018 7:43 PM, Ferruh Yigit wrote:
> On 3/21/2018 2:27 PM, Remy Horton wrote:
>> The optimal values of several transmission & reception related parameters,
>> such as burst sizes, descriptor ring sizes, and number of queues, varies
>> between different network interface devices. This patchset allows individual
>> PMDs to specify their preferred parameter values, and if so indicated by an
>> application, for them to be used automatically by the ethdev layer.
>>
>> rte_eth_dev_configure() has been changed so that specifying zero for both
>> nb_rx_q AND nb_tx_q causes it to use driver preferred values, and if these
>> are not available, falls back to EAL defaults. Setting one (but not both)
>> to zero does not cause the use of defaults, as having one of them zeroed is
>> a valid setup.
>>
>> This RFC/V1 includes per-PMD values for e1000 and i40e but it is expected
>> that subsequent patchsets will cover other PMDs. A deprecation notice
>> covering the API/ABI change is in place.
>>
>>
>> Changes in v2:
>> * Rebased to 
>> * Removed fallback values from rte_eth_dev_info_get()
>> * Added fallback values to rte_rte_[rt]x_queue_setup()
>> * Added fallback values to rte_eth_dev_configure()
>> * Corrected comment
>> * Removed deprecation notice
>> * Split RX and Tx into seperate structures
>> * Changed parameter names
>>
>>
>> Remy Horton (4):
>>   ethdev: add support for PMD-tuned Tx/Rx parameters
>>   net/e1000: add TxRx tuning parameters
>>   net/i40e: add TxRx tuning parameters
>>   testpmd: make use of per-PMD TxRx parameters
> 
> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[relevance 0%]

Results 9801-10000 of ~18000   |  | reverse | sort options + mbox downloads above
-- links below jump to the message on this page --
2017-06-30 14:26     [dpdk-dev] [RFC] ring: relax alignment constraint on ring structure Olivier Matz
2018-04-03 13:26  9% ` [dpdk-dev] [PATCH] " Olivier Matz
2018-04-03 15:07       ` Jerin Jacob
2018-04-03 15:25         ` Olivier Matz
2018-04-03 15:37           ` Jerin Jacob
2018-04-03 15:56  3%         ` Olivier Matz
2018-04-03 16:42  3%           ` Jerin Jacob
2018-04-04 23:38  0%             ` Ananyev, Konstantin
2018-04-05  8:01                   ` Jerin Jacob
2018-04-05 13:49                     ` Ananyev, Konstantin
2018-04-06  1:26                       ` Jerin Jacob
2018-04-11  0:33                         ` Ananyev, Konstantin
2018-04-11  2:48  4%                       ` Jerin Jacob
2018-04-11  8:40  0%                         ` Ananyev, Konstantin
2017-11-24 16:06     [dpdk-dev] [RFC PATCH 0/6] mempool: add bucket mempool driver Andrew Rybchenko
2018-04-16 13:24  2% ` [dpdk-dev] [PATCH v4 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
2018-04-16 13:24  7%   ` [dpdk-dev] [PATCH v4 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
2018-04-16 15:33  0%     ` Olivier Matz
2018-04-16 15:41  0%       ` Andrew Rybchenko
2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 05/11] mempool: add op to populate objects using provided memory Andrew Rybchenko
2018-04-16 13:24  6%   ` [dpdk-dev] [PATCH v4 06/11] mempool: remove callback to get capabilities Andrew Rybchenko
2018-04-16 13:24  4%   ` [dpdk-dev] [PATCH v4 07/11] mempool: deprecate xmem functions Andrew Rybchenko
2018-04-16 13:24  8%   ` [dpdk-dev] [PATCH v4 10/11] mempool: remove callback to register memory area Andrew Rybchenko
2018-04-16 13:33  3% ` [dpdk-dev] [PATCH v2 0/6] mempool: add bucket driver Andrew Rybchenko
2018-04-16 13:33  4%   ` [dpdk-dev] [PATCH v2 3/6] mempool: support block dequeue operation Andrew Rybchenko
2017-12-08 15:49     [dpdk-dev] [RFC] mbuf: remove control mbuf Olivier Matz
2018-04-03 13:39  3% ` [dpdk-dev] [PATCH] " Olivier Matz
2018-01-08 10:00     [dpdk-dev] [PATCH v3] lib/librte_meter: add meter configuration profile Jasvinder Singh
2018-01-08 15:43     ` [dpdk-dev] [PATCH v4] " Jasvinder Singh
2018-02-19 21:12       ` Thomas Monjalon
2018-04-05 10:12  0%     ` Thomas Monjalon
2018-04-05 11:00  0%       ` Dumitrescu, Cristian
2018-01-23 13:15     [dpdk-dev] [RFC v2 00/17] mempool: add bucket mempool driver Andrew Rybchenko
2018-03-26 16:09     ` [dpdk-dev] [PATCH v3 00/11] mempool: prepare to add bucket driver Andrew Rybchenko
2018-03-26 16:09       ` [dpdk-dev] [PATCH v3 04/11] mempool: add op to calculate memory size to be allocated Andrew Rybchenko
2018-04-04 15:08  0%     ` santosh
2018-04-06 15:51  0%     ` Olivier Matz
2018-04-12 15:22  0%     ` Burakov, Anatoly
2018-02-02 23:28     [dpdk-dev] [PATCH 0/7] vhost: support selective datapath Zhihong Wang
2018-03-30 10:00     ` [dpdk-dev] [PATCH v4 0/5] " Zhihong Wang
2018-03-30 10:01       ` [dpdk-dev] [PATCH v4 2/5] " Zhihong Wang
2018-03-31  6:10  3%     ` Maxime Coquelin
2018-04-02  1:58  0%       ` Wang, Zhihong
2018-02-17 10:49     [dpdk-dev] [PATCH 1/2] eal: add API to align integer to previous power of 2 Pavan Nikhilesh
2018-04-04 10:16     ` [dpdk-dev] [PATCH v3 " Pavan Nikhilesh
2018-04-04 16:10       ` Matan Azrad
2018-04-04 16:42         ` Pavan Nikhilesh
2018-04-04 17:11           ` Matan Azrad
2018-04-04 17:51             ` Pavan Nikhilesh
2018-04-04 18:10               ` Matan Azrad
2018-04-04 18:15                 ` Pavan Nikhilesh
2018-04-04 18:23                   ` Matan Azrad
2018-04-04 18:36  3%                 ` Pavan Nikhilesh
2018-04-04 19:41  3%                   ` Matan Azrad
2018-02-26 15:09     [dpdk-dev] [PATCH 01/18] ethdev: support tunnel RSS level Xueming Li
2018-04-10 13:00     ` [dpdk-dev] [PATCH v2 2/5] ethdev: introduce new tunnel VXLAN-GPE Xueming Li
2018-04-11  9:59  5%   ` Adrien Mazarguil
2018-04-11 12:04  0%     ` Xueming(Steven) Li
2018-04-10 13:00     ` [dpdk-dev] [PATCH v2 4/5] app/testpmd: " Xueming Li
2018-04-11  9:59  3%   ` Adrien Mazarguil
2018-04-12  7:33  3% ` [dpdk-dev] [PATCH v3 0/5] introduce new tunnel types Xueming Li
2018-04-13 11:02  3% ` [dpdk-dev] [PATCH v4 " Xueming Li
2018-03-08  1:29     [dpdk-dev] [RFC PATCH 0/5] add framework to load and execute BPF code Konstantin Ananyev
2018-03-08  1:30     ` [dpdk-dev] [RFC PATCH 5/5] test: add few eBPF samples Konstantin Ananyev
2018-03-13 14:01       ` Jerin Jacob
2018-03-13 18:14         ` Ananyev, Konstantin
2018-03-30 17:42           ` Ananyev, Konstantin
2018-04-02 22:26  3%         ` Jerin Jacob
2018-03-09 16:42     [dpdk-dev] [PATCH v1 0/5] add framework to load and execute BPF code Konstantin Ananyev
2018-03-30 17:32  2% ` [dpdk-dev] [PATCH v2 2/7] bpf: add BPF loading and execution framework Konstantin Ananyev
2018-03-09 22:22     [dpdk-dev] [RFC PATCH] ethdev: fix ports enumeration Thomas Monjalon
2018-04-05 15:33     ` [dpdk-dev] [PATCH v2 0/3] " Thomas Monjalon
2018-04-05 15:33  1%   ` [dpdk-dev] [PATCH v2 3/3] ethdev: deprecate port count function Thomas Monjalon
2018-03-10  1:25     [dpdk-dev] [PATCH v1 0/6] net/mlx5: add Multi-Packet Rx support Yongseok Koh
2018-04-02 18:50     ` [dpdk-dev] [PATCH v2 " Yongseok Koh
2018-04-02 18:50       ` [dpdk-dev] [PATCH v2 1/6] mbuf: add buffer offset field for flexible indirection Yongseok Koh
2018-04-03  8:26         ` Olivier Matz
2018-04-04  0:12           ` Yongseok Koh
2018-04-09 16:04             ` Olivier Matz
2018-04-10  1:59  3%           ` Yongseok Koh
2018-04-11  0:25  0%             ` Ananyev, Konstantin
2018-04-11  5:33  0%               ` Yongseok Koh
2018-04-11 11:39  0%                 ` Ananyev, Konstantin
2018-04-11 17:08  0%                   ` Yongseok Koh
2018-04-12 16:34  0%                     ` Ananyev, Konstantin
2018-04-12 18:58  0%                       ` Yongseok Koh
2018-03-21 14:27     [dpdk-dev] [PATCH v2 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
2018-03-27 18:43     ` Ferruh Yigit
2018-03-30 10:34  0%   ` Ferruh Yigit
2018-03-31  0:05  0%     ` Thomas Monjalon
2018-04-04 17:17  3% ` [dpdk-dev] [PATCH v3 " Remy Horton
2018-04-04 17:17       ` [dpdk-dev] [PATCH v3 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
2018-04-04 18:56  3%     ` De Lara Guarch, Pablo
2018-04-05 10:16  0%       ` Thomas Monjalon
2018-04-06 14:49  4%   ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
2018-04-06 14:49  7%     ` [dpdk-dev] [PATCH v5 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
2018-04-06 17:01  0%     ` [dpdk-dev] [PATCH v5 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
2018-04-10  9:43  4%     ` [dpdk-dev] [PATCH v6 " Remy Horton
2018-04-10  9:43  7%       ` [dpdk-dev] [PATCH v6 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
2018-04-10 18:56  0%       ` [dpdk-dev] [PATCH v6 0/4] ethdev: add per-PMD tuning of RxTx parmeters Ferruh Yigit
2018-03-22 12:36     [dpdk-dev] [PATCH v6] eal: provide API for querying valid socket id's Anatoly Burakov
2018-03-31 17:08  5% ` [dpdk-dev] [PATCH v7] " Anatoly Burakov
2018-04-04 22:31  3%   ` Thomas Monjalon
2018-03-23 12:58     [dpdk-dev] [PATCH v1 0/9] Bunch of flow API-related fixes Adrien Mazarguil
2018-04-04 14:57  3% ` [dpdk-dev] [PATCH v2 00/13] " Adrien Mazarguil
2018-04-04 14:58  4%   ` [dpdk-dev] [PATCH v2 12/13] ethdev: fix ABI version in meson build Adrien Mazarguil
2018-04-06 13:22  3%   ` [dpdk-dev] [PATCH v3 00/11] Bunch of flow API-related fixes Adrien Mazarguil
2018-04-10 16:34  3%     ` [dpdk-dev] [PATCH v4 " Adrien Mazarguil
2018-04-16 16:21  3%       ` [dpdk-dev] [PATCH v5 " Adrien Mazarguil
2018-03-26  9:51     [dpdk-dev] [PATCH v3 00/10] lib/librte_vhost: introduce new vhost user crypto backend support Fan Zhang
2018-03-29 12:52     ` [dpdk-dev] [PATCH v4 0/8] vhost: intdroduce vhost user crypto backend Fan Zhang
2018-03-29 12:52       ` [dpdk-dev] [PATCH v4 1/8] lib/librte_vhost: add external backend support Fan Zhang
2018-03-29 13:47         ` Wodkowski, PawelX
2018-04-01 19:53  0%       ` Zhang, Roy Fan
2018-04-03 13:44  0%         ` Maxime Coquelin
2018-04-03 13:55  0%           ` Zhang, Roy Fan
2018-04-03 14:42  0%           ` Tan, Jianfeng
2018-04-03 14:48  0%             ` Wodkowski, PawelX
2018-03-28 13:54     [dpdk-dev] [PATCH v6 0/7] switching device representation Declan Doherty
2018-03-28 13:54     ` [dpdk-dev] [PATCH v6 4/8] ethdev: Add port representor device flag Declan Doherty
2018-03-29  6:13       ` Shahaf Shuler
2018-03-29 14:53         ` Doherty, Declan
2018-04-01  6:14  0%       ` Shahaf Shuler
2018-03-28 23:29     [dpdk-dev] [PATCH 0/4] rte_flow extension for vSwitch acceleration Qi Zhang
2018-04-01 21:19     ` [dpdk-dev] [PATCH v2 " Qi Zhang
2018-04-01 21:19       ` [dpdk-dev] [PATCH v2 2/4] ether: add flow last hit query support Qi Zhang
2018-04-11 16:31  3%     ` Adrien Mazarguil
2018-04-01 21:19       ` [dpdk-dev] [PATCH v2 3/4] ether: add more protocol support in flow API Qi Zhang
2018-04-11 16:32  2%     ` Adrien Mazarguil
2018-04-12  5:12  0%       ` Zhang, Qi Z
2018-04-12  9:19  0%         ` Adrien Mazarguil
2018-04-12 10:00  0%           ` Zhang, Qi Z
2018-03-29 17:05     [dpdk-dev] [PATCH v3 0/2] gcc-8 build fixes Stephen Hemminger
2018-04-03  9:23     ` Ferruh Yigit
2018-04-03 15:10  3%   ` Stephen Hemminger
2018-03-29 17:52     [dpdk-dev] [PATCH v3] ethdev: replace bus specific struct with generic dev Ferruh Yigit
2018-03-30 15:17     ` [dpdk-dev] [PATCH v4] " Ferruh Yigit
2018-03-30 15:29       ` David Marchand
2018-04-02 16:13         ` santosh
2018-04-03  9:06           ` David Marchand
2018-04-03  9:50             ` Ferruh Yigit
2018-04-04 17:57  3%           ` De Lara Guarch, Pablo
2018-04-05  9:19  0%             ` Ferruh Yigit
2018-04-05 16:40  2%   ` [dpdk-dev] [PATCH v5] " Ferruh Yigit
2018-04-09 12:09  2%     ` [dpdk-dev] [PATCH v6] " Ferruh Yigit
2018-03-29 21:27     [dpdk-dev] [PATCH v8 0/9] eventtimer: introduce event timer adapter Erik Gabriel Carrillo
2018-04-02 19:39     ` [dpdk-dev] [PATCH v9 " Erik Gabriel Carrillo
2018-04-02 19:39  3%   ` [dpdk-dev] [PATCH v9 3/9] eventtimer: add common code Erik Gabriel Carrillo
2018-04-03 21:44       ` [dpdk-dev] [PATCH v10 0/9] eventtimer: introduce event timer adapter Erik Gabriel Carrillo
2018-04-03 21:44  3%     ` [dpdk-dev] [PATCH v10 3/9] eventtimer: add common code Erik Gabriel Carrillo
2018-04-04 21:51         ` [dpdk-dev] [PATCH v11 0/9] eventtimer: introduce event timer adapter Erik Gabriel Carrillo
2018-04-04 21:51  3%       ` [dpdk-dev] [PATCH v11 3/9] eventtimer: add common code Erik Gabriel Carrillo
2018-03-30 17:32     [dpdk-dev] [PATCH v2 1/7] net: move BPF related definitions into librte_net Konstantin Ananyev
2018-04-06 18:49  2% ` [dpdk-dev] [PATCH v3 02/10] bpf: add BPF loading and execution framework Konstantin Ananyev
2018-03-31  7:49     [dpdk-dev] [PATCH v4 0/7] crypto: add virtio poll mode driver Jay Zhou
2018-03-31  7:49  2% ` [dpdk-dev] [PATCH v4 1/7] crypto/virtio: add virtio related fundamental functions Jay Zhou
2018-03-31  9:18     [dpdk-dev] [PATCH v5 0/7] crypto: add virtio poll mode driver Jay Zhou
2018-03-31  9:18  2% ` [dpdk-dev] [PATCH v5 1/7] crypto/virtio: add virtio related fundamental functions Jay Zhou
2018-04-02  8:36     [dpdk-dev] [PATCH v2] eal/vfio: export internal vfio functions Hemant Agrawal
2018-04-03  8:28  4% ` [dpdk-dev] [PATCH v3 1/2] doc: add vfio api support Hemant Agrawal
2018-04-03 10:16  0%   ` Thomas Monjalon
2018-04-03  9:43     [dpdk-dev] [PATCH v6 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-03  9:43  1% ` [dpdk-dev] [PATCH v6 02/10] crypto/virtio: support virtio device init Jay Zhou
2018-04-04 17:03     ` [dpdk-dev] [PATCH v7 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-04 17:03  1%   ` [dpdk-dev] [PATCH v7 02/10] crypto/virtio: support virtio device init Jay Zhou
2018-04-14  9:34     ` [dpdk-dev] [PATCH v8 00/11] crypto: add virtio poll mode driver Jay Zhou
2018-04-14  9:34  1%   ` [dpdk-dev] [PATCH v8 02/11] crypto/virtio: support virtio device init Jay Zhou
2018-04-15  8:51     ` [dpdk-dev] [PATCH v9 00/11] crypto: add virtio poll mode driver Jay Zhou
2018-04-15  8:51  1%   ` [dpdk-dev] [PATCH v9 02/11] crypto/virtio: support virtio device init Jay Zhou
2018-04-16  2:21     ` [dpdk-dev] [PATCH v10 00/10] crypto: add virtio poll mode driver Jay Zhou
2018-04-16  2:21  1%   ` [dpdk-dev] [PATCH v10 02/10] crypto/virtio: support virtio device init Jay Zhou
2018-04-03 23:21     [dpdk-dev] [PATCH v3 00/68] Memory Hotplug for DPDK Anatoly Burakov
2018-03-07 16:56     ` [dpdk-dev] [PATCH v2 00/41] " Anatoly Burakov
2018-04-03 23:21  3%   ` [dpdk-dev] [PATCH v3 24/68] mempool: add support for the new allocation methods Anatoly Burakov
2018-04-04 11:27  3% [dpdk-dev] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
2018-04-04 11:27  3% ` [dpdk-dev] [PATCH 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
2018-04-04 11:27  2% ` [dpdk-dev] [PATCH 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
2018-04-04 15:56  4% [dpdk-dev] [PATCH v1 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-04 15:56  7% ` [dpdk-dev] [PATCH v1 01/16] ethdev: update ABI for flow API functions Adrien Mazarguil
2018-04-05 10:06  4%   ` Thomas Monjalon
2018-04-05 12:44  9%     ` Adrien Mazarguil
2018-04-05 13:36  7%       ` Thomas Monjalon
2018-04-04 15:56  3% ` [dpdk-dev] [PATCH v1 05/16] ethdev: remove DUP action from flow API Adrien Mazarguil
2018-04-04 15:56  2% ` [dpdk-dev] [PATCH v1 10/16] ethdev: add encap level to RSS flow API action Adrien Mazarguil
2018-04-06 13:25  5% ` [dpdk-dev] [PATCH v2 00/15] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 01/15] ethdev: add error types to flow API Adrien Mazarguil
2018-04-07  9:15  0%     ` Andrew Rybchenko
2018-04-07  9:18  0%       ` Andrew Rybchenko
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 04/15] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-07  9:23  0%     ` Andrew Rybchenko
2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 05/15] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-06 15:06  0%     ` Andrew Rybchenko
2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 06/15] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 07/15] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-07  9:05  0%     ` Andrew Rybchenko
2018-04-09 14:42  0%       ` Adrien Mazarguil
2018-04-11 13:21  0%         ` Andrew Rybchenko
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 08/15] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-06 15:41  0%     ` Andrew Rybchenko
2018-04-09 14:41  0%       ` Adrien Mazarguil
2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 09/15] ethdev: add encap level " Adrien Mazarguil
2018-04-07  8:27  0%     ` Andrew Rybchenko
2018-04-06 13:25  1%   ` [dpdk-dev] [PATCH v2 10/15] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-06 17:11  0%     ` Andrew Rybchenko
2018-04-09 14:42  0%       ` Adrien Mazarguil
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 11/15] ethdev: add transfer attribute to " Adrien Mazarguil
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 12/15] ethdev: update behavior of VF/PF in " Adrien Mazarguil
2018-04-07  9:41  0%     ` Andrew Rybchenko
2018-04-09 14:49  0%       ` Adrien Mazarguil
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 13/15] ethdev: rename physical port item " Adrien Mazarguil
2018-04-06 13:25  3%   ` [dpdk-dev] [PATCH v2 14/15] ethdev: add physical port action to " Adrien Mazarguil
2018-04-07  9:51  0%     ` Andrew Rybchenko
2018-04-09 15:00  0%       ` Adrien Mazarguil
2018-04-06 13:25  2%   ` [dpdk-dev] [PATCH v2 15/15] ethdev: add port ID item and " Adrien Mazarguil
2018-04-10 16:36  4%   ` [dpdk-dev] [PATCH v3 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 01/16] ethdev: add error types to flow API Adrien Mazarguil
2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 04/16] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 07/16] ethdev: flatten RSS configuration in " Adrien Mazarguil
2018-04-11 13:06  0%       ` Andrew Rybchenko
2018-04-10 16:36  2%     ` [dpdk-dev] [PATCH v3 08/16] ethdev: add hash function to RSS flow API action Adrien Mazarguil
2018-04-11 12:40  0%       ` Andrew Rybchenko
2018-04-10 16:36  3%     ` [dpdk-dev] [PATCH v3 09/16] ethdev: add encap level " Adrien Mazarguil
2018-04-10 16:36  1%     ` [dpdk-dev] [PATCH v3 10/16] ethdev: refine TPID handling in flow API Adrien Mazarguil
2018-04-11 12:45  0%       ` Andrew Rybchenko
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 12/16] ethdev: add transfer attribute to " Adrien Mazarguil
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 13/16] ethdev: update behavior of VF/PF in " Adrien Mazarguil
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 14/16] ethdev: rename physical port item " Adrien Mazarguil
2018-04-11 12:57  0%       ` Andrew Rybchenko
2018-04-10 16:37  3%     ` [dpdk-dev] [PATCH v3 15/16] ethdev: add physical port action to " Adrien Mazarguil
2018-04-11 13:00  0%       ` Andrew Rybchenko
2018-04-10 16:37  2%     ` [dpdk-dev] [PATCH v3 16/16] ethdev: add port ID item and " Adrien Mazarguil
2018-04-11 13:02  0%       ` Andrew Rybchenko
2018-04-16 16:22  4%     ` [dpdk-dev] [PATCH v4 00/16] Flow API overhaul for switch offloads Adrien Mazarguil
2018-04-16 16:22  3%       ` [dpdk-dev] [PATCH v4 01/16] ethdev: add error types to flow API Adrien Mazarguil
2018-04-16 16:22  2%       ` [dpdk-dev] [PATCH v4 04/16] ethdev: remove DUP action from " Adrien Mazarguil
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 05/16] ethdev: alter behavior of flow API actions Adrien Mazarguil
2018-04-16 16:22  1%       ` [dpdk-dev] [PATCH v4 06/16] ethdev: remove C99 flexible arrays from flow API Adrien Mazarguil
2018-04-04 22:01  3% [dpdk-dev] [PATCH v2 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-13  9:16  0% ` Burakov, Anatoly
2018-04-05 11:49  4% [dpdk-dev] [PATCH] doc: add meter API change to release notes Jasvinder Singh
2018-04-05 12:03  0% ` Dumitrescu, Cristian
2018-04-05 13:15  9% [dpdk-dev] [PATCH] eal/service: remove experimental tags Harry van Haaren
2018-04-06  6:18  0% ` Jerin Jacob
2018-04-05 14:28     [dpdk-dev] [PATCH v2 0/4] NFP PF support based on new CPP interface Alejandro Lucero
2018-04-05 14:28  1% ` [dpdk-dev] [PATCH 1/4] net/nfp: add NFP CPP support Alejandro Lucero
2018-04-05 14:28  6% ` [dpdk-dev] [PATCH 2/4] net/nfp: update PMD for using new CPP interface Alejandro Lucero
2018-04-05 14:42     [dpdk-dev] [PATCH v2 0/4] NFP PF support based on " Alejandro Lucero
2018-04-05 14:42  1% ` [dpdk-dev] [PATCH v2 1/4] net/nfp: add NFP CPP support Alejandro Lucero
2018-04-05 14:42  6% ` [dpdk-dev] [PATCH v2 2/4] net/nfp: update PMD for using new CPP interface Alejandro Lucero
2018-04-05 18:07     [dpdk-dev] [PATCH v5 3/4] net/ifcvf: add ifcvf vdpa driver Xiao Wang
2018-04-12  7:19     ` [dpdk-dev] [PATCH v6 0/4] " Xiao Wang
2018-04-12  7:19  3%   ` [dpdk-dev] [PATCH v6 4/4] doc: add ifcvf driver document and release note Xiao Wang
2018-04-05 19:13     [dpdk-dev] [PATCH 0/3] add Hyper-V bus and network driver Stephen Hemminger
2018-04-05 20:52     ` [dpdk-dev] [PATCH 3/3] net/netvsc: add hyper-v netvsc network device Thomas Monjalon
2018-04-05 20:59       ` Stephen Hemminger
2018-04-05 21:07  3%     ` Thomas Monjalon
2018-04-05 21:19  0%       ` Stephen Hemminger
2018-04-06 12:23     [dpdk-dev] [PATCH v3 0/4] ethdev: Additions to support tunnel encap/decap offload Declan Doherty
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 1/4] ethdev: add group counter support to rte_flow Declan Doherty
2018-04-06 20:26  3%   ` Adrien Mazarguil
2018-04-09 14:22  0%     ` Mohammad Abdul Awal
2018-04-09 15:23  0%       ` Adrien Mazarguil
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 2/4] ethdev: Add tunnel encap/decap actions Declan Doherty
2018-04-06 20:26  2%   ` Adrien Mazarguil
2018-04-09 16:10  0%     ` Mohammad Abdul Awal
2018-04-10 10:19  0%       ` Adrien Mazarguil
2018-04-10 11:06  0%         ` Shahaf Shuler
2018-04-06 12:24     ` [dpdk-dev] [PATCH v3 3/4] ethdev: Add group action type to rte_flow Declan Doherty
2018-04-06 20:26  3%   ` Adrien Mazarguil
2018-04-06 13:51     [dpdk-dev] [PATCH] app/test: enhance power manager unit tests Reshma Pattan
2018-04-10 14:19  3% ` Hunt, David
2018-04-06 13:54  4% [dpdk-dev] [PATCH v4 0/4] ethdev: add per-PMD tuning of RxTx parmeters Remy Horton
2018-04-06 13:54  7% ` [dpdk-dev] [PATCH v4 1/4] ethdev: add support for PMD-tuned Tx/Rx parameters Remy Horton
2018-04-06 18:49     [dpdk-dev] [PATCH v3 01/10] net: move BPF related definitions into librte_net Konstantin Ananyev
2018-04-13 14:43  2% ` [dpdk-dev] [PATCH v4 02/10] bpf: add BPF loading and execution framework Konstantin Ananyev
2018-04-09 12:49     [dpdk-dev] [PATCH] table: fix build error with gcc 8 Jasvinder Singh
2018-04-09 15:09     ` Stephen Hemminger
2018-04-09 15:58       ` Dumitrescu, Cristian
2018-04-09 16:38  4%     ` Van Haaren, Harry
2018-04-09 16:43  0%       ` Ferruh Yigit
2018-04-09 17:05  0%         ` Dumitrescu, Cristian
2018-04-09 17:02  4%       ` Dumitrescu, Cristian
2018-04-09 17:09  0%         ` Ananyev, Konstantin
2018-04-09 17:26  0%           ` Dumitrescu, Cristian
2018-04-10 12:32  0%             ` Van Haaren, Harry
2018-04-10 11:43  0%       ` Neil Horman
2018-04-09 13:11  5% [dpdk-dev] [PATCH v1] doc: add SPDX Licence to doc files Marko Kovacevic
2018-04-13 18:30  3% [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 02/13] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 03/13] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 04/13] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 06/13] kni: replace rte_panic instances in kni Arnon Warshavsky
2018-04-13 18:30  3% ` [dpdk-dev] [PATCH v3 11/13] eal: replace rte_panic instances in ethdev Arnon Warshavsky
2018-04-13 18:30  2% ` [dpdk-dev] [PATCH v3 12/13] eal: replace rte_panic instances in init sequence Arnon Warshavsky
2018-04-16 11:22  0% ` [dpdk-dev] [PATCH v3 00/13] eal: replace calls to rte_panic and refrain from new instances Burakov, Anatoly

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).