ABI - search results

DPDK patches and discussions
 help / color / mirror / Atom feed

Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download:

* [dpdk-dev] [PATCH v2] ci: Add the class_id support in pci probe
  @ 2016-05-19 12:25  7% ` Ziye Yang
  2016-05-19 13:17  7%   ` [dpdk-dev] [PATCH v3] " Ziye Yang
  0 siblings, 1 reply; 200+ results
From: Ziye Yang @ 2016-05-19 12:25 UTC (permalink / raw)
  To: dev

This patch is used to add the class_id (class_code,
subclass_code, programming_interface) support for
pci_device probe. With this patch, it will be
flexible for users to probe a class of devices
by class_id.

Signed-off-by: Ziye Yang <ziye.yang@intel.com>
---
 doc/guides/rel_notes/deprecation.rst    | 6 ------
 lib/librte_eal/bsdapp/eal/eal_pci.c     | 5 +++++
 lib/librte_eal/common/eal_common_pci.c  | 3 +++
 lib/librte_eal/common/include/rte_pci.h | 8 ++++++--
 lib/librte_eal/linuxapp/eal/eal_pci.c   | 9 +++++++++
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 7d94ba5..28f9c61 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -20,12 +20,6 @@ Deprecation Notices
   do not need to care about the kind of devices that are being used, making it
   easier to add new buses later.
 
-* ABI changes are planned for struct rte_pci_id, i.e., add new field ``class``.
-  This new added ``class`` field can be used to probe pci device by class
-  related info. This change should impact size of struct rte_pci_id and struct
-  rte_pci_device. The release 16.04 does not contain these ABI changes, but
-  release 16.07 will.
-
 * The xstats API and rte_eth_xstats struct will be changed to allow retrieval
   of values without any string copies or parsing.
   No backwards compatibility is planned, as it would require code duplication
diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
index 2d16d78..7fdd6f1 100644
--- a/lib/librte_eal/bsdapp/eal/eal_pci.c
+++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
@@ -278,6 +278,11 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
 	/* get subsystem_device id */
 	dev->id.subsystem_device_id = conf->pc_subdevice;
 
+	/* get class id */
+	dev->id.class_id = (conf->pc_class << 16) |
+			   (conf->pc_subclass << 8) |
+			   (conf->pc_progif);
+
 	/* TODO: get max_vfs */
 	dev->max_vfs = 0;
 
diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c
index 3cae4cb..6c3117d 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -162,6 +162,9 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *d
 		if (id_table->subsystem_device_id != dev->id.subsystem_device_id &&
 				id_table->subsystem_device_id != PCI_ANY_ID)
 			continue;
+		if (id_table->class_id != dev->id.class_id &&
+				id_table->class_id != RTE_CLASS_ANY_ID)
+			continue;
 
 		struct rte_pci_addr *loc = &dev->addr;
 
diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 8fa2712..c30adaf 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -125,6 +125,7 @@ struct rte_pci_resource {
  * table of these IDs for each device that it supports.
  */
 struct rte_pci_id {
+	uint32_t class_id;            /**< Class ID (class, subclass, pi) or RTE_CLASS_ANY_ID. */
 	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
 	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
 	uint16_t subsystem_vendor_id; /**< Subsystem vendor ID or PCI_ANY_ID. */
@@ -170,6 +171,7 @@ struct rte_pci_device {
 
 /** Any PCI device identifier (vendor, device, ...) */
 #define PCI_ANY_ID (0xffff)
+#define RTE_CLASS_ANY_ID (0xffffff)
 
 #ifdef __cplusplus
 /** C++ macro used to help building up tables of device IDs */
@@ -177,14 +179,16 @@ struct rte_pci_device {
 	(vend),                   \
 	(dev),                    \
 	PCI_ANY_ID,               \
-	PCI_ANY_ID
+	PCI_ANY_ID,               \
+	RTE_CLASS_ANY_ID
 #else
 /** Macro used to help building up tables of device IDs */
 #define RTE_PCI_DEVICE(vend, dev)          \
 	.vendor_id = (vend),               \
 	.device_id = (dev),                \
 	.subsystem_vendor_id = PCI_ANY_ID, \
-	.subsystem_device_id = PCI_ANY_ID
+	.subsystem_device_id = PCI_ANY_ID, \
+	.class_id = RTE_CLASS_ANY_ID
 #endif
 
 struct rte_pci_driver;
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index bdc08a0..ff255b4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -306,6 +306,15 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
 	}
 	dev->id.subsystem_device_id = (uint16_t)tmp;
 
+	/* get class_id */
+	snprintf(filename, sizeof(filename), "%s/class",
+		 dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
+
 	/* get max_vfs */
 	dev->max_vfs = 0;
 	snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
-- 
1.9.3

^ permalink raw reply	[relevance 7%]

* Re: [dpdk-dev] [PATCH 1/2] mbuf: new NSH packet type
  @ 2016-05-19 12:26  4%   ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2016-05-19 12:26 UTC (permalink / raw)
  To: Jingjing Wu, helin.zhang; +Cc: dev

Hi Jingjing,

On 05/03/2016 07:51 AM, Jingjing Wu wrote:
> Signed-off-by: Jingjing Wu <jingjing.wu@intel.com>
> ---
>  lib/librte_mbuf/rte_mbuf.h | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 529debb..79edae3 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -274,6 +274,13 @@ extern "C" {
>   */
>  #define RTE_PTYPE_L2_ETHER_LLDP             0x00000004
>  /**
> + * NSH (Network Service Header) packet type.
> + *
> + * Packet format:
> + * <'ether type'=0x894F>
> + */
> +#define RTE_PTYPE_L2_ETHER_NSH              0x00000005
> +/**
>   * Mask of layer 2 packet types.
>   * It is used for outer packet for tunneling cases.
>   */
> 

Acked-by: Olivier Matz <olivier.matz@6wind.com>


I have no objection for this patch, but it makes me think about
2 things:

- we have the room for 16 types for each layer, maybe we should
  start to be careful about which types should be supported to
  avoid running out of types in the future.

- The types supported in outer and inner have diverged. It would
  have been better to have something like:

  #define RTE_PTYPE_INNER_$type (RTE_PTYPE_$type << 16)

  Because it would make the software using the packet types
  simpler.

It's maybe a bit late now because it would break the ABI, but
this is something we could keep in mind in case we change the
ABI for another reason.

Regards,
Olivier

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 00/35] mempool: rework memory allocation
  @ 2016-05-19 12:47  0%   ` Thomas Monjalon
  2016-05-20  8:42  0%     ` Panu Matilainen
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-05-19 12:47 UTC (permalink / raw)
  To: Olivier Matz; +Cc: dev, bruce.richardson, stephen, keith.wiles

2016-05-18 13:04, Olivier Matz:
> This series is a rework of mempool. For those who don't want to read
> all the cover letter, here is a sumary:
> 
> - it is not possible to allocate large mempools if there is not enough
>   contiguous memory, this series solves this issue
> - introduce new APIs with less arguments: "create, populate, obj_init"
> - allow to free a mempool
> - split code in smaller functions, will ease the introduction of ext_handler
> - remove test-pmd anonymous mempool creation
> - remove most of dom0-specific mempool code
> - opens the door for a eal_memory rework: we probably don't need large
>   contiguous memory area anymore, working with pages would work.
> 
> This breaks the ABI as it was indicated in the deprecation for 16.04.
> The API stays almost the same, no modification is needed in examples app
> or in test-pmd. Only kni and mellanox drivers are slightly modified.

Applied with a small change you sent me to fix mlx build in the middle of the patchset
and update the removed Xen files in MAINTAINERS file.

Thanks for the big rework!

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3] ci: Add the class_id support in pci probe
  2016-05-19 12:25  7% ` [dpdk-dev] [PATCH v2] ci: " Ziye Yang
@ 2016-05-19 13:17  7%   ` Ziye Yang
  2016-05-24 12:50  7%     ` [dpdk-dev] [PATCH v4] Pci: Add the class_id support Ziye Yang
  0 siblings, 1 reply; 200+ results
From: Ziye Yang @ 2016-05-19 13:17 UTC (permalink / raw)
  To: dev

This patch is used to add the class_id (class_code,
subclass_code, programming_interface) support for
pci_device probe. With this patch, it will be
flexible for users to probe a class of devices
by class_id.

Signed-off-by: Ziye Yang <ziye.yang@intel.com>
---
 doc/guides/rel_notes/deprecation.rst    |  6 ------
 lib/librte_eal/bsdapp/eal/eal_pci.c     |  5 +++++
 lib/librte_eal/common/eal_common_pci.c  |  3 +++
 lib/librte_eal/common/include/rte_pci.h |  8 ++++++--
 lib/librte_eal/linuxapp/eal/eal_pci.c   | 10 ++++++++++
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 7d94ba5..28f9c61 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -20,12 +20,6 @@ Deprecation Notices
   do not need to care about the kind of devices that are being used, making it
   easier to add new buses later.
 
-* ABI changes are planned for struct rte_pci_id, i.e., add new field ``class``.
-  This new added ``class`` field can be used to probe pci device by class
-  related info. This change should impact size of struct rte_pci_id and struct
-  rte_pci_device. The release 16.04 does not contain these ABI changes, but
-  release 16.07 will.
-
 * The xstats API and rte_eth_xstats struct will be changed to allow retrieval
   of values without any string copies or parsing.
   No backwards compatibility is planned, as it would require code duplication
diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
index 2d16d78..7fdd6f1 100644
--- a/lib/librte_eal/bsdapp/eal/eal_pci.c
+++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
@@ -278,6 +278,11 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
 	/* get subsystem_device id */
 	dev->id.subsystem_device_id = conf->pc_subdevice;
 
+	/* get class id */
+	dev->id.class_id = (conf->pc_class << 16) |
+			   (conf->pc_subclass << 8) |
+			   (conf->pc_progif);
+
 	/* TODO: get max_vfs */
 	dev->max_vfs = 0;
 
diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c
index 3cae4cb..6c3117d 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -162,6 +162,9 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *d
 		if (id_table->subsystem_device_id != dev->id.subsystem_device_id &&
 				id_table->subsystem_device_id != PCI_ANY_ID)
 			continue;
+		if (id_table->class_id != dev->id.class_id &&
+				id_table->class_id != RTE_CLASS_ANY_ID)
+			continue;
 
 		struct rte_pci_addr *loc = &dev->addr;
 
diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 8fa2712..c30adaf 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -125,6 +125,7 @@ struct rte_pci_resource {
  * table of these IDs for each device that it supports.
  */
 struct rte_pci_id {
+	uint32_t class_id;            /**< Class ID (class, subclass, pi) or RTE_CLASS_ANY_ID. */
 	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
 	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
 	uint16_t subsystem_vendor_id; /**< Subsystem vendor ID or PCI_ANY_ID. */
@@ -170,6 +171,7 @@ struct rte_pci_device {
 
 /** Any PCI device identifier (vendor, device, ...) */
 #define PCI_ANY_ID (0xffff)
+#define RTE_CLASS_ANY_ID (0xffffff)
 
 #ifdef __cplusplus
 /** C++ macro used to help building up tables of device IDs */
@@ -177,14 +179,16 @@ struct rte_pci_device {
 	(vend),                   \
 	(dev),                    \
 	PCI_ANY_ID,               \
-	PCI_ANY_ID
+	PCI_ANY_ID,               \
+	RTE_CLASS_ANY_ID
 #else
 /** Macro used to help building up tables of device IDs */
 #define RTE_PCI_DEVICE(vend, dev)          \
 	.vendor_id = (vend),               \
 	.device_id = (dev),                \
 	.subsystem_vendor_id = PCI_ANY_ID, \
-	.subsystem_device_id = PCI_ANY_ID
+	.subsystem_device_id = PCI_ANY_ID, \
+	.class_id = RTE_CLASS_ANY_ID
 #endif
 
 struct rte_pci_driver;
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index bdc08a0..e6f0f13 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -306,6 +306,16 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
 	}
 	dev->id.subsystem_device_id = (uint16_t)tmp;
 
+	/* get class_id */
+	snprintf(filename, sizeof(filename), "%s/class",
+		 dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	/* the least 24 bits are valid: class, subclass, program interface */
+	dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
+
 	/* get max_vfs */
 	dev->max_vfs = 0;
 	snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
-- 
1.9.3

^ permalink raw reply	[relevance 7%]

* Re: [dpdk-dev] [PATCH] mbuf: make rearm_data address naturally aligned
  @ 2016-05-19 13:35  0%         ` Jerin Jacob
  0 siblings, 0 replies; 200+ results
From: Jerin Jacob @ 2016-05-19 13:35 UTC (permalink / raw)
  To: Ananyev, Konstantin
  Cc: Richardson, Bruce, dev, thomas.monjalon, viktorin, jianbo.liu

On Thu, May 19, 2016 at 12:18:57PM +0000, Ananyev, Konstantin wrote:
> 
> Hi everyone,
>  
> > On Thu, May 19, 2016 at 12:20:16AM +0530, Jerin Jacob wrote:
> > > On Wed, May 18, 2016 at 05:43:00PM +0100, Bruce Richardson wrote:
> > > > On Wed, May 18, 2016 at 07:27:43PM +0530, Jerin Jacob wrote:
> > > > > To avoid multiple stores on fast path, Ethernet drivers
> > > > > aggregate the writes to data_off, refcnt, nb_segs and port
> > > > > to an uint64_t data and write the data in one shot
> > > > > with uint64_t* at &mbuf->rearm_data address.
> > > > >
> > > > > Some of the non-IA platforms have store operation overhead
> > > > > if the store address is not naturally aligned.This patch
> > > > > fixes the performance issue on those targets.
> > > > >
> > > > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > > > ---
> > > > >
> > > > > Tested this patch on IA and non-IA(ThunderX) platforms.
> > > > > This patch shows 400Kpps/core improvement on ThunderX + ixgbe + vector environment.
> > > > > and this patch does not have any overhead on IA platform.
> > > > >
> > > > > Have tried an another similar approach by replacing "buf_len" with "pad"
> > > > > (in this patch context),
> > > > > Since it has additional overhead on read and then mask to keep "buf_len" intact,
> > > > > not much improvement is not shown.
> > > > > ref: http://dpdk.org/ml/archives/dev/2016-May/038914.html
> > > > >
> > > > > ---
> > > > While this will work and from your tests doesn't seem to have a performance
> > > > impact, I'm not sure I particularly like it. It's extending out the end of
> > > > cacheline0 of the mbuf by 16 bytes, though I suppose it's not technically using
> > > > up any more space of it.
> > >
> > > Extending by 2 bytes. Right ?. Yes, I guess, Now we using only 56 out of 64 bytes
> > > in the first 64-byte cache line.
> > >
> > > >
> > > > What I'm wondering about though, is do we have any usecases where we need a
> > > > variable buf_len for packets for RX. These mbufs come directly from a mempool,
> > > > which is generally understood to be a set of fixed-sized buffers. I realise that
> > > > this change was made in the past after some discussion, but one of the key points
> > > > there [at least to my reading] was that - even though nobody actually made a
> > > > concrete case where they had variable-sized buffers - having support for them
> > > > made no performance difference.
> 
> I was going to point to vhost zcp support, but as Thomas pointed out
> that functionality was removed  from dpdk.org recently.
> So I am not aware does such case exist right now in the 'real world' or not.
> Though I still think RX function should leave buf_len field intact. 
> 
> > > >
> > > > The latter part of that has now changed, and supporting variable-sized mbufs
> > > > from an mbuf pool has a perf impact. Do we definitely need that functionality,
> > > > because the easiest fix here is just to move the rxrearm marker back above
> > > > mbuf_len as it was originally in releases like 1.8?
> > >
> > > And initialize the buf_len with mp->elt_size - sizeof(struct rte_mbuf).
> > > Right?
> > >
> > > I don't have a strong opinion on this, I can do this if there is no
> > > objection on this. Let me know.
> > >
> > > However, I do see in future, "buf_len" may belong at the end of the first 64 byte
> > > cache line as currently "port" is defined as uint8_t, IMO, that is less.
> > > We may need to increase that uint16_t. The reason why I think that
> > > because, Currently in ThunderX HW, we do have 128VFs per socket for
> > > built-in NIC, So, the two node configuration and one external PCIe NW card
> > > configuration can easily go beyond 256 ports.
> 
> I wonder does anyone really use mbuf port field?
> My though was - could we to drop it completely?
> Actually, after discussing it with Bruce offline, an interesting idea came out:
> if we'll drop port and make mbuf_prefree() to reset nb_segs=1, then
> we can reduce RX rearm_data to 4B. So with that layout:
> 
> struct rte_mbuf {
> 
>          MARKER cacheline0;
> 
>         void *buf_addr;           
>         phys_addr_t buf_physaddr; 
>         uint16_t buf_len;
>         uint8_t nb_segs;
>         uint8_t reserved_1byte;   /* former port */
>         
>         MARKER32 rearm_data;
>         uint16_t data_off;
>        uint16_t refcnt;
>        
>         uint64_t ol_flags;
>         ...
> 
> We can keep buf_len at its place and avoid 2B gap, while making rearm_data
> 4B long and 4B aligned.

Couple of comments,
- IMO, It is good if nb_segs can move under rearm_data, as some
drivers(not in ixgbe may be) can write nb_segs in one shot also
in segmented rx handler case
- I think, it makes sense to keep port in mbuf so that application
can make use of it(Not sure what real application developers think of
this)
- if Writing 4B and 8B consume same cycles(at least in arm64) then I think it
makes sense to make it as 8B wide with maximum pre-built constants are possible.

> 
> Another similar alternative, is to make mbuf_prefree() to set refcnt=1
> (as it update it anyway). Then we can remove refcnt from the RX rearm_data,
> and again make rearm_data 4B long and 4B aligned:
> 
> struct rte_mbuf {
> 
>          MARKER cacheline0;
> 
>         void *buf_addr;           
>         phys_addr_t buf_physaddr; 
>         uint16_t buf_len;
>         uint16_t refcnt;
> 
>         MARKER32 rearm_data;
>         uint16_t data_off;
>         uint8_t nb_segs;
>         uint8_t port;

The only problem I think with this approach is that, port data type cannot be
extended to uint16_t in future.

>         
>         uint64_t ol_flags;
>          ..
> 
> As additional plus, __rte_mbuf_raw_alloc() wouldn't need to modify mbuf contents at all -
> which probably is a good thing.
> As a drawback - we'll have a free mbufs in pool with refcnt==1, which probably reduce
> debug ability of the mbuf code.  
> 
> Konstantin
> 
> > >
> > Ok, good point. If you think it's needed, and if we are changing the mbuf
> > structure, it might be a good time to extend that field while you are at it, save
> > a second ABI break later on.
> > 
> > /Bruce
> > 
> > > >
> > > > Regards,
> > > > /Bruce
> > > >
> > > > Ref: http://dpdk.org/ml/archives/dev/2014-December/009432.html
> > > >

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] mempool: external mempool manager
  @ 2016-05-19 13:44  2% ` David Hunt
    2016-06-01 16:19  2%   ` [dpdk-dev] [PATCH v6 0/5] mempool: add external mempool manager David Hunt
  0 siblings, 2 replies; 200+ results
From: David Hunt @ 2016-05-19 13:44 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, yuanhan.liu, pmatilai

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 19/5/2016, including 
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool handler. This is achieved by adding a
     new mempool handler source file into the librte_mempool library, and
     using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_handler to create a new mempool
     using the name parameter to identify which handler to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_handler() which sets the mempool's handler
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant handler

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool handler name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. put       - puts an object back into the mempool once an application has
                finished with it
 3. get       - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised handlers may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_handler()

int
rte_mempool_set_handler(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool handler is passed by name
to rte_mempool_set_handler, which looks through the handler array to
get the handler index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via handler index.

The mempool handler structure contains callbacks to the implementation of
the handler, and is set up for registration as follows:

static const struct rte_mempool_handler handler_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .put = common_ring_sp_put,
    .get = common_ring_mc_get,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the handler in the array of handlers

REGISTER_MEMPOOL_HANDLER(handler_mp_mc);

For and example of a simple malloc based mempool manager, see
lib/librte_mempool/custom_mempool.c

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external handler
  mbuf: get default mempool handler from configuration

Olivier Matz (1):
  app/test: test external mempool handler

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v2] doc: announce ABI change of struct rte_port_source_params and rte_port_sink_params
  @ 2016-05-19 14:18 20% ` Fan Zhang
  0 siblings, 0 replies; 200+ results
From: Fan Zhang @ 2016-05-19 14:18 UTC (permalink / raw)
  To: dev

The ABI changes are planned for rte_port_source_params and
rte_port_sink_params, which will be supported from release 16.11. Here
announces that ABI changes in detail.

Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
Acked-by: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
---
 doc/guides/rel_notes/deprecation.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index fffe9c7..4f3fefe 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -74,3 +74,11 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* ABI will change for rte_port_source_params struct. The member file_name
+  data type will be changed from char * to const char *. This change targets
+  release 16.11
+
+* ABI will change for rte_port_sink_params struct. The member file_name
+  data type will be changed from char * to const char *. This change targets
+  release 16.11
-- 
2.5.5

^ permalink raw reply	[relevance 20%]

* Re: [dpdk-dev] [PATCH v3 00/35] mempool: rework memory allocation
  2016-05-19 12:47  0%   ` Thomas Monjalon
@ 2016-05-20  8:42  0%     ` Panu Matilainen
  0 siblings, 0 replies; 200+ results
From: Panu Matilainen @ 2016-05-20  8:42 UTC (permalink / raw)
  To: Thomas Monjalon, Olivier Matz; +Cc: dev, bruce.richardson, stephen, keith.wiles

On 05/19/2016 03:47 PM, Thomas Monjalon wrote:
> 2016-05-18 13:04, Olivier Matz:
>> This series is a rework of mempool. For those who don't want to read
>> all the cover letter, here is a sumary:
>>
>> - it is not possible to allocate large mempools if there is not enough
>>   contiguous memory, this series solves this issue
>> - introduce new APIs with less arguments: "create, populate, obj_init"
>> - allow to free a mempool
>> - split code in smaller functions, will ease the introduction of ext_handler
>> - remove test-pmd anonymous mempool creation
>> - remove most of dom0-specific mempool code
>> - opens the door for a eal_memory rework: we probably don't need large
>>   contiguous memory area anymore, working with pages would work.
>>
>> This breaks the ABI as it was indicated in the deprecation for 16.04.
>> The API stays almost the same, no modification is needed in examples app
>> or in test-pmd. Only kni and mellanox drivers are slightly modified.
>
> Applied with a small change you sent me to fix mlx build in the middle of the patchset
> and update the removed Xen files in MAINTAINERS file.
>
> Thanks for the big rework!
>

Just noticed this series "breaks" --no-huge as a regular user, commit 
593a084afc2b to be exact:

mmap(NULL, 4194304, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, 0, 0) = -1 EAGAIN (Resource 
temporarily unavailable)
write(1, "EAL: rte_eal_hugepage_init: mmap"..., 76EAL: 
rte_eal_hugepage_init: mmap() failed: Resource temporarily unavailable

"Breaks" in quotes because I guess it always was broken (as the 
non-locked pages might not be in physical memory) and because its
possible to adjust resourse limits to allow the operation to succeed.
If you're root, that is.

I was just looking into making the test-suite runnable by a regular user 
with no special privileges, primarily to make it possible to run the 
testsuite as part of rpm package builds (in %check), and no special 
setup or extra privileges can be assumed there. Such tests are of course 
of limited coverage but still better than nothing, and --no-huge was my 
ticket there. Talk about bad timing :)

It'd be fine to have limited subset of tests to run when non-privileged 
but since this one lives inside rte_eal_init() it practically prevents 
everything, unless I'm missing some other magic switch or such. Thoughts?

	- Panu -

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v1] doc: fix code section in abi versioning doc
@ 2016-05-20 13:51 13% John McNamara
  0 siblings, 0 replies; 200+ results
From: John McNamara @ 2016-05-20 13:51 UTC (permalink / raw)
  To: dev; +Cc: John McNamara

Fix broken cosole directive in the ABI validator section of the
ABI versioning docs.

Signed-off-by: John McNamara <john.mcnamara@intel.com>
---
 doc/guides/contributing/versioning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/guides/contributing/versioning.rst b/doc/guides/contributing/versioning.rst
index ae10a98..92b4d7c 100644
--- a/doc/guides/contributing/versioning.rst
+++ b/doc/guides/contributing/versioning.rst
@@ -475,7 +475,7 @@ Where ``REV1`` and ``REV2`` are valid gitrevisions(7)
 https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html
 on the local repo and target is the usual DPDK compilation target.
 
-For example:
+For example::
 
    # Check between the previous and latest commit:
    ./scripts/validate-abi.sh HEAD~1 HEAD x86_64-native-linuxapp-gcc
-- 
2.5.0

^ permalink raw reply	[relevance 13%]

* [dpdk-dev] [PATCH v2] doc: fix code section in abi versioning doc
@ 2016-05-20 14:08 13% John McNamara
  2016-06-08 16:46  4% ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: John McNamara @ 2016-05-20 14:08 UTC (permalink / raw)
  To: dev; +Cc: John McNamara

Fix broken cosole directive in the ABI validator section of the
ABI versioning docs.

Fixes: f1ef9794f9bd ("doc: add ABI guidelines")

Signed-off-by: John McNamara <john.mcnamara@intel.com>
---

v2: Added fixline.

 doc/guides/contributing/versioning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/guides/contributing/versioning.rst b/doc/guides/contributing/versioning.rst
index ae10a98..92b4d7c 100644
--- a/doc/guides/contributing/versioning.rst
+++ b/doc/guides/contributing/versioning.rst
@@ -475,7 +475,7 @@ Where ``REV1`` and ``REV2`` are valid gitrevisions(7)
 https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html
 on the local repo and target is the usual DPDK compilation target.
 
-For example:
+For example::
 
    # Check between the previous and latest commit:
    ./scripts/validate-abi.sh HEAD~1 HEAD x86_64-native-linuxapp-gcc
-- 
2.5.0

^ permalink raw reply	[relevance 13%]

* [dpdk-dev] [PATCH v4] i40e: configure MTU
  @ 2016-05-20 15:17  4% ` Beilei Xing
  0 siblings, 0 replies; 200+ results
From: Beilei Xing @ 2016-05-20 15:17 UTC (permalink / raw)
  To: jingjing.wu; +Cc: dev, Beilei Xing

This patch enables configuring MTU for i40e.
Since changing MTU needs to reconfigure queue, stop port first
before configuring MTU.

Signed-off-by: Beilei Xing <beilei.xing@intel.com>
---
v4 changes:
 Revert v2 change, if the port is running, return -EBUSY.

 doc/guides/rel_notes/release_16_07.rst |  3 +++
 drivers/net/i40e/i40e_ethdev.c         | 34 ++++++++++++++++++++++++++++++++++
 lib/librte_ether/rte_ethdev.h          |  1 +
 3 files changed, 38 insertions(+)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 30e78d4..4b1c176 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -116,6 +116,9 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* The function ``rte_eth_dev_set_mtu`` adds a new return value ``-EBUSY``, which
+  indicates the operation is forbidden because the port is running.
+
 
 ABI Changes
 -----------
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 24777d5..ffccaae 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -447,6 +447,8 @@ static int i40e_get_eeprom(struct rte_eth_dev *dev,
 static void i40e_set_default_mac_addr(struct rte_eth_dev *dev,
 				      struct ether_addr *mac_addr);
 
+static int i40e_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu);
+
 static const struct rte_pci_id pci_id_i40e_map[] = {
 #define RTE_PCI_DEV_ID_DECL_I40E(vend, dev) {RTE_PCI_DEVICE(vend, dev)},
 #include "rte_pci_dev_ids.h"
@@ -520,6 +522,7 @@ static const struct eth_dev_ops i40e_eth_dev_ops = {
 	.get_eeprom_length            = i40e_get_eeprom_length,
 	.get_eeprom                   = i40e_get_eeprom,
 	.mac_addr_set                 = i40e_set_default_mac_addr,
+	.mtu_set                      = i40e_dev_mtu_set,
 };
 
 /* store statistics names and its offset in stats structure */
@@ -9108,3 +9111,34 @@ static void i40e_set_default_mac_addr(struct rte_eth_dev *dev,
 	/* Flags: 0x3 updates port address */
 	i40e_aq_mac_address_write(hw, 0x3, mac_addr->addr_bytes, NULL);
 }
+
+static int
+i40e_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct i40e_pf *pf = I40E_DEV_PRIVATE_TO_PF(dev->data->dev_private);
+	struct rte_eth_dev_data *dev_data = pf->dev_data;
+	uint32_t frame_size = mtu + ETHER_HDR_LEN
+			      + ETHER_CRC_LEN + I40E_VLAN_TAG_SIZE;
+	int ret = 0;
+
+	/* check if mtu is within the allowed range */
+	if ((mtu < ETHER_MIN_MTU) || (frame_size > I40E_FRAME_SIZE_MAX))
+		return -EINVAL;
+
+	/* mtu setting is forbidden if port is start */
+	if (dev_data->dev_started) {
+		PMD_DRV_LOG(ERR,
+			    "port %d must be stopped before configuration\n",
+			    dev_data->port_id);
+		return -EBUSY;
+	}
+
+	if (frame_size > ETHER_MAX_LEN)
+		dev_data->dev_conf.rxmode.jumbo_frame = 1;
+	else
+		dev_data->dev_conf.rxmode.jumbo_frame = 0;
+
+	dev_data->dev_conf.rxmode.max_rx_pkt_len = frame_size;
+
+	return ret;
+}
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 2757510..a8d9963 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -2398,6 +2398,7 @@ int rte_eth_dev_get_mtu(uint8_t port_id, uint16_t *mtu);
  *   - (-ENOTSUP) if operation is not supported.
  *   - (-ENODEV) if *port_id* invalid.
  *   - (-EINVAL) if *mtu* invalid.
+ *   - (-EBUSY) if operation is not allowed when the port is running
  */
 int rte_eth_dev_set_mtu(uint8_t port_id, uint16_t mtu);
 
-- 
2.5.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] Suggestions for the dpdk stable tree
  @ 2016-05-23  2:21  3%     ` Yuanhan Liu
  2016-06-01 19:01  0%       ` Mcnamara, John
  0 siblings, 1 reply; 200+ results
From: Yuanhan Liu @ 2016-05-23  2:21 UTC (permalink / raw)
  To: Mcnamara, John
  Cc: Christian Ehrhardt, dev, Stephen Hemminger, Thomas Monjalon

On Fri, May 20, 2016 at 02:49:31PM +0000, Mcnamara, John wrote:
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Christian Ehrhardt
> > Sent: Friday, May 20, 2016 9:07 AM
> > To: dev <dev@dpdk.org>; Stephen Hemminger <stephen@networkplumber.org>
> > Subject: Re: [dpdk-dev] Suggestions for the dpdk stable tree
> > 
> > Hi,
> > I guess over time/releases less people mind the 2.2-stable.
> > But I still see a lot of people referring to 2.2 - so why not giving this
> > thread a ping again.
> > 
> > ack / nack / opinions ?
> 
> Hi Christian,
> 
> We are interested in having a LTS/Stable tree.

I didn't notice this thread, otherwise, I would have commented earlier:
TBH, I have also thought of LTS tree few months before. But I was thinking,
hmm, it's just a library, what's the big deal of maintaining a stable
tree for it. I then hide it deep inside of my mind, silently.

> We have been looking at identifying a maintainer and validation engineer internally to support the effort but haven't be able to finalize that. Once we do we will come back to the mailing list with a proposal and a request for comments.

I would nominate myself as the LTS tree maintainer, if it makes sense
to have one.

> We would probably be looking at 16.04 or even 16.07 as the basis for the LTS at this stage.

Just one opinion from the view of vhost: since 16.07 is a vhost ABI/API
refactoring release, I'd suggest to base on 16.07, and then we could
have less conflicts to apply later bug fix patches.

However, I'm very open to choose any others as the base, say, even v2.2.

	--yliu

> It would be great if we could get support from you or others as well.
> 
> John.
> -- 
> 

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH] mbuf: new flag when Vlan is stripped
  @ 2016-05-23  8:46  2% ` Olivier Matz
  2016-05-23  8:59  0%   ` Ananyev, Konstantin
                     ` (2 more replies)
  0 siblings, 3 replies; 200+ results
From: Olivier Matz @ 2016-05-23  8:46 UTC (permalink / raw)
  To: dev
  Cc: johndale, konstantin.ananyev, helin.zhang, adrien.mazarguil,
	rahul.lakkireddy, alejandro.lucero, sony.chacko

The behavior of PKT_RX_VLAN_PKT was not very well defined, resulting in
PMDs not advertising the same flags in similar conditions.

Following discussion in [1], introduce 2 new flags PKT_RX_VLAN_STRIPPED
and PKT_RX_QINQ_STRIPPED that are better defined:

  PKT_RX_VLAN_STRIPPED: a vlan has been stripped by the hardware and its
  tci is saved in mbuf->vlan_tci. This can only happen if vlan stripping
  is enabled in the RX configuration of the PMD.

For now, the old flag PKT_RX_VLAN_PKT is kept but marked as deprecated.
It should be removed from applications and PMDs in a future revision.

This patch also updates the drivers. For PKT_RX_VLAN_PKT:

- e1000, enic, i40e, mlx5, nfp, vmxnet3: done, PKT_RX_VLAN_PKT already
  had the same meaning than PKT_RX_VLAN_STRIPPED, minor update is
  required.
- fm10k: done, PKT_RX_VLAN_PKT already had the same meaning than
  PKT_RX_VLAN_STRIPPED, and vlan stripping is always enabled on fm10k.
- ixgbe: modification done for standard mode (vector does not support
  vlan stripping)
- the other drivers do not support vlan stripping.

For PKT_RX_QINQ_PKT, it was only supported on i40e, and the meaning was
already correct, so we can reuse the same value for PKT_RX_QINQ_STRIPPED.

[1] http://dpdk.org/ml/archives/dev/2016-April/037837.html,

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

RFC -> v1:
- fix checkpatch and check-git-log.sh issues
- add a deprecation notice for the old vlan flags
- rebase on head


 app/test-pmd/rxonly.c                |  4 +--
 doc/guides/rel_notes/deprecation.rst |  5 ++++
 drivers/net/e1000/em_rxtx.c          |  3 ++-
 drivers/net/e1000/igb_rxtx.c         |  3 ++-
 drivers/net/enic/enic_rx.c           |  2 +-
 drivers/net/i40e/i40e_rxtx.c         |  2 +-
 drivers/net/ixgbe/ixgbe_ethdev.c     |  7 +++++
 drivers/net/ixgbe/ixgbe_rxtx.c       | 21 +++++++++++----
 drivers/net/ixgbe/ixgbe_rxtx.h       |  1 +
 drivers/net/mlx5/mlx5_rxtx.c         |  6 +++--
 drivers/net/nfp/nfp_net.c            |  2 +-
 drivers/net/vmxnet3/vmxnet3_rxtx.c   |  2 +-
 lib/librte_mbuf/rte_mbuf.c           |  2 ++
 lib/librte_mbuf/rte_mbuf.h           | 50 ++++++++++++++++++++++++++++++++----
 14 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 14555ab..c69b344 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -156,9 +156,9 @@ pkt_burst_receive(struct fwd_stream *fs)
 				printf("hash=0x%x ID=0x%x ",
 				       mb->hash.fdir.hash, mb->hash.fdir.id);
 		}
-		if (ol_flags & PKT_RX_VLAN_PKT)
+		if (ol_flags & PKT_RX_VLAN_STRIPPED)
 			printf(" - VLAN tci=0x%x", mb->vlan_tci);
-		if (ol_flags & PKT_RX_QINQ_PKT)
+		if (ol_flags & PKT_RX_QINQ_STRIPPED)
 			printf(" - QinQ VLAN tci=0x%x, VLAN tci outer=0x%x",
 					mb->vlan_tci, mb->vlan_tci_outer);
 		if (mb->packet_type) {
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..2233a90 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -57,3 +57,8 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* The mbuf flags PKT_RX_VLAN_PKT and PKT_RX_QINQ_PKT are deprecated and
+  are respectively replaced by PKT_RX_VLAN_STRIPPED and
+  PKT_RX_QINQ_STRIPPED, that are better described. The old flags and
+  their behavior will be kept in 16.07 and will be removed in 16.11.
diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
index 3d36f21..6d8750a 100644
--- a/drivers/net/e1000/em_rxtx.c
+++ b/drivers/net/e1000/em_rxtx.c
@@ -629,7 +629,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0);
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 	return pkt_flags;
 }
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 18aeead..9d80a0b 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -729,7 +729,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 #if defined(RTE_LIBRTE_IEEE1588)
 	if (rx_status & E1000_RXD_STAT_TMST)
diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c
index f92f6bc..6459e97 100644
--- a/drivers/net/enic/enic_rx.c
+++ b/drivers/net/enic/enic_rx.c
@@ -197,7 +197,7 @@ enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
 
 	/* VLAN stripping */
 	if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
-		pkt_flags |= PKT_RX_VLAN_PKT;
+		pkt_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		mbuf->vlan_tci = enic_cq_rx_desc_vlan(cqrd);
 	} else {
 		mbuf->vlan_tci = 0;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index c833aa3..aa161a9 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -99,7 +99,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 	if (rte_le_to_cpu_16(rxdp->wb.qword2.ext_status) &
 		(1 << I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) {
-		mb->ol_flags |= PKT_RX_QINQ_PKT;
+		mb->ol_flags |= PKT_RX_QINQ_STRIPPED;
 		mb->vlan_tci_outer = mb->vlan_tci;
 		mb->vlan_tci = rte_le_to_cpu_16(rxdp->wb.qword2.l2tag2_2);
 		PMD_RX_LOG(DEBUG, "Descriptor l2tag2_1: %u, l2tag2_2: %u",
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a2b170b..e7717e3 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1636,6 +1636,7 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 {
 	struct ixgbe_hwstrip *hwstrip =
 		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(dev->data->dev_private);
+	struct ixgbe_rx_queue *rxq;
 
 	if (queue >= IXGBE_MAX_RX_QUEUE_NUM)
 		return;
@@ -1644,6 +1645,12 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 		IXGBE_SET_HWSTRIP(hwstrip, queue);
 	else
 		IXGBE_CLEAR_HWSTRIP(hwstrip, queue);
+
+	if (queue >= dev->data->nb_rx_queues)
+		return;
+
+	rxq = dev->data->rx_queues[queue];
+	rxq->vlan_strip = on;
 }
 
 static void
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9c6eaf2..3d740df 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1221,16 +1221,23 @@ ixgbe_rxd_pkt_info_to_pkt_flags(uint16_t pkt_info)
 }
 
 static inline uint64_t
-rx_desc_status_to_pkt_flags(uint32_t rx_status)
+rx_desc_status_to_pkt_flags(uint32_t rx_status, uint8_t vlan_strip)
 {
 	uint64_t pkt_flags;
+	uint64_t vlan_flags;
+
+	/* if vlan is stripped, set the proper flag */
+	if (vlan_strip)
+		vlan_flags = PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
+	else
+		vlan_flags = PKT_RX_VLAN_PKT;
 
 	/*
 	 * Check if VLAN present only.
 	 * Do not check whether L3/L4 rx checksum done by NIC or not,
 	 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
 	 */
-	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;
 
 #ifdef RTE_LIBRTE_IEEE1588
 	if (rx_status & IXGBE_RXD_STAT_TMST)
@@ -1287,6 +1294,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 	uint32_t pkt_info[LOOK_AHEAD];
 	int i, j, nb_rx = 0;
 	uint32_t status;
+	uint8_t vlan_strip = rxq->vlan_strip;
 
 	/* get references to current descriptor and S/W ring entry */
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
@@ -1328,7 +1336,8 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 			mb->vlan_tci = rte_le_to_cpu_16(rxdp[j].wb.upper.vlan);
 
 			/* convert descriptor fields to rte mbuf flags */
-			pkt_flags = rx_desc_status_to_pkt_flags(s[j]);
+			pkt_flags = rx_desc_status_to_pkt_flags(s[j],
+				vlan_strip);
 			pkt_flags |= rx_desc_error_to_pkt_flags(s[j]);
 			pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags
 					((uint16_t)pkt_info[j]);
@@ -1544,6 +1553,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint16_t nb_rx;
 	uint16_t nb_hold;
 	uint64_t pkt_flags;
+	uint8_t vlan_strip;
 
 	nb_rx = 0;
 	nb_hold = 0;
@@ -1551,6 +1561,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	sw_ring = rxq->sw_ring;
+	vlan_strip = rxq->vlan_strip;
 	while (nb_rx < nb_pkts) {
 		/*
 		 * The order of operations here is important as the DD status
@@ -1660,7 +1671,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		/* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
 		rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
 
-		pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+		pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_strip);
 		pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
 		pkt_flags = pkt_flags |
 			ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
@@ -1753,7 +1764,7 @@ ixgbe_fill_cluster_head_buf(
 	 */
 	head->vlan_tci = rte_le_to_cpu_16(desc->wb.upper.vlan);
 	pkt_info = rte_le_to_cpu_32(desc->wb.lower.lo_dword.data);
-	pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+	pkt_flags = rx_desc_status_to_pkt_flags(staterr, rxq->vlan_strip);
 	pkt_flags |= rx_desc_error_to_pkt_flags(staterr);
 	pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
 	head->ol_flags = pkt_flags;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 3691a19..9ca0e8b 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -146,6 +146,7 @@ struct ixgbe_rx_queue {
 	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
 	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
 	uint8_t             rx_deferred_start; /**< not in global dev start. */
+	uint8_t             vlan_strip; /**< 1 if vlan stripping enabled. */
 	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
 	struct rte_mbuf fake_mbuf;
 	/** hold packets to return to application */
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 13c8d71..ac96fc9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1051,7 +1051,8 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				pkt_buf->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1207,7 +1208,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				seg->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..5c9f350 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1800,7 +1800,7 @@ nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
-			mb->ol_flags |= PKT_RX_VLAN_PKT;
+			mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		}
 
 		/* Adding the mbuff to the mbuff array passed by the app */
diff --git a/drivers/net/vmxnet3/vmxnet3_rxtx.c b/drivers/net/vmxnet3/vmxnet3_rxtx.c
index 9fe8752..ccafc0c 100644
--- a/drivers/net/vmxnet3/vmxnet3_rxtx.c
+++ b/drivers/net/vmxnet3/vmxnet3_rxtx.c
@@ -579,7 +579,7 @@ vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
 {
 	/* Check for hardware stripped VLAN tag */
 	if (rcd->ts) {
-		rxm->ol_flags |= PKT_RX_VLAN_PKT;
+		rxm->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
 		rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
 	}
 
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index eec1456..2ece742 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -258,8 +258,10 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
 	/* case PKT_RX_HBUF_OVERFLOW: return "PKT_RX_HBUF_OVERFLOW"; */
 	/* case PKT_RX_RECIP_ERR: return "PKT_RX_RECIP_ERR"; */
 	/* case PKT_RX_MAC_ERR: return "PKT_RX_MAC_ERR"; */
+	case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED";
 	case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP";
 	case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
+	case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
 	default: return NULL;
 	}
 }
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 48911a6..5b8a11a 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -79,7 +79,16 @@ extern "C" {
  * Keep these flags synchronized with rte_get_rx_ol_flag_name() and
  * rte_get_tx_ol_flag_name().
  */
-#define PKT_RX_VLAN_PKT      (1ULL << 0)  /**< RX packet is a 802.1q VLAN packet. */
+
+/**
+ * Deprecated.
+ * RX packet is a 802.1q VLAN packet. This flag was set by PMDs when
+ * the packet is recognized as a VLAN, but the behavior between PMDs
+ * was not the same. This flag is kept for some time to avoid breaking
+ * applications and should be replaced by PKT_RX_VLAN_STRIPPED.
+ */
+#define PKT_RX_VLAN_PKT      (1ULL << 0)
+
 #define PKT_RX_RSS_HASH      (1ULL << 1)  /**< RX packet with RSS hash result. */
 #define PKT_RX_FDIR          (1ULL << 2)  /**< RX packet with FDIR match indicate. */
 #define PKT_RX_L4_CKSUM_BAD  (1ULL << 3)  /**< L4 cksum of RX pkt. is not OK. */
@@ -89,11 +98,37 @@ extern "C" {
 #define PKT_RX_HBUF_OVERFLOW (0ULL << 0)  /**< Header buffer overflow. */
 #define PKT_RX_RECIP_ERR     (0ULL << 0)  /**< Hardware processing error. */
 #define PKT_RX_MAC_ERR       (0ULL << 0)  /**< MAC error. */
+
+/**
+ * A vlan has been stripped by the hardware and its tci is saved in
+ * mbuf->vlan_tci. This can only happen if vlan stripping is enabled
+ * in the RX configuration of the PMD.
+ */
+#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
+
+/* hole, some bits can be reused here  */
+
 #define PKT_RX_IEEE1588_PTP  (1ULL << 9)  /**< RX IEEE1588 L2 Ethernet PT Packet. */
 #define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
 #define PKT_RX_FDIR_ID       (1ULL << 13) /**< FD id reported if FDIR match. */
 #define PKT_RX_FDIR_FLX      (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
-#define PKT_RX_QINQ_PKT      (1ULL << 15)  /**< RX packet with double VLAN stripped. */
+
+/**
+ * The 2 vlans have been stripped by the hardware and their tci are
+ * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
+ * This can only happen if vlan stripping is enabled in the RX
+ * configuration of the PMD. If this flag is set, PKT_RX_VLAN_STRIPPED
+ * must also be set.
+ */
+#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
+
+/**
+ * Deprecated.
+ * RX packet with double VLAN stripped.
+ * This flag is replaced by PKT_RX_QINQ_STRIPPED.
+ */
+#define PKT_RX_QINQ_PKT      PKT_RX_QINQ_STRIPPED
+
 /* add new RX flags here */
 
 /* add new TX flags here */
@@ -761,7 +796,10 @@ struct rte_mbuf {
 
 	/*
 	 * The packet type, which is the combination of outer/inner L2, L3, L4
-	 * and tunnel types.
+	 * and tunnel types. The packet_type is about data really present in the
+	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
+	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
+	 * vlan is stripped from the data.
 	 */
 	union {
 		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
@@ -778,7 +816,8 @@ struct rte_mbuf {
 
 	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
 	uint16_t data_len;        /**< Amount of data in segment buffer. */
-	uint16_t vlan_tci;        /**< VLAN Tag Control Identifier (CPU order) */
+	/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+	uint16_t vlan_tci;
 
 	union {
 		uint32_t rss;     /**< RSS hash result if RSS enabled */
@@ -804,7 +843,8 @@ struct rte_mbuf {
 
 	uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
 
-	uint16_t vlan_tci_outer;  /**< Outer VLAN Tag Control Identifier (CPU order) */
+	/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
+	uint16_t vlan_tci_outer;
 
 	/* second cache line - fields only used in slow path or on TX */
 	MARKER cacheline1 __rte_cache_min_aligned;
-- 
2.8.0.rc3

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH] mbuf: new flag when Vlan is stripped
  2016-05-23  8:46  2% ` [dpdk-dev] [PATCH] mbuf: new flag when Vlan " Olivier Matz
@ 2016-05-23  8:59  0%   ` Ananyev, Konstantin
  2016-05-23  9:20  0%   ` Ananyev, Konstantin
  2016-05-27 14:33  2%   ` [dpdk-dev] [PATCH v2] " Olivier Matz
  2 siblings, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2016-05-23  8:59 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: johndale, Zhang, Helin, adrien.mazarguil, rahul.lakkireddy,
	alejandro.lucero, sony.chacko

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Monday, May 23, 2016 9:47 AM
> To: dev@dpdk.org
> Cc: johndale@cisco.com; Ananyev, Konstantin; Zhang, Helin; adrien.mazarguil@6wind.com; rahul.lakkireddy@chelsio.com;
> alejandro.lucero@netronome.com; sony.chacko@qlogic.com
> Subject: [PATCH] mbuf: new flag when Vlan is stripped
> 
> The behavior of PKT_RX_VLAN_PKT was not very well defined, resulting in
> PMDs not advertising the same flags in similar conditions.
> 
> Following discussion in [1], introduce 2 new flags PKT_RX_VLAN_STRIPPED
> and PKT_RX_QINQ_STRIPPED that are better defined:
> 
>   PKT_RX_VLAN_STRIPPED: a vlan has been stripped by the hardware and its
>   tci is saved in mbuf->vlan_tci. This can only happen if vlan stripping
>   is enabled in the RX configuration of the PMD.
> 
> For now, the old flag PKT_RX_VLAN_PKT is kept but marked as deprecated.
> It should be removed from applications and PMDs in a future revision.
> 
> This patch also updates the drivers. For PKT_RX_VLAN_PKT:
> 
> - e1000, enic, i40e, mlx5, nfp, vmxnet3: done, PKT_RX_VLAN_PKT already
>   had the same meaning than PKT_RX_VLAN_STRIPPED, minor update is
>   required.
> - fm10k: done, PKT_RX_VLAN_PKT already had the same meaning than
>   PKT_RX_VLAN_STRIPPED, and vlan stripping is always enabled on fm10k.
> - ixgbe: modification done for standard mode (vector does not support
>   vlan stripping)
> - the other drivers do not support vlan stripping.
> 
> For PKT_RX_QINQ_PKT, it was only supported on i40e, and the meaning was
> already correct, so we can reuse the same value for PKT_RX_QINQ_STRIPPED.
> 
> [1] http://dpdk.org/ml/archives/dev/2016-April/037837.html,
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
> 
> RFC -> v1:
> - fix checkpatch and check-git-log.sh issues
> - add a deprecation notice for the old vlan flags
> - rebase on head
> 
> 
>  app/test-pmd/rxonly.c                |  4 +--
>  doc/guides/rel_notes/deprecation.rst |  5 ++++
>  drivers/net/e1000/em_rxtx.c          |  3 ++-
>  drivers/net/e1000/igb_rxtx.c         |  3 ++-
>  drivers/net/enic/enic_rx.c           |  2 +-
>  drivers/net/i40e/i40e_rxtx.c         |  2 +-
>  drivers/net/ixgbe/ixgbe_ethdev.c     |  7 +++++
>  drivers/net/ixgbe/ixgbe_rxtx.c       | 21 +++++++++++----
>  drivers/net/ixgbe/ixgbe_rxtx.h       |  1 +
>  drivers/net/mlx5/mlx5_rxtx.c         |  6 +++--
>  drivers/net/nfp/nfp_net.c            |  2 +-
>  drivers/net/vmxnet3/vmxnet3_rxtx.c   |  2 +-
>  lib/librte_mbuf/rte_mbuf.c           |  2 ++
>  lib/librte_mbuf/rte_mbuf.h           | 50 ++++++++++++++++++++++++++++++++----
>  14 files changed, 90 insertions(+), 20 deletions(-)


I don't see ixgbe/i4oe_rxtx_vec.c updated.
Would it be another patch for them?
Thanks
Konstantin

> 
> diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
> index 14555ab..c69b344 100644
> --- a/app/test-pmd/rxonly.c
> +++ b/app/test-pmd/rxonly.c
> @@ -156,9 +156,9 @@ pkt_burst_receive(struct fwd_stream *fs)
>  				printf("hash=0x%x ID=0x%x ",
>  				       mb->hash.fdir.hash, mb->hash.fdir.id);
>  		}
> -		if (ol_flags & PKT_RX_VLAN_PKT)
> +		if (ol_flags & PKT_RX_VLAN_STRIPPED)
>  			printf(" - VLAN tci=0x%x", mb->vlan_tci);
> -		if (ol_flags & PKT_RX_QINQ_PKT)
> +		if (ol_flags & PKT_RX_QINQ_STRIPPED)
>  			printf(" - QinQ VLAN tci=0x%x, VLAN tci outer=0x%x",
>  					mb->vlan_tci, mb->vlan_tci_outer);
>  		if (mb->packet_type) {
> diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
> index ad05eba..2233a90 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -57,3 +57,8 @@ Deprecation Notices
>    a handle, like the way kernel exposes an fd to user for locating a
>    specific file, and to keep all major structures internally, so that
>    we are likely to be free from ABI violations in future.
> +
> +* The mbuf flags PKT_RX_VLAN_PKT and PKT_RX_QINQ_PKT are deprecated and
> +  are respectively replaced by PKT_RX_VLAN_STRIPPED and
> +  PKT_RX_QINQ_STRIPPED, that are better described. The old flags and
> +  their behavior will be kept in 16.07 and will be removed in 16.11.
> diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
> index 3d36f21..6d8750a 100644
> --- a/drivers/net/e1000/em_rxtx.c
> +++ b/drivers/net/e1000/em_rxtx.c
> @@ -629,7 +629,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
>  	uint64_t pkt_flags;
> 
>  	/* Check if VLAN present */
> -	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0);
> +	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
> +		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
> 
>  	return pkt_flags;
>  }
> diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
> index 18aeead..9d80a0b 100644
> --- a/drivers/net/e1000/igb_rxtx.c
> +++ b/drivers/net/e1000/igb_rxtx.c
> @@ -729,7 +729,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
>  	uint64_t pkt_flags;
> 
>  	/* Check if VLAN present */
> -	pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
> +	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
> +		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
> 
>  #if defined(RTE_LIBRTE_IEEE1588)
>  	if (rx_status & E1000_RXD_STAT_TMST)
> diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c
> index f92f6bc..6459e97 100644
> --- a/drivers/net/enic/enic_rx.c
> +++ b/drivers/net/enic/enic_rx.c
> @@ -197,7 +197,7 @@ enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
> 
>  	/* VLAN stripping */
>  	if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
> -		pkt_flags |= PKT_RX_VLAN_PKT;
> +		pkt_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
>  		mbuf->vlan_tci = enic_cq_rx_desc_vlan(cqrd);
>  	} else {
>  		mbuf->vlan_tci = 0;
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
> index c833aa3..aa161a9 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -99,7 +99,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
>  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>  	if (rte_le_to_cpu_16(rxdp->wb.qword2.ext_status) &
>  		(1 << I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) {
> -		mb->ol_flags |= PKT_RX_QINQ_PKT;
> +		mb->ol_flags |= PKT_RX_QINQ_STRIPPED;
>  		mb->vlan_tci_outer = mb->vlan_tci;
>  		mb->vlan_tci = rte_le_to_cpu_16(rxdp->wb.qword2.l2tag2_2);
>  		PMD_RX_LOG(DEBUG, "Descriptor l2tag2_1: %u, l2tag2_2: %u",
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
> index a2b170b..e7717e3 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -1636,6 +1636,7 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
>  {
>  	struct ixgbe_hwstrip *hwstrip =
>  		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(dev->data->dev_private);
> +	struct ixgbe_rx_queue *rxq;
> 
>  	if (queue >= IXGBE_MAX_RX_QUEUE_NUM)
>  		return;
> @@ -1644,6 +1645,12 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
>  		IXGBE_SET_HWSTRIP(hwstrip, queue);
>  	else
>  		IXGBE_CLEAR_HWSTRIP(hwstrip, queue);
> +
> +	if (queue >= dev->data->nb_rx_queues)
> +		return;
> +
> +	rxq = dev->data->rx_queues[queue];
> +	rxq->vlan_strip = on;
>  }
> 
>  static void
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 9c6eaf2..3d740df 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -1221,16 +1221,23 @@ ixgbe_rxd_pkt_info_to_pkt_flags(uint16_t pkt_info)
>  }
> 
>  static inline uint64_t
> -rx_desc_status_to_pkt_flags(uint32_t rx_status)
> +rx_desc_status_to_pkt_flags(uint32_t rx_status, uint8_t vlan_strip)
>  {
>  	uint64_t pkt_flags;
> +	uint64_t vlan_flags;
> +
> +	/* if vlan is stripped, set the proper flag */
> +	if (vlan_strip)
> +		vlan_flags = PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
> +	else
> +		vlan_flags = PKT_RX_VLAN_PKT;
> 
>  	/*
>  	 * Check if VLAN present only.
>  	 * Do not check whether L3/L4 rx checksum done by NIC or not,
>  	 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
>  	 */
> -	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
> +	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;
> 
>  #ifdef RTE_LIBRTE_IEEE1588
>  	if (rx_status & IXGBE_RXD_STAT_TMST)
> @@ -1287,6 +1294,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>  	uint32_t pkt_info[LOOK_AHEAD];
>  	int i, j, nb_rx = 0;
>  	uint32_t status;
> +	uint8_t vlan_strip = rxq->vlan_strip;
> 
>  	/* get references to current descriptor and S/W ring entry */
>  	rxdp = &rxq->rx_ring[rxq->rx_tail];
> @@ -1328,7 +1336,8 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>  			mb->vlan_tci = rte_le_to_cpu_16(rxdp[j].wb.upper.vlan);
> 
>  			/* convert descriptor fields to rte mbuf flags */
> -			pkt_flags = rx_desc_status_to_pkt_flags(s[j]);
> +			pkt_flags = rx_desc_status_to_pkt_flags(s[j],
> +				vlan_strip);
>  			pkt_flags |= rx_desc_error_to_pkt_flags(s[j]);
>  			pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags
>  					((uint16_t)pkt_info[j]);
> @@ -1544,6 +1553,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  	uint16_t nb_rx;
>  	uint16_t nb_hold;
>  	uint64_t pkt_flags;
> +	uint8_t vlan_strip;
> 
>  	nb_rx = 0;
>  	nb_hold = 0;
> @@ -1551,6 +1561,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  	rx_id = rxq->rx_tail;
>  	rx_ring = rxq->rx_ring;
>  	sw_ring = rxq->sw_ring;
> +	vlan_strip = rxq->vlan_strip;
>  	while (nb_rx < nb_pkts) {
>  		/*
>  		 * The order of operations here is important as the DD status
> @@ -1660,7 +1671,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		/* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
>  		rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
> 
> -		pkt_flags = rx_desc_status_to_pkt_flags(staterr);
> +		pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_strip);
>  		pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
>  		pkt_flags = pkt_flags |
>  			ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
> @@ -1753,7 +1764,7 @@ ixgbe_fill_cluster_head_buf(
>  	 */
>  	head->vlan_tci = rte_le_to_cpu_16(desc->wb.upper.vlan);
>  	pkt_info = rte_le_to_cpu_32(desc->wb.lower.lo_dword.data);
> -	pkt_flags = rx_desc_status_to_pkt_flags(staterr);
> +	pkt_flags = rx_desc_status_to_pkt_flags(staterr, rxq->vlan_strip);
>  	pkt_flags |= rx_desc_error_to_pkt_flags(staterr);
>  	pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
>  	head->ol_flags = pkt_flags;
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 3691a19..9ca0e8b 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -146,6 +146,7 @@ struct ixgbe_rx_queue {
>  	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
>  	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
>  	uint8_t             rx_deferred_start; /**< not in global dev start. */
> +	uint8_t             vlan_strip; /**< 1 if vlan stripping enabled. */
>  	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
>  	struct rte_mbuf fake_mbuf;
>  	/** hold packets to return to application */
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 13c8d71..ac96fc9 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1051,7 +1051,8 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
>  #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
>  			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
> -				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
> +				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT |
> +					PKT_RX_VLAN_STRIPPED;
>  				pkt_buf->vlan_tci = vlan_tci;
>  			}
>  #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
> @@ -1207,7 +1208,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
>  #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
>  			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
> -				seg->ol_flags |= PKT_RX_VLAN_PKT;
> +				seg->ol_flags |= PKT_RX_VLAN_PKT |
> +					PKT_RX_VLAN_STRIPPED;
>  				seg->vlan_tci = vlan_tci;
>  			}
>  #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
> diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
> index ea5a2a3..5c9f350 100644
> --- a/drivers/net/nfp/nfp_net.c
> +++ b/drivers/net/nfp/nfp_net.c
> @@ -1800,7 +1800,7 @@ nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
>  		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
>  		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
>  			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
> -			mb->ol_flags |= PKT_RX_VLAN_PKT;
> +			mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
>  		}
> 
>  		/* Adding the mbuff to the mbuff array passed by the app */
> diff --git a/drivers/net/vmxnet3/vmxnet3_rxtx.c b/drivers/net/vmxnet3/vmxnet3_rxtx.c
> index 9fe8752..ccafc0c 100644
> --- a/drivers/net/vmxnet3/vmxnet3_rxtx.c
> +++ b/drivers/net/vmxnet3/vmxnet3_rxtx.c
> @@ -579,7 +579,7 @@ vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
>  {
>  	/* Check for hardware stripped VLAN tag */
>  	if (rcd->ts) {
> -		rxm->ol_flags |= PKT_RX_VLAN_PKT;
> +		rxm->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
>  		rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
>  	}
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index eec1456..2ece742 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -258,8 +258,10 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
>  	/* case PKT_RX_HBUF_OVERFLOW: return "PKT_RX_HBUF_OVERFLOW"; */
>  	/* case PKT_RX_RECIP_ERR: return "PKT_RX_RECIP_ERR"; */
>  	/* case PKT_RX_MAC_ERR: return "PKT_RX_MAC_ERR"; */
> +	case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED";
>  	case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP";
>  	case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
> +	case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
>  	default: return NULL;
>  	}
>  }
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 48911a6..5b8a11a 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -79,7 +79,16 @@ extern "C" {
>   * Keep these flags synchronized with rte_get_rx_ol_flag_name() and
>   * rte_get_tx_ol_flag_name().
>   */
> -#define PKT_RX_VLAN_PKT      (1ULL << 0)  /**< RX packet is a 802.1q VLAN packet. */
> +
> +/**
> + * Deprecated.
> + * RX packet is a 802.1q VLAN packet. This flag was set by PMDs when
> + * the packet is recognized as a VLAN, but the behavior between PMDs
> + * was not the same. This flag is kept for some time to avoid breaking
> + * applications and should be replaced by PKT_RX_VLAN_STRIPPED.
> + */
> +#define PKT_RX_VLAN_PKT      (1ULL << 0)
> +
>  #define PKT_RX_RSS_HASH      (1ULL << 1)  /**< RX packet with RSS hash result. */
>  #define PKT_RX_FDIR          (1ULL << 2)  /**< RX packet with FDIR match indicate. */
>  #define PKT_RX_L4_CKSUM_BAD  (1ULL << 3)  /**< L4 cksum of RX pkt. is not OK. */
> @@ -89,11 +98,37 @@ extern "C" {
>  #define PKT_RX_HBUF_OVERFLOW (0ULL << 0)  /**< Header buffer overflow. */
>  #define PKT_RX_RECIP_ERR     (0ULL << 0)  /**< Hardware processing error. */
>  #define PKT_RX_MAC_ERR       (0ULL << 0)  /**< MAC error. */
> +
> +/**
> + * A vlan has been stripped by the hardware and its tci is saved in
> + * mbuf->vlan_tci. This can only happen if vlan stripping is enabled
> + * in the RX configuration of the PMD.
> + */
> +#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
> +
> +/* hole, some bits can be reused here  */
> +
>  #define PKT_RX_IEEE1588_PTP  (1ULL << 9)  /**< RX IEEE1588 L2 Ethernet PT Packet. */
>  #define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
>  #define PKT_RX_FDIR_ID       (1ULL << 13) /**< FD id reported if FDIR match. */
>  #define PKT_RX_FDIR_FLX      (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
> -#define PKT_RX_QINQ_PKT      (1ULL << 15)  /**< RX packet with double VLAN stripped. */
> +
> +/**
> + * The 2 vlans have been stripped by the hardware and their tci are
> + * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
> + * This can only happen if vlan stripping is enabled in the RX
> + * configuration of the PMD. If this flag is set, PKT_RX_VLAN_STRIPPED
> + * must also be set.
> + */
> +#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
> +
> +/**
> + * Deprecated.
> + * RX packet with double VLAN stripped.
> + * This flag is replaced by PKT_RX_QINQ_STRIPPED.
> + */
> +#define PKT_RX_QINQ_PKT      PKT_RX_QINQ_STRIPPED
> +
>  /* add new RX flags here */
> 
>  /* add new TX flags here */
> @@ -761,7 +796,10 @@ struct rte_mbuf {
> 
>  	/*
>  	 * The packet type, which is the combination of outer/inner L2, L3, L4
> -	 * and tunnel types.
> +	 * and tunnel types. The packet_type is about data really present in the
> +	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
> +	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
> +	 * vlan is stripped from the data.
>  	 */
>  	union {
>  		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
> @@ -778,7 +816,8 @@ struct rte_mbuf {
> 
>  	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
>  	uint16_t data_len;        /**< Amount of data in segment buffer. */
> -	uint16_t vlan_tci;        /**< VLAN Tag Control Identifier (CPU order) */
> +	/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
> +	uint16_t vlan_tci;
> 
>  	union {
>  		uint32_t rss;     /**< RSS hash result if RSS enabled */
> @@ -804,7 +843,8 @@ struct rte_mbuf {
> 
>  	uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
> 
> -	uint16_t vlan_tci_outer;  /**< Outer VLAN Tag Control Identifier (CPU order) */
> +	/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
> +	uint16_t vlan_tci_outer;
> 
>  	/* second cache line - fields only used in slow path or on TX */
>  	MARKER cacheline1 __rte_cache_min_aligned;
> --
> 2.8.0.rc3

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] mbuf: new flag when Vlan is stripped
  2016-05-23  8:46  2% ` [dpdk-dev] [PATCH] mbuf: new flag when Vlan " Olivier Matz
  2016-05-23  8:59  0%   ` Ananyev, Konstantin
@ 2016-05-23  9:20  0%   ` Ananyev, Konstantin
  2016-05-27 14:33  2%   ` [dpdk-dev] [PATCH v2] " Olivier Matz
  2 siblings, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2016-05-23  9:20 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: johndale, Zhang, Helin, adrien.mazarguil, rahul.lakkireddy,
	alejandro.lucero, sony.chacko



> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Monday, May 23, 2016 9:47 AM
> To: dev@dpdk.org
> Cc: johndale@cisco.com; Ananyev, Konstantin; Zhang, Helin; adrien.mazarguil@6wind.com; rahul.lakkireddy@chelsio.com;
> alejandro.lucero@netronome.com; sony.chacko@qlogic.com
> Subject: [PATCH] mbuf: new flag when Vlan is stripped
> 
> The behavior of PKT_RX_VLAN_PKT was not very well defined, resulting in
> PMDs not advertising the same flags in similar conditions.
> 
> Following discussion in [1], introduce 2 new flags PKT_RX_VLAN_STRIPPED
> and PKT_RX_QINQ_STRIPPED that are better defined:
> 
>   PKT_RX_VLAN_STRIPPED: a vlan has been stripped by the hardware and its
>   tci is saved in mbuf->vlan_tci. This can only happen if vlan stripping
>   is enabled in the RX configuration of the PMD.
> 
> For now, the old flag PKT_RX_VLAN_PKT is kept but marked as deprecated.
> It should be removed from applications and PMDs in a future revision.
> 
> This patch also updates the drivers. For PKT_RX_VLAN_PKT:
> 
> - e1000, enic, i40e, mlx5, nfp, vmxnet3: done, PKT_RX_VLAN_PKT already
>   had the same meaning than PKT_RX_VLAN_STRIPPED, minor update is
>   required.
> - fm10k: done, PKT_RX_VLAN_PKT already had the same meaning than
>   PKT_RX_VLAN_STRIPPED, and vlan stripping is always enabled on fm10k.
> - ixgbe: modification done for standard mode (vector does not support
>   vlan stripping)
> - the other drivers do not support vlan stripping.
> 
> For PKT_RX_QINQ_PKT, it was only supported on i40e, and the meaning was
> already correct, so we can reuse the same value for PKT_RX_QINQ_STRIPPED.
> 
> [1] http://dpdk.org/ml/archives/dev/2016-April/037837.html,
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
> 
> RFC -> v1:
> - fix checkpatch and check-git-log.sh issues
> - add a deprecation notice for the old vlan flags
> - rebase on head
> 
> 
>  app/test-pmd/rxonly.c                |  4 +--
>  doc/guides/rel_notes/deprecation.rst |  5 ++++
>  drivers/net/e1000/em_rxtx.c          |  3 ++-
>  drivers/net/e1000/igb_rxtx.c         |  3 ++-
>  drivers/net/enic/enic_rx.c           |  2 +-
>  drivers/net/i40e/i40e_rxtx.c         |  2 +-
>  drivers/net/ixgbe/ixgbe_ethdev.c     |  7 +++++
>  drivers/net/ixgbe/ixgbe_rxtx.c       | 21 +++++++++++----
>  drivers/net/ixgbe/ixgbe_rxtx.h       |  1 +
>  drivers/net/mlx5/mlx5_rxtx.c         |  6 +++--
>  drivers/net/nfp/nfp_net.c            |  2 +-
>  drivers/net/vmxnet3/vmxnet3_rxtx.c   |  2 +-
>  lib/librte_mbuf/rte_mbuf.c           |  2 ++
>  lib/librte_mbuf/rte_mbuf.h           | 50 ++++++++++++++++++++++++++++++++----
>  14 files changed, 90 insertions(+), 20 deletions(-)
> 
> diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
> index 14555ab..c69b344 100644
> --- a/app/test-pmd/rxonly.c
> +++ b/app/test-pmd/rxonly.c
> @@ -156,9 +156,9 @@ pkt_burst_receive(struct fwd_stream *fs)
>  				printf("hash=0x%x ID=0x%x ",
>  				       mb->hash.fdir.hash, mb->hash.fdir.id);
>  		}
> -		if (ol_flags & PKT_RX_VLAN_PKT)
> +		if (ol_flags & PKT_RX_VLAN_STRIPPED)
>  			printf(" - VLAN tci=0x%x", mb->vlan_tci);
> -		if (ol_flags & PKT_RX_QINQ_PKT)
> +		if (ol_flags & PKT_RX_QINQ_STRIPPED)
>  			printf(" - QinQ VLAN tci=0x%x, VLAN tci outer=0x%x",
>  					mb->vlan_tci, mb->vlan_tci_outer);
>  		if (mb->packet_type) {
> diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
> index ad05eba..2233a90 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -57,3 +57,8 @@ Deprecation Notices
>    a handle, like the way kernel exposes an fd to user for locating a
>    specific file, and to keep all major structures internally, so that
>    we are likely to be free from ABI violations in future.
> +
> +* The mbuf flags PKT_RX_VLAN_PKT and PKT_RX_QINQ_PKT are deprecated and
> +  are respectively replaced by PKT_RX_VLAN_STRIPPED and
> +  PKT_RX_QINQ_STRIPPED, that are better described. The old flags and
> +  their behavior will be kept in 16.07 and will be removed in 16.11.
> diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
> index 3d36f21..6d8750a 100644
> --- a/drivers/net/e1000/em_rxtx.c
> +++ b/drivers/net/e1000/em_rxtx.c
> @@ -629,7 +629,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
>  	uint64_t pkt_flags;
> 
>  	/* Check if VLAN present */
> -	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0);
> +	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
> +		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
> 
>  	return pkt_flags;
>  }
> diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
> index 18aeead..9d80a0b 100644
> --- a/drivers/net/e1000/igb_rxtx.c
> +++ b/drivers/net/e1000/igb_rxtx.c
> @@ -729,7 +729,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
>  	uint64_t pkt_flags;
> 
>  	/* Check if VLAN present */
> -	pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
> +	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
> +		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
> 
>  #if defined(RTE_LIBRTE_IEEE1588)
>  	if (rx_status & E1000_RXD_STAT_TMST)
> diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c
> index f92f6bc..6459e97 100644
> --- a/drivers/net/enic/enic_rx.c
> +++ b/drivers/net/enic/enic_rx.c
> @@ -197,7 +197,7 @@ enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
> 
>  	/* VLAN stripping */
>  	if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
> -		pkt_flags |= PKT_RX_VLAN_PKT;
> +		pkt_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
>  		mbuf->vlan_tci = enic_cq_rx_desc_vlan(cqrd);
>  	} else {
>  		mbuf->vlan_tci = 0;
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
> index c833aa3..aa161a9 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -99,7 +99,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
>  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>  	if (rte_le_to_cpu_16(rxdp->wb.qword2.ext_status) &
>  		(1 << I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) {
> -		mb->ol_flags |= PKT_RX_QINQ_PKT;
> +		mb->ol_flags |= PKT_RX_QINQ_STRIPPED;
>  		mb->vlan_tci_outer = mb->vlan_tci;
>  		mb->vlan_tci = rte_le_to_cpu_16(rxdp->wb.qword2.l2tag2_2);
>  		PMD_RX_LOG(DEBUG, "Descriptor l2tag2_1: %u, l2tag2_2: %u",
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
> index a2b170b..e7717e3 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -1636,6 +1636,7 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
>  {
>  	struct ixgbe_hwstrip *hwstrip =
>  		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(dev->data->dev_private);
> +	struct ixgbe_rx_queue *rxq;
> 
>  	if (queue >= IXGBE_MAX_RX_QUEUE_NUM)
>  		return;
> @@ -1644,6 +1645,12 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
>  		IXGBE_SET_HWSTRIP(hwstrip, queue);
>  	else
>  		IXGBE_CLEAR_HWSTRIP(hwstrip, queue);
> +
> +	if (queue >= dev->data->nb_rx_queues)
> +		return;
> +
> +	rxq = dev->data->rx_queues[queue];
> +	rxq->vlan_strip = on;
>  }
> 
>  static void
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 9c6eaf2..3d740df 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -1221,16 +1221,23 @@ ixgbe_rxd_pkt_info_to_pkt_flags(uint16_t pkt_info)
>  }
> 
>  static inline uint64_t
> -rx_desc_status_to_pkt_flags(uint32_t rx_status)
> +rx_desc_status_to_pkt_flags(uint32_t rx_status, uint8_t vlan_strip)
>  {
>  	uint64_t pkt_flags;
> +	uint64_t vlan_flags;
> +
> +	/* if vlan is stripped, set the proper flag */
> +	if (vlan_strip)
> +		vlan_flags = PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
> +	else
> +		vlan_flags = PKT_RX_VLAN_PKT;
> 
>  	/*
>  	 * Check if VLAN present only.
>  	 * Do not check whether L3/L4 rx checksum done by NIC or not,
>  	 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
>  	 */
> -	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
> +	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;


Instead of storing in rxq (and passing as a parameter) a bool value for vlan_strip (=on/off),
you probably can store in rxq and pass as a parameter here uint64_t vlan_flags;
Then it will be:
 
rx_desc_status_to_pkt_flags(uint32_t rx_status, uint64_t vlan_flags)
{
   ...
   pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;
   ...
}

...
pkt_flags = rx_desc_status_to_pkt_flags(s[j], rxq->vlan_flags);

Might help to save few cycles here.
Konstantin

> 
>  #ifdef RTE_LIBRTE_IEEE1588
>  	if (rx_status & IXGBE_RXD_STAT_TMST)
> @@ -1287,6 +1294,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>  	uint32_t pkt_info[LOOK_AHEAD];
>  	int i, j, nb_rx = 0;
>  	uint32_t status;
> +	uint8_t vlan_strip = rxq->vlan_strip;
> 
>  	/* get references to current descriptor and S/W ring entry */
>  	rxdp = &rxq->rx_ring[rxq->rx_tail];
> @@ -1328,7 +1336,8 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>  			mb->vlan_tci = rte_le_to_cpu_16(rxdp[j].wb.upper.vlan);
> 
>  			/* convert descriptor fields to rte mbuf flags */
> -			pkt_flags = rx_desc_status_to_pkt_flags(s[j]);
> +			pkt_flags = rx_desc_status_to_pkt_flags(s[j],
> +				vlan_strip);
>  			pkt_flags |= rx_desc_error_to_pkt_flags(s[j]);
>  			pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags
>  					((uint16_t)pkt_info[j]);
> @@ -1544,6 +1553,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  	uint16_t nb_rx;
>  	uint16_t nb_hold;
>  	uint64_t pkt_flags;
> +	uint8_t vlan_strip;
> 
>  	nb_rx = 0;
>  	nb_hold = 0;
> @@ -1551,6 +1561,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  	rx_id = rxq->rx_tail;
>  	rx_ring = rxq->rx_ring;
>  	sw_ring = rxq->sw_ring;
> +	vlan_strip = rxq->vlan_strip;
>  	while (nb_rx < nb_pkts) {
>  		/*
>  		 * The order of operations here is important as the DD status
> @@ -1660,7 +1671,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		/* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
>  		rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
> 
> -		pkt_flags = rx_desc_status_to_pkt_flags(staterr);
> +		pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_strip);
>  		pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
>  		pkt_flags = pkt_flags |
>  			ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
> @@ -1753,7 +1764,7 @@ ixgbe_fill_cluster_head_buf(
>  	 */
>  	head->vlan_tci = rte_le_to_cpu_16(desc->wb.upper.vlan);
>  	pkt_info = rte_le_to_cpu_32(desc->wb.lower.lo_dword.data);
> -	pkt_flags = rx_desc_status_to_pkt_flags(staterr);
> +	pkt_flags = rx_desc_status_to_pkt_flags(staterr, rxq->vlan_strip);
>  	pkt_flags |= rx_desc_error_to_pkt_flags(staterr);
>  	pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
>  	head->ol_flags = pkt_flags;
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
> index 3691a19..9ca0e8b 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.h
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.h
> @@ -146,6 +146,7 @@ struct ixgbe_rx_queue {
>  	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
>  	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
>  	uint8_t             rx_deferred_start; /**< not in global dev start. */
> +	uint8_t             vlan_strip; /**< 1 if vlan stripping enabled. */
>  	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
>  	struct rte_mbuf fake_mbuf;
>  	/** hold packets to return to application */

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports on ppc64
  @ 2016-05-23 13:07  3%       ` Yuanhan Liu
  2016-05-23 13:40  3%         ` Olivier Matz
  0 siblings, 1 reply; 200+ results
From: Yuanhan Liu @ 2016-05-23 13:07 UTC (permalink / raw)
  To: David Marchand; +Cc: Olivier Matz, dev, Chao Zhu, Xie, Huawei

On Tue, May 17, 2016 at 05:54:01PM +0200, David Marchand wrote:
> > +pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
> > +                  struct rte_pci_ioport *p)
> > +{
> > +       FILE *f;
> > +       char buf[BUFSIZ];
> > +       char filename[PATH_MAX];
> > +       uint64_t phys_addr, end_addr, flags;
> > +       int fd, i;
> > +       void *addr;
> > +
> > +       /* open and read addresses of the corresponding resource in sysfs */
> > +       snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource",
> > +               SYSFS_PCI_DEVICES, dev->addr.domain, dev->addr.bus,
> > +                dev->addr.devid, dev->addr.function);
> > +       f = fopen(filename, "r");
> > +       if (f == NULL) {
> > +               RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n",
> > +                       strerror(errno));
> > +               return -1;
> > +       }
> > +       for (i = 0; i < bar + 1; i++) {
> > +               if (fgets(buf, sizeof(buf), f) == NULL) {
> > +                       RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n");
> > +                       goto error;
> > +               }
> > +       }
> > +       if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
> > +                       &end_addr, &flags) < 0)
> > +               goto error;
> > +       if ((flags & IORESOURCE_IO) == 0) {
> > +               RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar);
> > +               goto error;
> > +       }
> > +       snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d",
> > +               SYSFS_PCI_DEVICES, dev->addr.domain, dev->addr.bus,
> > +                dev->addr.devid, dev->addr.function, bar);
> > +
> > +       /* mmap the pci resource */
> > +       fd = open(filename, O_RDWR);
> > +       if (fd < 0) {
> > +               RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
> > +                       strerror(errno));
> > +               goto error;
> > +       }
> > +       addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE,
> > +               MAP_SHARED, fd, 0);
> 
> Sorry, did not catch it in v1, but a close(fd) is missing here.
> With this, I think the patchset looks good.
> 
> Just missing some opinion from the virtio maintainers ?

Apologize for being late for review. Assuming you have done proper
test, this patch set looks good to me. (well, I don't quite like
the tons of "#ifdef ... #else ..#end" block though)

A side note is that I noticed an ABI breakage introduced in this
patch, so, this release is not a good fit?

	--yliu

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports on ppc64
  2016-05-23 13:07  3%       ` Yuanhan Liu
@ 2016-05-23 13:40  3%         ` Olivier Matz
  2016-05-24  5:15  3%           ` Yuanhan Liu
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2016-05-23 13:40 UTC (permalink / raw)
  To: Yuanhan Liu, David Marchand; +Cc: dev, Chao Zhu, Xie, Huawei

Hi Yuanhan,

On 05/23/2016 03:07 PM, Yuanhan Liu wrote:
> On Tue, May 17, 2016 at 05:54:01PM +0200, David Marchand wrote:
>>> +pci_uio_ioport_map(struct rte_pci_device *dev, int bar,
>>> +                  struct rte_pci_ioport *p)
>>> +{
>>> +       FILE *f;
>>> +       char buf[BUFSIZ];
>>> +       char filename[PATH_MAX];
>>> +       uint64_t phys_addr, end_addr, flags;
>>> +       int fd, i;
>>> +       void *addr;
>>> +
>>> +       /* open and read addresses of the corresponding resource in sysfs */
>>> +       snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource",
>>> +               SYSFS_PCI_DEVICES, dev->addr.domain, dev->addr.bus,
>>> +                dev->addr.devid, dev->addr.function);
>>> +       f = fopen(filename, "r");
>>> +       if (f == NULL) {
>>> +               RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n",
>>> +                       strerror(errno));
>>> +               return -1;
>>> +       }
>>> +       for (i = 0; i < bar + 1; i++) {
>>> +               if (fgets(buf, sizeof(buf), f) == NULL) {
>>> +                       RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n");
>>> +                       goto error;
>>> +               }
>>> +       }
>>> +       if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr,
>>> +                       &end_addr, &flags) < 0)
>>> +               goto error;
>>> +       if ((flags & IORESOURCE_IO) == 0) {
>>> +               RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar);
>>> +               goto error;
>>> +       }
>>> +       snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d",
>>> +               SYSFS_PCI_DEVICES, dev->addr.domain, dev->addr.bus,
>>> +                dev->addr.devid, dev->addr.function, bar);
>>> +
>>> +       /* mmap the pci resource */
>>> +       fd = open(filename, O_RDWR);
>>> +       if (fd < 0) {
>>> +               RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
>>> +                       strerror(errno));
>>> +               goto error;
>>> +       }
>>> +       addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE,
>>> +               MAP_SHARED, fd, 0);
>>
>> Sorry, did not catch it in v1, but a close(fd) is missing here.
>> With this, I think the patchset looks good.
>>
>> Just missing some opinion from the virtio maintainers ?
> 
> Apologize for being late for review. Assuming you have done proper
> test, this patch set looks good to me. (well, I don't quite like
> the tons of "#ifdef ... #else ..#end" block though)
> 
> A side note is that I noticed an ABI breakage introduced in this
> patch, so, this release is not a good fit?

Thank you for the review.

For reference, here is the report of the ABI checker for EAL:

[−] struct rte_pci_ioport (2)

 1 Field len has been added to this type.
   1) This field will not be initialized by old clients.
   2) Size of the inclusive type has been changed.
      NOTE: this field should be accessed only from the new library
      functions, otherwise it may result in crash or incorrect behavior
      of applications.
 2 Size of this type has been changed from 16 bytes to 24 bytes. 	
   The fields or parameters of such data type may be incorrectly
   initialized or accessed by old client applications.

[−] affected symbols (4)
 rte_eal_pci_ioport_map ( struct rte_pci_device* dev, int bar,
    struct rte_pci_ioport* p ) @@ DPDK_16.04
 3rd parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
 rte_eal_pci_ioport_read ( struct rte_pci_ioport* p, void* data,
    size_t len, off_t offset ) @@ DPDK_16.04
 1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
 rte_eal_pci_ioport_unmap ( struct rte_pci_ioport* p ) @@ DPDK_16.04
 1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
 rte_eal_pci_ioport_write ( struct rte_pci_ioport* p, void const* data,
    size_t len, off_t offset ) @@ DPDK_16.04
 1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.


My understanding of the comment for this structure is that it's
internal to EAL:

/**
 * A structure used to access io resources for a pci device.
 * rte_pci_ioport is arch, os, driver specific, and should not be used
outside
 * of pci ioport api.
 */
struct rte_pci_ioport {
 ...
}

So I'd say it's ok to have it integrated for 16.07.

Regards,
Olivier

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 0/9] add packet capture framework
  @ 2016-05-23 21:38  3% ` Reshma Pattan
                       ` (3 more replies)
  0 siblings, 4 replies; 200+ results
From: Reshma Pattan @ 2016-05-23 21:38 UTC (permalink / raw)
  To: dev

This patchset include below changes

1)Changes to librte_ether.
2)New library librte_pdump added for packet capture framework.
3)New app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
Pdump library provides APIs to support packet capturing on dpdk ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and un initialize the packet capture framework.

Pdump library works on server and client based model.

Sever is responsible for enabling/disabling the packet captures.
Clients are responsible for requesting enable/disable of the
packet captures.

As part of packet capture framework initialization, pthread and
the server socket is created. Only one server socket is allowed on the system.
As part of enabling/disabling the packet capture, client sockets are created
and multiple client sockets are allowed.
Who ever calls initialization first they will succeed with the initialization,
next subsequent calls of initialization are not allowed. So next users can only
request enabling/disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to deque the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()

2)app/pdump tool
================
Tool app/pdump is based on librte_pdump for packet capturing.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize apis of
librate_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and serves client
request to enable ethernet rxtx call-backs for given port|device_id and queue.·

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on device passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (9):
  librte_ether: protect add/remove of rxtx callbacks with spinlocks
  librte_ether: add new api rte_eth_add_first_rx_callback
  librte_ether: add new fields to rte_eth_dev_info struct
  librte_ether: make rte_eth_dev_get_port_by_name api public
  lib/librte_pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/test-pmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework
  doc: announce ABI change for rte_eth_dev_info structure

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 888 ++++++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 106 ++++
 doc/guides/rel_notes/deprecation.rst    |   6 +
 doc/guides/rel_notes/release_16_07.rst  |  11 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 115 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 121 +++--
 lib/librte_ether/rte_ethdev.h           |  45 ++
 lib/librte_ether/rte_ether_version.map  |   8 +
 lib/librte_pdump/Makefile               |  55 ++
 lib/librte_pdump/rte_pdump.c            | 816 +++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 186 +++++++
 lib/librte_pdump/rte_pdump_version.map  |  12 +
 mk/rte.app.mk                           |   1 +
 21 files changed, 2395 insertions(+), 43 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

-- 
2.5.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4 9/9] doc: announce ABI change for rte_eth_dev_info structure
  2016-05-23 21:38  3% ` [dpdk-dev] [PATCH v4 0/9] " Reshma Pattan
    2016-05-23 21:38  6%   ` [dpdk-dev] [PATCH v4 8/9] doc: update doc for packet capture framework Reshma Pattan
@ 2016-05-23 21:38  9%   ` Reshma Pattan
  2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
  3 siblings, 0 replies; 200+ results
From: Reshma Pattan @ 2016-05-23 21:38 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

New fields nb_rx_queues and nb_tx_queues will be added to
rte_eth_dev_info structure.
Changes to API rte_eth_dev_info_get() will be done to update
these new fields to rte_eth_dev_info object.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 doc/guides/rel_notes/deprecation.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..04316fb 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -57,3 +57,9 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* A librte_ether public structure ``rte_eth_dev_info`` will be changed in 16.07.
+  The proposed change will add new parameters ``nb_rx_queues``, ``nb_tx_queues``
+  to the structure. These are the number of queues configured by software.
+  Modification to definition of ``rte_eth_dev_info_get()`` will be done
+  to update new parameters to ``rte_eth_dev_info`` object.
-- 
2.5.0

^ permalink raw reply	[relevance 9%]

* [dpdk-dev] [PATCH v4 8/9] doc: update doc for packet capture framework
  2016-05-23 21:38  3% ` [dpdk-dev] [PATCH v4 0/9] " Reshma Pattan
  @ 2016-05-23 21:38  6%   ` Reshma Pattan
  2016-05-23 21:38  9%   ` [dpdk-dev] [PATCH v4 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
  2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
  3 siblings, 0 replies; 200+ results
From: Reshma Pattan @ 2016-05-23 21:38 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 106 +++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  11 +++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 115 ++++++++++++++++++++++++++++++++
 6 files changed, 237 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index ae706b9..8b00f41 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -437,6 +437,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..8d9ef29
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,106 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _Pdump_Library:
+
+pdump Library
+=============
+
+The ``pdump`` library provides the framework for the packet capturing on DPDK.
+Library provides the below APIs to initialize the packet capture framework, to enable
+or disable the packet capture and to un initialize the packet capture framework.
+
+``rte_pdump_init()``:
+This API initializes the packet capture framework.
+
+``rte_pdump_enable()``:
+This API enables the packet capture on a given port and the queue.
+Note: filter option in the API is the place holder for the future enhancements.
+
+``rte_pdump_enable_by_deviceid()``:
+This API enables the packet capture on a given device id(``vdev name or pci address``) and the queue.
+Note: filter option in the API is the place holder for the future enhancements.
+
+``rte_pdump_disable()``:
+This API disables the packet capture on a given port and the queue.
+
+``rte_pdump_disable_by_deviceid()``:
+This API disables the packet capture on a given device id(``vdev name or pci address``) and the queue.
+
+``rte_pdump_uninit()``:
+This API un initializes the packet capture framework.
+
+
+Operation
+---------
+
+The ``pdump`` library works on the server and the client based model. The sever is responsible for enabling or
+disabling the packet capture and the clients are responsible to request enable or disable the packet capture.
+
+The packet capture framework, as part of it's initialization, creates the pthread and creates the server socket in
+the pthread. The application who calls the framework initialization first, will have the server socket created and
+the further calls to the framework initialization by same application or other applications is not allowed i.e. only
+one server socket is allowed on the system. So the other applications, can only request for enabling or disabling of
+the packet capture and the client socket is created to send the request to the server. The server socket will be
+listening to the client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket.The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture. Who ever calls this API first will have the server socket created, the subsequent calls to this APIs
+will not create any further server socket. i.e. only one server socket is allowed.
+
+These library APIs ``rte_pdump_enable()/rte_pdump_enable_by_deviceid()`` enables the packet capture, on each call to
+these APIs, library creates the separate client socket, creates the pdump enable request and send the request to the
+server. Server who is listening on the socket will take the request, enable the packet capture by registering the
+Ethernet rx/tx callbacks for the given port or device_id and queue combinations. Then server will mirror the packets
+to the new mempool and enqueue them to the ring that clients has passed in to these APIs, server also sends the response
+back to the client about the status of the request that was processed. After the response is received from the server,
+client socket is closed.
+
+The library APIs ``rte_pdump_disable()/rte_pdump_disable_by_deviceid()`` disables the packet capture, on each call to
+these APIs, library creates the separate client socket, creates the pdump disable request and send the request to the
+server. Server who is listening on the socket will take the request, disable the packet capture by removing the
+Ethernet rx/tx callbacks for the given port or device_id and queue combinations. Server sends the response back to the
+client about the status of the request that was processed. After the response is received from the server, client
+socket is closed.
+
+The library API ``rte_pdump_uninit()``, un initializes the packet capture framework by closing the pthread and the
+server socket.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+DPDK ``app/pdump`` tool is developed based on this library to capture the packets in DPDK.
+Users can use this library to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 30e78d4..e3cd64a 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -47,6 +47,10 @@ New Features
   * Dropped specific Xen Dom0 code.
   * Dropped specific anonymous mempool code in testpmd.
 
+* **Added packet capture framework.**
+
+  * The new library ``librte_pdump`` is added to provide packet capture APIs.
+  * The new ``app/pdump`` tool is added to capture packets on DPDK.
 
 Resolved Issues
 ---------------
@@ -116,6 +120,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -127,6 +136,8 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..89b14ec
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,115 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 6%]

* Re: [dpdk-dev] [PATCH v4 3/9] librte_ether: add new fields to rte_eth_dev_info struct
  @ 2016-05-23 22:24  3%     ` Stephen Hemminger
  2016-05-24  8:09  3%       ` Pattan, Reshma
  0 siblings, 1 reply; 200+ results
From: Stephen Hemminger @ 2016-05-23 22:24 UTC (permalink / raw)
  To: Reshma Pattan; +Cc: dev

On Mon, 23 May 2016 22:38:26 +0100
Reshma Pattan <reshma.pattan@intel.com> wrote:

> Add new fields to rte_eth_dev_info struct
> New fields nb_rx_queues and nb_tx_queues are added to
> rte_eth_dev_info structure.
> Changes to API rte_eth_dev_info_get() are done to update
> these new fields to rte_eth_dev_info object.
> 
> Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>

This is an ABI break because rte_dev_info_get will clobber the
the stack of the caller if the caller thinks dev_info is old size.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports on ppc64
  2016-05-23 13:40  3%         ` Olivier Matz
@ 2016-05-24  5:15  3%           ` Yuanhan Liu
  2016-05-30  8:45  0%             ` Olivier Matz
  0 siblings, 1 reply; 200+ results
From: Yuanhan Liu @ 2016-05-24  5:15 UTC (permalink / raw)
  To: Olivier Matz
  Cc: David Marchand, dev, Chao Zhu, Xie, Huawei, Panu Matilainen,
	Thomas Monjalon

On Mon, May 23, 2016 at 03:40:58PM +0200, Olivier Matz wrote:
> For reference, here is the report of the ABI checker for EAL:
> 
> [−] struct rte_pci_ioport (2)
> 
>  1 Field len has been added to this type.
>    1) This field will not be initialized by old clients.
>    2) Size of the inclusive type has been changed.
>       NOTE: this field should be accessed only from the new library
>       functions, otherwise it may result in crash or incorrect behavior
>       of applications.
>  2 Size of this type has been changed from 16 bytes to 24 bytes. 	
>    The fields or parameters of such data type may be incorrectly
>    initialized or accessed by old client applications.
> 
> [−] affected symbols (4)
>  rte_eal_pci_ioport_map ( struct rte_pci_device* dev, int bar,
>     struct rte_pci_ioport* p ) @@ DPDK_16.04
>  3rd parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>  rte_eal_pci_ioport_read ( struct rte_pci_ioport* p, void* data,
>     size_t len, off_t offset ) @@ DPDK_16.04
>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>  rte_eal_pci_ioport_unmap ( struct rte_pci_ioport* p ) @@ DPDK_16.04
>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>  rte_eal_pci_ioport_write ( struct rte_pci_ioport* p, void const* data,
>     size_t len, off_t offset ) @@ DPDK_16.04
>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
> 
> 
> My understanding of the comment for this structure is that it's
> internal to EAL:

I'm not quite sure that is enough. Cc'ed Panu, the guru on ABI stuff,
hopefully he could shed some light on it.

> /**
>  * A structure used to access io resources for a pci device.
>  * rte_pci_ioport is arch, os, driver specific, and should not be used
> outside
>  * of pci ioport api.
>  */
> struct rte_pci_ioport {
>  ...
> }
> 
> So I'd say it's ok to have it integrated for 16.07.

I'll let Thomas to decide it :)

	--yliu

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4 3/9] librte_ether: add new fields to rte_eth_dev_info struct
  2016-05-23 22:24  3%     ` Stephen Hemminger
@ 2016-05-24  8:09  3%       ` Pattan, Reshma
  0 siblings, 0 replies; 200+ results
From: Pattan, Reshma @ 2016-05-24  8:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev



> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Monday, May 23, 2016 11:25 PM
> To: Pattan, Reshma <reshma.pattan@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4 3/9] librte_ether: add new fields to
> rte_eth_dev_info struct
> 
> On Mon, 23 May 2016 22:38:26 +0100
> Reshma Pattan <reshma.pattan@intel.com> wrote:
> 
> > Add new fields to rte_eth_dev_info struct New fields nb_rx_queues and
> > nb_tx_queues are added to rte_eth_dev_info structure.
> > Changes to API rte_eth_dev_info_get() are done to update these new
> > fields to rte_eth_dev_info object.
> >
> > Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
> 
> This is an ABI break because rte_dev_info_get will clobber the the stack of the
> caller if the caller thinks dev_info is old size.

Yes  and the ABI breakage was announced as RFC earlier, please check the below mails, now this is formal patch for the same.
http://dpdk.org/ml/archives/dev/2016-April/037458.html
http://dpdk.org/ml/archives/dev/2016-April/037459.html
http://dpdk.org/ml/archives/dev/2016-April/037460.html

Thanks,
Reshma

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v4] Pci: Add the class_id support
  2016-05-19 13:17  7%   ` [dpdk-dev] [PATCH v3] " Ziye Yang
@ 2016-05-24 12:50  7%     ` Ziye Yang
    0 siblings, 1 reply; 200+ results
From: Ziye Yang @ 2016-05-24 12:50 UTC (permalink / raw)
  To: dev

This patch is used to add the class_id (class_code,
subclass_code, programming_interface) support for
pci_device probe. With this patch, it will be
flexible for users to probe a class of devices
by class_id.


Signed-off-by: Ziye Yang <ziye.yang@intel.com>
---
Changes in v4: adjust title name and change RTE_PCI_DEVICE macro

 doc/guides/rel_notes/deprecation.rst    |  6 ------
 lib/librte_eal/bsdapp/eal/eal_pci.c     |  5 +++++
 lib/librte_eal/common/eal_common_pci.c  |  3 +++
 lib/librte_eal/common/include/rte_pci.h |  4 ++++
 lib/librte_eal/linuxapp/eal/eal_pci.c   | 10 ++++++++++
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..a300508 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -20,12 +20,6 @@ Deprecation Notices
   do not need to care about the kind of devices that are being used, making it
   easier to add new buses later.
 
-* ABI changes are planned for struct rte_pci_id, i.e., add new field ``class``.
-  This new added ``class`` field can be used to probe pci device by class
-  related info. This change should impact size of struct rte_pci_id and struct
-  rte_pci_device. The release 16.04 does not contain these ABI changes, but
-  release 16.07 will.
-
 * The xstats API and rte_eth_xstats struct will be changed to allow retrieval
   of values without any string copies or parsing.
   No backwards compatibility is planned, as it would require code duplication
diff --git a/lib/librte_eal/bsdapp/eal/eal_pci.c b/lib/librte_eal/bsdapp/eal/eal_pci.c
index 2d16d78..7fdd6f1 100644
--- a/lib/librte_eal/bsdapp/eal/eal_pci.c
+++ b/lib/librte_eal/bsdapp/eal/eal_pci.c
@@ -278,6 +278,11 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf)
 	/* get subsystem_device id */
 	dev->id.subsystem_device_id = conf->pc_subdevice;
 
+	/* get class id */
+	dev->id.class_id = (conf->pc_class << 16) |
+			   (conf->pc_subclass << 8) |
+			   (conf->pc_progif);
+
 	/* TODO: get max_vfs */
 	dev->max_vfs = 0;
 
diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c
index 3cae4cb..6c3117d 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -162,6 +162,9 @@ rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *d
 		if (id_table->subsystem_device_id != dev->id.subsystem_device_id &&
 				id_table->subsystem_device_id != PCI_ANY_ID)
 			continue;
+		if (id_table->class_id != dev->id.class_id &&
+				id_table->class_id != RTE_CLASS_ANY_ID)
+			continue;
 
 		struct rte_pci_addr *loc = &dev->addr;
 
diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 8fa2712..debc9ca 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -125,6 +125,7 @@ struct rte_pci_resource {
  * table of these IDs for each device that it supports.
  */
 struct rte_pci_id {
+	uint32_t class_id;            /**< Class ID (class, subclass, pi) or RTE_CLASS_ANY_ID. */
 	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
 	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
 	uint16_t subsystem_vendor_id; /**< Subsystem vendor ID or PCI_ANY_ID. */
@@ -170,10 +171,12 @@ struct rte_pci_device {
 
 /** Any PCI device identifier (vendor, device, ...) */
 #define PCI_ANY_ID (0xffff)
+#define RTE_CLASS_ANY_ID (0xffffff)
 
 #ifdef __cplusplus
 /** C++ macro used to help building up tables of device IDs */
 #define RTE_PCI_DEVICE(vend, dev) \
+	RTE_CLASS_ANY_ID,         \
 	(vend),                   \
 	(dev),                    \
 	PCI_ANY_ID,               \
@@ -181,6 +184,7 @@ struct rte_pci_device {
 #else
 /** Macro used to help building up tables of device IDs */
 #define RTE_PCI_DEVICE(vend, dev)          \
+	.class_id = RTE_CLASS_ANY_ID,      \
 	.vendor_id = (vend),               \
 	.device_id = (dev),                \
 	.subsystem_vendor_id = PCI_ANY_ID, \
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index bdc08a0..e6f0f13 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -306,6 +306,16 @@ pci_scan_one(const char *dirname, uint16_t domain, uint8_t bus,
 	}
 	dev->id.subsystem_device_id = (uint16_t)tmp;
 
+	/* get class_id */
+	snprintf(filename, sizeof(filename), "%s/class",
+		 dirname);
+	if (eal_parse_sysfs_value(filename, &tmp) < 0) {
+		free(dev);
+		return -1;
+	}
+	/* the least 24 bits are valid: class, subclass, program interface */
+	dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID;
+
 	/* get max_vfs */
 	dev->max_vfs = 0;
 	snprintf(filename, sizeof(filename), "%s/max_vfs", dirname);
-- 
1.9.3

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH] ethdev: change comments of VLAN type
@ 2016-05-26  7:28  4% Beilei Xing
  2016-06-13  8:03  4% ` [dpdk-dev] [PATCH v2] i40e: modify the meaning of single " Beilei Xing
  0 siblings, 1 reply; 200+ results
From: Beilei Xing @ 2016-05-26  7:28 UTC (permalink / raw)
  To: jingjing.wu; +Cc: dev, Beilei Xing

If the packet carries a single VLAN header, it is treated as the
outer header.
So change the comments of inner VLAN and outer VLAN.

Signed-off-by: Beilei Xing <beilei.xing@intel.com>
---
 doc/guides/rel_notes/release_16_07.rst | 3 +++
 lib/librte_ether/rte_ethdev.h          | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 30e78d4..29db86c 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -116,6 +116,9 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* The comments of ``ETH_VLAN_TYPE_INNER`` and ``ETH_VLAN_TYPE_OUTER`` in
+  ``rte_vlan_type`` are changed.
+
 
 ABI Changes
 -----------
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 2757510..c5c29fb 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -363,8 +363,8 @@ struct rte_eth_rxmode {
  */
 enum rte_vlan_type {
 	ETH_VLAN_TYPE_UNKNOWN = 0,
-	ETH_VLAN_TYPE_INNER, /**< Single VLAN, or inner VLAN. */
-	ETH_VLAN_TYPE_OUTER, /**< Outer VLAN. */
+	ETH_VLAN_TYPE_INNER, /**< Inner VLAN. */
+	ETH_VLAN_TYPE_OUTER, /**< Single VLAN, or outer VLAN. */
 	ETH_VLAN_TYPE_MAX,
 };
 
-- 
2.5.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 00/19] vhost ABI/API refactoring
  @ 2016-05-26 17:04  4%   ` Rich Lane
  2016-05-27  1:36  4%     ` Yuanhan Liu
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
  1 sibling, 1 reply; 200+ results
From: Rich Lane @ 2016-05-26 17:04 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, Thomas Monjalon, huawei.xie, Panu Matilainen,
	Tetsuya Mukawa, Traynor Kevin

On Thu, May 12, 2016 at 10:24 PM, Yuanhan Liu <yuanhan.liu@linux.intel.com>
wrote:

> v2: - exported ifname as well to fix a vhost-pmd issue reported
>       by Rich
>     - separated the big patch that introduces several new APIs
>       into some small patches.
>     - updated release note
>     - updated version.map
>

Tested-by: Rich Lane <rich.lane@bigswitch.com>
Acked-by: Rich Lane <rich.lane@bigswitch.com>

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2 00/19] vhost ABI/API refactoring
  2016-05-26 17:04  4%   ` Rich Lane
@ 2016-05-27  1:36  4%     ` Yuanhan Liu
  0 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-05-27  1:36 UTC (permalink / raw)
  To: Rich Lane
  Cc: dev, Thomas Monjalon, huawei.xie, Panu Matilainen,
	Tetsuya Mukawa, Traynor Kevin

On Thu, May 26, 2016 at 10:04:23AM -0700, Rich Lane wrote:
> On Thu, May 12, 2016 at 10:24 PM, Yuanhan Liu <yuanhan.liu@linux.intel.com>
> wrote:
> 
>     v2: - exported ifname as well to fix a vhost-pmd issue reported
>           by Rich
>         - separated the big patch that introduces several new APIs
>           into some small patches.
>         - updated release note
>         - updated version.map
> 
> 
> Tested-by: Rich Lane <rich.lane@bigswitch.com>
> Acked-by: Rich Lane <rich.lane@bigswitch.com>

Rich, appreciate your time for reviewing and testing!

	--yliu

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH 1/2] ethdev: add callback to get register size in bytes
  @ 2016-05-27 10:28  4%   ` Panu Matilainen
  2016-05-27 14:43  3%     ` Thomas Monjalon
  2016-05-30  9:32  0%     ` Zyta Szpak
  0 siblings, 2 replies; 200+ results
From: Panu Matilainen @ 2016-05-27 10:28 UTC (permalink / raw)
  To: zr, remy.horton, thomas.monjalon; +Cc: dev

On 05/25/2016 09:36 AM, zr@semihalf.com wrote:
> From: Zyta Szpak <zr@semihalf.com>
>
> Version 2 of fixing the fixed register width assumption.
> rte_eth_dev_get_reg_length and rte_eth_dev_get_reg callbacks
> do not provide register size to the app in any way. It is
> needed to allocate proper number of bytes before retrieving
> registers content with rte_eth_dev_get_reg.
>
> Signed-off-by: Zyta Szpak <zr@semihalf.com>
> ---
>  lib/librte_ether/rte_ethdev.c | 12 ++++++++++++
>  lib/librte_ether/rte_ethdev.h | 18 ++++++++++++++++++
>  2 files changed, 30 insertions(+)
>
> diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
> index a31018e..e0765f8 100644
> --- a/lib/librte_ether/rte_ethdev.c
> +++ b/lib/librte_ether/rte_ethdev.c
> @@ -3231,6 +3231,18 @@ rte_eth_dev_get_reg_length(uint8_t port_id)
>  }
>
>  int
> +rte_eth_dev_get_reg_width(uint8_t port_id)
> +{
> +	struct rte_eth_dev *dev;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> +
> +	dev = &rte_eth_devices[port_id];
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_reg_width, -ENOTSUP);
> +	return (*dev->dev_ops->get_reg_width)(dev);
> +}
> +
> +int
>  rte_eth_dev_get_reg_info(uint8_t port_id, struct rte_dev_reg_info *info)
>  {
>  	struct rte_eth_dev *dev;
> diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
> index 2757510..552eaed 100644
> --- a/lib/librte_ether/rte_ethdev.h
> +++ b/lib/librte_ether/rte_ethdev.h
> @@ -1292,6 +1292,9 @@ typedef int (*eth_timesync_write_time)(struct rte_eth_dev *dev,
>  typedef int (*eth_get_reg_length_t)(struct rte_eth_dev *dev);
>  /**< @internal Retrieve device register count  */
>
> +typedef int (*eth_get_reg_width_t)(struct rte_eth_dev *dev);
> +/**< @internal Retrieve device register byte number */
> +
>  typedef int (*eth_get_reg_t)(struct rte_eth_dev *dev,
>  				struct rte_dev_reg_info *info);
>  /**< @internal Retrieve registers  */
> @@ -1455,6 +1458,8 @@ struct eth_dev_ops {
>
>  	eth_get_reg_length_t get_reg_length;
>  	/**< Get # of registers */
> +	eth_get_reg_width_t get_reg_width;
> +	/**< Get # of bytes in register */
>  	eth_get_reg_t get_reg;
>  	/**< Get registers */
>  	eth_get_eeprom_length_t get_eeprom_length;

This is an ABI break, but maybe it is part of that "driver 
implementation API" which is exempt from the ABI/API policies. Thomas?

> @@ -3971,6 +3976,19 @@ int rte_eth_tx_queue_info_get(uint8_t port_id, uint16_t queue_id,
>   */
>  int rte_eth_dev_get_reg_length(uint8_t port_id);
>
> +/*
> + * Retrieve the number of bytes in register for a specific device
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + * @return
> + *   - (>=0) number of registers if successful.
> + *   - (-ENOTSUP) if hardware doesn't support.
> + *   - (-ENODEV) if *port_id* invalid.
> + *   - others depends on the specific operations implementation.
> + */
> +int rte_eth_dev_get_reg_width(uint8_t port_id);
> +
>  /**
>   * Retrieve device registers and register attributes
>   *

The function needs to be exported via rte_ether_version.map as well.

	- Panu -
>

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v2] mbuf: new flag when Vlan is stripped
  2016-05-23  8:46  2% ` [dpdk-dev] [PATCH] mbuf: new flag when Vlan " Olivier Matz
  2016-05-23  8:59  0%   ` Ananyev, Konstantin
  2016-05-23  9:20  0%   ` Ananyev, Konstantin
@ 2016-05-27 14:33  2%   ` Olivier Matz
  2016-06-15 11:48  2%     ` [dpdk-dev] [PATCH v3] " Olivier Matz
  2 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2016-05-27 14:33 UTC (permalink / raw)
  To: dev
  Cc: johndale, konstantin.ananyev, helin.zhang, adrien.mazarguil,
	rahul.lakkireddy, alejandro.lucero, sony.chacko

The behavior of PKT_RX_VLAN_PKT was not very well defined, resulting in
PMDs not advertising the same flags in similar conditions.

Following discussion in [1], introduce 2 new flags PKT_RX_VLAN_STRIPPED
and PKT_RX_QINQ_STRIPPED that are better defined:

  PKT_RX_VLAN_STRIPPED: a vlan has been stripped by the hardware and its
  tci is saved in mbuf->vlan_tci. This can only happen if vlan stripping
  is enabled in the RX configuration of the PMD.

For now, the old flag PKT_RX_VLAN_PKT is kept but marked as deprecated.
It should be removed from applications and PMDs in a future revision.

This patch also updates the drivers. For PKT_RX_VLAN_PKT:

- e1000, enic, i40e, mlx5, nfp, vmxnet3: done, PKT_RX_VLAN_PKT already
  had the same meaning than PKT_RX_VLAN_STRIPPED, minor update is
  required.
- fm10k: done, PKT_RX_VLAN_PKT already had the same meaning than
  PKT_RX_VLAN_STRIPPED, and vlan stripping is always enabled on fm10k.
- ixgbe: modification done (vector and normal), the old flag was set
  when a vlan was recognized, even if vlan stripping was disabled.
- the other drivers do not support vlan stripping.

For PKT_RX_QINQ_PKT, it was only supported on i40e, and the behavior was
already correct, so we can reuse the same bit value for
PKT_RX_QINQ_STRIPPED.

[1] http://dpdk.org/ml/archives/dev/2016-April/037837.html,

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

v1 -> v2:
- fix ixgbe (vector mode) and i40e (normal and vector mode)
- store vlan flags instead of a boolean value in ixgbe rxq, as
  suggested by Konstantin
- replay tests on ixgbe (normal + vector) and i40e (normal +
  vector). See below.

RFC -> v1:
- fix checkpatch and check-git-log.sh issues
- add a deprecation notice for the old vlan flags
- rebase on head


This patch is tested on ixgbe (normal + vector), i40e (normal +
vector) and igb (hardware is a 82575):

  # we use scapy to send packets like this:
  # Ether(src="00:01:02:03:04:05", dst="00:1B:21:AB:8F:10")/Dot1Q(vlan=0x666)/IP()/UDP()/Raw("x"*32)

  cd dpdk.org/
  make config T=x86_64-native-linuxapp-gcc
  make -j32
  mkdir -p /mnt/huge
  mount -t hugetlbfs nodev /mnt/huge
  echo 256 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
  modprobe uio_pci_generic

  # test-pmd is started with vlan stripping, using the rx-vector
  # function if available (i40e and ixgbe)
  ./build/app/testpmd -l 2,4 -- --total-num-mbufs=65536 -i --port-topology=chained \
    --disable-hw-vlan-filter
  # to disable vlan stripping, add:
  --disable-hw-vlan-strip
  # to disable the vector mode (it can be checked in debug logs), add:
  --enable-rx-cksum

  # we run test-pmd in rxonly mode, displaying the packet information.
  set fwd rxonly
  set verbose 1
  start

==== IXGBE normal rx function

  # ixgbe: the behavior of the flag PKT_RX_VLAN_PKT is kept as before,
  # and the new flag PKT_RX_VLAN_STRIPPED is introduced when vlan stripping
  # is enabled and a vlan is stripped.

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT

==== IXGBE vector rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666Unknown packet type
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1Unknown packet type
 - Receive queue=0x0
  PKT_RX_VLAN_PKT

==== I40E normal rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown            
 - Receive queue=0x0     

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 

==== I40E vector rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0     

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666Unknown packet type
 - Receive queue=0x0     
  PKT_RX_VLAN_PKT        
  PKT_RX_VLAN_STRIPPED   

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0
port 0/queue 0: received 1 packets

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1Unknown packet type
 - Receive queue=0x0

==== IGB

(not retested since RFC patch, but there was no code modification)

--- vlan stripping enabled

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

--- vlan stripping disabled

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED


 app/test-pmd/rxonly.c                |  4 +--
 doc/guides/rel_notes/deprecation.rst |  5 ++++
 drivers/net/e1000/em_rxtx.c          |  3 ++-
 drivers/net/e1000/igb_rxtx.c         |  3 ++-
 drivers/net/enic/enic_rx.c           |  2 +-
 drivers/net/i40e/i40e_rxtx.c         |  4 +--
 drivers/net/i40e/i40e_rxtx_vec.c     |  2 +-
 drivers/net/ixgbe/ixgbe_ethdev.c     | 11 ++++++++
 drivers/net/ixgbe/ixgbe_rxtx.c       | 14 ++++++----
 drivers/net/ixgbe/ixgbe_rxtx.h       |  2 ++
 drivers/net/ixgbe/ixgbe_rxtx_vec.c   | 36 +++++++++++++++++---------
 drivers/net/mlx5/mlx5_rxtx.c         |  6 +++--
 drivers/net/nfp/nfp_net.c            |  2 +-
 drivers/net/vmxnet3/vmxnet3_rxtx.c   |  2 +-
 lib/librte_mbuf/rte_mbuf.c           |  2 ++
 lib/librte_mbuf/rte_mbuf.h           | 50 ++++++++++++++++++++++++++++++++----
 16 files changed, 114 insertions(+), 34 deletions(-)

diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 14555ab..c69b344 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -156,9 +156,9 @@ pkt_burst_receive(struct fwd_stream *fs)
 				printf("hash=0x%x ID=0x%x ",
 				       mb->hash.fdir.hash, mb->hash.fdir.id);
 		}
-		if (ol_flags & PKT_RX_VLAN_PKT)
+		if (ol_flags & PKT_RX_VLAN_STRIPPED)
 			printf(" - VLAN tci=0x%x", mb->vlan_tci);
-		if (ol_flags & PKT_RX_QINQ_PKT)
+		if (ol_flags & PKT_RX_QINQ_STRIPPED)
 			printf(" - QinQ VLAN tci=0x%x, VLAN tci outer=0x%x",
 					mb->vlan_tci, mb->vlan_tci_outer);
 		if (mb->packet_type) {
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..2233a90 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -57,3 +57,8 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* The mbuf flags PKT_RX_VLAN_PKT and PKT_RX_QINQ_PKT are deprecated and
+  are respectively replaced by PKT_RX_VLAN_STRIPPED and
+  PKT_RX_QINQ_STRIPPED, that are better described. The old flags and
+  their behavior will be kept in 16.07 and will be removed in 16.11.
diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
index 3d36f21..6d8750a 100644
--- a/drivers/net/e1000/em_rxtx.c
+++ b/drivers/net/e1000/em_rxtx.c
@@ -629,7 +629,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0);
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 	return pkt_flags;
 }
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 18aeead..9d80a0b 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -729,7 +729,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 #if defined(RTE_LIBRTE_IEEE1588)
 	if (rx_status & E1000_RXD_STAT_TMST)
diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c
index f92f6bc..6459e97 100644
--- a/drivers/net/enic/enic_rx.c
+++ b/drivers/net/enic/enic_rx.c
@@ -197,7 +197,7 @@ enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
 
 	/* VLAN stripping */
 	if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
-		pkt_flags |= PKT_RX_VLAN_PKT;
+		pkt_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		mbuf->vlan_tci = enic_cq_rx_desc_vlan(cqrd);
 	} else {
 		mbuf->vlan_tci = 0;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index c833aa3..eea246b 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -88,7 +88,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
 {
 	if (rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len) &
 		(1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) {
-		mb->ol_flags |= PKT_RX_VLAN_PKT;
+		mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		mb->vlan_tci =
 			rte_le_to_cpu_16(rxdp->wb.qword0.lo_dword.l2tag1);
 		PMD_RX_LOG(DEBUG, "Descriptor l2tag1: %u",
@@ -99,7 +99,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 	if (rte_le_to_cpu_16(rxdp->wb.qword2.ext_status) &
 		(1 << I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) {
-		mb->ol_flags |= PKT_RX_QINQ_PKT;
+		mb->ol_flags |= PKT_RX_QINQ_STRIPPED;
 		mb->vlan_tci_outer = mb->vlan_tci;
 		mb->vlan_tci = rte_le_to_cpu_16(rxdp->wb.qword2.l2tag2_2);
 		PMD_RX_LOG(DEBUG, "Descriptor l2tag2_1: %u, l2tag2_2: %u",
diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
index eef80d9..634bd39 100644
--- a/drivers/net/i40e/i40e_rxtx_vec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec.c
@@ -154,7 +154,7 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 	/* map rss and vlan type to rss hash and vlan flag */
 	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
 			0, 0, 0, 0,
-			0, 0, 0, PKT_RX_VLAN_PKT,
+			0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
 			0, 0, 0, 0);
 
 	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a2b170b..5f3e047 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1636,6 +1636,7 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 {
 	struct ixgbe_hwstrip *hwstrip =
 		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(dev->data->dev_private);
+	struct ixgbe_rx_queue *rxq;
 
 	if (queue >= IXGBE_MAX_RX_QUEUE_NUM)
 		return;
@@ -1644,6 +1645,16 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 		IXGBE_SET_HWSTRIP(hwstrip, queue);
 	else
 		IXGBE_CLEAR_HWSTRIP(hwstrip, queue);
+
+	if (queue >= dev->data->nb_rx_queues)
+		return;
+
+	rxq = dev->data->rx_queues[queue];
+
+	if (on)
+		rxq->vlan_flags = PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
+	else
+		rxq->vlan_flags = PKT_RX_VLAN_PKT;
 }
 
 static void
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9c6eaf2..5a7064c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1221,7 +1221,7 @@ ixgbe_rxd_pkt_info_to_pkt_flags(uint16_t pkt_info)
 }
 
 static inline uint64_t
-rx_desc_status_to_pkt_flags(uint32_t rx_status)
+rx_desc_status_to_pkt_flags(uint32_t rx_status, uint64_t vlan_flags)
 {
 	uint64_t pkt_flags;
 
@@ -1230,7 +1230,7 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	 * Do not check whether L3/L4 rx checksum done by NIC or not,
 	 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
 	 */
-	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;
 
 #ifdef RTE_LIBRTE_IEEE1588
 	if (rx_status & IXGBE_RXD_STAT_TMST)
@@ -1287,6 +1287,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 	uint32_t pkt_info[LOOK_AHEAD];
 	int i, j, nb_rx = 0;
 	uint32_t status;
+	uint64_t vlan_flags = rxq->vlan_flags;
 
 	/* get references to current descriptor and S/W ring entry */
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
@@ -1328,7 +1329,8 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 			mb->vlan_tci = rte_le_to_cpu_16(rxdp[j].wb.upper.vlan);
 
 			/* convert descriptor fields to rte mbuf flags */
-			pkt_flags = rx_desc_status_to_pkt_flags(s[j]);
+			pkt_flags = rx_desc_status_to_pkt_flags(s[j],
+				vlan_flags);
 			pkt_flags |= rx_desc_error_to_pkt_flags(s[j]);
 			pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags
 					((uint16_t)pkt_info[j]);
@@ -1544,6 +1546,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint16_t nb_rx;
 	uint16_t nb_hold;
 	uint64_t pkt_flags;
+	uint64_t vlan_flags;
 
 	nb_rx = 0;
 	nb_hold = 0;
@@ -1551,6 +1554,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	sw_ring = rxq->sw_ring;
+	vlan_flags = rxq->vlan_flags;
 	while (nb_rx < nb_pkts) {
 		/*
 		 * The order of operations here is important as the DD status
@@ -1660,7 +1664,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		/* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
 		rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
 
-		pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+		pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_flags);
 		pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
 		pkt_flags = pkt_flags |
 			ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
@@ -1753,7 +1757,7 @@ ixgbe_fill_cluster_head_buf(
 	 */
 	head->vlan_tci = rte_le_to_cpu_16(desc->wb.upper.vlan);
 	pkt_info = rte_le_to_cpu_32(desc->wb.lower.lo_dword.data);
-	pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+	pkt_flags = rx_desc_status_to_pkt_flags(staterr, rxq->vlan_flags);
 	pkt_flags |= rx_desc_error_to_pkt_flags(staterr);
 	pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
 	head->ol_flags = pkt_flags;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 3691a19..2608b36 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -146,6 +146,8 @@ struct ixgbe_rx_queue {
 	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
 	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
 	uint8_t             rx_deferred_start; /**< not in global dev start. */
+	/** flags to set in mbuf when a vlan is detected. */
+	uint64_t            vlan_flags;
 	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
 	struct rte_mbuf fake_mbuf;
 	/** hold packets to return to application */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index e97ea82..d895bf1 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -140,10 +140,9 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
  */
 #ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
 
-#define VTAG_SHIFT     (3)
-
 static inline void
-desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
+desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
+	struct rte_mbuf **rx_pkts)
 {
 	__m128i ptype0, ptype1, vtag0, vtag1;
 	union {
@@ -151,12 +150,6 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 		uint64_t dword;
 	} vol;
 
-	/* pkt type + vlan olflags mask */
-	const __m128i pkttype_msk = _mm_set_epi16(
-			0x0000, 0x0000, 0x0000, 0x0000,
-			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
-			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);
-
 	/* mask everything except rss type */
 	const __m128i rsstype_msk = _mm_set_epi16(
 			0x0000, 0x0000, 0x0000, 0x0000,
@@ -168,6 +161,19 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
 
+	/* mask everything except vlan present bit */
+	const __m128i vlan_msk = _mm_set_epi16(
+			0x0000, 0x0000,
+			0x0000, 0x0000,
+			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
+			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
+	/* map vlan present (0x8) to ol_flags */
+	const __m128i vlan_map = _mm_set_epi8(
+		0, 0, 0, 0,
+		0, 0, 0, vlan_flags,
+		0, 0, 0, 0,
+		0, 0, 0, 0);
+
 	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
 	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
 	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
@@ -178,8 +184,8 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
 
 	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
-	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
-	vtag1 = _mm_and_si128(vtag1, pkttype_msk);
+	vtag1 = _mm_and_si128(vtag1, vlan_msk);
+	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);
 
 	vtag1 = _mm_or_si128(ptype0, vtag1);
 	vol.dword = _mm_cvtsi128_si64(vtag1);
@@ -221,6 +227,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 				0, 0            /* ignore pkt_type field */
 			);
 	__m128i dd_check, eop_check;
+	uint8_t vlan_flags;
 
 	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
 	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
@@ -270,6 +277,11 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	 */
 	sw_ring = &rxq->sw_ring[rxq->rx_tail];
 
+	/* ensure these 2 flags are in the lower 8 bits */
+	RTE_BUILD_BUG_ON(((PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED) &
+			0xffffffffffffff00ULL) != 0ULL);
+	vlan_flags = rxq->vlan_flags & 0xff;
+
 	/* A. load 4 packet in one loop
 	 * [A*. mask out 4 unused dirty field in desc]
 	 * B. copy 4 mbuf point from swring to rx_pkts
@@ -330,7 +342,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
 
 		/* set ol_flags with vlan packet type */
-		desc_to_olflags_v(descs, &rx_pkts[pos]);
+		desc_to_olflags_v(descs, vlan_flags, &rx_pkts[pos]);
 
 		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 29bfcec..d5b2286 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1051,7 +1051,8 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				pkt_buf->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1207,7 +1208,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				seg->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..5c9f350 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1800,7 +1800,7 @@ nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
-			mb->ol_flags |= PKT_RX_VLAN_PKT;
+			mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		}
 
 		/* Adding the mbuff to the mbuff array passed by the app */
diff --git a/drivers/net/vmxnet3/vmxnet3_rxtx.c b/drivers/net/vmxnet3/vmxnet3_rxtx.c
index 9fe8752..ccafc0c 100644
--- a/drivers/net/vmxnet3/vmxnet3_rxtx.c
+++ b/drivers/net/vmxnet3/vmxnet3_rxtx.c
@@ -579,7 +579,7 @@ vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
 {
 	/* Check for hardware stripped VLAN tag */
 	if (rcd->ts) {
-		rxm->ol_flags |= PKT_RX_VLAN_PKT;
+		rxm->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
 		rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
 	}
 
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index eec1456..2ece742 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -258,8 +258,10 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
 	/* case PKT_RX_HBUF_OVERFLOW: return "PKT_RX_HBUF_OVERFLOW"; */
 	/* case PKT_RX_RECIP_ERR: return "PKT_RX_RECIP_ERR"; */
 	/* case PKT_RX_MAC_ERR: return "PKT_RX_MAC_ERR"; */
+	case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED";
 	case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP";
 	case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
+	case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
 	default: return NULL;
 	}
 }
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 11fa06d..76b4f55 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -79,7 +79,16 @@ extern "C" {
  * Keep these flags synchronized with rte_get_rx_ol_flag_name() and
  * rte_get_tx_ol_flag_name().
  */
-#define PKT_RX_VLAN_PKT      (1ULL << 0)  /**< RX packet is a 802.1q VLAN packet. */
+
+/**
+ * Deprecated.
+ * RX packet is a 802.1q VLAN packet. This flag was set by PMDs when
+ * the packet is recognized as a VLAN, but the behavior between PMDs
+ * was not the same. This flag is kept for some time to avoid breaking
+ * applications and should be replaced by PKT_RX_VLAN_STRIPPED.
+ */
+#define PKT_RX_VLAN_PKT      (1ULL << 0)
+
 #define PKT_RX_RSS_HASH      (1ULL << 1)  /**< RX packet with RSS hash result. */
 #define PKT_RX_FDIR          (1ULL << 2)  /**< RX packet with FDIR match indicate. */
 #define PKT_RX_L4_CKSUM_BAD  (1ULL << 3)  /**< L4 cksum of RX pkt. is not OK. */
@@ -89,11 +98,37 @@ extern "C" {
 #define PKT_RX_HBUF_OVERFLOW (0ULL << 0)  /**< Header buffer overflow. */
 #define PKT_RX_RECIP_ERR     (0ULL << 0)  /**< Hardware processing error. */
 #define PKT_RX_MAC_ERR       (0ULL << 0)  /**< MAC error. */
+
+/**
+ * A vlan has been stripped by the hardware and its tci is saved in
+ * mbuf->vlan_tci. This can only happen if vlan stripping is enabled
+ * in the RX configuration of the PMD.
+ */
+#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
+
+/* hole, some bits can be reused here  */
+
 #define PKT_RX_IEEE1588_PTP  (1ULL << 9)  /**< RX IEEE1588 L2 Ethernet PT Packet. */
 #define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
 #define PKT_RX_FDIR_ID       (1ULL << 13) /**< FD id reported if FDIR match. */
 #define PKT_RX_FDIR_FLX      (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
-#define PKT_RX_QINQ_PKT      (1ULL << 15)  /**< RX packet with double VLAN stripped. */
+
+/**
+ * The 2 vlans have been stripped by the hardware and their tci are
+ * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
+ * This can only happen if vlan stripping is enabled in the RX
+ * configuration of the PMD. If this flag is set, PKT_RX_VLAN_STRIPPED
+ * must also be set.
+ */
+#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
+
+/**
+ * Deprecated.
+ * RX packet with double VLAN stripped.
+ * This flag is replaced by PKT_RX_QINQ_STRIPPED.
+ */
+#define PKT_RX_QINQ_PKT      PKT_RX_QINQ_STRIPPED
+
 /* add new RX flags here */
 
 /* add new TX flags here */
@@ -761,7 +796,10 @@ struct rte_mbuf {
 
 	/*
 	 * The packet type, which is the combination of outer/inner L2, L3, L4
-	 * and tunnel types.
+	 * and tunnel types. The packet_type is about data really present in the
+	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
+	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
+	 * vlan is stripped from the data.
 	 */
 	union {
 		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
@@ -778,7 +816,8 @@ struct rte_mbuf {
 
 	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
 	uint16_t data_len;        /**< Amount of data in segment buffer. */
-	uint16_t vlan_tci;        /**< VLAN Tag Control Identifier (CPU order) */
+	/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+	uint16_t vlan_tci;
 
 	union {
 		uint32_t rss;     /**< RSS hash result if RSS enabled */
@@ -804,7 +843,8 @@ struct rte_mbuf {
 
 	uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
 
-	uint16_t vlan_tci_outer;  /**< Outer VLAN Tag Control Identifier (CPU order) */
+	/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
+	uint16_t vlan_tci_outer;
 
 	/* second cache line - fields only used in slow path or on TX */
 	MARKER cacheline1 __rte_cache_min_aligned;
-- 
2.8.0.rc3

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH 1/2] ethdev: add callback to get register size in bytes
  2016-05-27 10:28  4%   ` Panu Matilainen
@ 2016-05-27 14:43  3%     ` Thomas Monjalon
  2016-05-30  9:32  0%     ` Zyta Szpak
  1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-05-27 14:43 UTC (permalink / raw)
  To: Panu Matilainen; +Cc: zr, remy.horton, dev

2016-05-27 13:28, Panu Matilainen:
> On 05/25/2016 09:36 AM, zr@semihalf.com wrote:
> > @@ -1455,6 +1458,8 @@ struct eth_dev_ops {
> >
> >  	eth_get_reg_length_t get_reg_length;
> >  	/**< Get # of registers */
> > +	eth_get_reg_width_t get_reg_width;
> > +	/**< Get # of bytes in register */
> >  	eth_get_reg_t get_reg;
> >  	/**< Get registers */
> >  	eth_get_eeprom_length_t get_eeprom_length;
> 
> This is an ABI break, but maybe it is part of that "driver 
> implementation API" which is exempt from the ABI/API policies. Thomas?

Yes dev_ops are for drivers, not for applications.
Thus it should not be impacted by the ABI policy.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports on ppc64
  2016-05-24  5:15  3%           ` Yuanhan Liu
@ 2016-05-30  8:45  0%             ` Olivier Matz
  2016-06-15 16:13  3%               ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2016-05-30  8:45 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: David Marchand, dev, Chao Zhu, Xie, Huawei, Panu Matilainen,
	Thomas Monjalon



On 05/24/2016 07:15 AM, Yuanhan Liu wrote:
> On Mon, May 23, 2016 at 03:40:58PM +0200, Olivier Matz wrote:
>> For reference, here is the report of the ABI checker for EAL:
>>
>> [−] struct rte_pci_ioport (2)
>>
>>  1 Field len has been added to this type.
>>    1) This field will not be initialized by old clients.
>>    2) Size of the inclusive type has been changed.
>>       NOTE: this field should be accessed only from the new library
>>       functions, otherwise it may result in crash or incorrect behavior
>>       of applications.
>>  2 Size of this type has been changed from 16 bytes to 24 bytes. 	
>>    The fields or parameters of such data type may be incorrectly
>>    initialized or accessed by old client applications.
>>
>> [−] affected symbols (4)
>>  rte_eal_pci_ioport_map ( struct rte_pci_device* dev, int bar,
>>     struct rte_pci_ioport* p ) @@ DPDK_16.04
>>  3rd parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>>  rte_eal_pci_ioport_read ( struct rte_pci_ioport* p, void* data,
>>     size_t len, off_t offset ) @@ DPDK_16.04
>>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>>  rte_eal_pci_ioport_unmap ( struct rte_pci_ioport* p ) @@ DPDK_16.04
>>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>>  rte_eal_pci_ioport_write ( struct rte_pci_ioport* p, void const* data,
>>     size_t len, off_t offset ) @@ DPDK_16.04
>>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
>>
>>
>> My understanding of the comment for this structure is that it's
>> internal to EAL:
> 
> I'm not quite sure that is enough. Cc'ed Panu, the guru on ABI stuff,
> hopefully he could shed some light on it.
> 
>> /**
>>  * A structure used to access io resources for a pci device.
>>  * rte_pci_ioport is arch, os, driver specific, and should not be used
>> outside
>>  * of pci ioport api.
>>  */
>> struct rte_pci_ioport {
>>  ...
>> }
>>
>> So I'd say it's ok to have it integrated for 16.07.
> 
> I'll let Thomas to decide it :)

Panu or Thomas, do you have any comment on this?

Thanks,
Olivier

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 1/2] ethdev: add callback to get register size in bytes
  2016-05-27 10:28  4%   ` Panu Matilainen
  2016-05-27 14:43  3%     ` Thomas Monjalon
@ 2016-05-30  9:32  0%     ` Zyta Szpak
  1 sibling, 0 replies; 200+ results
From: Zyta Szpak @ 2016-05-30  9:32 UTC (permalink / raw)
  To: Panu Matilainen, remy.horton, thomas.monjalon; +Cc: dev



On 27.05.2016 12:28, Panu Matilainen wrote:
> On 05/25/2016 09:36 AM, zr@semihalf.com wrote:
>> From: Zyta Szpak <zr@semihalf.com>
>>
>> Version 2 of fixing the fixed register width assumption.
>> rte_eth_dev_get_reg_length and rte_eth_dev_get_reg callbacks
>> do not provide register size to the app in any way. It is
>> needed to allocate proper number of bytes before retrieving
>> registers content with rte_eth_dev_get_reg.
>>
>> Signed-off-by: Zyta Szpak <zr@semihalf.com>
>> ---
>>  lib/librte_ether/rte_ethdev.c | 12 ++++++++++++
>>  lib/librte_ether/rte_ethdev.h | 18 ++++++++++++++++++
>>  2 files changed, 30 insertions(+)
>>
>> diff --git a/lib/librte_ether/rte_ethdev.c 
>> b/lib/librte_ether/rte_ethdev.c
>> index a31018e..e0765f8 100644
>> --- a/lib/librte_ether/rte_ethdev.c
>> +++ b/lib/librte_ether/rte_ethdev.c
>> @@ -3231,6 +3231,18 @@ rte_eth_dev_get_reg_length(uint8_t port_id)
>>  }
>>
>>  int
>> +rte_eth_dev_get_reg_width(uint8_t port_id)
>> +{
>> +    struct rte_eth_dev *dev;
>> +
>> +    RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
>> +
>> +    dev = &rte_eth_devices[port_id];
>> +    RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->get_reg_width, -ENOTSUP);
>> +    return (*dev->dev_ops->get_reg_width)(dev);
>> +}
>> +
>> +int
>>  rte_eth_dev_get_reg_info(uint8_t port_id, struct rte_dev_reg_info 
>> *info)
>>  {
>>      struct rte_eth_dev *dev;
>> diff --git a/lib/librte_ether/rte_ethdev.h 
>> b/lib/librte_ether/rte_ethdev.h
>> index 2757510..552eaed 100644
>> --- a/lib/librte_ether/rte_ethdev.h
>> +++ b/lib/librte_ether/rte_ethdev.h
>> @@ -1292,6 +1292,9 @@ typedef int (*eth_timesync_write_time)(struct 
>> rte_eth_dev *dev,
>>  typedef int (*eth_get_reg_length_t)(struct rte_eth_dev *dev);
>>  /**< @internal Retrieve device register count  */
>>
>> +typedef int (*eth_get_reg_width_t)(struct rte_eth_dev *dev);
>> +/**< @internal Retrieve device register byte number */
>> +
>>  typedef int (*eth_get_reg_t)(struct rte_eth_dev *dev,
>>                  struct rte_dev_reg_info *info);
>>  /**< @internal Retrieve registers  */
>> @@ -1455,6 +1458,8 @@ struct eth_dev_ops {
>>
>>      eth_get_reg_length_t get_reg_length;
>>      /**< Get # of registers */
>> +    eth_get_reg_width_t get_reg_width;
>> +    /**< Get # of bytes in register */
>>      eth_get_reg_t get_reg;
>>      /**< Get registers */
>>      eth_get_eeprom_length_t get_eeprom_length;
>
> This is an ABI break, but maybe it is part of that "driver 
> implementation API" which is exempt from the ABI/API policies. Thomas?
>
>> @@ -3971,6 +3976,19 @@ int rte_eth_tx_queue_info_get(uint8_t port_id, 
>> uint16_t queue_id,
>>   */
>>  int rte_eth_dev_get_reg_length(uint8_t port_id);
>>
>> +/*
>> + * Retrieve the number of bytes in register for a specific device
>> + *
>> + * @param port_id
>> + *   The port identifier of the Ethernet device.
>> + * @return
>> + *   - (>=0) number of registers if successful.
>> + *   - (-ENOTSUP) if hardware doesn't support.
>> + *   - (-ENODEV) if *port_id* invalid.
>> + *   - others depends on the specific operations implementation.
>> + */
>> +int rte_eth_dev_get_reg_width(uint8_t port_id);
>> +
>>  /**
>>   * Retrieve device registers and register attributes
>>   *
>
> The function needs to be exported via rte_ether_version.map as well.
OK, right!
>
>     - Panu -
>>
>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v5 1/3] mempool: support external handler
  @ 2016-05-30  9:41  3%             ` Jerin Jacob
  2016-05-30 11:27  0%               ` Hunt, David
  0 siblings, 1 reply; 200+ results
From: Jerin Jacob @ 2016-05-30  9:41 UTC (permalink / raw)
  To: Hunt, David; +Cc: dev, olivier.matz, yuanhan.liu, pmatilai

On Fri, May 27, 2016 at 03:44:31PM +0100, Hunt, David wrote:
> 
> 
Hi David,
[snip]
>      That chunk of code above would be better moved all right. I'd suggest
> moving it to the
> rte_mempool_create function, as that's the one that needs the backward
> compatibility.

OK

> 
> On the flags issue, each mempool handler can re-interpret the flags as
> needed. Maybe we
> could use the upper half of the bits for different handlers, changing the
> meaning of the
> bits depending on which handler is being set up. We can then keep the lower
> half for bits that are common across all handlers? That way the user can

Common lower half bit in flags looks good.

> just set the bits they
> are interested in for that handler. Also, the alloc function has access to
> the flags, so maybe the
> handler specific setup could be handled in the alloc function rather than
> adding a new function pointer?

Yes. I agree.

> 
> Of course, that won't help if we need to pass in more data, in which case
> we'd probably need an
> opaque data pointer somewhere. It would probably be most useful to pass it
> in with the
> alloc, which may need the data. Any suggestions?

But the top level rte_mempool_create() function needs to pass the data. Right?
That would be an ABI change. IMO, we need to start thinking about
passing a struct of config data to rte_mempool_create to create
backward compatibility on new argument addition to rte_mempool_create()

Other points in HW assisted pool manager perspective,

1) May be RING can be replaced with some other higher abstraction name
for the internal MEMPOOL_F_RING_CREATED flag
2) IMO, It is better to change void *pool in struct rte_mempool to
anonymous union type, something like below, so that mempool
implementation can choose the best type.
	union {
		void *pool;
		uint64_t val;
	}

3) int32_t handler_idx creates 4 byte hole in struct rte_mempool in
64 bit system. IMO it better to rearrange.(as const struct rte_memzone
*mz comes next)

4) IMO, It is better to change ext_alloc/ext_free to ext_create/ext_destroy
as their is no allocation in HW assisted pool manager case,
it will be mostly creating some HW initialization.

> 
> Regards,
> Dave.
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v5 1/3] mempool: support external handler
  2016-05-30  9:41  3%             ` Jerin Jacob
@ 2016-05-30 11:27  0%               ` Hunt, David
  0 siblings, 0 replies; 200+ results
From: Hunt, David @ 2016-05-30 11:27 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, olivier.matz, yuanhan.liu, pmatilai

On 5/30/2016 10:41 AM, Jerin Jacob wrote:
--snip--
>> Of course, that won't help if we need to pass in more data, in which case
>> we'd probably need an
>> opaque data pointer somewhere. It would probably be most useful to pass it
>> in with the
>> alloc, which may need the data. Any suggestions?
> But the top level rte_mempool_create() function needs to pass the data. Right?
> That would be an ABI change. IMO, we need to start thinking about
> passing a struct of config data to rte_mempool_create to create
> backward compatibility on new argument addition to rte_mempool_create()

New mempool handlers will use rte_mempool_create_empty(), 
rte_mempool_set_handler(),
then rte_mempool_populate_*(). These three functions are new to this 
release, to no problem
to add a parameter to one of them for the config data. Also since we're 
adding some new
elements to the mempool structure, how about we add a new pointer for a 
void pointer to a
config data structure, as defined by the handler.

So, new element in rte_mempool struct alongside the *pool
     void *pool;
     void *pool_config;

Then add a param to the rte_mempool_set_handler function:
int
rte_mempool_set_handler(struct rte_mempool *mp, const char *name, void 
*pool_config)

The function would simply set the pointer in the mempool struct, and the 
custom handler
alloc/create function would use as apporopriate as needed. Handlers that 
do not need this
data can be passed NULL.

> Other points in HW assisted pool manager perspective,
>
> 1) May be RING can be replaced with some other higher abstraction name
> for the internal MEMPOOL_F_RING_CREATED flag

Agreed. I'll change to MEMPOOL_F_POOL_CREATED, since we're already 
changing the *ring
element of the mempool struct to *pool

> 2) IMO, It is better to change void *pool in struct rte_mempool to
> anonymous union type, something like below, so that mempool
> implementation can choose the best type.
> 	union {
> 		void *pool;
> 		uint64_t val;
> 	}

Could we do this by using the union for the *pool_config suggested 
above, would that give
you what you need?

> 3) int32_t handler_idx creates 4 byte hole in struct rte_mempool in
> 64 bit system. IMO it better to rearrange.(as const struct rte_memzone
> *mz comes next)
OK, Will look at this.

> 4) IMO, It is better to change ext_alloc/ext_free to ext_create/ext_destroy
> as their is no allocation in HW assisted pool manager case,
> it will be mostly creating some HW initialization.

OK, I'll change. I think that makes more sense.

Regards,
Dave.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] about rx checksum flags
  @ 2016-05-31 20:28  3%     ` Stephen Hemminger
  2016-05-31 20:58  0%       ` Olivier MATZ
  0 siblings, 1 reply; 200+ results
From: Stephen Hemminger @ 2016-05-31 20:28 UTC (permalink / raw)
  To: Olivier MATZ
  Cc: Yuanhan Liu, dev, Ananyev, Konstantin, Richardson, Bruce,
	Adrien Mazarguil, Tan, Jianfeng

On Tue, 31 May 2016 21:11:59 +0200
Olivier MATZ <olivier.matz@6wind.com> wrote:

> 
> 
> On 05/31/2016 10:09 AM, Yuanhan Liu wrote:
> > On Mon, May 30, 2016 at 05:26:21PM +0200, Olivier Matz wrote:
> >>  PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the packet
> >>  data, but the integrity of the L4 header is verified.
> >>   -> the application can process the packet but must not verify the
> >>      checksum by sw. It has to take care to recalculate the cksum
> >>      if the packet is transmitted (either by sw or using tx offload)
> > 
> > I like the explanation you made at [1] better :)
> > 
> > So in general, I think this proposal is good to have.
> 
> Thanks everyone for your feedback.
> 
> I'll try to send a first patch proposition soon.
> 
> Regards,
> Olivier

I think it is time to ditch the old definitions of Rx checksum and instead
use something more compatiable with virtio (and Linux). I.e have three values
  1) checksum is know good for packet contents
  2) checksum value one's complement for packet contents
  3) checksum is undetermined
The original definition seems to be Intel HW centric and applies to a limited
range of devices making it unusable by general application.

Break the ABI, and ditch the old values (ok mark PKT_RX_L4_CKSUM_BAD as __deprecated
and remove all usage).

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] about rx checksum flags
  2016-05-31 20:28  3%     ` Stephen Hemminger
@ 2016-05-31 20:58  0%       ` Olivier MATZ
  2016-05-31 22:02  0%         ` Stephen Hemminger
  0 siblings, 1 reply; 200+ results
From: Olivier MATZ @ 2016-05-31 20:58 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Yuanhan Liu, dev, Ananyev, Konstantin, Richardson, Bruce,
	Adrien Mazarguil, Tan, Jianfeng

Hi Stephen,

On 05/31/2016 10:28 PM, Stephen Hemminger wrote:
> On Tue, 31 May 2016 21:11:59 +0200
> Olivier MATZ <olivier.matz@6wind.com> wrote:
> 
>>
>>
>> On 05/31/2016 10:09 AM, Yuanhan Liu wrote:
>>> On Mon, May 30, 2016 at 05:26:21PM +0200, Olivier Matz wrote:
>>>>  PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the packet
>>>>  data, but the integrity of the L4 header is verified.
>>>>   -> the application can process the packet but must not verify the
>>>>      checksum by sw. It has to take care to recalculate the cksum
>>>>      if the packet is transmitted (either by sw or using tx offload)
>>>
>>> I like the explanation you made at [1] better :)
>>>
>>> So in general, I think this proposal is good to have.
>>
>> Thanks everyone for your feedback.
>>
>> I'll try to send a first patch proposition soon.
>>
>> Regards,
>> Olivier
> 
> I think it is time to ditch the old definitions of Rx checksum and instead
> use something more compatiable with virtio (and Linux). I.e have three values
>   1) checksum is know good for packet contents
>   2) checksum value one's complement for packet contents
>   3) checksum is undetermined
> The original definition seems to be Intel HW centric and applies to a limited
> range of devices making it unusable by general application.
> 
> Break the ABI, and ditch the old values (ok mark PKT_RX_L4_CKSUM_BAD as __deprecated
> and remove all usage).
> 

Don't you think knowing that a checksum is bad could be useful?
In that case the application can drop/log the packet without any
additional cpu cost.

What do you mean by beeing unusable by general application?

I think the "2)" also requires a csum_start + csum_offset in
mbuf structure, right?

Do you also suggest to drop IP checksum flags?

Will it be possible to manage tunnel checksums?

I think this would be a pretty big change. If there is no additional
argument than beeing more compatible with virtio/linux, I'm wondering
if it's worth breaking the API. Let's wait for other opinions.

Thanks for your feedback.
Olivier

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] about rx checksum flags
  2016-05-31 20:58  0%       ` Olivier MATZ
@ 2016-05-31 22:02  0%         ` Stephen Hemminger
  2016-06-01  9:06  0%           ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Stephen Hemminger @ 2016-05-31 22:02 UTC (permalink / raw)
  To: Olivier MATZ
  Cc: Yuanhan Liu, dev, Ananyev, Konstantin, Richardson, Bruce,
	Adrien Mazarguil, Tan, Jianfeng

On Tue, 31 May 2016 22:58:57 +0200
Olivier MATZ <olivier.matz@6wind.com> wrote:

> Hi Stephen,
> 
> On 05/31/2016 10:28 PM, Stephen Hemminger wrote:
> > On Tue, 31 May 2016 21:11:59 +0200
> > Olivier MATZ <olivier.matz@6wind.com> wrote:
> > 
> >>
> >>
> >> On 05/31/2016 10:09 AM, Yuanhan Liu wrote:
> >>> On Mon, May 30, 2016 at 05:26:21PM +0200, Olivier Matz wrote:
> >>>>  PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the packet
> >>>>  data, but the integrity of the L4 header is verified.
> >>>>   -> the application can process the packet but must not verify the
> >>>>      checksum by sw. It has to take care to recalculate the cksum
> >>>>      if the packet is transmitted (either by sw or using tx offload)
> >>>
> >>> I like the explanation you made at [1] better :)
> >>>
> >>> So in general, I think this proposal is good to have.
> >>
> >> Thanks everyone for your feedback.
> >>
> >> I'll try to send a first patch proposition soon.
> >>
> >> Regards,
> >> Olivier
> > 
> > I think it is time to ditch the old definitions of Rx checksum and instead
> > use something more compatiable with virtio (and Linux). I.e have three values
> >   1) checksum is know good for packet contents
> >   2) checksum value one's complement for packet contents
> >   3) checksum is undetermined
> > The original definition seems to be Intel HW centric and applies to a limited
> > range of devices making it unusable by general application.
> > 
> > Break the ABI, and ditch the old values (ok mark PKT_RX_L4_CKSUM_BAD as __deprecated
> > and remove all usage).
> > 
> 
> Don't you think knowing that a checksum is bad could be useful?

Not really. They should be mark as undetermined, then software can recheck
for the possibly buggy hardware.

> In that case the application can drop/log the packet without any
> additional cpu cost.
> 
> What do you mean by beeing unusable by general application?

Right now application can only see "known bad" or "indeterminate"
there is no way to no which packets are good. Since good is the desired/expected
case, software has to checksum every packet.

> 
> I think the "2)" also requires a csum_start + csum_offset in
> mbuf structure, right?

Not really, it would mean having a way to get the raw one's complement sum
out of the hardware. This is a good way to support the tunnel protocol du jour
without having to have firmware support. Unfortunately, most hardware vendors
don't believe in doing it that way.


> Do you also suggest to drop IP checksum flags?

IP checksum offload is mostly useless. If application needs to look
at IP, it can do whole checksum in very few instructions, the whole header
is in the same cache line as src/dst so the HW offload is really no help.

> 
> Will it be possible to manage tunnel checksums?
> 
> I think this would be a pretty big change. If there is no additional
> argument than beeing more compatible with virtio/linux, I'm wondering
> if it's worth breaking the API. Let's wait for other opinions.
> 
> Thanks for your feedback.
> Olivier

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] about rx checksum flags
  2016-05-31 22:02  0%         ` Stephen Hemminger
@ 2016-06-01  9:06  0%           ` Ananyev, Konstantin
  2016-06-02  7:42  0%             ` Chandran, Sugesh
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2016-06-01  9:06 UTC (permalink / raw)
  To: Stephen Hemminger, Olivier MATZ
  Cc: Yuanhan Liu, dev, Richardson, Bruce, Adrien Mazarguil, Tan, Jianfeng



> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, May 31, 2016 11:03 PM
> To: Olivier MATZ
> Cc: Yuanhan Liu; dev@dpdk.org; Ananyev, Konstantin; Richardson, Bruce; Adrien Mazarguil; Tan, Jianfeng
> Subject: Re: [dpdk-dev] about rx checksum flags
> 
> On Tue, 31 May 2016 22:58:57 +0200
> Olivier MATZ <olivier.matz@6wind.com> wrote:
> 
> > Hi Stephen,
> >
> > On 05/31/2016 10:28 PM, Stephen Hemminger wrote:
> > > On Tue, 31 May 2016 21:11:59 +0200
> > > Olivier MATZ <olivier.matz@6wind.com> wrote:
> > >
> > >>
> > >>
> > >> On 05/31/2016 10:09 AM, Yuanhan Liu wrote:
> > >>> On Mon, May 30, 2016 at 05:26:21PM +0200, Olivier Matz wrote:
> > >>>>  PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the packet
> > >>>>  data, but the integrity of the L4 header is verified.
> > >>>>   -> the application can process the packet but must not verify the
> > >>>>      checksum by sw. It has to take care to recalculate the cksum
> > >>>>      if the packet is transmitted (either by sw or using tx offload)
> > >>>
> > >>> I like the explanation you made at [1] better :)
> > >>>
> > >>> So in general, I think this proposal is good to have.
> > >>
> > >> Thanks everyone for your feedback.
> > >>
> > >> I'll try to send a first patch proposition soon.
> > >>
> > >> Regards,
> > >> Olivier
> > >
> > > I think it is time to ditch the old definitions of Rx checksum and instead
> > > use something more compatiable with virtio (and Linux). I.e have three values
> > >   1) checksum is know good for packet contents
> > >   2) checksum value one's complement for packet contents
> > >   3) checksum is undetermined
> > > The original definition seems to be Intel HW centric and applies to a limited
> > > range of devices making it unusable by general application.
> > >
> > > Break the ABI, and ditch the old values (ok mark PKT_RX_L4_CKSUM_BAD as __deprecated
> > > and remove all usage).
> > >
> >
> > Don't you think knowing that a checksum is bad could be useful?
> 
> Not really. They should be mark as undetermined, then software can recheck
> for the possibly buggy hardware.

Hmm, I don't see the point here.
If the HW clearly reports that checksum is invalid (not unknown),
why SW has to assume it is ' undetermined' and recheck it?
To me that means just wasted cycles.
In general, it sounds like really strange approach to me:
write your SW with assumption that all HW you are going to use
will not work correctly. 

> 
> > In that case the application can drop/log the packet without any
> > additional cpu cost.
> >
> > What do you mean by beeing unusable by general application?
> 
> Right now application can only see "known bad" or "indeterminate"
> there is no way to no which packets are good. Since good is the desired/expected
> case, software has to checksum every packet.
> 
> >
> > I think the "2)" also requires a csum_start + csum_offset in
> > mbuf structure, right?
> 
> Not really, it would mean having a way to get the raw one's complement sum
> out of the hardware. This is a good way to support the tunnel protocol du jour
> without having to have firmware support. Unfortunately, most hardware vendors
> don't believe in doing it that way.

It might be a good feature to have, but if most HW vendors don't support it
why to bother?

> 
> 
> > Do you also suggest to drop IP checksum flags?
> 
> IP checksum offload is mostly useless. If application needs to look
> at IP, it can do whole checksum in very few instructions, the whole header
> is in the same cache line as src/dst so the HW offload is really no help.
> 
> >
> > Will it be possible to manage tunnel checksums?
> >
> > I think this would be a pretty big change. If there is no additional
> > argument than beeing more compatible with virtio/linux, I'm wondering
> > if it's worth breaking the API. Let's wait for other opinions.

I think that what Olivier proposed is good enough and
definitely a step forward from what we have right now.

Konstantin

> >
> > Thanks for your feedback.
> > Olivier

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 0/5] mempool: add external mempool manager
  2016-05-19 13:44  2% ` [dpdk-dev] mempool: " David Hunt
  @ 2016-06-01 16:19  2%   ` David Hunt
  2016-06-02 13:27  2%     ` [dpdk-dev] [PATCH v7 " David Hunt
  1 sibling, 1 reply; 200+ results
From: David Hunt @ 2016-06-01 16:19 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 1st June 2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

Note: After applying the last patch, run "make config ..." before
compiling. It introduces a config file change. 

Note: Hopefully I've addressed all the extensive comments over the
last week. If I've missed any, please let me know, as it would
not have been intentional. I hop I've responded to all comments
via email on the mailing list. 

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool handler. This is achieved by adding a
     new mempool handler source file into the librte_mempool library, and
     using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_handler to create a new mempool
     using the name parameter to identify which handler to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_handler() which sets the mempool's handler
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant handler

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool handler name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. put       - puts an object back into the mempool once an application has
                finished with it
 3. get       - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised handlers may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_handler()

int
rte_mempool_set_handler(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool handler is passed by name
to rte_mempool_set_handler, which looks through the handler array to
get the handler index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via handler index.

The mempool handler structure contains callbacks to the implementation of
the handler, and is set up for registration as follows:

static const struct rte_mempool_handler handler_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .put = common_ring_sp_put,
    .get = common_ring_mc_get,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the handler in the array of handlers

REGISTER_MEMPOOL_HANDLER(handler_mp_mc);

For and example of a simple malloc based mempool manager, see
lib/librte_mempool/custom_mempool.c

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (4):
  mempool: support external handler
  mempool: remove rte_ring from rte_mempool struct
  mempool: add default external mempool handler
  mbuf: get default mempool handler from configuration

Olivier Matz (1):
  app/test: test external mempool handler

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] Suggestions for the dpdk stable tree
  2016-05-23  2:21  3%     ` Yuanhan Liu
@ 2016-06-01 19:01  0%       ` Mcnamara, John
  0 siblings, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-01 19:01 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Christian Ehrhardt, dev, Stephen Hemminger, Thomas Monjalon

> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Monday, May 23, 2016 3:22 AM
> To: Mcnamara, John <john.mcnamara@intel.com>
> Cc: Christian Ehrhardt <christian.ehrhardt@canonical.com>; dev
> <dev@dpdk.org>; Stephen Hemminger <stephen@networkplumber.org>; Thomas
> Monjalon <thomas.monjalon@6wind.com>
> Subject: Re: [dpdk-dev] Suggestions for the dpdk stable tree
> 
> > We have been looking at identifying a maintainer and validation engineer
> internally to support the effort but haven't be able to finalize that.
> Once we do we will come back to the mailing list with a proposal and a
> request for comments.
> 
> I would nominate myself as the LTS tree maintainer, if it makes sense to
> have one.

Hi Yuanhan,

Thanks for putting your name forward. I think your experience as the dpdk-next-virtio
maintainer will help with this.


> > We would probably be looking at 16.04 or even 16.07 as the basis for the
> LTS at this stage.
> 
> Just one opinion from the view of vhost: since 16.07 is a vhost ABI/API
> refactoring release, I'd suggest to base on 16.07, and then we could have
> less conflicts to apply later bug fix patches.

Agreed. At this stage 16.07 make more sense.

I'll start a separate discussion thread about how the LTS process would work
to see if we can get some consensus from interested parties.

John.
-- 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] about rx checksum flags
  2016-06-01  9:06  0%           ` Ananyev, Konstantin
@ 2016-06-02  7:42  0%             ` Chandran, Sugesh
  0 siblings, 0 replies; 200+ results
From: Chandran, Sugesh @ 2016-06-02  7:42 UTC (permalink / raw)
  To: Ananyev, Konstantin, Stephen Hemminger, Olivier MATZ
  Cc: Yuanhan Liu, dev, Richardson, Bruce, Adrien Mazarguil, Tan, Jianfeng

Hi Olivier,

Thank you for working on this..
A comment on the proposal is given below,
 

Regards
_Sugesh

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev,
> Konstantin
> Sent: Wednesday, June 1, 2016 10:07 AM
> To: Stephen Hemminger <stephen@networkplumber.org>; Olivier MATZ
> <olivier.matz@6wind.com>
> Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>; dev@dpdk.org; Richardson,
> Bruce <bruce.richardson@intel.com>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>; Tan, Jianfeng <jianfeng.tan@intel.com>
> Subject: Re: [dpdk-dev] about rx checksum flags
> 
> 
> 
> > -----Original Message-----
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Tuesday, May 31, 2016 11:03 PM
> > To: Olivier MATZ
> > Cc: Yuanhan Liu; dev@dpdk.org; Ananyev, Konstantin; Richardson, Bruce;
> > Adrien Mazarguil; Tan, Jianfeng
> > Subject: Re: [dpdk-dev] about rx checksum flags
> >
> > On Tue, 31 May 2016 22:58:57 +0200
> > Olivier MATZ <olivier.matz@6wind.com> wrote:
> >
> > > Hi Stephen,
> > >
> > > On 05/31/2016 10:28 PM, Stephen Hemminger wrote:
> > > > On Tue, 31 May 2016 21:11:59 +0200 Olivier MATZ
> > > > <olivier.matz@6wind.com> wrote:
> > > >
> > > >>
> > > >>
> > > >> On 05/31/2016 10:09 AM, Yuanhan Liu wrote:
> > > >>> On Mon, May 30, 2016 at 05:26:21PM +0200, Olivier Matz wrote:
> > > >>>>  PKT_RX_L4_CKSUM_NONE: the L4 checksum is not correct in the
> > > >>>> packet  data, but the integrity of the L4 header is verified.
> > > >>>>   -> the application can process the packet but must not verify the
> > > >>>>      checksum by sw. It has to take care to recalculate the cksum
> > > >>>>      if the packet is transmitted (either by sw or using tx
> > > >>>> offload)
> > > >>>
> > > >>> I like the explanation you made at [1] better :)
> > > >>>
> > > >>> So in general, I think this proposal is good to have.
> > > >>
> > > >> Thanks everyone for your feedback.
> > > >>
> > > >> I'll try to send a first patch proposition soon.
> > > >>
> > > >> Regards,
> > > >> Olivier
> > > >
> > > > I think it is time to ditch the old definitions of Rx checksum and
> > > > instead use something more compatiable with virtio (and Linux). I.e
> have three values
> > > >   1) checksum is know good for packet contents
> > > >   2) checksum value one's complement for packet contents
> > > >   3) checksum is undetermined
> > > > The original definition seems to be Intel HW centric and applies
> > > > to a limited range of devices making it unusable by general application.
> > > >
> > > > Break the ABI, and ditch the old values (ok mark
> > > > PKT_RX_L4_CKSUM_BAD as __deprecated and remove all usage).
> > > >
> > >
> > > Don't you think knowing that a checksum is bad could be useful?
> >
> > Not really. They should be mark as undetermined, then software can
> > recheck for the possibly buggy hardware.
> 
> Hmm, I don't see the point here.
> If the HW clearly reports that checksum is invalid (not unknown), why SW has
> to assume it is ' undetermined' and recheck it?
> To me that means just wasted cycles.
> In general, it sounds like really strange approach to me:
> write your SW with assumption that all HW you are going to use will not work
> correctly.
> 
> >
> > > In that case the application can drop/log the packet without any
> > > additional cpu cost.
> > >
> > > What do you mean by beeing unusable by general application?
> >
> > Right now application can only see "known bad" or "indeterminate"
> > there is no way to no which packets are good. Since good is the
> > desired/expected case, software has to checksum every packet.
> >
> > >
> > > I think the "2)" also requires a csum_start + csum_offset in mbuf
> > > structure, right?
> >
> > Not really, it would mean having a way to get the raw one's complement
> > sum out of the hardware. This is a good way to support the tunnel
> > protocol du jour without having to have firmware support.
> > Unfortunately, most hardware vendors don't believe in doing it that way.
> 
> It might be a good feature to have, but if most HW vendors don't support it
> why to bother?
> 
> >
> >
> > > Do you also suggest to drop IP checksum flags?
> >
> > IP checksum offload is mostly useless. If application needs to look at
> > IP, it can do whole checksum in very few instructions, the whole
> > header is in the same cache line as src/dst so the HW offload is really no
> help.
> >
[Sugesh] The checksum offload can boost the tunneling performance in OVS.
I guess the IP checksum also important as L4. In some cases, UDP checksum is
zero and no need to validate it. But Ip checksum is present on all the packets and that must be
validated all  the time. At higher packet rate, the ip checksum offload can offer slight 
performance improvement. What do you think??

> > >
> > > Will it be possible to manage tunnel checksums?
> > >
> > > I think this would be a pretty big change. If there is no additional
> > > argument than beeing more compatible with virtio/linux, I'm
> > > wondering if it's worth breaking the API. Let's wait for other opinions.
> 
> I think that what Olivier proposed is good enough and definitely a step
> forward from what we have right now.
> 
> Konstantin
> 
> > >
> > > Thanks for your feedback.
> > > Olivier

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v7 0/5] mempool: add external mempool manager
  2016-06-01 16:19  2%   ` [dpdk-dev] [PATCH v6 0/5] mempool: add external mempool manager David Hunt
@ 2016-06-02 13:27  2%     ` David Hunt
    2016-06-03 14:58  2%       ` [dpdk-dev] [PATCH v8 " David Hunt
  0 siblings, 2 replies; 200+ results
From: David Hunt @ 2016-06-02 13:27 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 19/5/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool ops struct name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. put       - puts an object back into the mempool once an application has
                finished with it
 3. get       - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .put = common_ring_sp_put,
    .get = common_ring_mc_get,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (4):
  mempool: support external mempool operations
  mempool: remove rte_ring from rte_mempool struct
  mempool: add default external mempool ops
  mbuf: allow apps to change default mempool ops

Olivier Matz (1):
  app/test: test external mempool manager

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v7 0/5] mempool: add external mempool manager
  @ 2016-06-02 13:38  2%         ` Hunt, David
  0 siblings, 0 replies; 200+ results
From: Hunt, David @ 2016-06-02 13:38 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob

Since the cover letter seems to have gone missing, sending it again:

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 19/5/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html


v7 changes:

  * Changed rte_mempool_handler_table to rte_mempool_ops_table
  * Changed hander_idx to ops_index in rte_mempool struct
  * Reworked comments in rte_mempool.h around ops functions
  * Changed rte_mempool_hander.c to rte_mempool_ops.c
  * Changed all functions containing _handler_ to _ops_
  * Now there is no mention of 'handler' left
  * Other small changes out of review of mailing list

v6 changes:

  * Moved the flags handling from rte_mempool_create_empty to
    rte_mempool_create, as it's only there for backward compatibility
  * Various comment additions and cleanup
  * Renamed rte_mempool_handler to rte_mempool_ops
  * Added a union for *pool and u64 pool_id in struct rte_mempool
  * split the original patch into a few parts for easier review.
  * rename functions with _ext_ to _ops_.
  * addressed review comments
  * renamed put and get functions to enqueue and dequeue
  * changed occurences of rte_mempool_ops to const, as they
    contain function pointers (security)
  * split out the default external mempool handler into a separate
    patch for easier review

v5 changes:
  * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
  * remove the rte_mempool_create_ext() function. To change the handler, the
    user has to do the following:
    - mp = rte_mempool_create_empty()
    - rte_mempool_set_handler(mp, "my_handler")
    - rte_mempool_populate_default(mp)
    This avoids to add another function with more than 10 arguments, 
duplicating
    the doxygen comments
  * change the api of rte_mempool_alloc_t: only the mempool pointer is 
required
    as all information is available in it
  * change the api of rte_mempool_free_t: remove return value
  * move inline wrapper functions from the .c to the .h (else they won't be
    inlined). This implies to have one header file (rte_mempool.h), or it
    would have generate cross dependencies issues.
  * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
    to the use of && instead of &)
  * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
  * fix build with shared libraries (global handler has to be declared in
    the .map file)
  * rationalize #include order
  * remove unused function rte_mempool_get_handler_name()
  * rename some structures, fields, functions
  * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
    from Yuanhan)
  * test the ext mempool handler in the same file than standard mempool 
tests,
    avoiding to duplicate the code
  * rework the custom handler in mempool_test
  * rework a bit the patch selecting default mbuf pool handler
  * fix some doxygen comments

v3 changes:
  * simplified the file layout, renamed to rte_mempool_handler.[hc]
  * moved the default handlers into rte_mempool_default.c
  * moved the example handler out into app/test/test_ext_mempool.c
  * removed is_mc/is_mp change, slight perf degredation on sp cached 
operation
  * removed stack hanler, may re-introduce at a later date
  * Changes out of code reviews

v2 changes:
  * There was a lot of duplicate code between rte_mempool_xmem_create and
    rte_mempool_create_ext. This has now been refactored and is now
    hopefully cleaner.
  * The RTE_NEXT_ABI define is now used to allow building of the library
    in a format that is compatible with binaries built against previous
    versions of DPDK.
  * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external 
memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
   1. Adding the code for your new mempool operations (ops). This is
      achieved by adding a new mempool ops source file into the
      librte_mempool library, and using the REGISTER_MEMPOOL_HANDLER macro.
   2. Using the new API to call rte_mempool_create_empty and
      rte_mempool_set_ops to create a new mempool
      using the name parameter to identify which ops to use.

New API calls added
  1. A new rte_mempool_create_empty() function
  2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
  3. An rte_mempool_populate_default() and rte_mempool_populate_anon() 
functions
     which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, 
providing the
mempool ops struct name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created 
internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
  1. alloc     - allocates the mempool memory, and adds each object onto 
a ring
  2. put       - puts an object back into the mempool once an 
application has
                 finished with it
  3. get       - gets an object from the mempool for use by the application
  4. get_count - gets the number of available objects in the mempool
  5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
     unsigned cache_size, unsigned private_data_size,
     int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.


The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
     .name = "ring_sp_mc",
     .alloc = rte_mempool_common_ring_alloc,
     .put = common_ring_sp_put,
     .get = common_ring_mc_get,
     .get_count = common_ring_get_count,
     .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple 
mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (4):
   mempool: support external mempool operations
   mempool: remove rte_ring from rte_mempool struct
   mempool: add default external mempool ops
   mbuf: allow apps to change default mempool ops

Olivier Matz (1):
   app/test: test external mempool manager

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  @ 2016-06-02 19:41  3%         ` Wiles, Keith
  2016-06-02 20:08  3%           ` Neil Horman
  0 siblings, 1 reply; 200+ results
From: Wiles, Keith @ 2016-06-02 19:41 UTC (permalink / raw)
  To: Neil Horman
  Cc: Thomas Monjalon, Yuanhan Liu, dev, Richardson, Bruce, Tan,
	Jianfeng, Stephen Hemminger, Christian Ehrhardt, Panu Matilainen,
	Olivier Matz


On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:

>
>1) The definition of a config structure that can be passed to rte_eal_init,
>defining the configuration for that running process

Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.

Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)

Would this work better in the long run, does a fixed structure still make sense?

>
>2) The creation and use of an API that various DPDK libraries can use to
>retrieve that structure (or elements thereof), based on some explicit or imlicit
>id, so that the configuration can be used (I'm thinking here specifically of
>multiple dpdk applications using a dpdk shared library)
>
>3) The removal of the eal_parse_args code from the core dpdk library entirely,
>packaging it instead as its own library that interprets command line arguments
>as currently defined, and populates an instance of the structure defined in (1)
>
>4) Altering the Makefiles, so that the example apps link against the new library
>in (3), altering the app source code to work with the config structure defined
>in (1)
>
>With those steps, I think we will remove the command line bits from the dpdk
>core, and do so without altering the user experience for any of the sample apps
>(which will demonstrate to other developers that the same can be done with their
>applications).  From there we will be free to create alternate methods of
>populating the config struct defined in (1) (via JSON file, YAML, XML, or
>whatever).
>
>Neil
>
>> >> 
>> >> For the purposes of the example apps, it would seem that either JSON, YAML, or
>> >> the above Lua format would work just fine.
>> >
>> >+1
>> >
>> 
>> Regards,
>> ++Keith
>> 
>> 
>




^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-02 19:41  3%         ` Wiles, Keith
@ 2016-06-02 20:08  3%           ` Neil Horman
    2016-06-03 10:29  0%             ` Bruce Richardson
  0 siblings, 2 replies; 200+ results
From: Neil Horman @ 2016-06-02 20:08 UTC (permalink / raw)
  To: Wiles, Keith
  Cc: Thomas Monjalon, Yuanhan Liu, dev, Richardson, Bruce, Tan,
	Jianfeng, Stephen Hemminger, Christian Ehrhardt, Panu Matilainen,
	Olivier Matz

On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
> 
> On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> 
> >
> >1) The definition of a config structure that can be passed to rte_eal_init,
> >defining the configuration for that running process
> 
> Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.

Thats a fair point.  A decent starting point is likely a simple struct that
looks like this:

struct key_vals {
	char *key;
	union {
		ulong longval;
		void *ptrval;
	} value;
};

struct config {
	size_t count;
	struct key_vals kvp[0];
};

> 
> Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)
> 
Well, if you have config sections that require mulitiple elements, I'd handle
that with naming, i.e. if you have a config group that has an int and char
value, I'd name them "group.intval", and "group.charval", so they are
independently searchable, but linked from a nomenclature standpoint.

> Would this work better in the long run, does a fixed structure still make sense?
> 
No. I think you're ABI concerns are valid, but the above is likely a good
starting point to address them.

Best
Neil


> >
> >2) The creation and use of an API that various DPDK libraries can use to
> >retrieve that structure (or elements thereof), based on some explicit or imlicit
> >id, so that the configuration can be used (I'm thinking here specifically of
> >multiple dpdk applications using a dpdk shared library)
> >
> >3) The removal of the eal_parse_args code from the core dpdk library entirely,
> >packaging it instead as its own library that interprets command line arguments
> >as currently defined, and populates an instance of the structure defined in (1)
> >
> >4) Altering the Makefiles, so that the example apps link against the new library
> >in (3), altering the app source code to work with the config structure defined
> >in (1)
> >
> >With those steps, I think we will remove the command line bits from the dpdk
> >core, and do so without altering the user experience for any of the sample apps
> >(which will demonstrate to other developers that the same can be done with their
> >applications).  From there we will be free to create alternate methods of
> >populating the config struct defined in (1) (via JSON file, YAML, XML, or
> >whatever).
> >
> >Neil
> >
> >> >> 
> >> >> For the purposes of the example apps, it would seem that either JSON, YAML, or
> >> >> the above Lua format would work just fine.
> >> >
> >> >+1
> >> >
> >> 
> >> Regards,
> >> ++Keith
> >> 
> >> 
> >
> 
> 
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  @ 2016-06-02 22:34  3%               ` Neil Horman
  0 siblings, 0 replies; 200+ results
From: Neil Horman @ 2016-06-02 22:34 UTC (permalink / raw)
  To: Matthew Hall
  Cc: Wiles, Keith, Thomas Monjalon, Yuanhan Liu, dev, Richardson,
	Bruce, Tan, Jianfeng, Stephen Hemminger, Christian Ehrhardt,
	Panu Matilainen, Olivier Matz

On Thu, Jun 02, 2016 at 01:53:55PM -0700, Matthew Hall wrote:
> On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
> > struct key_vals {
> > 	char *key;
> > 	union {
> > 		ulong longval;
> > 		void *ptrval;
> > 	} value;
> > };
> > 
> > struct config {
> > 	size_t count;
> > 	struct key_vals kvp[0];
> > };
> 
> This sort of code is very 1970s / ioctl / messy binary. And doesn't buy any 
> performance advantage because it's just for config.
> 
What!?  I can't even parse that sentence.  Of course its just for config, we're
talking about a configuration structure. If you want to make it more
complex/heirarchical/whatever, fine, propose a way to do that that isnt ABI
variant in response to config additions.  Its just a starting point.

> Something that looks more like sysctl MIBs with hierarchical names or like 
> JSON w/ a hierarchy of hash tables and arrays is much less user-hostile.
> 

> https://www.freebsd.org/cgi/man.cgi?sysctl(3)
> 
I can't even begin to understand what you're after here.  sysctl provides a
heirarchy in _exactly_ the same way that I just proposed, by texual consistency
in naming.

> http://json-c.github.io/json-c/json-c-0.12/doc/html/json__object_8h.html
> 
So, this is a fine interface to convert text config to a code format, but thats
a decision that application should be making, not something dpdk should mandate

Neil

> Matthew.
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-02 20:08  3%           ` Neil Horman
  @ 2016-06-03 10:29  0%             ` Bruce Richardson
  2016-06-03 11:01  0%               ` Bruce Richardson
  1 sibling, 1 reply; 200+ results
From: Bruce Richardson @ 2016-06-03 10:29 UTC (permalink / raw)
  To: Neil Horman
  Cc: Wiles, Keith, Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Panu Matilainen,
	Olivier Matz

On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
> On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
> > 
> > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> > 
> > >
> > >1) The definition of a config structure that can be passed to rte_eal_init,
> > >defining the configuration for that running process
> > 
> > Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.
> 
> Thats a fair point.  A decent starting point is likely a simple struct that
> looks like this:
> 
> struct key_vals {
> 	char *key;
> 	union {
> 		ulong longval;
> 		void *ptrval;
> 	} value;
> };
> 
> struct config {
> 	size_t count;
> 	struct key_vals kvp[0];
> };
> 
> > 
> > Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)
> > 
> Well, if you have config sections that require mulitiple elements, I'd handle
> that with naming, i.e. if you have a config group that has an int and char
> value, I'd name them "group.intval", and "group.charval", so they are
> independently searchable, but linked from a nomenclature standpoint.
> 
> > Would this work better in the long run, does a fixed structure still make sense?
> > 
> No. I think you're ABI concerns are valid, but the above is likely a good
> starting point to address them.
> 
> Best
> Neil

I'll throw out one implementation idea here that I looked at previously, for
the reason that it was simple enough implement with existing code.

We already have the cfgfile library which works with name/value pairs read from
ini files on disk. However, it would be easy enough to add couple of APIs to
that to allow the user to "set" values inside an ini structure as well. With
that done we can then just add a new eal_init api which takes a single
"struct rte_cfgfile *" as parameter. For those apps that want to just use
inifiles for configuration straight, they can then do:

cfg = rte_cfgfile_load("my_cfg_file");
rte_eal_newinit(cfg);

Those who want a different config can instead do:

cfg = rte_cfgfile_new();
rte_cfgfile_add_section(cfg, "dpdk");
foreach_eal_setting_wanted:
	rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
rte_eal_newinit(cfg);

We can standardize on a sectionname, or a couple of standard section names that
are used by DPDK, so that the rest of the config file can contain other data
for the app itself.

What do people think. I mainly like it because it gives us good reuse of what
is already there, and enhances our existing library. As well as this it makes
it trivially easy for apps to use ini files - which seem to be very popular here
- while still giving flexibility for others to use whatever other config format
their app prefers.

/Bruce

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 10:29  0%             ` Bruce Richardson
@ 2016-06-03 11:01  0%               ` Bruce Richardson
  2016-06-03 11:50  0%                 ` Neil Horman
  0 siblings, 1 reply; 200+ results
From: Bruce Richardson @ 2016-06-03 11:01 UTC (permalink / raw)
  To: Neil Horman
  Cc: Wiles, Keith, Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Panu Matilainen,
	Olivier Matz

On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
> On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
> > On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
> > > 
> > > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> > > 
> > > >
> > > >1) The definition of a config structure that can be passed to rte_eal_init,
> > > >defining the configuration for that running process
> > > 
> > > Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.
> > 
> > Thats a fair point.  A decent starting point is likely a simple struct that
> > looks like this:
> > 
> > struct key_vals {
> > 	char *key;
> > 	union {
> > 		ulong longval;
> > 		void *ptrval;
> > 	} value;
> > };
> > 
> > struct config {
> > 	size_t count;
> > 	struct key_vals kvp[0];
> > };
> > 
> > > 
> > > Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)
> > > 
> > Well, if you have config sections that require mulitiple elements, I'd handle
> > that with naming, i.e. if you have a config group that has an int and char
> > value, I'd name them "group.intval", and "group.charval", so they are
> > independently searchable, but linked from a nomenclature standpoint.
> > 
> > > Would this work better in the long run, does a fixed structure still make sense?
> > > 
> > No. I think you're ABI concerns are valid, but the above is likely a good
> > starting point to address them.
> > 
> > Best
> > Neil
> 
> I'll throw out one implementation idea here that I looked at previously, for
> the reason that it was simple enough implement with existing code.
> 
> We already have the cfgfile library which works with name/value pairs read from
> ini files on disk. However, it would be easy enough to add couple of APIs to
> that to allow the user to "set" values inside an ini structure as well. With
> that done we can then just add a new eal_init api which takes a single
> "struct rte_cfgfile *" as parameter. For those apps that want to just use
> inifiles for configuration straight, they can then do:
> 
> cfg = rte_cfgfile_load("my_cfg_file");
> rte_eal_newinit(cfg);
> 
> Those who want a different config can instead do:
> 
> cfg = rte_cfgfile_new();
> rte_cfgfile_add_section(cfg, "dpdk");
> foreach_eal_setting_wanted:
> 	rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
> rte_eal_newinit(cfg);
> 
>From chatting to a couple of other DPDK dev's here I suspect I may not have
been entirely clear here with this example. What is being shown above is building
up a "config-file" in memory - or rather a config structure which happens to
have the idea of sections and values as an ini file has. There is no actual
file ever being written to disk, and for those using any non-ini config file
structure for their app, the code overhead of using the APIs above should be 
pretty much the same as building up any other set of key-value pairs in
memory to pass to an init function.

Hope this is a little clearer now.

/Bruce

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 11:01  0%               ` Bruce Richardson
@ 2016-06-03 11:50  0%                 ` Neil Horman
  2016-06-03 12:01  0%                   ` Arnon Warshavsky
  2016-06-03 12:14  0%                   ` Panu Matilainen
  0 siblings, 2 replies; 200+ results
From: Neil Horman @ 2016-06-03 11:50 UTC (permalink / raw)
  To: Bruce Richardson
  Cc: Wiles, Keith, Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Panu Matilainen,
	Olivier Matz

On Fri, Jun 03, 2016 at 12:01:30PM +0100, Bruce Richardson wrote:
> On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
> > On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
> > > On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
> > > > 
> > > > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> > > > 
> > > > >
> > > > >1) The definition of a config structure that can be passed to rte_eal_init,
> > > > >defining the configuration for that running process
> > > > 
> > > > Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.
> > > 
> > > Thats a fair point.  A decent starting point is likely a simple struct that
> > > looks like this:
> > > 
> > > struct key_vals {
> > > 	char *key;
> > > 	union {
> > > 		ulong longval;
> > > 		void *ptrval;
> > > 	} value;
> > > };
> > > 
> > > struct config {
> > > 	size_t count;
> > > 	struct key_vals kvp[0];
> > > };
> > > 
> > > > 
> > > > Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)
> > > > 
> > > Well, if you have config sections that require mulitiple elements, I'd handle
> > > that with naming, i.e. if you have a config group that has an int and char
> > > value, I'd name them "group.intval", and "group.charval", so they are
> > > independently searchable, but linked from a nomenclature standpoint.
> > > 
> > > > Would this work better in the long run, does a fixed structure still make sense?
> > > > 
> > > No. I think you're ABI concerns are valid, but the above is likely a good
> > > starting point to address them.
> > > 
> > > Best
> > > Neil
> > 
> > I'll throw out one implementation idea here that I looked at previously, for
> > the reason that it was simple enough implement with existing code.
> > 
> > We already have the cfgfile library which works with name/value pairs read from
> > ini files on disk. However, it would be easy enough to add couple of APIs to
> > that to allow the user to "set" values inside an ini structure as well. With
> > that done we can then just add a new eal_init api which takes a single
> > "struct rte_cfgfile *" as parameter. For those apps that want to just use
> > inifiles for configuration straight, they can then do:
> > 
> > cfg = rte_cfgfile_load("my_cfg_file");
> > rte_eal_newinit(cfg);
> > 
> > Those who want a different config can instead do:
> > 
> > cfg = rte_cfgfile_new();
> > rte_cfgfile_add_section(cfg, "dpdk");
> > foreach_eal_setting_wanted:
> > 	rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
> > rte_eal_newinit(cfg);
> > 
> From chatting to a couple of other DPDK dev's here I suspect I may not have
> been entirely clear here with this example. What is being shown above is building
> up a "config-file" in memory - or rather a config structure which happens to
> have the idea of sections and values as an ini file has. There is no actual
> file ever being written to disk, and for those using any non-ini config file
> structure for their app, the code overhead of using the APIs above should be 
> pretty much the same as building up any other set of key-value pairs in
> memory to pass to an init function.
> 
> Hope this is a little clearer now.
> 
I'm fine with the idea of reusing the config file library that currently exists,
or more to the point, modifying it to be usable as a configuration API, rather
than a configuration file parser.  My primary interest is in separating the user
configuration mechanism from the internal library configuration lookup
mechanism.  What I would really like to be able to see is application developers
have the flexibiilty to choose their own configuration method and format, and
programatically build a configuration for the dpdk on a per-instance basis prior
to calling rte_eal_init

It seems like this approach satisfies that requirement
Neil

> /Bruce
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 11:50  0%                 ` Neil Horman
@ 2016-06-03 12:01  0%                   ` Arnon Warshavsky
  2016-06-03 12:53  0%                     ` Panu Matilainen
  2016-06-03 12:14  0%                   ` Panu Matilainen
  1 sibling, 1 reply; 200+ results
From: Arnon Warshavsky @ 2016-06-03 12:01 UTC (permalink / raw)
  To: Neil Horman
  Cc: Bruce Richardson, Wiles, Keith, Thomas Monjalon, Yuanhan Liu,
	dev, Tan, Jianfeng, Stephen Hemminger, Christian Ehrhardt,
	Panu Matilainen, Olivier Matz

On Fri, Jun 3, 2016 at 2:50 PM, Neil Horman <nhorman@tuxdriver.com> wrote:

> On Fri, Jun 03, 2016 at 12:01:30PM +0100, Bruce Richardson wrote:
> > On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
> > > On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
> > > > On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
> > > > >
> > > > > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> > > > >
> > > > > >
> > > > > >1) The definition of a config structure that can be passed to
> rte_eal_init,
> > > > > >defining the configuration for that running process
> > > > >
> > > > > Having a configuration structure means we have to have an ABI
> change to that structure anytime we add or remove an option. I was thinking
> a very simple DB of some kind would be better. Have the code query the DB
> to obtain the needed information. The APIs used to query and set the DB
> needs to be very easy to use as well.
> > > >
> > > > Thats a fair point.  A decent starting point is likely a simple
> struct that
> > > > looks like this:
> > > >
> > > > struct key_vals {
> > > >   char *key;
> > > >   union {
> > > >           ulong longval;
> > > >           void *ptrval;
> > > >   } value;
> > > > };
> > > >
> > > > struct config {
> > > >   size_t count;
> > > >   struct key_vals kvp[0];
> > > > };
> > > >
> > > > >
> > > > > Maybe each option can define its own structure if needed or just a
> simple variable type can be used for the basic types (int, string, bool, …)
> > > > >
> > > > Well, if you have config sections that require mulitiple elements,
> I'd handle
> > > > that with naming, i.e. if you have a config group that has an int
> and char
> > > > value, I'd name them "group.intval", and "group.charval", so they are
> > > > independently searchable, but linked from a nomenclature standpoint.
> > > >
> > > > > Would this work better in the long run, does a fixed structure
> still make sense?
> > > > >
> > > > No. I think you're ABI concerns are valid, but the above is likely a
> good
> > > > starting point to address them.
> > > >
> > > > Best
> > > > Neil
> > >
> > > I'll throw out one implementation idea here that I looked at
> previously, for
> > > the reason that it was simple enough implement with existing code.
> > >
> > > We already have the cfgfile library which works with name/value pairs
> read from
> > > ini files on disk. However, it would be easy enough to add couple of
> APIs to
> > > that to allow the user to "set" values inside an ini structure as
> well. With
> > > that done we can then just add a new eal_init api which takes a single
> > > "struct rte_cfgfile *" as parameter. For those apps that want to just
> use
> > > inifiles for configuration straight, they can then do:
> > >
> > > cfg = rte_cfgfile_load("my_cfg_file");
> > > rte_eal_newinit(cfg);
> > >
> > > Those who want a different config can instead do:
> > >
> > > cfg = rte_cfgfile_new();
> > > rte_cfgfile_add_section(cfg, "dpdk");
> > > foreach_eal_setting_wanted:
> > >     rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
> > > rte_eal_newinit(cfg);
> > >
> > From chatting to a couple of other DPDK dev's here I suspect I may not
> have
> > been entirely clear here with this example. What is being shown above is
> building
> > up a "config-file" in memory - or rather a config structure which
> happens to
> > have the idea of sections and values as an ini file has. There is no
> actual
> > file ever being written to disk, and for those using any non-ini config
> file
> > structure for their app, the code overhead of using the APIs above
> should be
> > pretty much the same as building up any other set of key-value pairs in
> > memory to pass to an init function.
> >
> > Hope this is a little clearer now.
> >
> I'm fine with the idea of reusing the config file library that currently
> exists,
> or more to the point, modifying it to be usable as a configuration API,
> rather
> than a configuration file parser.  My primary interest is in separating
> the user
> configuration mechanism from the internal library configuration lookup
> mechanism.  What I would really like to be able to see is application
> developers
> have the flexibiilty to choose their own configuration method and format,
> and
> programatically build a configuration for the dpdk on a per-instance basis
> prior
> to calling rte_eal_init
>
> It seems like this approach satisfies that requirement
> Neil
>
>
If the there is no configuration structure , rather an opaque configuration
key/value store ,
the user applications can set and get knobs that are not seen by anyone
(library) that does not know them by name

e.g

int num_nodes = getCfgInt ( cfgObject , "eal" , "num_numa_nodes");
int delay = getCfgInt ( cfgObject , "drivers.ixgbe" , "some_delay");
setCfgInt (cfgObject , "my_app" , "num_days" , 7);
setCfgString (cfgObject , "my_app" , "best_day" , "Wednesday");

/Arnon

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 11:50  0%                 ` Neil Horman
  2016-06-03 12:01  0%                   ` Arnon Warshavsky
@ 2016-06-03 12:14  0%                   ` Panu Matilainen
  1 sibling, 0 replies; 200+ results
From: Panu Matilainen @ 2016-06-03 12:14 UTC (permalink / raw)
  To: Neil Horman, Bruce Richardson
  Cc: Wiles, Keith, Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz

On 06/03/2016 02:50 PM, Neil Horman wrote:
> On Fri, Jun 03, 2016 at 12:01:30PM +0100, Bruce Richardson wrote:
>> On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
>>> On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
>>>> On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
>>>>>
>>>>> On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
>>>>>
>>>>>>
>>>>>> 1) The definition of a config structure that can be passed to rte_eal_init,
>>>>>> defining the configuration for that running process
>>>>>
>>>>> Having a configuration structure means we have to have an ABI change to that structure anytime we add or remove an option. I was thinking a very simple DB of some kind would be better. Have the code query the DB to obtain the needed information. The APIs used to query and set the DB needs to be very easy to use as well.
>>>>
>>>> Thats a fair point.  A decent starting point is likely a simple struct that
>>>> looks like this:
>>>>
>>>> struct key_vals {
>>>> 	char *key;
>>>> 	union {
>>>> 		ulong longval;
>>>> 		void *ptrval;
>>>> 	} value;
>>>> };
>>>>
>>>> struct config {
>>>> 	size_t count;
>>>> 	struct key_vals kvp[0];
>>>> };
>>>>
>>>>>
>>>>> Maybe each option can define its own structure if needed or just a simple variable type can be used for the basic types (int, string, bool, …)
>>>>>
>>>> Well, if you have config sections that require mulitiple elements, I'd handle
>>>> that with naming, i.e. if you have a config group that has an int and char
>>>> value, I'd name them "group.intval", and "group.charval", so they are
>>>> independently searchable, but linked from a nomenclature standpoint.
>>>>
>>>>> Would this work better in the long run, does a fixed structure still make sense?
>>>>>
>>>> No. I think you're ABI concerns are valid, but the above is likely a good
>>>> starting point to address them.
>>>>
>>>> Best
>>>> Neil
>>>
>>> I'll throw out one implementation idea here that I looked at previously, for
>>> the reason that it was simple enough implement with existing code.
>>>
>>> We already have the cfgfile library which works with name/value pairs read from
>>> ini files on disk. However, it would be easy enough to add couple of APIs to
>>> that to allow the user to "set" values inside an ini structure as well. With
>>> that done we can then just add a new eal_init api which takes a single
>>> "struct rte_cfgfile *" as parameter. For those apps that want to just use
>>> inifiles for configuration straight, they can then do:
>>>
>>> cfg = rte_cfgfile_load("my_cfg_file");
>>> rte_eal_newinit(cfg);
>>>
>>> Those who want a different config can instead do:
>>>
>>> cfg = rte_cfgfile_new();
>>> rte_cfgfile_add_section(cfg, "dpdk");
>>> foreach_eal_setting_wanted:
>>> 	rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
>>> rte_eal_newinit(cfg);
>>>
>> From chatting to a couple of other DPDK dev's here I suspect I may not have
>> been entirely clear here with this example. What is being shown above is building
>> up a "config-file" in memory - or rather a config structure which happens to
>> have the idea of sections and values as an ini file has. There is no actual
>> file ever being written to disk, and for those using any non-ini config file
>> structure for their app, the code overhead of using the APIs above should be
>> pretty much the same as building up any other set of key-value pairs in
>> memory to pass to an init function.

/me nods.

This is pretty much exactly what I suggested (only in much less detail) 
last year :) http://dpdk.org/ml/archives/dev/2015-October/024803.html

>> Hope this is a little clearer now.
> I'm fine with the idea of reusing the config file library that currently exists,
> or more to the point, modifying it to be usable as a configuration API, rather
> than a configuration file parser.  My primary interest is in separating the user
> configuration mechanism from the internal library configuration lookup
> mechanism.  What I would really like to be able to see is application developers
> have the flexibiilty to choose their own configuration method and format, and
> programatically build a configuration for the dpdk on a per-instance basis prior
> to calling rte_eal_init
>
> It seems like this approach satisfies that requirement

/me nods some more.

What the key-value config also can buy us is a direct mapping to cli 
options (which is something Keith has been looking into IIRC), at which 
point I think all the bases are quite nicely covered.

	- Panu -

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 12:01  0%                   ` Arnon Warshavsky
@ 2016-06-03 12:53  0%                     ` Panu Matilainen
  2016-06-03 14:31  0%                       ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Panu Matilainen @ 2016-06-03 12:53 UTC (permalink / raw)
  To: Arnon Warshavsky, Neil Horman
  Cc: Bruce Richardson, Wiles, Keith, Thomas Monjalon, Yuanhan Liu,
	dev, Tan, Jianfeng, Stephen Hemminger, Christian Ehrhardt,
	Olivier Matz

On 06/03/2016 03:01 PM, Arnon Warshavsky wrote:
>
>
> On Fri, Jun 3, 2016 at 2:50 PM, Neil Horman <nhorman@tuxdriver.com
> <mailto:nhorman@tuxdriver.com>> wrote:
>
>     On Fri, Jun 03, 2016 at 12:01:30PM +0100, Bruce Richardson wrote:
>     > On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
>     > > On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
>     > > > On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
>     > > > >
>     > > > > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com
>     <mailto:nhorman@tuxdriver.com>> wrote:
>     > > > >
>     > > > > >
>     > > > > >1) The definition of a config structure that can be passed
>     to rte_eal_init,
>     > > > > >defining the configuration for that running process
>     > > > >
>     > > > > Having a configuration structure means we have to have an
>     ABI change to that structure anytime we add or remove an option. I
>     was thinking a very simple DB of some kind would be better. Have the
>     code query the DB to obtain the needed information. The APIs used to
>     query and set the DB needs to be very easy to use as well.
>     > > >
>     > > > Thats a fair point.  A decent starting point is likely a
>     simple struct that
>     > > > looks like this:
>     > > >
>     > > > struct key_vals {
>     > > >   char *key;
>     > > >   union {
>     > > >           ulong longval;
>     > > >           void *ptrval;
>     > > >   } value;
>     > > > };
>     > > >
>     > > > struct config {
>     > > >   size_t count;
>     > > >   struct key_vals kvp[0];
>     > > > };
>     > > >
>     > > > >
>     > > > > Maybe each option can define its own structure if needed or
>     just a simple variable type can be used for the basic types (int,
>     string, bool, …)
>     > > > >
>     > > > Well, if you have config sections that require mulitiple
>     elements, I'd handle
>     > > > that with naming, i.e. if you have a config group that has an
>     int and char
>     > > > value, I'd name them "group.intval", and "group.charval", so
>     they are
>     > > > independently searchable, but linked from a nomenclature
>     standpoint.
>     > > >
>     > > > > Would this work better in the long run, does a fixed
>     structure still make sense?
>     > > > >
>     > > > No. I think you're ABI concerns are valid, but the above is
>     likely a good
>     > > > starting point to address them.
>     > > >
>     > > > Best
>     > > > Neil
>     > >
>     > > I'll throw out one implementation idea here that I looked at
>     previously, for
>     > > the reason that it was simple enough implement with existing code.
>     > >
>     > > We already have the cfgfile library which works with name/value
>     pairs read from
>     > > ini files on disk. However, it would be easy enough to add
>     couple of APIs to
>     > > that to allow the user to "set" values inside an ini structure
>     as well. With
>     > > that done we can then just add a new eal_init api which takes a
>     single
>     > > "struct rte_cfgfile *" as parameter. For those apps that want to
>     just use
>     > > inifiles for configuration straight, they can then do:
>     > >
>     > > cfg = rte_cfgfile_load("my_cfg_file");
>     > > rte_eal_newinit(cfg);
>     > >
>     > > Those who want a different config can instead do:
>     > >
>     > > cfg = rte_cfgfile_new();
>     > > rte_cfgfile_add_section(cfg, "dpdk");
>     > > foreach_eal_setting_wanted:
>     > >     rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
>     > > rte_eal_newinit(cfg);
>     > >
>     > From chatting to a couple of other DPDK dev's here I suspect I may
>     not have
>     > been entirely clear here with this example. What is being shown
>     above is building
>     > up a "config-file" in memory - or rather a config structure which
>     happens to
>     > have the idea of sections and values as an ini file has. There is
>     no actual
>     > file ever being written to disk, and for those using any non-ini
>     config file
>     > structure for their app, the code overhead of using the APIs above
>     should be
>     > pretty much the same as building up any other set of key-value
>     pairs in
>     > memory to pass to an init function.
>     >
>     > Hope this is a little clearer now.
>     >
>     I'm fine with the idea of reusing the config file library that
>     currently exists,
>     or more to the point, modifying it to be usable as a configuration
>     API, rather
>     than a configuration file parser.  My primary interest is in
>     separating the user
>     configuration mechanism from the internal library configuration lookup
>     mechanism.  What I would really like to be able to see is
>     application developers
>     have the flexibiilty to choose their own configuration method and
>     format, and
>     programatically build a configuration for the dpdk on a per-instance
>     basis prior
>     to calling rte_eal_init
>
>     It seems like this approach satisfies that requirement
>     Neil
>
>
> If the there is no configuration structure , rather an opaque
> configuration key/value store ,
> the user applications can set and get knobs that are not seen by anyone
> (library) that does not know them by name
>
> e.g
>
> int num_nodes = getCfgInt ( cfgObject , "eal" , "num_numa_nodes");
> int delay = getCfgInt ( cfgObject , "drivers.ixgbe" , "some_delay");
> setCfgInt (cfgObject , "my_app" , "num_days" , 7);
> setCfgString (cfgObject , "my_app" , "best_day" , "Wednesday");

I dont see why it would not be possible to have the libraries export 
their known config keys in one way or the other. Or more.

One aspect is runtime queries which would need an API of some kind. 
Being able to query default values should work for that purpose and be 
handy for various other uses as well.

Another one is build-time sanity checking which could be doen by 
auto-generating header(s) from the library known keys, eg

#define CFG_NUM_NUMA_NODES "num_numa_nodes"

so if you use the macro instead of the actual string, you'll get a 
compiler error in case of unknown key instead of runtime misbehavior in 
case of typoed values etc. Whether that's worth it is an entirely 
different question.

	- Panu -





> /Arnon
>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 12:53  0%                     ` Panu Matilainen
@ 2016-06-03 14:31  0%                       ` Arnon Warshavsky
    0 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2016-06-03 14:31 UTC (permalink / raw)
  To: Panu Matilainen
  Cc: Neil Horman, Bruce Richardson, Wiles, Keith, Thomas Monjalon,
	Yuanhan Liu, dev, Tan, Jianfeng, Stephen Hemminger,
	Christian Ehrhardt, Olivier Matz

On Fri, Jun 3, 2016 at 3:53 PM, Panu Matilainen <pmatilai@redhat.com> wrote:

> On 06/03/2016 03:01 PM, Arnon Warshavsky wrote:
>
>>
>>
>> On Fri, Jun 3, 2016 at 2:50 PM, Neil Horman <nhorman@tuxdriver.com
>> <mailto:nhorman@tuxdriver.com>> wrote:
>>
>>     On Fri, Jun 03, 2016 at 12:01:30PM +0100, Bruce Richardson wrote:
>>     > On Fri, Jun 03, 2016 at 11:29:43AM +0100, Bruce Richardson wrote:
>>     > > On Thu, Jun 02, 2016 at 04:08:37PM -0400, Neil Horman wrote:
>>     > > > On Thu, Jun 02, 2016 at 07:41:10PM +0000, Wiles, Keith wrote:
>>     > > > >
>>     > > > > On 6/2/16, 12:11 PM, "Neil Horman" <nhorman@tuxdriver.com
>>     <mailto:nhorman@tuxdriver.com>> wrote:
>>     > > > >
>>     > > > > >
>>     > > > > >1) The definition of a config structure that can be passed
>>     to rte_eal_init,
>>     > > > > >defining the configuration for that running process
>>     > > > >
>>     > > > > Having a configuration structure means we have to have an
>>     ABI change to that structure anytime we add or remove an option. I
>>     was thinking a very simple DB of some kind would be better. Have the
>>     code query the DB to obtain the needed information. The APIs used to
>>     query and set the DB needs to be very easy to use as well.
>>     > > >
>>     > > > Thats a fair point.  A decent starting point is likely a
>>     simple struct that
>>     > > > looks like this:
>>     > > >
>>     > > > struct key_vals {
>>     > > >   char *key;
>>     > > >   union {
>>     > > >           ulong longval;
>>     > > >           void *ptrval;
>>     > > >   } value;
>>     > > > };
>>     > > >
>>     > > > struct config {
>>     > > >   size_t count;
>>     > > >   struct key_vals kvp[0];
>>     > > > };
>>     > > >
>>     > > > >
>>     > > > > Maybe each option can define its own structure if needed or
>>     just a simple variable type can be used for the basic types (int,
>>     string, bool, …)
>>     > > > >
>>     > > > Well, if you have config sections that require mulitiple
>>     elements, I'd handle
>>     > > > that with naming, i.e. if you have a config group that has an
>>     int and char
>>     > > > value, I'd name them "group.intval", and "group.charval", so
>>     they are
>>     > > > independently searchable, but linked from a nomenclature
>>     standpoint.
>>     > > >
>>     > > > > Would this work better in the long run, does a fixed
>>     structure still make sense?
>>     > > > >
>>     > > > No. I think you're ABI concerns are valid, but the above is
>>     likely a good
>>     > > > starting point to address them.
>>     > > >
>>     > > > Best
>>     > > > Neil
>>     > >
>>     > > I'll throw out one implementation idea here that I looked at
>>     previously, for
>>     > > the reason that it was simple enough implement with existing code.
>>     > >
>>     > > We already have the cfgfile library which works with name/value
>>     pairs read from
>>     > > ini files on disk. However, it would be easy enough to add
>>     couple of APIs to
>>     > > that to allow the user to "set" values inside an ini structure
>>     as well. With
>>     > > that done we can then just add a new eal_init api which takes a
>>     single
>>     > > "struct rte_cfgfile *" as parameter. For those apps that want to
>>     just use
>>     > > inifiles for configuration straight, they can then do:
>>     > >
>>     > > cfg = rte_cfgfile_load("my_cfg_file");
>>     > > rte_eal_newinit(cfg);
>>     > >
>>     > > Those who want a different config can instead do:
>>     > >
>>     > > cfg = rte_cfgfile_new();
>>     > > rte_cfgfile_add_section(cfg, "dpdk");
>>     > > foreach_eal_setting_wanted:
>>     > >     rte_cfgfile_set(cfg, "dpdk", mysetting, myvalue);
>>     > > rte_eal_newinit(cfg);
>>     > >
>>     > From chatting to a couple of other DPDK dev's here I suspect I may
>>     not have
>>     > been entirely clear here with this example. What is being shown
>>     above is building
>>     > up a "config-file" in memory - or rather a config structure which
>>     happens to
>>     > have the idea of sections and values as an ini file has. There is
>>     no actual
>>     > file ever being written to disk, and for those using any non-ini
>>     config file
>>     > structure for their app, the code overhead of using the APIs above
>>     should be
>>     > pretty much the same as building up any other set of key-value
>>     pairs in
>>     > memory to pass to an init function.
>>     >
>>     > Hope this is a little clearer now.
>>     >
>>     I'm fine with the idea of reusing the config file library that
>>     currently exists,
>>     or more to the point, modifying it to be usable as a configuration
>>     API, rather
>>     than a configuration file parser.  My primary interest is in
>>     separating the user
>>     configuration mechanism from the internal library configuration lookup
>>     mechanism.  What I would really like to be able to see is
>>     application developers
>>     have the flexibiilty to choose their own configuration method and
>>     format, and
>>     programatically build a configuration for the dpdk on a per-instance
>>     basis prior
>>     to calling rte_eal_init
>>
>>     It seems like this approach satisfies that requirement
>>     Neil
>>
>>
>> If the there is no configuration structure , rather an opaque
>> configuration key/value store ,
>> the user applications can set and get knobs that are not seen by anyone
>> (library) that does not know them by name
>>
>> e.g
>>
>> int num_nodes = getCfgInt ( cfgObject , "eal" , "num_numa_nodes");
>> int delay = getCfgInt ( cfgObject , "drivers.ixgbe" , "some_delay");
>> setCfgInt (cfgObject , "my_app" , "num_days" , 7);
>> setCfgString (cfgObject , "my_app" , "best_day" , "Wednesday");
>>
>
> I dont see why it would not be possible to have the libraries export their
> known config keys in one way or the other. Or more.
>
> One aspect is runtime queries which would need an API of some kind. Being
> able to query default values should work for that purpose and be handy for
> various other uses as well.
>
> Another one is build-time sanity checking which could be doen by
> auto-generating header(s) from the library known keys, eg
>
> #define CFG_NUM_NUMA_NODES "num_numa_nodes"
>
> so if you use the macro instead of the actual string, you'll get a
> compiler error in case of unknown key instead of runtime misbehavior in
> case of typoed values etc. Whether that's worth it is an entirely different
> question.
>
>         - Panu -



Thanks Panu .
I was not clear here.
Naturally libraries are better off accessed using well known macro keys.
The other way around that does not require the library to know the keys of
the applications.

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v8 0/5] mempool: add external mempool manager
  2016-06-02 13:27  2%     ` [dpdk-dev] [PATCH v7 " David Hunt
  @ 2016-06-03 14:58  2%       ` David Hunt
  2016-06-10 15:16  2%         ` [dpdk-dev] [PATCH v9 0/3] " David Hunt
  1 sibling, 1 reply; 200+ results
From: David Hunt @ 2016-06-03 14:58 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 19/5/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool ops struct name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. put       - puts an object back into the mempool once an application has
                finished with it
 3. get       - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .put = common_ring_sp_put,
    .get = common_ring_mc_get,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool manager

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] RFC: DPDK Long Term Support
@ 2016-06-03 15:07  4% Mcnamara, John
  2016-06-03 16:05  0% ` Thomas Monjalon
                   ` (3 more replies)
  0 siblings, 4 replies; 200+ results
From: Mcnamara, John @ 2016-06-03 15:07 UTC (permalink / raw)
  To: dev; +Cc: Christian Ehrhardt, Markos Chandras, Panu Matilainen

Introduction
------------

This document sets out a proposal for a DPDK Long Term Support release (LTS).

The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
backported bug fixes over an extended period of time. This will provide
downstream consumers of DPDK with a stable target on which to base
applications or packages.

As with previous DPDK guidelines this proposal is open for discussion within
the community. The consensus view will be included in the DPDK documentation
as a guideline.

LTS Maintainer
--------------

The proposed maintainer for the LTS is Yuanhan Liu
<yuanhan.liu@linux.intel.com>.

LTS Duration
------------

The proposed duration of the LTS support is 2 years.

There will only be one LTS branch being maintained at any time. At the end of
the 2 year cycle the maintenance on the previous LTS will be wound down.

LTS Version
------------

The proposed initial LTS version will be DPDK 16.07. The next versions, based
on a 2 year cycle, will be DPDK 18.08, 20.08, etc.

What changes should be backported
---------------------------------

* Bug fixes that don't break the ABI.

What changes should not be backported
-------------------------------------

* API or ABI breaking changes.

* Features should not be backported. Unless:

   * There is a justifiable use case (for example a new PMD).
   * The change is non-invasive.
   * The work of preparing the backport is done by the proposer.
   * There is support within the community.

Role of the maintainer
----------------------

* The maintainer will evaluate fixes to the DPDK master submitted by the
  fixing developer and apply them to the LTS branch/tree.

* The maintainer will evaluate backported patches from downstream consumers
  and apply them to the LTS branch/tree.

* The maintainer will not backport non-trivial fixes without assistance from
  the downstream consumers or requester.

Role of the downstream consumers
--------------------------------

Developers submitting fixes to the mainline should also CC the maintainer so
that they can evaluate the patch. A <stable@dpdk.org> email address could be
provided for this so that it can be included as a CC in the commit messages
and documented in the Code Contribution Guidelines.

The downstream consumers (OSVs and DPDK dependent application and framework
developers) should identify issues in the field that have been fixed in the
mainline release and report them to the maintainer. They should, ideally,
assist with backporting any required fixes.

Testing
-------

Intel will provide validation engineers to test the LTS branch/tree. Tested
releases can be marked using a Git tag with an incremented revision number. For
example: 16.07.00_LTS -> 16.07.01_LTS. The testing cadence should be quarterly
but will be best effort only and dependent on available resources.

Validated OSes
--------------

In order to reduce the testing effort the number of OSes which will be
officially validated should be as small as possible. The proposal is that the
following long term OSes are used for validation:

(OSV reps please confirm.)

* Ubuntu 16.04 LTS
* RHEL 7.3
* SuSE 11 SP4 or 12
* FreeBSD 10.3

Fixes for newer OSes, kernels (and associated KNI fixes), and newer GCC/Clang
versions can be backported but the validation effort will be limited to the
above platforms.

Release Notes
-------------

The LTS release notes should be updated to include a section with backported
fixes. Patches for backporting should include additions to the release notes
like patches to the mainline branch.

LTS Review
----------

The LTS guidelines shall be reviewed after 1 year to adjust for any experiences
from LTS maintainership.

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  @ 2016-06-03 17:44  3%                           ` Neil Horman
  2016-06-03 18:29  3%                             ` Wiles, Keith
  0 siblings, 1 reply; 200+ results
From: Neil Horman @ 2016-06-03 17:44 UTC (permalink / raw)
  To: Wiles, Keith
  Cc: Arnon Warshavsky, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz

On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> Sorry, I deleted all of the text as it was getting a bit long.
> 
> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
> 
> - Break out the current command line options out of the DPDK common code and move into a new lib.
>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
These three items seem to be the exact opposite of my suggestion.  The point of
this change was to segregate the parsing of configuration away from the
initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
way that the command line is directly passed into it, you've not changed that
implicit binding to command line options.

I can understand if you want to keep rte_eal_init as is for ABI purposes, but
then you should create an rte_eal_init2(foo), where foo is some handle to in
memory parsed configuration, so that applications can preform that separation.

Neil

>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
> 
> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
> 
> - Next is the data storage and how we can access the data in a clean simple way.
> - I want to have some simple level of hierarchy in the data.
>   - Having a string containing at least two levels “primary:secondary”.
>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> 
>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
>      - Key value pairs (KVP) or a hashmap data store.
>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
>            - If we want to have the two split I am ok with that as well meaning the API would be:
>              rte_map_get(mapObj, “EAL”, “foo.bar”);
>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
>               - I am leaning toward
>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
> - Use a code design to make the strings simple to use without having typos be a problem.
>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
> 
> This is as far as I have gotten and got tired of typing ☺
> 
> I hope this will satisfy most everyone’s needs for now.
> 
> 
> Regards,
> Keith
> 
> 
> 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 15:07  4% [dpdk-dev] RFC: DPDK Long Term Support Mcnamara, John
  2016-06-03 16:05  0% ` Thomas Monjalon
@ 2016-06-03 18:17  3% ` Matthew Hall
  2016-06-07 12:53  3%   ` Mcnamara, John
  2016-06-05 18:15  5% ` Neil Horman
  2016-06-07 12:36  3% ` Christian Ehrhardt
  3 siblings, 1 reply; 200+ results
From: Matthew Hall @ 2016-06-03 18:17 UTC (permalink / raw)
  To: Mcnamara, John; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen

On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> What changes should be backported
> ---------------------------------
> 
> * Bug fixes that don't break the ABI.
> 
> 
> What changes should not be backported
> -------------------------------------
> 
> * API or ABI breaking changes.

I think this part needs some adjusting.

It seems like there should be allowance for bug fixes where the original does 
break ABI but it is possible to make a version that doesn't.

A lot of DPDK bug fixes I see would fall into this category and it isn't 
discussed.

Matthew.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 17:44  3%                           ` Neil Horman
@ 2016-06-03 18:29  3%                             ` Wiles, Keith
  2016-06-03 18:38  0%                               ` Neil Horman
  0 siblings, 1 reply; 200+ results
From: Wiles, Keith @ 2016-06-03 18:29 UTC (permalink / raw)
  To: Neil Horman
  Cc: Arnon Warshavsky, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz


On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:

>On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
>> Sorry, I deleted all of the text as it was getting a bit long.
>> 
>> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
>> 
>> - Break out the current command line options out of the DPDK common code and move into a new lib.
>>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
>>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
>>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
>These three items seem to be the exact opposite of my suggestion.  The point of
>this change was to segregate the parsing of configuration away from the
>initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
>way that the command line is directly passed into it, you've not changed that
>implicit binding to command line options.

Neil,

You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.

>
>I can understand if you want to keep rte_eal_init as is for ABI purposes, but
>then you should create an rte_eal_init2(foo), where foo is some handle to in
>memory parsed configuration, so that applications can preform that separation.

I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.

I hope that clears that up, but let me know.

++Keith

>
>Neil
>
>>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
>> 
>> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
>>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
>> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
>> 
>> - Next is the data storage and how we can access the data in a clean simple way.
>> - I want to have some simple level of hierarchy in the data.
>>   - Having a string containing at least two levels “primary:secondary”.
>>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
>>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
>>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
>> 
>>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
>>      - Key value pairs (KVP) or a hashmap data store.
>>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
>>            - If we want to have the two split I am ok with that as well meaning the API would be:
>>              rte_map_get(mapObj, “EAL”, “foo.bar”);
>>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
>>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
>>               - I am leaning toward
>>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
>> - Use a code design to make the strings simple to use without having typos be a problem.
>>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
>> 
>> This is as far as I have gotten and got tired of typing ☺
>> 
>> I hope this will satisfy most everyone’s needs for now.
>> 
>> 
>> Regards,
>> Keith
>> 
>> 
>> 
>




^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 18:29  3%                             ` Wiles, Keith
@ 2016-06-03 18:38  0%                               ` Neil Horman
  2016-06-03 18:52  0%                                 ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Neil Horman @ 2016-06-03 18:38 UTC (permalink / raw)
  To: Wiles, Keith
  Cc: Arnon Warshavsky, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz

On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
> 
> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> 
> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> >> Sorry, I deleted all of the text as it was getting a bit long.
> >> 
> >> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
> >> 
> >> - Break out the current command line options out of the DPDK common code and move into a new lib.
> >>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
> >>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
> >>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
> >These three items seem to be the exact opposite of my suggestion.  The point of
> >this change was to segregate the parsing of configuration away from the
> >initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
> >way that the command line is directly passed into it, you've not changed that
> >implicit binding to command line options.
> 
> Neil,
> 
> You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.
> 
> >
> >I can understand if you want to keep rte_eal_init as is for ABI purposes, but
> >then you should create an rte_eal_init2(foo), where foo is some handle to in
> >memory parsed configuration, so that applications can preform that separation.
> 
> I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.
> 
> I hope that clears that up, but let me know.
> 
yes, that clarifies your thinking, and I agree with it.  Thank you!
Neil

> ++Keith
> 
> >
> >Neil
> >
> >>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
> >> 
> >> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
> >>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
> >> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
> >> 
> >> - Next is the data storage and how we can access the data in a clean simple way.
> >> - I want to have some simple level of hierarchy in the data.
> >>   - Having a string containing at least two levels “primary:secondary”.
> >>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
> >>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
> >>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> >> 
> >>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
> >>      - Key value pairs (KVP) or a hashmap data store.
> >>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
> >>            - If we want to have the two split I am ok with that as well meaning the API would be:
> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
> >>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
> >>               - I am leaning toward
> >>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
> >> - Use a code design to make the strings simple to use without having typos be a problem.
> >>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
> >> 
> >> This is as far as I have gotten and got tired of typing ☺
> >> 
> >> I hope this will satisfy most everyone’s needs for now.
> >> 
> >> 
> >> Regards,
> >> Keith
> >> 
> >> 
> >> 
> >
> 
> 
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 18:38  0%                               ` Neil Horman
@ 2016-06-03 18:52  0%                                 ` Arnon Warshavsky
  2016-06-03 19:00  0%                                   ` Wiles, Keith
  0 siblings, 1 reply; 200+ results
From: Arnon Warshavsky @ 2016-06-03 18:52 UTC (permalink / raw)
  To: Neil Horman
  Cc: Wiles, Keith, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz

On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com> wrote:

> On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
> >
> > On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
> >
> > >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> > >> Sorry, I deleted all of the text as it was getting a bit long.
> > >>
> > >> Here are my thoughts as of now, which is a combination of many
> suggestions I read from everyone’s emails. I hope this is not too hard to
> understand.
> > >>
> > >> - Break out the current command line options out of the DPDK common
> code and move into a new lib.
> > >>   - At this point I was thinking of keeping the rte_eal_init(args,
> argv) API and just have it pass the args/argv to the new lib to create the
> data storage.
> > >>      - Maybe move the rte_eal_init() API to the new lib or keep it in
> the common eal code. Do not want to go hog wild.
> > >>   - The rte_eal_init(args, argv) would then call to the new API
> rte_eal_initialize(void), which in turn queries the data storage. (still
> thinking here)
> > >These three items seem to be the exact opposite of my suggestion.  The
> point of
> > >this change was to segregate the parsing of configuration away from the
> > >initalization dpdk using that configurtion.  By keeping rte_eal_init in
> such a
> > >way that the command line is directly passed into it, you've not
> changed that
> > >implicit binding to command line options.
> >
> > Neil,
> >
> > You maybe reading the above wrong or I wrote it wrong, which is a high
> possibility. I want to move the command line parsing out of DPDK an into a
> library, but I still believe I need to provide some backward compatibility
> for ABI and to reduce the learning curve. The current applications can
> still call the rte_eal_init(), which then calls the new lib parser for dpdk
> command line options and then calls rte_eal_initialize() or move to the new
> API rte_eal_initialize() preceded by a new library call to parse the old
> command line args. At some point we can deprecate the rte_eal_init() if we
> think it is reasonable.
> >
> > >
> > >I can understand if you want to keep rte_eal_init as is for ABI
> purposes, but
> > >then you should create an rte_eal_init2(foo), where foo is some handle
> to in
> > >memory parsed configuration, so that applications can preform that
> separation.
> >
> > I think you describe what I had planned here. The rte_eal_initialize()
> routine is the new rte_eal_init2() API and the rte_eal_init() was only for
> backward compatibility was my thinking. I figured the argument to
> rte_eal_initialize() would be something to be decided, but it will mostly
> likely be some type of pointer to the storage.
> >
> > I hope that clears that up, but let me know.
> >
> yes, that clarifies your thinking, and I agree with it.  Thank you!
> Neil
>
> > ++Keith
> >
> > >
> > >Neil
> > >
> > >>   - The example apps args needs to be passed to the examples as is
> for now, then we can convert them one at a time if needed.
> > >>
> > >> - I would like to keep the storage of the data separate from the file
> parser as they can use the ‘set’ routines to build the data storage up.
> > >>   - Keeping them split allows for new parsers to be created, while
> keeping the data storage from changing.
> > >> - The rte_cfg code could be modified to use the new configuration if
> someone wants to take on that task ☺
> > >>
> > >> - Next is the data storage and how we can access the data in a clean
> simple way.
> > >> - I want to have some simple level of hierarchy in the data.
> > >>   - Having a string containing at least two levels
> “primary:secondary”.
> > >>      - Primary string is something like “EAL” or “Pktgen” or
> “testpmd” to divide the data storage into logical major groups.
> > >>         - The primary allows us to have groups and then we can have
> common secondary strings in different groups if needed.
> > >>      - Secondary string can be whatever the developer of that group
> would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> > >>
> > >>   - The secondary string is treated as a single string if it has a
> hierarchy or not, but referencing a single value in the data storage.
> > >>      - Key value pairs (KVP) or a hashmap data store.
> > >>         - The key here is the whole string “EAL:foobar” not just
> “foobar” secondary string.
> > >>            - If we want to have the two split I am ok with that as
> well meaning the API would be:
> > >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
> > >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
> > >>            - Have the primary as a different section in the data
> store, would allow for dumping that section maybe easier, not sure.
> > >>               - I am leaning toward
> > >>      - Not going to try splitting up the string or parse it as it is
> up to the developer to make it unique in the data store.
> > >> - Use a code design to make the strings simple to use without having
> typos be a problem.
> > >>    - Not sure what the design is yet, but I do not want to have to
> concat two string or split strings in the code.
> > >>
> > >> This is as far as I have gotten and got tired of typing ☺
> > >>
> > >> I hope this will satisfy most everyone’s needs for now.
> > >>
> > >>
> > >> Regards,
> > >> Keith
> > >>
> > >>
> > >>
> > >
> >
> >
> >
>

Keith

What about the data types of the values?
I would assume that as a library it can provide the service of typed
get/set and not leave conversion and validation to the app.

rte_map_get_int(map,section,key)
rte_map_get_double(...)
rte_map_get_string(...)
rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS key

This may also allow some basic validity of the configuration file

Another point I forgot about is default values.
We sometimes use a notation where the app also specifies a default value in
case the configuration did not specify it
  rte_map_get_int(map,section,key , defaultValue )
and specify if this was a mandatory that has no default
  rte_map_get_int_crash_if_missing (map,section,key)


/Arnon

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 18:52  0%                                 ` Arnon Warshavsky
@ 2016-06-03 19:00  0%                                   ` Wiles, Keith
  2016-06-03 19:07  0%                                     ` Wiles, Keith
  0 siblings, 1 reply; 200+ results
From: Wiles, Keith @ 2016-06-03 19:00 UTC (permalink / raw)
  To: Arnon Warshavsky, Neil Horman
  Cc: Panu Matilainen, Richardson, Bruce, Thomas Monjalon, Yuanhan Liu,
	dev, Tan, Jianfeng, Stephen Hemminger, Christian Ehrhardt,
	Olivier Matz

On 6/3/16, 1:52 PM, "Arnon Warshavsky" <arnon@qwilt.com<mailto:arnon@qwilt.com>> wrote:



On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
>
> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
>
> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> >> Sorry, I deleted all of the text as it was getting a bit long.
> >>
> >> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
> >>
> >> - Break out the current command line options out of the DPDK common code and move into a new lib.
> >>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
> >>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
> >>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
> >These three items seem to be the exact opposite of my suggestion.  The point of
> >this change was to segregate the parsing of configuration away from the
> >initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
> >way that the command line is directly passed into it, you've not changed that
> >implicit binding to command line options.
>
> Neil,
>
> You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.
>
> >
> >I can understand if you want to keep rte_eal_init as is for ABI purposes, but
> >then you should create an rte_eal_init2(foo), where foo is some handle to in
> >memory parsed configuration, so that applications can preform that separation.
>
> I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.
>
> I hope that clears that up, but let me know.
>
yes, that clarifies your thinking, and I agree with it.  Thank you!
Neil

> ++Keith
>
> >
> >Neil
> >
> >>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
> >>
> >> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
> >>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
> >> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
> >>
> >> - Next is the data storage and how we can access the data in a clean simple way.
> >> - I want to have some simple level of hierarchy in the data.
> >>   - Having a string containing at least two levels “primary:secondary”.
> >>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
> >>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
> >>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> >>
> >>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
> >>      - Key value pairs (KVP) or a hashmap data store.
> >>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
> >>            - If we want to have the two split I am ok with that as well meaning the API would be:
> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
> >>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
> >>               - I am leaning toward
> >>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
> >> - Use a code design to make the strings simple to use without having typos be a problem.
> >>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
> >>
> >> This is as far as I have gotten and got tired of typing ☺
> >>
> >> I hope this will satisfy most everyone’s needs for now.
> >>
> >>
> >> Regards,
> >> Keith
> >>
> >>
> >>
> >
>
>
>

Keith
What about the data types of the values?
I would assume that as a library it can provide the service of typed get/set and not leave conversion and validation to the app.

rte_map_get_int(map,section,key)
rte_map_get_double(...)
rte_map_get_string(...)
rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS key
This may also allow some basic validity of the configuration file
Another point I forgot about is default values.
We sometimes use a notation where the app also specifies a default value in case the configuration did not specify it
  rte_map_get_int(map,section,key , defaultValue )
and specify if this was a mandatory that has no default
  rte_map_get_int_crash_if_missing (map,section,key)




/Arnon

Arnon,

Yes, I too was thinking about access type APIs, but had not come to a full conclusion yet. As long as the API for get/put can return any value, we can add a layer on top of these primary get/put APIs to do some basic type checking. This way the developer can add his/her own type checking APIs or we provide a couple basic types for simple values.

Does that make sense?

++Keith

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 19:00  0%                                   ` Wiles, Keith
@ 2016-06-03 19:07  0%                                     ` Wiles, Keith
  2016-06-03 19:18  0%                                       ` Neil Horman
  0 siblings, 1 reply; 200+ results
From: Wiles, Keith @ 2016-06-03 19:07 UTC (permalink / raw)
  To: Arnon Warshavsky, Neil Horman
  Cc: Panu Matilainen, Richardson, Bruce, Thomas Monjalon, Yuanhan Liu,
	dev, Tan, Jianfeng, Stephen Hemminger, Christian Ehrhardt,
	Olivier Matz

On 6/3/16, 2:00 PM, "dev on behalf of Wiles, Keith" <dev-bounces@dpdk.org on behalf of keith.wiles@intel.com> wrote:

>On 6/3/16, 1:52 PM, "Arnon Warshavsky" <arnon@qwilt.com<mailto:arnon@qwilt.com>> wrote:
>
>
>
>On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
>On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
>>
>> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
>>
>> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
>> >> Sorry, I deleted all of the text as it was getting a bit long.
>> >>
>> >> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
>> >>
>> >> - Break out the current command line options out of the DPDK common code and move into a new lib.
>> >>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
>> >>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
>> >>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
>> >These three items seem to be the exact opposite of my suggestion.  The point of
>> >this change was to segregate the parsing of configuration away from the
>> >initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
>> >way that the command line is directly passed into it, you've not changed that
>> >implicit binding to command line options.
>>
>> Neil,
>>
>> You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.
>>
>> >
>> >I can understand if you want to keep rte_eal_init as is for ABI purposes, but
>> >then you should create an rte_eal_init2(foo), where foo is some handle to in
>> >memory parsed configuration, so that applications can preform that separation.
>>
>> I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.
>>
>> I hope that clears that up, but let me know.
>>
>yes, that clarifies your thinking, and I agree with it.  Thank you!
>Neil
>
>> ++Keith
>>
>> >
>> >Neil
>> >
>> >>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
>> >>
>> >> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
>> >>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
>> >> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
>> >>
>> >> - Next is the data storage and how we can access the data in a clean simple way.
>> >> - I want to have some simple level of hierarchy in the data.
>> >>   - Having a string containing at least two levels “primary:secondary”.
>> >>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
>> >>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
>> >>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
>> >>
>> >>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
>> >>      - Key value pairs (KVP) or a hashmap data store.
>> >>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
>> >>            - If we want to have the two split I am ok with that as well meaning the API would be:
>> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
>> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
>> >>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
>> >>               - I am leaning toward
>> >>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
>> >> - Use a code design to make the strings simple to use without having typos be a problem.
>> >>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
>> >>
>> >> This is as far as I have gotten and got tired of typing ☺
>> >>
>> >> I hope this will satisfy most everyone’s needs for now.
>> >>
>> >>
>> >> Regards,
>> >> Keith
>> >>
>> >>
>> >>
>> >
>>
>>
>>
>
>Keith
>What about the data types of the values?
>I would assume that as a library it can provide the service of typed get/set and not leave conversion and validation to the app.
>
>rte_map_get_int(map,section,key)
>rte_map_get_double(...)
>rte_map_get_string(...)
>rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS key
>This may also allow some basic validity of the configuration file
>Another point I forgot about is default values.
>We sometimes use a notation where the app also specifies a default value in case the configuration did not specify it
>  rte_map_get_int(map,section,key , defaultValue )
>and specify if this was a mandatory that has no default
>  rte_map_get_int_crash_if_missing (map,section,key)
>
>
>
>
>/Arnon
>
>Arnon,
>
>Yes, I too was thinking about access type APIs, but had not come to a full conclusion yet. As long as the API for get/put can return any value, we can add a layer on top of these primary get/put APIs to do some basic type checking. This way the developer can add his/her own type checking APIs or we provide a couple basic types for simple values.

One more thing. I had not thought about default values as the defaults are handle directly by the code when an option is not applied. I think it should be left up to the developer to add default values to the storage or handle it when an option is not found in the storage.

If I understand your code above the API would pass in a default value if one did not exist in the storage, which I guess is reasonable. Anyone think this is a good idea or not?

>
>Does that make sense?
>
>++Keith
>




^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 19:07  0%                                     ` Wiles, Keith
@ 2016-06-03 19:18  0%                                       ` Neil Horman
  2016-06-03 19:23  0%                                         ` Wiles, Keith
  0 siblings, 1 reply; 200+ results
From: Neil Horman @ 2016-06-03 19:18 UTC (permalink / raw)
  To: Wiles, Keith
  Cc: Arnon Warshavsky, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz

On Fri, Jun 03, 2016 at 07:07:50PM +0000, Wiles, Keith wrote:
> On 6/3/16, 2:00 PM, "dev on behalf of Wiles, Keith" <dev-bounces@dpdk.org on behalf of keith.wiles@intel.com> wrote:
> 
> >On 6/3/16, 1:52 PM, "Arnon Warshavsky" <arnon@qwilt.com<mailto:arnon@qwilt.com>> wrote:
> >
> >
> >
> >On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
> >On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
> >>
> >> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
> >>
> >> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> >> >> Sorry, I deleted all of the text as it was getting a bit long.
> >> >>
> >> >> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
> >> >>
> >> >> - Break out the current command line options out of the DPDK common code and move into a new lib.
> >> >>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
> >> >>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
> >> >>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
> >> >These three items seem to be the exact opposite of my suggestion.  The point of
> >> >this change was to segregate the parsing of configuration away from the
> >> >initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
> >> >way that the command line is directly passed into it, you've not changed that
> >> >implicit binding to command line options.
> >>
> >> Neil,
> >>
> >> You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.
> >>
> >> >
> >> >I can understand if you want to keep rte_eal_init as is for ABI purposes, but
> >> >then you should create an rte_eal_init2(foo), where foo is some handle to in
> >> >memory parsed configuration, so that applications can preform that separation.
> >>
> >> I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.
> >>
> >> I hope that clears that up, but let me know.
> >>
> >yes, that clarifies your thinking, and I agree with it.  Thank you!
> >Neil
> >
> >> ++Keith
> >>
> >> >
> >> >Neil
> >> >
> >> >>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
> >> >>
> >> >> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
> >> >>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
> >> >> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
> >> >>
> >> >> - Next is the data storage and how we can access the data in a clean simple way.
> >> >> - I want to have some simple level of hierarchy in the data.
> >> >>   - Having a string containing at least two levels “primary:secondary”.
> >> >>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
> >> >>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
> >> >>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> >> >>
> >> >>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
> >> >>      - Key value pairs (KVP) or a hashmap data store.
> >> >>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
> >> >>            - If we want to have the two split I am ok with that as well meaning the API would be:
> >> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
> >> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
> >> >>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
> >> >>               - I am leaning toward
> >> >>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
> >> >> - Use a code design to make the strings simple to use without having typos be a problem.
> >> >>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
> >> >>
> >> >> This is as far as I have gotten and got tired of typing ☺
> >> >>
> >> >> I hope this will satisfy most everyone’s needs for now.
> >> >>
> >> >>
> >> >> Regards,
> >> >> Keith
> >> >>
> >> >>
> >> >>
> >> >
> >>
> >>
> >>
> >
> >Keith
> >What about the data types of the values?
> >I would assume that as a library it can provide the service of typed get/set and not leave conversion and validation to the app.
> >
> >rte_map_get_int(map,section,key)
> >rte_map_get_double(...)
> >rte_map_get_string(...)
> >rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS key
> >This may also allow some basic validity of the configuration file
> >Another point I forgot about is default values.
> >We sometimes use a notation where the app also specifies a default value in case the configuration did not specify it
> >  rte_map_get_int(map,section,key , defaultValue )
> >and specify if this was a mandatory that has no default
> >  rte_map_get_int_crash_if_missing (map,section,key)
> >
> >
> >
> >
> >/Arnon
> >
> >Arnon,
> >
> >Yes, I too was thinking about access type APIs, but had not come to a full conclusion yet. As long as the API for get/put can return any value, we can add a layer on top of these primary get/put APIs to do some basic type checking. This way the developer can add his/her own type checking APIs or we provide a couple basic types for simple values.
> 
> One more thing. I had not thought about default values as the defaults are handle directly by the code when an option is not applied. I think it should be left up to the developer to add default values to the storage or handle it when an option is not found in the storage.
> 
> If I understand your code above the API would pass in a default value if one did not exist in the storage, which I guess is reasonable. Anyone think this is a good idea or not?
> 

I'm not opposed to default values, but it seems to me that if we are splitting
out a configuration storage library from dpdk, part of the initzliation of that
library can be installing default values.  That is to say, instead of having the
code specific areas assume a default value if none is present in the config, an
init function for the configuration storage library would just populate the
keystore.  That way all the dpdk itself has to do is a key lookup.

Neil

> >
> >Does that make sense?
> >
> >++Keith
> >
> 
> 
> 

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 19:18  0%                                       ` Neil Horman
@ 2016-06-03 19:23  0%                                         ` Wiles, Keith
  2016-06-03 19:28  0%                                           ` Arnon Warshavsky
  0 siblings, 1 reply; 200+ results
From: Wiles, Keith @ 2016-06-03 19:23 UTC (permalink / raw)
  To: Neil Horman
  Cc: Arnon Warshavsky, Panu Matilainen, Richardson, Bruce,
	Thomas Monjalon, Yuanhan Liu, dev, Tan, Jianfeng,
	Stephen Hemminger, Christian Ehrhardt, Olivier Matz


On 6/3/16, 2:18 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:

>On Fri, Jun 03, 2016 at 07:07:50PM +0000, Wiles, Keith wrote:
>> On 6/3/16, 2:00 PM, "dev on behalf of Wiles, Keith" <dev-bounces@dpdk.org on behalf of keith.wiles@intel.com> wrote:
>> 
>> >On 6/3/16, 1:52 PM, "Arnon Warshavsky" <arnon@qwilt.com<mailto:arnon@qwilt.com>> wrote:
>> >
>> >
>> >
>> >On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
>> >On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
>> >>
>> >> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com<mailto:nhorman@tuxdriver.com>> wrote:
>> >>
>> >> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
>> >> >> Sorry, I deleted all of the text as it was getting a bit long.
>> >> >>
>> >> >> Here are my thoughts as of now, which is a combination of many suggestions I read from everyone’s emails. I hope this is not too hard to understand.
>> >> >>
>> >> >> - Break out the current command line options out of the DPDK common code and move into a new lib.
>> >> >>   - At this point I was thinking of keeping the rte_eal_init(args, argv) API and just have it pass the args/argv to the new lib to create the data storage.
>> >> >>      - Maybe move the rte_eal_init() API to the new lib or keep it in the common eal code. Do not want to go hog wild.
>> >> >>   - The rte_eal_init(args, argv) would then call to the new API rte_eal_initialize(void), which in turn queries the data storage. (still thinking here)
>> >> >These three items seem to be the exact opposite of my suggestion.  The point of
>> >> >this change was to segregate the parsing of configuration away from the
>> >> >initalization dpdk using that configurtion.  By keeping rte_eal_init in such a
>> >> >way that the command line is directly passed into it, you've not changed that
>> >> >implicit binding to command line options.
>> >>
>> >> Neil,
>> >>
>> >> You maybe reading the above wrong or I wrote it wrong, which is a high possibility. I want to move the command line parsing out of DPDK an into a library, but I still believe I need to provide some backward compatibility for ABI and to reduce the learning curve. The current applications can still call the rte_eal_init(), which then calls the new lib parser for dpdk command line options and then calls rte_eal_initialize() or move to the new API rte_eal_initialize() preceded by a new library call to parse the old command line args. At some point we can deprecate the rte_eal_init() if we think it is reasonable.
>> >>
>> >> >
>> >> >I can understand if you want to keep rte_eal_init as is for ABI purposes, but
>> >> >then you should create an rte_eal_init2(foo), where foo is some handle to in
>> >> >memory parsed configuration, so that applications can preform that separation.
>> >>
>> >> I think you describe what I had planned here. The rte_eal_initialize() routine is the new rte_eal_init2() API and the rte_eal_init() was only for backward compatibility was my thinking. I figured the argument to rte_eal_initialize() would be something to be decided, but it will mostly likely be some type of pointer to the storage.
>> >>
>> >> I hope that clears that up, but let me know.
>> >>
>> >yes, that clarifies your thinking, and I agree with it.  Thank you!
>> >Neil
>> >
>> >> ++Keith
>> >>
>> >> >
>> >> >Neil
>> >> >
>> >> >>   - The example apps args needs to be passed to the examples as is for now, then we can convert them one at a time if needed.
>> >> >>
>> >> >> - I would like to keep the storage of the data separate from the file parser as they can use the ‘set’ routines to build the data storage up.
>> >> >>   - Keeping them split allows for new parsers to be created, while keeping the data storage from changing.
>> >> >> - The rte_cfg code could be modified to use the new configuration if someone wants to take on that task ☺
>> >> >>
>> >> >> - Next is the data storage and how we can access the data in a clean simple way.
>> >> >> - I want to have some simple level of hierarchy in the data.
>> >> >>   - Having a string containing at least two levels “primary:secondary”.
>> >> >>      - Primary string is something like “EAL” or “Pktgen” or “testpmd” to divide the data storage into logical major groups.
>> >> >>         - The primary allows us to have groups and then we can have common secondary strings in different groups if needed.
>> >> >>      - Secondary string can be whatever the developer of that group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
>> >> >>
>> >> >>   - The secondary string is treated as a single string if it has a hierarchy or not, but referencing a single value in the data storage.
>> >> >>      - Key value pairs (KVP) or a hashmap data store.
>> >> >>         - The key here is the whole string “EAL:foobar” not just “foobar” secondary string.
>> >> >>            - If we want to have the two split I am ok with that as well meaning the API would be:
>> >> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
>> >> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
>> >> >>            - Have the primary as a different section in the data store, would allow for dumping that section maybe easier, not sure.
>> >> >>               - I am leaning toward
>> >> >>      - Not going to try splitting up the string or parse it as it is up to the developer to make it unique in the data store.
>> >> >> - Use a code design to make the strings simple to use without having typos be a problem.
>> >> >>    - Not sure what the design is yet, but I do not want to have to concat two string or split strings in the code.
>> >> >>
>> >> >> This is as far as I have gotten and got tired of typing ☺
>> >> >>
>> >> >> I hope this will satisfy most everyone’s needs for now.
>> >> >>
>> >> >>
>> >> >> Regards,
>> >> >> Keith
>> >> >>
>> >> >>
>> >> >>
>> >> >
>> >>
>> >>
>> >>
>> >
>> >Keith
>> >What about the data types of the values?
>> >I would assume that as a library it can provide the service of typed get/set and not leave conversion and validation to the app.
>> >
>> >rte_map_get_int(map,section,key)
>> >rte_map_get_double(...)
>> >rte_map_get_string(...)
>> >rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS key
>> >This may also allow some basic validity of the configuration file
>> >Another point I forgot about is default values.
>> >We sometimes use a notation where the app also specifies a default value in case the configuration did not specify it
>> >  rte_map_get_int(map,section,key , defaultValue )
>> >and specify if this was a mandatory that has no default
>> >  rte_map_get_int_crash_if_missing (map,section,key)
>> >
>> >
>> >
>> >
>> >/Arnon
>> >
>> >Arnon,
>> >
>> >Yes, I too was thinking about access type APIs, but had not come to a full conclusion yet. As long as the API for get/put can return any value, we can add a layer on top of these primary get/put APIs to do some basic type checking. This way the developer can add his/her own type checking APIs or we provide a couple basic types for simple values.
>> 
>> One more thing. I had not thought about default values as the defaults are handle directly by the code when an option is not applied. I think it should be left up to the developer to add default values to the storage or handle it when an option is not found in the storage.
>> 
>> If I understand your code above the API would pass in a default value if one did not exist in the storage, which I guess is reasonable. Anyone think this is a good idea or not?
>> 
>
>I'm not opposed to default values, but it seems to me that if we are splitting
>out a configuration storage library from dpdk, part of the initzliation of that
>library can be installing default values.  That is to say, instead of having the
>code specific areas assume a default value if none is present in the config, an
>init function for the configuration storage library would just populate the
>keystore.  That way all the dpdk itself has to do is a key lookup.

+1

If someone needs or wants default values in the API call then a wrapper functions around the basic keystore APIs can be done by the developer or we can add a new set of APIs to provide that type of feature, just like the variable type APIs. Just as long as the basic APIs do not exclude we can add it later.

>
>Neil
>
>> >
>> >Does that make sense?
>> >
>> >++Keith
>> >
>> 
>> 
>> 
>




^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Yet another option for DPDK options
  2016-06-03 19:23  0%                                         ` Wiles, Keith
@ 2016-06-03 19:28  0%                                           ` Arnon Warshavsky
  0 siblings, 0 replies; 200+ results
From: Arnon Warshavsky @ 2016-06-03 19:28 UTC (permalink / raw)
  To: Wiles, Keith
  Cc: Neil Horman, Panu Matilainen, Richardson, Bruce, Thomas Monjalon,
	Yuanhan Liu, dev, Tan, Jianfeng, Stephen Hemminger,
	Christian Ehrhardt, Olivier Matz

I

On Fri, Jun 3, 2016 at 10:23 PM, Wiles, Keith <keith.wiles@intel.com> wrote:

>
> On 6/3/16, 2:18 PM, "Neil Horman" <nhorman@tuxdriver.com> wrote:
>
> >On Fri, Jun 03, 2016 at 07:07:50PM +0000, Wiles, Keith wrote:
> >> On 6/3/16, 2:00 PM, "dev on behalf of Wiles, Keith" <
> dev-bounces@dpdk.org on behalf of keith.wiles@intel.com> wrote:
> >>
> >> >On 6/3/16, 1:52 PM, "Arnon Warshavsky" <arnon@qwilt.com<mailto:
> arnon@qwilt.com>> wrote:
> >> >
> >> >
> >> >
> >> >On Fri, Jun 3, 2016 at 9:38 PM, Neil Horman <nhorman@tuxdriver.com
> <mailto:nhorman@tuxdriver.com>> wrote:
> >> >On Fri, Jun 03, 2016 at 06:29:13PM +0000, Wiles, Keith wrote:
> >> >>
> >> >> On 6/3/16, 12:44 PM, "Neil Horman" <nhorman@tuxdriver.com<mailto:
> nhorman@tuxdriver.com>> wrote:
> >> >>
> >> >> >On Fri, Jun 03, 2016 at 04:04:14PM +0000, Wiles, Keith wrote:
> >> >> >> Sorry, I deleted all of the text as it was getting a bit long.
> >> >> >>
> >> >> >> Here are my thoughts as of now, which is a combination of many
> suggestions I read from everyone’s emails. I hope this is not too hard to
> understand.
> >> >> >>
> >> >> >> - Break out the current command line options out of the DPDK
> common code and move into a new lib.
> >> >> >>   - At this point I was thinking of keeping the
> rte_eal_init(args, argv) API and just have it pass the args/argv to the new
> lib to create the data storage.
> >> >> >>      - Maybe move the rte_eal_init() API to the new lib or keep
> it in the common eal code. Do not want to go hog wild.
> >> >> >>   - The rte_eal_init(args, argv) would then call to the new API
> rte_eal_initialize(void), which in turn queries the data storage. (still
> thinking here)
> >> >> >These three items seem to be the exact opposite of my suggestion.
> The point of
> >> >> >this change was to segregate the parsing of configuration away from
> the
> >> >> >initalization dpdk using that configurtion.  By keeping
> rte_eal_init in such a
> >> >> >way that the command line is directly passed into it, you've not
> changed that
> >> >> >implicit binding to command line options.
> >> >>
> >> >> Neil,
> >> >>
> >> >> You maybe reading the above wrong or I wrote it wrong, which is a
> high possibility. I want to move the command line parsing out of DPDK an
> into a library, but I still believe I need to provide some backward
> compatibility for ABI and to reduce the learning curve. The current
> applications can still call the rte_eal_init(), which then calls the new
> lib parser for dpdk command line options and then calls
> rte_eal_initialize() or move to the new API rte_eal_initialize() preceded
> by a new library call to parse the old command line args. At some point we
> can deprecate the rte_eal_init() if we think it is reasonable.
> >> >>
> >> >> >
> >> >> >I can understand if you want to keep rte_eal_init as is for ABI
> purposes, but
> >> >> >then you should create an rte_eal_init2(foo), where foo is some
> handle to in
> >> >> >memory parsed configuration, so that applications can preform that
> separation.
> >> >>
> >> >> I think you describe what I had planned here. The
> rte_eal_initialize() routine is the new rte_eal_init2() API and the
> rte_eal_init() was only for backward compatibility was my thinking. I
> figured the argument to rte_eal_initialize() would be something to be
> decided, but it will mostly likely be some type of pointer to the storage.
> >> >>
> >> >> I hope that clears that up, but let me know.
> >> >>
> >> >yes, that clarifies your thinking, and I agree with it.  Thank you!
> >> >Neil
> >> >
> >> >> ++Keith
> >> >>
> >> >> >
> >> >> >Neil
> >> >> >
> >> >> >>   - The example apps args needs to be passed to the examples as
> is for now, then we can convert them one at a time if needed.
> >> >> >>
> >> >> >> - I would like to keep the storage of the data separate from the
> file parser as they can use the ‘set’ routines to build the data storage up.
> >> >> >>   - Keeping them split allows for new parsers to be created,
> while keeping the data storage from changing.
> >> >> >> - The rte_cfg code could be modified to use the new configuration
> if someone wants to take on that task ☺
> >> >> >>
> >> >> >> - Next is the data storage and how we can access the data in a
> clean simple way.
> >> >> >> - I want to have some simple level of hierarchy in the data.
> >> >> >>   - Having a string containing at least two levels
> “primary:secondary”.
> >> >> >>      - Primary string is something like “EAL” or “Pktgen” or
> “testpmd” to divide the data storage into logical major groups.
> >> >> >>         - The primary allows us to have groups and then we can
> have common secondary strings in different groups if needed.
> >> >> >>      - Secondary string can be whatever the developer of that
> group would like e.g. simple “EAL:foobar”, two levels “testpmd:foo.bar”
> >> >> >>
> >> >> >>   - The secondary string is treated as a single string if it has
> a hierarchy or not, but referencing a single value in the data storage.
> >> >> >>      - Key value pairs (KVP) or a hashmap data store.
> >> >> >>         - The key here is the whole string “EAL:foobar” not just
> “foobar” secondary string.
> >> >> >>            - If we want to have the two split I am ok with that
> as well meaning the API would be:
> >> >> >>              rte_map_get(mapObj, “EAL”, “foo.bar”);
> >> >> >>              rte_map_set(mapObj, “EAL”, “foo.bar”, value);
> >> >> >>            - Have the primary as a different section in the data
> store, would allow for dumping that section maybe easier, not sure.
> >> >> >>               - I am leaning toward
> >> >> >>      - Not going to try splitting up the string or parse it as it
> is up to the developer to make it unique in the data store.
> >> >> >> - Use a code design to make the strings simple to use without
> having typos be a problem.
> >> >> >>    - Not sure what the design is yet, but I do not want to have
> to concat two string or split strings in the code.
> >> >> >>
> >> >> >> This is as far as I have gotten and got tired of typing ☺
> >> >> >>
> >> >> >> I hope this will satisfy most everyone’s needs for now.
> >> >> >>
> >> >> >>
> >> >> >> Regards,
> >> >> >> Keith
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >
> >> >>
> >> >>
> >> >>
> >> >
> >> >Keith
> >> >What about the data types of the values?
> >> >I would assume that as a library it can provide the service of typed
> get/set and not leave conversion and validation to the app.
> >> >
> >> >rte_map_get_int(map,section,key)
> >> >rte_map_get_double(...)
> >> >rte_map_get_string(...)
> >> >rte_map_get_bytes(...,destBuff , destBuffSize) //e.g byte array of RSS
> key
> >> >This may also allow some basic validity of the configuration file
> >> >Another point I forgot about is default values.
> >> >We sometimes use a notation where the app also specifies a default
> value in case the configuration did not specify it
> >> >  rte_map_get_int(map,section,key , defaultValue )
> >> >and specify if this was a mandatory that has no default
> >> >  rte_map_get_int_crash_if_missing (map,section,key)
> >> >
> >> >
> >> >
> >> >
> >> >/Arnon
> >> >
> >> >Arnon,
> >> >
> >> >Yes, I too was thinking about access type APIs, but had not come to a
> full conclusion yet. As long as the API for get/put can return any value,
> we can add a layer on top of these primary get/put APIs to do some basic
> type checking. This way the developer can add his/her own type checking
> APIs or we provide a couple basic types for simple values.
> >>
> >> One more thing. I had not thought about default values as the defaults
> are handle directly by the code when an option is not applied. I think it
> should be left up to the developer to add default values to the storage or
> handle it when an option is not found in the storage.
> >>
> >> If I understand your code above the API would pass in a default value
> if one did not exist in the storage, which I guess is reasonable. Anyone
> think this is a good idea or not?
> >>
> >
> >I'm not opposed to default values, but it seems to me that if we are
> splitting
> >out a configuration storage library from dpdk, part of the initzliation
> of that
> >library can be installing default values.  That is to say, instead of
> having the
> >code specific areas assume a default value if none is present in the
> config, an
> >init function for the configuration storage library would just populate
> the
> >keystore.  That way all the dpdk itself has to do is a key lookup.
>
> +1
>
> If someone needs or wants default values in the API call then a wrapper
> functions around the basic keystore APIs can be done by the developer or we
> can add a new set of APIs to provide that type of feature, just like the
> variable type APIs. Just as long as the basic APIs do not exclude we can
> add it later.
>
> >
> >Neil
> >
> >> >
> >> >Does that make sense?
> >> >
> >> >++Keith
> >> >
> >>
> >>
> >>
> >
>
>
>
>
Yes.
I like to use the the getValue(myAlternativeDefault) concept when I have
different granularity defaults coming from different hierarchies,
but per dpdk as a single configuration separation to an init phase indeed
makes more sense, so +1 here too

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 15:07  4% [dpdk-dev] RFC: DPDK Long Term Support Mcnamara, John
  2016-06-03 16:05  0% ` Thomas Monjalon
  2016-06-03 18:17  3% ` Matthew Hall
@ 2016-06-05 18:15  5% ` Neil Horman
  2016-06-06  9:27  5%   ` Thomas Monjalon
  2016-06-07 15:55  5%   ` Mcnamara, John
  2016-06-07 12:36  3% ` Christian Ehrhardt
  3 siblings, 2 replies; 200+ results
From: Neil Horman @ 2016-06-05 18:15 UTC (permalink / raw)
  To: Mcnamara, John; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen

On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> Introduction
> ------------
> 
> This document sets out a proposal for a DPDK Long Term Support release (LTS).
> 
> The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> backported bug fixes over an extended period of time. This will provide
> downstream consumers of DPDK with a stable target on which to base
> applications or packages.
> 
> As with previous DPDK guidelines this proposal is open for discussion within
> the community. The consensus view will be included in the DPDK documentation
> as a guideline.
> 
> 
> LTS Maintainer
> --------------
> 
> The proposed maintainer for the LTS is Yuanhan Liu
> <yuanhan.liu@linux.intel.com>.
> 
> 
> LTS Duration
> ------------
> 
> The proposed duration of the LTS support is 2 years.
> 
> There will only be one LTS branch being maintained at any time. At the end of
> the 2 year cycle the maintenance on the previous LTS will be wound down.
> 
> 
> LTS Version
> ------------
> 
> The proposed initial LTS version will be DPDK 16.07. The next versions, based
> on a 2 year cycle, will be DPDK 18.08, 20.08, etc.
> 
> 
> What changes should be backported
> ---------------------------------
> 
> * Bug fixes that don't break the ABI.
> 
> 
> What changes should not be backported
> -------------------------------------
> 
> * API or ABI breaking changes.
> 
> * Features should not be backported. Unless:
> 
>    * There is a justifiable use case (for example a new PMD).
>    * The change is non-invasive.
>    * The work of preparing the backport is done by the proposer.
>    * There is support within the community.
> 
> 
> Role of the maintainer
> ----------------------
> 
> * The maintainer will evaluate fixes to the DPDK master submitted by the
>   fixing developer and apply them to the LTS branch/tree.
> 
> * The maintainer will evaluate backported patches from downstream consumers
>   and apply them to the LTS branch/tree.
> 
> * The maintainer will not backport non-trivial fixes without assistance from
>   the downstream consumers or requester.
> 
> 
> Role of the downstream consumers
> --------------------------------
> 
> Developers submitting fixes to the mainline should also CC the maintainer so
> that they can evaluate the patch. A <stable@dpdk.org> email address could be
> provided for this so that it can be included as a CC in the commit messages
> and documented in the Code Contribution Guidelines.
> 
> The downstream consumers (OSVs and DPDK dependent application and framework
> developers) should identify issues in the field that have been fixed in the
> mainline release and report them to the maintainer. They should, ideally,
> assist with backporting any required fixes.
> 
> 
> Testing
> -------
> 
> Intel will provide validation engineers to test the LTS branch/tree. Tested
> releases can be marked using a Git tag with an incremented revision number. For
> example: 16.07.00_LTS -> 16.07.01_LTS. The testing cadence should be quarterly
> but will be best effort only and dependent on available resources.
> 
> 
> Validated OSes
> --------------
> 
> In order to reduce the testing effort the number of OSes which will be
> officially validated should be as small as possible. The proposal is that the
> following long term OSes are used for validation:
> 
> (OSV reps please confirm.)
> 
> * Ubuntu 16.04 LTS
> * RHEL 7.3
> * SuSE 11 SP4 or 12
> * FreeBSD 10.3
> 
> Fixes for newer OSes, kernels (and associated KNI fixes), and newer GCC/Clang
> versions can be backported but the validation effort will be limited to the
> above platforms.
> 
> 
> Release Notes
> -------------
> 
> The LTS release notes should be updated to include a section with backported
> fixes. Patches for backporting should include additions to the release notes
> like patches to the mainline branch.
> 
> 
> LTS Review
> ----------
> 
> The LTS guidelines shall be reviewed after 1 year to adjust for any experiences
> from LTS maintainership.
> 
> 
> 
> 
> 
> 
> 
I'm not opposed to an LTS release, but it seems to be re-solving the issue of
ABI breakage.  That is to say, there is alreay a process in place for managing
ABI changes to the DPDK, which is designed to help ensure that:

1) ABI changes are signaled at least 2 releases early
2) ABI changes whenever possible are designed such that backward compatibility
versions can be encoded at the same time with versioning tags

Those two mechanism are expressly intended to allow application upgrades of DPDK
libraries without worrying about ABI breakage.  While LTS releases are a fine
approach for  some things, they sacrifice upstream efficiency (by creating work
for backporting teams), while allowing upstream developers more leverage to just
create ABI breaking changes on a whim, ignoring the existing ABI compatibility
mechanism

LTS is a fine process for projects in which API/ABI breakage is either uncommon
or fairly isolated, but that in my mind doesn't really describe DPDK.

Neil

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
  @ 2016-06-06  5:40  4% ` Wenzhuo Lu
  2016-06-08  2:15  0%   ` Stephen Hemminger
  0 siblings, 1 reply; 200+ results
From: Wenzhuo Lu @ 2016-06-06  5:40 UTC (permalink / raw)
  To: dev; +Cc: Wenzhuo Lu, Zhe Tao

Define lock mode for RX/TX queue. Because when resetting
the device we want the resetting thread to get the lock
of the RX/TX queue to make sure the RX/TX is stopped.

Using next ABI macro for this ABI change as it has too
much impact. 7 APIs and 1 global variable are impacted.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Signed-off-by: Zhe Tao <zhe.tao@intel.com>
---
 lib/librte_ether/rte_ethdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 74e895f..4efb5e9 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -354,7 +354,12 @@ struct rte_eth_rxmode {
 		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
 		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
 		enable_scatter   : 1, /**< Enable scatter packets rx handler */
+#ifndef RTE_NEXT_ABI
 		enable_lro       : 1; /**< Enable LRO */
+#else
+		enable_lro       : 1, /**< Enable LRO */
+		lock_mode        : 1; /**< Using lock path */
+#endif
 };
 
 /**
@@ -634,11 +639,68 @@ struct rte_eth_txmode {
 		/**< If set, reject sending out tagged pkts */
 		hw_vlan_reject_untagged : 1,
 		/**< If set, reject sending out untagged pkts */
+#ifndef RTE_NEXT_ABI
 		hw_vlan_insert_pvid : 1;
 		/**< If set, enable port based VLAN insertion */
+#else
+		hw_vlan_insert_pvid : 1,
+		/**< If set, enable port based VLAN insertion */
+		lock_mode : 1;
+		/**< If set, using lock path */
+#endif
 };
 
 /**
+ * The macros for the RX/TX lock mode functions
+ */
+#ifdef RTE_NEXT_ABI
+#define RX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.rxmode.lock_mode ? \
+	func ## _lock : func)
+
+#define TX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.txmode.lock_mode ? \
+	func ## _lock : func)
+#else
+#define RX_LOCK_FUNCTION(dev, func) func
+
+#define TX_LOCK_FUNCTION(dev, func) func
+#endif
+
+/* Add the lock RX/TX function for VF reset */
+#define GENERATE_RX_LOCK(func, nic) \
+uint16_t func ## _lock(void *rx_queue, \
+		      struct rte_mbuf **rx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _rx_queue *rxq = rx_queue; \
+	uint16_t nb_rx = 0; \
+						\
+	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
+		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&rxq->rx_lock); \
+	} \
+	\
+	return nb_rx; \
+}
+
+#define GENERATE_TX_LOCK(func, nic) \
+uint16_t func ## _lock(void *tx_queue, \
+		      struct rte_mbuf **tx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _tx_queue *txq = tx_queue; \
+	uint16_t nb_tx = 0; \
+						\
+	if (rte_spinlock_trylock(&txq->tx_lock)) { \
+		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&txq->tx_lock); \
+	} \
+	\
+	return nb_tx; \
+}
+
+/**
  * A structure used to configure an RX ring of an Ethernet port.
  */
 struct rte_eth_rxconf {
-- 
1.9.3

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 15:07  4% [dpdk-dev] RFC: DPDK Long Term Support Mcnamara, John
@ 2016-06-03 16:05  0% ` Thomas Monjalon
  2016-06-06 11:49  0%   ` Yuanhan Liu
  2016-06-07 13:17  3%   ` Mcnamara, John
  2016-06-03 18:17  3% ` Matthew Hall
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 200+ results
From: Thomas Monjalon @ 2016-06-03 16:05 UTC (permalink / raw)
  To: Mcnamara, John; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen

Hi,

2016-06-03 15:07, Mcnamara, John:
> Introduction
> ------------
> 
> This document sets out a proposal for a DPDK Long Term Support release (LTS).

In general, LTS refer to a longer maintenance than than regular one.
Here we are talking to doing some maintenance as stable releases first.
Currently we have no maintenance at all.
So I suggest to differentiate "stable branches" and "LTS" for some stable branches.

> The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> backported bug fixes over an extended period of time. This will provide
> downstream consumers of DPDK with a stable target on which to base
> applications or packages.
[...]
> The proposed maintainer for the LTS is Yuanhan Liu
> <yuanhan.liu@linux.intel.com>.

I wonder if Yuanhan is OK to maintain every stable releases which could be
requested/needed? Or should we have other committers for the stable releases
that Yuanhan would not want to maintain himself?
The Linux model is to let people declare themselves when they want to maintain
a stable branch.

> The proposed duration of the LTS support is 2 years.

I think we should discuss the support duration for each release separately.

> There will only be one LTS branch being maintained at any time. At the end of
> the 2 year cycle the maintenance on the previous LTS will be wound down.

Seems a bit too restrictive.
Currently, there is no maintenance at all because nobody was volunteer.
If Yuanhan is volunteer for a stable branch every 2 years, fine.
If someone else is volunteer for other branches, why not let him do it?

> The proposed initial LTS version will be DPDK 16.07. The next versions, based
> on a 2 year cycle, will be DPDK 18.08, 20.08, etc.

Let's do a first run with 16.07 and see later what we want to do next.
How long time a stable branch must be announced before its initial release?

> What changes should be backported
> ---------------------------------
> 
> * Bug fixes that don't break the ABI.

And API?
And behaviour (if not clearly documented in the API)?

[...]
> Developers submitting fixes to the mainline should also CC the maintainer so
> that they can evaluate the patch. A <stable@dpdk.org> email address could be
> provided for this so that it can be included as a CC in the commit messages
> and documented in the Code Contribution Guidelines.

Why?
We must avoid putting too much restrictions on the contributors.

> Intel will provide validation engineers to test the LTS branch/tree. Tested
> releases can be marked using a Git tag with an incremented revision number. For
> example: 16.07.00_LTS -> 16.07.01_LTS. The testing cadence should be quarterly
> but will be best effort only and dependent on available resources.

Thanks
It must not be just a tag. There should be an announce and a tarball ready
to download.

[...]
> In order to reduce the testing effort the number of OSes which will be
> officially validated should be as small as possible. The proposal is that the
> following long term OSes are used for validation:
> 
> (OSV reps please confirm.)
> 
> * Ubuntu 16.04 LTS
> * RHEL 7.3
> * SuSE 11 SP4 or 12
> * FreeBSD 10.3

I'm sure there will be more validation on the field or from contributors.

[...]
> The LTS guidelines shall be reviewed after 1 year to adjust for any experiences
> from LTS maintainership.

Yes seems very reasonnable.
Thanks

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-05 18:15  5% ` Neil Horman
@ 2016-06-06  9:27  5%   ` Thomas Monjalon
  2016-06-06 13:47  5%     ` Neil Horman
  2016-06-07 15:55  5%   ` Mcnamara, John
  1 sibling, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-06-06  9:27 UTC (permalink / raw)
  To: Neil Horman
  Cc: dev, Mcnamara, John, Christian Ehrhardt, Markos Chandras,
	Panu Matilainen

2016-06-05 14:15, Neil Horman:
> On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> > Introduction
> > ------------
> > 
> > This document sets out a proposal for a DPDK Long Term Support release (LTS).
> > 
> > The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> > backported bug fixes over an extended period of time. This will provide
> > downstream consumers of DPDK with a stable target on which to base
> > applications or packages.
[...]
> I'm not opposed to an LTS release, but it seems to be re-solving the issue of
> ABI breakage.  That is to say, there is alreay a process in place for managing
> ABI changes to the DPDK, which is designed to help ensure that:
> 
> 1) ABI changes are signaled at least 2 releases early
> 2) ABI changes whenever possible are designed such that backward compatibility
> versions can be encoded at the same time with versioning tags

Sorry I don't understand your point.
We are talking about two different things:
1/ ABI care for each new major release
2/ Minor release for bug fixes

I think both may exist.

> Those two mechanism are expressly intended to allow application upgrades of DPDK
> libraries without worrying about ABI breakage.  While LTS releases are a fine
> approach for  some things, they sacrifice upstream efficiency (by creating work
> for backporting teams), while allowing upstream developers more leverage to just
> create ABI breaking changes on a whim, ignoring the existing ABI compatibility
> mechanism

No it was not stated that upstream developers should ignore ABI compatibility.
Do you mean having a stable branch means ABI preservation for the next major
release is less important?

> LTS is a fine process for projects in which API/ABI breakage is either uncommon
> or fairly isolated, but that in my mind doesn't really describe DPDK.

Yes API/ABI breakages are still common in DPDK.
So it's even more important to have some stable branches.

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 16:05  0% ` Thomas Monjalon
@ 2016-06-06 11:49  0%   ` Yuanhan Liu
  2016-06-07 13:17  3%   ` Mcnamara, John
  1 sibling, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-06 11:49 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: Mcnamara, John, dev, Christian Ehrhardt, Markos Chandras,
	Panu Matilainen

On Fri, Jun 03, 2016 at 06:05:15PM +0200, Thomas Monjalon wrote:
> Hi,
> 
> 2016-06-03 15:07, Mcnamara, John:
> > Introduction
> > ------------
> > 
> > This document sets out a proposal for a DPDK Long Term Support release (LTS).
> 
> In general, LTS refer to a longer maintenance than than regular one.
> Here we are talking to doing some maintenance as stable releases first.
> Currently we have no maintenance at all.
> So I suggest to differentiate "stable branches" and "LTS" for some stable branches.
> 
> > The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> > backported bug fixes over an extended period of time. This will provide
> > downstream consumers of DPDK with a stable target on which to base
> > applications or packages.
> [...]
> > The proposed maintainer for the LTS is Yuanhan Liu
> > <yuanhan.liu@linux.intel.com>.
> 
> I wonder if Yuanhan is OK to maintain every stable releases which could be
> requested/needed?

I'm Okay, since I assume the maintain effort would be small: mainly
for picking acked and tested *bug fix* patches.

> Or should we have other committers for the stable releases
> that Yuanhan would not want to maintain himself?
> The Linux model is to let people declare themselves when they want to maintain
> a stable branch.

I have no object though, if somebody volunteer him as a stable branch
maintainer.

> 
> > The proposed duration of the LTS support is 2 years.
> 
> I think we should discuss the support duration for each release separately.
> 
> > There will only be one LTS branch being maintained at any time. At the end of
> > the 2 year cycle the maintenance on the previous LTS will be wound down.
> 
> Seems a bit too restrictive.
> Currently, there is no maintenance at all because nobody was volunteer.
> If Yuanhan is volunteer for a stable branch every 2 years, fine.
> If someone else is volunteer for other branches, why not let him do it?
> 
> > The proposed initial LTS version will be DPDK 16.07. The next versions, based
> > on a 2 year cycle, will be DPDK 18.08, 20.08, etc.
> 
> Let's do a first run with 16.07 and see later what we want to do next.
> How long time a stable branch must be announced before its initial release?
> 
> > What changes should be backported
> > ---------------------------------
> > 
> > * Bug fixes that don't break the ABI.
> 
> And API?
> And behaviour (if not clearly documented in the API)?

Agreed, we should not include those changes, either.

> 
> [...]
> > Developers submitting fixes to the mainline should also CC the maintainer so
> > that they can evaluate the patch. A <stable@dpdk.org> email address could be
> > provided for this so that it can be included as a CC in the commit messages
> > and documented in the Code Contribution Guidelines.
> 
> Why?
> We must avoid putting too much restrictions on the contributors.

This is actually requested by me, in a behaviour similar to Linux
kernel community takes. Here is the thing, the developer normally
knows better than a generic maintainer (assume it's me) that a patch
applies to stable branch or not. This is especially true for DPDK,
since we ask the developer to note down the bug commit by adding a
fix line.

It wouldn't be a burden for an active contributor, as CCing to related
people (including right mailing list) is a good habit they already
have.  For some one-time contributors, it's okay that they don't know
and follow it.

In such case, I guess we need the help from the related subsystem
maintainer: if it's a good bug fix that applies to stable branch,
and the contributor forgot to make a explicit cc to stable mailing
list, the subsystem maintainer should forward or ask him to forward
to stable mailing list.

The reason I'm asking is that as a generic maintainer, there is
simply no such energy to keep an eye on all patches: you have to
be aware of that we have thoughts of email per month from dpdk dev
mailing list: the number of last month is 1808.

Doing so would allow one person maintain several stable tree
be possible.

For more info, you could check linux/Documentation/stable_kernel_rules.txt.

> 
> > Intel will provide validation engineers to test the LTS branch/tree. Tested
> > releases can be marked using a Git tag with an incremented revision number. For
> > example: 16.07.00_LTS -> 16.07.01_LTS. The testing cadence should be quarterly
> > but will be best effort only and dependent on available resources.
> 
> Thanks
> It must not be just a tag. There should be an announce and a tarball ready
> to download.

Agreed.

	--yliu

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-06  9:27  5%   ` Thomas Monjalon
@ 2016-06-06 13:47  5%     ` Neil Horman
  2016-06-06 14:21  4%       ` Thomas Monjalon
  2016-06-07 16:21  3%       ` Mcnamara, John
  0 siblings, 2 replies; 200+ results
From: Neil Horman @ 2016-06-06 13:47 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Mcnamara, John, Christian Ehrhardt, Markos Chandras,
	Panu Matilainen

On Mon, Jun 06, 2016 at 11:27:29AM +0200, Thomas Monjalon wrote:
> 2016-06-05 14:15, Neil Horman:
> > On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> > > Introduction
> > > ------------
> > > 
> > > This document sets out a proposal for a DPDK Long Term Support release (LTS).
> > > 
> > > The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> > > backported bug fixes over an extended period of time. This will provide
> > > downstream consumers of DPDK with a stable target on which to base
> > > applications or packages.
> [...]
> > I'm not opposed to an LTS release, but it seems to be re-solving the issue of
> > ABI breakage.  That is to say, there is alreay a process in place for managing
> > ABI changes to the DPDK, which is designed to help ensure that:
> > 
> > 1) ABI changes are signaled at least 2 releases early
> > 2) ABI changes whenever possible are designed such that backward compatibility
> > versions can be encoded at the same time with versioning tags
> 
> Sorry I don't understand your point.
> We are talking about two different things:
> 1/ ABI care for each new major release
> 2/ Minor release for bug fixes
> 
> I think both may exist.
> 
Sure, they can exist together (they being both an ABI backwards compatible HEAD
and a set of LTS releases).  The point I'm trying to make is that if you do your
ABI compatible HEAD well enough, you don't really need an LTS release.

Thats not to say that you can't do both, but an LTS release is a significant
workload item, especially given the rapid pace of change in HEAD.  The longer
you maintain an LTS release, the more difficult "minor" bugfixes are to
integrate, especially if you wind up skipping any ABI breaking patches.  I think
its worth calling attention to that as this approach gets considered.

> > Those two mechanism are expressly intended to allow application upgrades of DPDK
> > libraries without worrying about ABI breakage.  While LTS releases are a fine
> > approach for  some things, they sacrifice upstream efficiency (by creating work
> > for backporting teams), while allowing upstream developers more leverage to just
> > create ABI breaking changes on a whim, ignoring the existing ABI compatibility
> > mechanism
> 
> No it was not stated that upstream developers should ignore ABI compatibility.
> Do you mean having a stable branch means ABI preservation for the next major
> release is less important?
> 
I never stated that developers should ignore ABI compatibility, I stated that
creating an LTS release will make it that much easier for developers to do so.

And I think, pragmatically speaking, that is a concern.  Given that the
existance of an LTS release will make it tempting for developers to simply
follow the deprecation process rather than try to create ABI backward compatible
paths.

Looking at the git history, it seems clear to me that this is already happening.
I'm able to find a multitude of instances in which the deprecation process has
been followed reasonably well, but I can find no instances in which any efforts
have been made for backward compatibility.

> > LTS is a fine process for projects in which API/ABI breakage is either uncommon
> > or fairly isolated, but that in my mind doesn't really describe DPDK.
> 
> Yes API/ABI breakages are still common in DPDK.
> So it's even more important to have some stable branches.

We seem to be comming to different conclusions based on the same evidence. We
agree that API/ABI changes continue to be frequent ocurances, but my position is
that we already have a process in place to mitigate that, which is simply not
being used (i.e. versioning symbols to provide backward compatible paths),
whereas you seem to be asserting that an LTS model will allow for ABI stabiilty
and bug fixes.

While I don't disagree with that statement (LTS does provide both of those
things if the maintainer does it properly), I'm forced to ask the question,
before we solve this problem in a new way, lets ask why the existing way isn't
being used.  Do developers just not care about backwards compatibility?  Is the
process to hard?  Something else?  I really don't like the idea of abandoning
what currently exists to replace it with something else, without first
addressing why what we have isn't working.

Neil

> 

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-06 13:47  5%     ` Neil Horman
@ 2016-06-06 14:21  4%       ` Thomas Monjalon
  2016-06-06 15:07  5%         ` Neil Horman
  2016-06-07 16:21  3%       ` Mcnamara, John
  1 sibling, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-06-06 14:21 UTC (permalink / raw)
  To: Neil Horman
  Cc: dev, Mcnamara, John, Christian Ehrhardt, Markos Chandras,
	Panu Matilainen

2016-06-06 09:47, Neil Horman:
> On Mon, Jun 06, 2016 at 11:27:29AM +0200, Thomas Monjalon wrote:
> > 2016-06-05 14:15, Neil Horman:
> > > On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> > > > Introduction
> > > > ------------
> > > > 
> > > > This document sets out a proposal for a DPDK Long Term Support release (LTS).
> > > > 
> > > > The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> > > > backported bug fixes over an extended period of time. This will provide
> > > > downstream consumers of DPDK with a stable target on which to base
> > > > applications or packages.
> > [...]
> > > I'm not opposed to an LTS release, but it seems to be re-solving the issue of
> > > ABI breakage.  That is to say, there is alreay a process in place for managing
> > > ABI changes to the DPDK, which is designed to help ensure that:
> > > 
> > > 1) ABI changes are signaled at least 2 releases early
> > > 2) ABI changes whenever possible are designed such that backward compatibility
> > > versions can be encoded at the same time with versioning tags
> > 
> > Sorry I don't understand your point.
> > We are talking about two different things:
> > 1/ ABI care for each new major release
> > 2/ Minor release for bug fixes
> > 
> > I think both may exist.
> > 
> Sure, they can exist together (they being both an ABI backwards compatible HEAD
> and a set of LTS releases).  The point I'm trying to make is that if you do your
> ABI compatible HEAD well enough, you don't really need an LTS release.
> 
> Thats not to say that you can't do both, but an LTS release is a significant
> workload item, especially given the rapid pace of change in HEAD.  The longer
> you maintain an LTS release, the more difficult "minor" bugfixes are to
> integrate, especially if you wind up skipping any ABI breaking patches.  I think
> its worth calling attention to that as this approach gets considered.
> 
> > > Those two mechanism are expressly intended to allow application upgrades of DPDK
> > > libraries without worrying about ABI breakage.  While LTS releases are a fine
> > > approach for  some things, they sacrifice upstream efficiency (by creating work
> > > for backporting teams), while allowing upstream developers more leverage to just
> > > create ABI breaking changes on a whim, ignoring the existing ABI compatibility
> > > mechanism
> > 
> > No it was not stated that upstream developers should ignore ABI compatibility.
> > Do you mean having a stable branch means ABI preservation for the next major
> > release is less important?
> > 
> I never stated that developers should ignore ABI compatibility, I stated that
> creating an LTS release will make it that much easier for developers to do so.
> 
> And I think, pragmatically speaking, that is a concern.  Given that the
> existance of an LTS release will make it tempting for developers to simply
> follow the deprecation process rather than try to create ABI backward compatible
> paths.
> 
> Looking at the git history, it seems clear to me that this is already happening.
> I'm able to find a multitude of instances in which the deprecation process has
> been followed reasonably well, but I can find no instances in which any efforts
> have been made for backward compatibility.

There were some examples of backward compatibility in hash and lpm libraries.

> > > LTS is a fine process for projects in which API/ABI breakage is either uncommon
> > > or fairly isolated, but that in my mind doesn't really describe DPDK.
> > 
> > Yes API/ABI breakages are still common in DPDK.
> > So it's even more important to have some stable branches.
> 
> We seem to be comming to different conclusions based on the same evidence. We
> agree that API/ABI changes continue to be frequent ocurances, but my position is
> that we already have a process in place to mitigate that, which is simply not
> being used (i.e. versioning symbols to provide backward compatible paths),
> whereas you seem to be asserting that an LTS model will allow for ABI stabiilty
> and bug fixes.
> 
> While I don't disagree with that statement (LTS does provide both of those
> things if the maintainer does it properly), I'm forced to ask the question,
> before we solve this problem in a new way, 

The following questions are interesting but please don't assume the stable
branch address the same issue as ABI compat.
In each major release, we add some new bugs because of new features, even
if the ABI is kept.
In a minor stable release there are only some bug fixes. So the only way
to have a "bug free" version in a stable environment, is to do some
maintenance in a stable branch.

> lets ask why the existing way isn't
> being used.  Do developers just not care about backwards compatibility?  Is the
> process to hard?  Something else?  I really don't like the idea of abandoning
> what currently exists to replace it with something else, without first
> addressing why what we have isn't working.

We can address both. But I strongly think the ABI compat is another topic.

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-06 14:21  4%       ` Thomas Monjalon
@ 2016-06-06 15:07  5%         ` Neil Horman
  0 siblings, 0 replies; 200+ results
From: Neil Horman @ 2016-06-06 15:07 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Mcnamara, John, Christian Ehrhardt, Markos Chandras,
	Panu Matilainen

On Mon, Jun 06, 2016 at 04:21:11PM +0200, Thomas Monjalon wrote:
> 2016-06-06 09:47, Neil Horman:
> > On Mon, Jun 06, 2016 at 11:27:29AM +0200, Thomas Monjalon wrote:
> > > 2016-06-05 14:15, Neil Horman:
> > > > On Fri, Jun 03, 2016 at 03:07:49PM +0000, Mcnamara, John wrote:
> > > > > Introduction
> > > > > ------------
> > > > > 
> > > > > This document sets out a proposal for a DPDK Long Term Support release (LTS).
> > > > > 
> > > > > The purpose of the DPDK LTS will be to maintain a stable release of DPDK with
> > > > > backported bug fixes over an extended period of time. This will provide
> > > > > downstream consumers of DPDK with a stable target on which to base
> > > > > applications or packages.
> > > [...]
> > > > I'm not opposed to an LTS release, but it seems to be re-solving the issue of
> > > > ABI breakage.  That is to say, there is alreay a process in place for managing
> > > > ABI changes to the DPDK, which is designed to help ensure that:
> > > > 
> > > > 1) ABI changes are signaled at least 2 releases early
> > > > 2) ABI changes whenever possible are designed such that backward compatibility
> > > > versions can be encoded at the same time with versioning tags
> > > 
> > > Sorry I don't understand your point.
> > > We are talking about two different things:
> > > 1/ ABI care for each new major release
> > > 2/ Minor release for bug fixes
> > > 
> > > I think both may exist.
> > > 
> > Sure, they can exist together (they being both an ABI backwards compatible HEAD
> > and a set of LTS releases).  The point I'm trying to make is that if you do your
> > ABI compatible HEAD well enough, you don't really need an LTS release.
> > 
> > Thats not to say that you can't do both, but an LTS release is a significant
> > workload item, especially given the rapid pace of change in HEAD.  The longer
> > you maintain an LTS release, the more difficult "minor" bugfixes are to
> > integrate, especially if you wind up skipping any ABI breaking patches.  I think
> > its worth calling attention to that as this approach gets considered.
> > 
> > > > Those two mechanism are expressly intended to allow application upgrades of DPDK
> > > > libraries without worrying about ABI breakage.  While LTS releases are a fine
> > > > approach for  some things, they sacrifice upstream efficiency (by creating work
> > > > for backporting teams), while allowing upstream developers more leverage to just
> > > > create ABI breaking changes on a whim, ignoring the existing ABI compatibility
> > > > mechanism
> > > 
> > > No it was not stated that upstream developers should ignore ABI compatibility.
> > > Do you mean having a stable branch means ABI preservation for the next major
> > > release is less important?
> > > 
> > I never stated that developers should ignore ABI compatibility, I stated that
> > creating an LTS release will make it that much easier for developers to do so.
> > 
> > And I think, pragmatically speaking, that is a concern.  Given that the
> > existance of an LTS release will make it tempting for developers to simply
> > follow the deprecation process rather than try to create ABI backward compatible
> > paths.
> > 
> > Looking at the git history, it seems clear to me that this is already happening.
> > I'm able to find a multitude of instances in which the deprecation process has
> > been followed reasonably well, but I can find no instances in which any efforts
> > have been made for backward compatibility.
> 
> There were some examples of backward compatibility in hash and lpm libraries.
> 
Ok, apologies, but you still see my point.  A relatively minor number of
instances of creating backward compatibility among a much larger set of easier
deprecate and replace instances.  Its not really having the effect it was
intended to.

> > > > LTS is a fine process for projects in which API/ABI breakage is either uncommon
> > > > or fairly isolated, but that in my mind doesn't really describe DPDK.
> > > 
> > > Yes API/ABI breakages are still common in DPDK.
> > > So it's even more important to have some stable branches.
> > 
> > We seem to be comming to different conclusions based on the same evidence. We
> > agree that API/ABI changes continue to be frequent ocurances, but my position is
> > that we already have a process in place to mitigate that, which is simply not
> > being used (i.e. versioning symbols to provide backward compatible paths),
> > whereas you seem to be asserting that an LTS model will allow for ABI stabiilty
> > and bug fixes.
> > 
> > While I don't disagree with that statement (LTS does provide both of those
> > things if the maintainer does it properly), I'm forced to ask the question,
> > before we solve this problem in a new way, 
> 
> The following questions are interesting but please don't assume the stable
> branch address the same issue as ABI compat.
Given your perspecive on what LTS/stable branches should be, I absolutely agree,
but thats not what John M. Was proposing.  from his initial proposal, he
specifically called out which changes were acceptable:

What changes should not be backported
-------------------------------------

* API or ABI breaking changes.

* Features should not be backported. Unless:

   * There is a justifiable use case (for example a new PMD).
   * The change is non-invasive.
   * The work of preparing the backport is done by the proposer.
   * There is support within the community.

The above list in my mind amounts to "Any change that there is sufficient
consumer demand for and doesn't present too much validation difficulty, except
ABI or API breaking changes".

While theres nothing really wrong with that, if we want to go down that path,
that really says to me that this is a way around ABI compatibilty problems,
because the inclusion of any other fix, given sufficient demand, can be
potentially justified.  So, in Johns proposal, a stable branch / LTS release is
going to effectively be a way to allow consumers to stay on one API/ABI level
for  longer period of time before having to make a major change catch up to the
HEAD release.

> In each major release, we add some new bugs because of new features, even
> if the ABI is kept.
> In a minor stable release there are only some bug fixes. So the only way
> to have a "bug free" version in a stable environment, is to do some
> maintenance in a stable branch.
> 
Again, I agree with your perspecitive on what a stable branch should be, but
thats not what John was proposing, and thats what I'm raising a concern about.

> > lets ask why the existing way isn't
> > being used.  Do developers just not care about backwards compatibility?  Is the
> > process to hard?  Something else?  I really don't like the idea of abandoning
> > what currently exists to replace it with something else, without first
> > addressing why what we have isn't working.
> 
> We can address both. But I strongly think the ABI compat is another topic.

I agree it can be a separate topic, but given the proposal here, it seems like
an awfully tempting way to avoid having to address it.  Not saying its a bad
plan, mind you, just that ABI compatibility is something that does need to be
kept at the forefront, because it still changes often (more often than it has
to).
Neil
 
> 

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
    2016-05-26 17:04  4%   ` Rich Lane
@ 2016-06-07  3:51  9%   ` Yuanhan Liu
  2016-06-07  3:52  7%     ` [dpdk-dev] [PATCH v3 11/20] vhost: introduce new API to export queue free entries Yuanhan Liu
                       ` (6 more replies)
  1 sibling, 7 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:51 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

v3: - adapted the new vhost ABI/API changes to tep_term example, to make
      sure not break build at least.
    - bumped the ABI version to 3

NOTE: I created a branch at dpdk.org [0] for more conveinient testing:

    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing


Every time we introduce a new feature to vhost, we are likely to break
ABI. Moreover, some cleanups (such as the one from Ilya to remove vec_buf
from vhost_virtqueue struct) also break ABI.

This patch set is meant to resolve above issue ultimately, by hiding
virtio_net structure (as well as few others) internaly, and export the
virtio_net dev strut to applications by a number, vid, like the way
kernel exposes an fd to user space.

Back to the patch set, the first part of this set makes some changes to
vhost example, vhost-pmd and vhost, bit by bit, to remove the dependence
to "virtio_net" struct. And then do the final change to make the current
APIs to adapt to using "vid".

After that, "vrtio_net_device_ops" is the only left open struct that an
application can acces, therefore, it's the only place that might introduce
potential ABI breakage in future for extension. Hence, I made few more
(5) space reservation, to make sure we will not break ABI for a long time,
and hopefuly, forever.

The last bit of this patch set is some cleanups, including the one from
Ilya.

v2: - exported ifname as well to fix a vhost-pmd issue reported by Rich
    - separated the big patch that introduces several new APIs into some
      small patches.
    - updated release note
    - updated version.map

Thanks.

	--yliu

---
Ilya Maximets (1):
  vhost: make buf vector for scatter Rx local

Yuanhan Liu (19):
  vhost: declare backend with int type
  vhost: set/reset dev flags internally
  vhost: declare device fh as int
  examples/vhost: make a copy of virtio device id
  vhost: rename device fh to vid
  vhost: get device by vid only
  vhost: move vhost device ctx to cuse
  vhost: introduce new API to export numa node
  vhost: introduce new API to export number of queues
  vhost: introduce new API to export ifname
  vhost: introduce new API to export queue free entries
  vhost: remove dependency on priv field
  vhost: export vid as the only interface to applications
  vhost: hide internal structs/macros/functions
  vhost: remove unnecessary fields
  vhost: remove virtio-net.h
  vhost: reserve few more space for future extension
  examples/tep_term: adapt to new vhost ABI/API changes
  vhost: per device virtio net header len

 doc/guides/rel_notes/release_16_07.rst        |  11 +-
 drivers/net/vhost/rte_eth_vhost.c             |  79 ++++-----
 examples/tep_termination/main.c               |  83 +++++-----
 examples/tep_termination/main.h               |   5 +-
 examples/tep_termination/vxlan_setup.c        |  20 +--
 examples/tep_termination/vxlan_setup.h        |   6 +-
 examples/vhost/main.c                         | 116 +++++++------
 examples/vhost/main.h                         |   3 +-
 lib/librte_vhost/Makefile                     |   2 +-
 lib/librte_vhost/rte_vhost_version.map        |  10 ++
 lib/librte_vhost/rte_virtio_net.h             | 223 +++++++------------------
 lib/librte_vhost/vhost-net.h                  | 201 ++++++++++++++++++----
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  |  83 +++++-----
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  30 ++--
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.h |  12 +-
 lib/librte_vhost/vhost_rxtx.c                 | 133 ++++++++-------
 lib/librte_vhost/vhost_user/vhost-net-user.c  |  53 +++---
 lib/librte_vhost/vhost_user/vhost-net-user.h  |   2 +
 lib/librte_vhost/vhost_user/virtio-net-user.c |  64 +++----
 lib/librte_vhost/vhost_user/virtio-net-user.h |  18 +-
 lib/librte_vhost/virtio-net.c                 | 229 +++++++++++++++++---------
 lib/librte_vhost/virtio-net.h                 |  43 -----
 22 files changed, 752 insertions(+), 674 deletions(-)
 delete mode 100644 lib/librte_vhost/virtio-net.h

-- 
1.9.0

^ permalink raw reply	[relevance 9%]

* [dpdk-dev] [PATCH v3 11/20] vhost: introduce new API to export queue free entries
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
@ 2016-06-07  3:52  7%     ` Yuanhan Liu
  2016-06-07  3:52  3%     ` [dpdk-dev] [PATCH v3 12/20] vhost: remove dependency on priv field Yuanhan Liu
                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:52 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

The new API rte_vhost_avail_entries() is actually a rename of
rte_vring_available_entries(), with the "vring" to "vhost" name
change to keep the consistency of other vhost exported APIs.

This change could let us avoid the dependency of "virtio_net"
struct, to prepare for the ABI refactoring.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Rich Lane <rich.lane@bigswitch.com>
Acked-by: Rich Lane <rich.lane@bigswitch.com>
---
 doc/guides/rel_notes/release_16_07.rst |  2 ++
 examples/vhost/main.c                  |  4 ++--
 lib/librte_vhost/rte_vhost_version.map |  1 +
 lib/librte_vhost/rte_virtio_net.h      | 24 +++++++++++++-----------
 lib/librte_vhost/virtio-net.c          | 17 +++++++++++++++++
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 30e78d4..7b602b7 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -116,6 +116,8 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* ``rte_vring_available_entries`` is renamed to ``rte_vhost_avail_entries``.
+
 
 ABI Changes
 -----------
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index d04f779..3ae302f 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1055,13 +1055,13 @@ drain_eth_rx(struct vhost_dev *vdev)
 	 * to diminish packet loss.
 	 */
 	if (enable_retry &&
-	    unlikely(rx_count > rte_vring_available_entries(dev,
+	    unlikely(rx_count > rte_vhost_avail_entries(dev->vid,
 			VIRTIO_RXQ))) {
 		uint32_t retry;
 
 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
 			rte_delay_us(burst_rx_delay_time);
-			if (rx_count <= rte_vring_available_entries(dev,
+			if (rx_count <= rte_vhost_avail_entries(dev->vid,
 					VIRTIO_RXQ))
 				break;
 		}
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index 4608e3b..93f1188 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -24,6 +24,7 @@ DPDK_2.1 {
 DPDK_16.07 {
 	global:
 
+	rte_vhost_avail_entries;
 	rte_vhost_get_ifname;
 	rte_vhost_get_numa_node;
 	rte_vhost_get_queue_num;
diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index 0898e8b..0427461 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -184,17 +184,6 @@ struct virtio_net_device_ops {
 	int (*vring_state_changed)(struct virtio_net *dev, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
 };
 
-static inline uint16_t __attribute__((always_inline))
-rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id)
-{
-	struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
-
-	if (!vq->enabled)
-		return 0;
-
-	return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx_res;
-}
-
 /**
  * Function to convert guest physical addresses to vhost virtual addresses.
  * This is used to convert guest virtio buffer addresses.
@@ -285,6 +274,19 @@ uint32_t rte_vhost_get_queue_num(int vid);
 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
 
 /**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ *  virtio-net device ID
+ * @param queue_id
+ *  virtio queue index
+ *
+ * @return
+ *  num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 375c9d4..115eba4 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -783,6 +783,23 @@ rte_vhost_get_ifname(int vid, char *buf, size_t len)
 	return 0;
 }
 
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+	struct virtio_net *dev;
+	struct vhost_virtqueue *vq;
+
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
+	vq = dev->virtqueue[queue_id];
+	if (!vq->enabled)
+		return 0;
+
+	return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx_res;
+}
+
 int rte_vhost_enable_guest_notification(struct virtio_net *dev,
 	uint16_t queue_id, int enable)
 {
-- 
1.9.0

^ permalink raw reply	[relevance 7%]

* [dpdk-dev] [PATCH v3 12/20] vhost: remove dependency on priv field
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
  2016-06-07  3:52  7%     ` [dpdk-dev] [PATCH v3 11/20] vhost: introduce new API to export queue free entries Yuanhan Liu
@ 2016-06-07  3:52  3%     ` Yuanhan Liu
  2016-06-07  3:52 13%     ` [dpdk-dev] [PATCH v3 13/20] vhost: export vid as the only interface to applications Yuanhan Liu
                       ` (4 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:52 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

This change could let us avoid the dependency of "virtio_net"
struct, to prepare for the ABI refactoring.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Rich Lane <rich.lane@bigswitch.com>
Acked-by: Rich Lane <rich.lane@bigswitch.com>
---
 drivers/net/vhost/rte_eth_vhost.c | 13 +++++++------
 examples/vhost/main.c             | 10 +++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
index 6fa9f6b..de0f25e 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -275,7 +275,6 @@ new_device(struct virtio_net *dev)
 	for (i = 0; i < rte_vhost_get_queue_num(dev->vid) * VIRTIO_QNUM; i++)
 		rte_vhost_enable_guest_notification(dev, i, 0);
 
-	dev->priv = eth_dev;
 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
 
 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
@@ -303,6 +302,8 @@ destroy_device(volatile struct virtio_net *dev)
 {
 	struct rte_eth_dev *eth_dev;
 	struct vhost_queue *vq;
+	struct internal_list *list;
+	char ifname[PATH_MAX];
 	unsigned i;
 
 	if (dev == NULL) {
@@ -310,11 +311,13 @@ destroy_device(volatile struct virtio_net *dev)
 		return;
 	}
 
-	eth_dev = (struct rte_eth_dev *)dev->priv;
-	if (eth_dev == NULL) {
-		RTE_LOG(INFO, PMD, "Failed to find a ethdev\n");
+	rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+	list = find_internal_resource(ifname);
+	if (list == NULL) {
+		RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
 		return;
 	}
+	eth_dev = list->eth_dev;
 
 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
@@ -336,8 +339,6 @@ destroy_device(volatile struct virtio_net *dev)
 
 	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
 
-	dev->priv = NULL;
-
 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 		vq = eth_dev->data->rx_queues[i];
 		if (vq == NULL)
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 3ae302f..9b74a16 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -1173,10 +1173,15 @@ switch_worker(void *arg __rte_unused)
 static void
 destroy_device (volatile struct virtio_net *dev)
 {
-	struct vhost_dev *vdev;
+	struct vhost_dev *vdev = NULL;
 	int lcore;
 
-	vdev = (struct vhost_dev *)dev->priv;
+	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
+		if (vdev->vid == dev->vid)
+			break;
+	}
+	if (!vdev)
+		return;
 	/*set the remove flag. */
 	vdev->remove = 1;
 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
@@ -1231,7 +1236,6 @@ new_device (struct virtio_net *dev)
 		return -1;
 	}
 	vdev->dev = dev;
-	dev->priv = vdev;
 	vdev->vid = vid;
 
 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
-- 
1.9.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v3 13/20] vhost: export vid as the only interface to applications
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
  2016-06-07  3:52  7%     ` [dpdk-dev] [PATCH v3 11/20] vhost: introduce new API to export queue free entries Yuanhan Liu
  2016-06-07  3:52  3%     ` [dpdk-dev] [PATCH v3 12/20] vhost: remove dependency on priv field Yuanhan Liu
@ 2016-06-07  3:52 13%     ` Yuanhan Liu
  2016-06-07  3:52  4%     ` [dpdk-dev] [PATCH v3 17/20] vhost: reserve few more space for future extension Yuanhan Liu
                       ` (3 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:52 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

With all the previous prepare works, we are just one step away from
the final ABI refactoring. That is, to change current API to let them
stick to vid instead of the old virtio_net dev.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Rich Lane <rich.lane@bigswitch.com>
Acked-by: Rich Lane <rich.lane@bigswitch.com>
---

v2: update release note

v3: - bump the ABI version to 3.
    - remove "struct virtio_net *dev" field in vhost example
---
 doc/guides/rel_notes/release_16_07.rst        |  9 ++++-
 drivers/net/vhost/rte_eth_vhost.c             | 47 +++++++++------------------
 examples/vhost/main.c                         | 25 +++++++-------
 examples/vhost/main.h                         |  2 --
 lib/librte_vhost/Makefile                     |  2 +-
 lib/librte_vhost/rte_virtio_net.h             | 18 +++++-----
 lib/librte_vhost/vhost_rxtx.c                 | 15 +++++++--
 lib/librte_vhost/vhost_user/virtio-net-user.c | 14 ++++----
 lib/librte_vhost/virtio-net.c                 | 17 ++++++----
 9 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 7b602b7..8dbcf8a 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -118,6 +118,10 @@ API Changes
 
 * ``rte_vring_available_entries`` is renamed to ``rte_vhost_avail_entries``.
 
+* All existing vhost APIs and callbacks with ``virtio_net`` struct pointer
+  as the parameter have been changed due to the ABI refactoring mentioned
+  below: it's replaced by ``int vid``.
+
 
 ABI Changes
 -----------
@@ -129,6 +133,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* vhost ABI refactoring has been made: ``virtio_net`` structure is never
+  exported to application any more. Instead, a handle, ``vid``, has been
+  used to represent this structure internally.
 
 Shared Library Versions
 -----------------------
@@ -165,7 +172,7 @@ The libraries prepended with a plus sign were incremented in this version.
      librte_sched.so.1
      librte_table.so.2
      librte_timer.so.1
-     librte_vhost.so.2
+   + librte_vhost.so.3
 
 
 Tested Platforms
diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
index de0f25e..56c1c36 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -71,9 +71,9 @@ static struct ether_addr base_eth_addr = {
 };
 
 struct vhost_queue {
+	int vid;
 	rte_atomic32_t allow_queuing;
 	rte_atomic32_t while_queuing;
-	struct virtio_net *device;
 	struct pmd_internal *internal;
 	struct rte_mempool *mb_pool;
 	uint8_t port;
@@ -139,7 +139,7 @@ eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 		goto out;
 
 	/* Dequeue packets from guest TX queue */
-	nb_rx = rte_vhost_dequeue_burst(r->device,
+	nb_rx = rte_vhost_dequeue_burst(r->vid,
 			r->virtqueue_id, r->mb_pool, bufs, nb_bufs);
 
 	r->rx_pkts += nb_rx;
@@ -170,7 +170,7 @@ eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 		goto out;
 
 	/* Enqueue packets to guest RX queue */
-	nb_tx = rte_vhost_enqueue_burst(r->device,
+	nb_tx = rte_vhost_enqueue_burst(r->vid,
 			r->virtqueue_id, bufs, nb_bufs);
 
 	r->tx_pkts += nb_tx;
@@ -222,7 +222,7 @@ find_internal_resource(char *ifname)
 }
 
 static int
-new_device(struct virtio_net *dev)
+new_device(int vid)
 {
 	struct rte_eth_dev *eth_dev;
 	struct internal_list *list;
@@ -234,12 +234,7 @@ new_device(struct virtio_net *dev)
 	int newnode;
 #endif
 
-	if (dev == NULL) {
-		RTE_LOG(INFO, PMD, "Invalid argument\n");
-		return -1;
-	}
-
-	rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 	list = find_internal_resource(ifname);
 	if (list == NULL) {
 		RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
@@ -250,7 +245,7 @@ new_device(struct virtio_net *dev)
 	internal = eth_dev->data->dev_private;
 
 #ifdef RTE_LIBRTE_VHOST_NUMA
-	newnode = rte_vhost_get_numa_node(dev->vid);
+	newnode = rte_vhost_get_numa_node(vid);
 	if (newnode > 0)
 		eth_dev->data->numa_node = newnode;
 #endif
@@ -259,7 +254,7 @@ new_device(struct virtio_net *dev)
 		vq = eth_dev->data->rx_queues[i];
 		if (vq == NULL)
 			continue;
-		vq->device = dev;
+		vq->vid = vid;
 		vq->internal = internal;
 		vq->port = eth_dev->data->port_id;
 	}
@@ -267,13 +262,13 @@ new_device(struct virtio_net *dev)
 		vq = eth_dev->data->tx_queues[i];
 		if (vq == NULL)
 			continue;
-		vq->device = dev;
+		vq->vid = vid;
 		vq->internal = internal;
 		vq->port = eth_dev->data->port_id;
 	}
 
-	for (i = 0; i < rte_vhost_get_queue_num(dev->vid) * VIRTIO_QNUM; i++)
-		rte_vhost_enable_guest_notification(dev, i, 0);
+	for (i = 0; i < rte_vhost_get_queue_num(vid) * VIRTIO_QNUM; i++)
+		rte_vhost_enable_guest_notification(vid, i, 0);
 
 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
 
@@ -298,7 +293,7 @@ new_device(struct virtio_net *dev)
 }
 
 static void
-destroy_device(volatile struct virtio_net *dev)
+destroy_device(int vid)
 {
 	struct rte_eth_dev *eth_dev;
 	struct vhost_queue *vq;
@@ -306,12 +301,7 @@ destroy_device(volatile struct virtio_net *dev)
 	char ifname[PATH_MAX];
 	unsigned i;
 
-	if (dev == NULL) {
-		RTE_LOG(INFO, PMD, "Invalid argument\n");
-		return;
-	}
-
-	rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 	list = find_internal_resource(ifname);
 	if (list == NULL) {
 		RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
@@ -343,13 +333,13 @@ destroy_device(volatile struct virtio_net *dev)
 		vq = eth_dev->data->rx_queues[i];
 		if (vq == NULL)
 			continue;
-		vq->device = NULL;
+		vq->vid = -1;
 	}
 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
 		vq = eth_dev->data->tx_queues[i];
 		if (vq == NULL)
 			continue;
-		vq->device = NULL;
+		vq->vid = -1;
 	}
 
 	RTE_LOG(INFO, PMD, "Connection closed\n");
@@ -358,19 +348,14 @@ destroy_device(volatile struct virtio_net *dev)
 }
 
 static int
-vring_state_changed(struct virtio_net *dev, uint16_t vring, int enable)
+vring_state_changed(int vid, uint16_t vring, int enable)
 {
 	struct rte_vhost_vring_state *state;
 	struct rte_eth_dev *eth_dev;
 	struct internal_list *list;
 	char ifname[PATH_MAX];
 
-	if (dev == NULL) {
-		RTE_LOG(ERR, PMD, "Invalid argument\n");
-		return -1;
-	}
-
-	rte_vhost_get_ifname(dev->vid, ifname, sizeof(ifname));
+	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 	list = find_internal_resource(ifname);
 	if (list == NULL) {
 		RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 9b74a16..c854660 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -795,7 +795,7 @@ virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
 {
 	uint16_t ret;
 
-	ret = rte_vhost_enqueue_burst(dst_vdev->dev, VIRTIO_RXQ, &m, 1);
+	ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
 	if (enable_stats) {
 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
@@ -1041,7 +1041,6 @@ static inline void __attribute__((always_inline))
 drain_eth_rx(struct vhost_dev *vdev)
 {
 	uint16_t rx_count, enqueue_count;
-	struct virtio_net *dev = vdev->dev;
 	struct rte_mbuf *pkts[MAX_PKT_BURST];
 
 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
@@ -1055,19 +1054,19 @@ drain_eth_rx(struct vhost_dev *vdev)
 	 * to diminish packet loss.
 	 */
 	if (enable_retry &&
-	    unlikely(rx_count > rte_vhost_avail_entries(dev->vid,
+	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
 			VIRTIO_RXQ))) {
 		uint32_t retry;
 
 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
 			rte_delay_us(burst_rx_delay_time);
-			if (rx_count <= rte_vhost_avail_entries(dev->vid,
+			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
 					VIRTIO_RXQ))
 				break;
 		}
 	}
 
-	enqueue_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ,
+	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
 						pkts, rx_count);
 	if (enable_stats) {
 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
@@ -1084,7 +1083,7 @@ drain_virtio_tx(struct vhost_dev *vdev)
 	uint16_t count;
 	uint16_t i;
 
-	count = rte_vhost_dequeue_burst(vdev->dev, VIRTIO_TXQ, mbuf_pool,
+	count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool,
 					pkts, MAX_PKT_BURST);
 
 	/* setup VMDq for the first packet */
@@ -1171,13 +1170,13 @@ switch_worker(void *arg __rte_unused)
  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
  */
 static void
-destroy_device (volatile struct virtio_net *dev)
+destroy_device(int vid)
 {
 	struct vhost_dev *vdev = NULL;
 	int lcore;
 
 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
-		if (vdev->vid == dev->vid)
+		if (vdev->vid == vid)
 			break;
 	}
 	if (!vdev)
@@ -1221,12 +1220,11 @@ destroy_device (volatile struct virtio_net *dev)
  * and the allocated to a specific data core.
  */
 static int
-new_device (struct virtio_net *dev)
+new_device(int vid)
 {
 	int lcore, core_add = 0;
 	uint32_t device_num_min = num_devices;
 	struct vhost_dev *vdev;
-	int vid = dev->vid;
 
 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
 	if (vdev == NULL) {
@@ -1235,7 +1233,6 @@ new_device (struct virtio_net *dev)
 			vid);
 		return -1;
 	}
-	vdev->dev = dev;
 	vdev->vid = vid;
 
 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
@@ -1259,8 +1256,8 @@ new_device (struct virtio_net *dev)
 	lcore_info[vdev->coreid].device_num++;
 
 	/* Disable notifications. */
-	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
-	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
+	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
+	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been added to data core %d\n",
@@ -1316,7 +1313,7 @@ print_stats(void)
 				"RX total:              %" PRIu64 "\n"
 				"RX dropped:            %" PRIu64 "\n"
 				"RX successful:         %" PRIu64 "\n",
-				vdev->dev->vid,
+				vdev->vid,
 				tx_total, tx_dropped, tx,
 				rx_total, rx_dropped, rx);
 		}
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index e99c436..6bb42e8 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -49,8 +49,6 @@ struct device_statistics {
 };
 
 struct vhost_dev {
-	/**< Pointer to device created by vhost lib. */
-	struct virtio_net      *dev;
 	/**< Number of memory regions for gpa to hpa translation. */
 	uint32_t nregions_hpa;
 	/**< Device MAC address (Obtained on first TX packet). */
diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index e33ff53..7ef8d34 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -36,7 +36,7 @@ LIB = librte_vhost.a
 
 EXPORT_MAP := rte_vhost_version.map
 
-LIBABIVER := 2
+LIBABIVER := 3
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
 ifeq ($(CONFIG_RTE_LIBRTE_VHOST_USER),y)
diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index 0427461..370345e 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -178,10 +178,10 @@ struct virtio_memory {
  *
  */
 struct virtio_net_device_ops {
-	int (*new_device)(struct virtio_net *);	/**< Add device. */
-	void (*destroy_device)(volatile struct virtio_net *);	/**< Remove device. */
+	int (*new_device)(int vid);		/**< Add device. */
+	void (*destroy_device)(int vid);	/**< Remove device. */
 
-	int (*vring_state_changed)(struct virtio_net *dev, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
+	int (*vring_state_changed)(int vid, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
 };
 
 /**
@@ -220,7 +220,7 @@ int rte_vhost_feature_enable(uint64_t feature_mask);
 /* Returns currently supported vhost features */
 uint64_t rte_vhost_feature_get(void);
 
-int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_id, int enable);
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
 
 /* Register vhost driver. dev_name could be different for multiple instance support. */
 int rte_vhost_driver_register(const char *dev_name);
@@ -291,8 +291,8 @@ uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
  * be received from the physical port or from another virtual device. A packet
  * count is returned to indicate the number of packets that were succesfully
  * added to the RX queue.
- * @param dev
- *  virtio-net device
+ * @param vid
+ *  virtio-net device ID
  * @param queue_id
  *  virtio queue index in mq case
  * @param pkts
@@ -302,14 +302,14 @@ uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
  * @return
  *  num of packets enqueued
  */
-uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint16_t count);
 
 /**
  * This function gets guest buffers from the virtio device TX virtqueue,
  * construct host mbufs, copies guest buffer content to host mbufs and
  * store them in pkts to be processed.
- * @param dev
+ * @param vid
  *  virtio-net device
  * @param queue_id
  *  virtio queue index in mq case
@@ -322,7 +322,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
  * @return
  *  num of packets dequeued
  */
-uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
 
 #endif /* _VIRTIO_NET_H_ */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 8d87508..08cab08 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -46,6 +46,7 @@
 #include <rte_arp.h>
 
 #include "vhost-net.h"
+#include "virtio-net.h"
 
 #define MAX_PKT_BURST 32
 #define VHOST_LOG_PAGE	4096
@@ -587,9 +588,14 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 }
 
 uint16_t
-rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint16_t count)
 {
+	struct virtio_net *dev = get_device(vid);
+
+	if (!dev)
+		return 0;
+
 	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
 		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
 	else
@@ -815,9 +821,10 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 }
 
 uint16_t
-rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
+rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
+	struct virtio_net *dev;
 	struct rte_mbuf *rarp_mbuf = NULL;
 	struct vhost_virtqueue *vq;
 	uint32_t desc_indexes[MAX_PKT_BURST];
@@ -826,6 +833,10 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
 	uint16_t free_entries;
 	uint16_t avail_idx;
 
+	dev = get_device(vid);
+	if (!dev)
+		return 0;
+
 	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
 		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
 			dev->vid, __func__, queue_id);
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
index 9385af1..7fa69a7 100644
--- a/lib/librte_vhost/vhost_user/virtio-net-user.c
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
@@ -117,7 +117,7 @@ user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
 	/* Remove from the data plane. */
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
-		notify_ops->destroy_device(dev);
+		notify_ops->destroy_device(vid);
 	}
 
 	if (dev->mem) {
@@ -279,6 +279,9 @@ user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
 	struct vhost_vring_file file;
 	struct virtio_net *dev = get_device(vid);
 
+	if (!dev)
+		return;
+
 	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
 	if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
 		file.fd = VIRTIO_INVALID_EVENTFD;
@@ -289,7 +292,7 @@ user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
 	vhost_set_vring_kick(vid, &file);
 
 	if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
-		if (notify_ops->new_device(dev) == 0)
+		if (notify_ops->new_device(vid) == 0)
 			dev->flags |= VIRTIO_DEV_RUNNING;
 	}
 }
@@ -307,7 +310,7 @@ user_get_vring_base(int vid,
 		return -1;
 	/* We have to stop the queue (virtio) if it is running. */
 	if (dev->flags & VIRTIO_DEV_RUNNING)
-		notify_ops->destroy_device(dev);
+		notify_ops->destroy_device(vid);
 
 	/* Here we are safe to get the last used index */
 	vhost_get_vring_base(vid, state->index, state);
@@ -342,9 +345,8 @@ user_set_vring_enable(int vid,
 		"set queue enable: %d to qp idx: %d\n",
 		enable, state->index);
 
-	if (notify_ops->vring_state_changed) {
-		notify_ops->vring_state_changed(dev, state->index, enable);
-	}
+	if (notify_ops->vring_state_changed)
+		notify_ops->vring_state_changed(vid, state->index, enable);
 
 	dev->virtqueue[state->index]->enabled = enable;
 
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index 115eba4..ea216c0 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -296,7 +296,7 @@ vhost_destroy_device(int vid)
 
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
-		notify_ops->destroy_device(dev);
+		notify_ops->destroy_device(vid);
 	}
 
 	cleanup_device(dev, 1);
@@ -354,7 +354,7 @@ vhost_reset_owner(int vid)
 
 	if (dev->flags & VIRTIO_DEV_RUNNING) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
-		notify_ops->destroy_device(dev);
+		notify_ops->destroy_device(vid);
 	}
 
 	cleanup_device(dev, 0);
@@ -718,13 +718,13 @@ vhost_set_backend(int vid, struct vhost_vring_file *file)
 	if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
 		if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
 		    dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
-			if (notify_ops->new_device(dev) < 0)
+			if (notify_ops->new_device(vid) < 0)
 				return -1;
 			dev->flags |= VIRTIO_DEV_RUNNING;
 		}
 	} else if (file->fd == VIRTIO_DEV_STOPPED) {
 		dev->flags &= ~VIRTIO_DEV_RUNNING;
-		notify_ops->destroy_device(dev);
+		notify_ops->destroy_device(vid);
 	}
 
 	return 0;
@@ -800,9 +800,14 @@ rte_vhost_avail_entries(int vid, uint16_t queue_id)
 	return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx_res;
 }
 
-int rte_vhost_enable_guest_notification(struct virtio_net *dev,
-	uint16_t queue_id, int enable)
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
 {
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
 	if (enable) {
 		RTE_LOG(ERR, VHOST_CONFIG,
 			"guest notification isn't supported.\n");
-- 
1.9.0

^ permalink raw reply	[relevance 13%]

* [dpdk-dev] [PATCH v3 17/20] vhost: reserve few more space for future extension
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
                       ` (2 preceding siblings ...)
  2016-06-07  3:52 13%     ` [dpdk-dev] [PATCH v3 13/20] vhost: export vid as the only interface to applications Yuanhan Liu
@ 2016-06-07  3:52  4%     ` Yuanhan Liu
  2016-06-07  3:52  6%     ` [dpdk-dev] [PATCH v3 18/20] examples/tep_term: adapt to new vhost ABI/API changes Yuanhan Liu
                       ` (2 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:52 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

"virtio_net_device_ops" is the only left open struct that an application
can access, therefore, it's the only place that might introduce potential
ABI break in future for extension.

So, do some reservation for it. 5 should be pretty enough, considering
that we have barely touched it for a long while. Another reason to
choose 5 is for cache alignment: 5 makes the struct 64 bytes for 64 bit
machine.

With this, it's confidence to say that we might be able to be free from
the ABI violation forever.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Rich Lane <rich.lane@bigswitch.com>
Acked-by: Rich Lane <rich.lane@bigswitch.com>
---
 lib/librte_vhost/rte_virtio_net.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index fc1d799..bc2b74b 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -66,6 +66,8 @@ struct virtio_net_device_ops {
 	void (*destroy_device)(int vid);	/**< Remove device. */
 
 	int (*vring_state_changed)(int vid, uint16_t queue_id, int enable);	/**< triggered when a vring is enabled or disabled */
+
+	void *reserved[5]; /**< Reserved for future extension */
 };
 
 /**
-- 
1.9.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v3 18/20] examples/tep_term: adapt to new vhost ABI/API changes
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
                       ` (3 preceding siblings ...)
  2016-06-07  3:52  4%     ` [dpdk-dev] [PATCH v3 17/20] vhost: reserve few more space for future extension Yuanhan Liu
@ 2016-06-07  3:52  6%     ` Yuanhan Liu
  2016-06-14 12:00  4%     ` [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring Yuanhan Liu
  2016-06-30  7:39  9%     ` Panu Matilainen
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-07  3:52 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa, Yuanhan Liu

Adapt to the new vhost ABI/API refactoring changes, to not break the
build. It's a straightforward change: replace "struct virtio_net *dev"
with "int fd". Simple build test only so far.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---

tep_term is built on top of vhost switch example; they shared a lot of
code (before the vhost example cleanup). Idealy, we might should move
the vxlan part to vhost example, and introduce an option to enable it.

However, I found that would take more effort, including the effort
of making it co-work with VLAN and VMDq stuff as well as the effort
to not break anything, I found it's better to start simple first:
just do a new ABI/API adaption.
---
 examples/tep_termination/main.c        | 83 +++++++++++++++++-----------------
 examples/tep_termination/main.h        |  5 +-
 examples/tep_termination/vxlan_setup.c | 20 ++++----
 examples/tep_termination/vxlan_setup.h |  6 +--
 4 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/examples/tep_termination/main.c b/examples/tep_termination/main.c
index b8297dd..32eb925 100644
--- a/examples/tep_termination/main.c
+++ b/examples/tep_termination/main.c
@@ -566,10 +566,9 @@ virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m)
 	struct rte_mbuf **m_table;
 	unsigned len, ret = 0;
 	const uint16_t lcore_id = rte_lcore_id();
-	struct virtio_net *dev = vdev->dev;
+	int vid = vdev->vid;
 
-	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: MAC address is external\n",
-		dev->device_fh);
+	RTE_LOG(DEBUG, VHOST_DATA, "(%d) TX: MAC address is external\n", vid);
 
 	/* Add packet to the port tx queue */
 	tx_q = &lcore_tx_queue[lcore_id];
@@ -578,8 +577,8 @@ virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m)
 	tx_q->m_table[len] = m;
 	len++;
 	if (enable_stats) {
-		dev_statistics[dev->device_fh].tx_total++;
-		dev_statistics[dev->device_fh].tx++;
+		dev_statistics[vid].tx_total++;
+		dev_statistics[vid].tx++;
 	}
 
 	if (unlikely(len == MAX_PKT_BURST)) {
@@ -614,7 +613,7 @@ static int
 switch_worker(__rte_unused void *arg)
 {
 	struct rte_mempool *mbuf_pool = arg;
-	struct virtio_net *dev = NULL;
+	int vid;
 	struct vhost_dev *vdev = NULL;
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
 	struct virtio_net_data_ll *dev_ll;
@@ -688,7 +687,7 @@ switch_worker(__rte_unused void *arg)
 
 		while (dev_ll != NULL) {
 			vdev = dev_ll->vdev;
-			dev = vdev->dev;
+			vid = vdev->vid;
 
 			if (unlikely(vdev->remove)) {
 				dev_ll = dev_ll->next;
@@ -709,22 +708,22 @@ switch_worker(__rte_unused void *arg)
 					* must be less than virtio queue size
 					*/
 					if (enable_retry && unlikely(rx_count >
-						rte_vring_available_entries(dev, VIRTIO_RXQ))) {
+						rte_vhost_avail_entries(vid, VIRTIO_RXQ))) {
 						for (retry = 0; retry < burst_rx_retry_num;
 							retry++) {
 							rte_delay_us(burst_rx_delay_time);
-							if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
+							if (rx_count <= rte_vhost_avail_entries(vid, VIRTIO_RXQ))
 								break;
 						}
 					}
 
-					ret_count = overlay_options.rx_handle(dev, pkts_burst, rx_count);
+					ret_count = overlay_options.rx_handle(vid, pkts_burst, rx_count);
 					if (enable_stats) {
 						rte_atomic64_add(
-						&dev_statistics[dev->device_fh].rx_total_atomic,
+						&dev_statistics[vid].rx_total_atomic,
 						rx_count);
 						rte_atomic64_add(
-						&dev_statistics[dev->device_fh].rx_atomic, ret_count);
+						&dev_statistics[vid].rx_atomic, ret_count);
 					}
 					while (likely(rx_count)) {
 						rx_count--;
@@ -736,7 +735,7 @@ switch_worker(__rte_unused void *arg)
 
 			if (likely(!vdev->remove)) {
 				/* Handle guest TX*/
-				tx_count = rte_vhost_dequeue_burst(dev,
+				tx_count = rte_vhost_dequeue_burst(vid,
 						VIRTIO_TXQ, mbuf_pool,
 						pkts_burst, MAX_PKT_BURST);
 				/* If this is the first received packet we need to learn the MAC */
@@ -913,18 +912,24 @@ init_data_ll(void)
  * loop in the rte_pause loop.
  */
 static void
-destroy_device(volatile struct virtio_net *dev)
+destroy_device(int vid)
 {
 	struct virtio_net_data_ll *ll_lcore_dev_cur;
 	struct virtio_net_data_ll *ll_main_dev_cur;
 	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
 	struct virtio_net_data_ll *ll_main_dev_last = NULL;
-	struct vhost_dev *vdev;
+	struct vhost_dev *vdev = NULL;
 	int lcore;
 
-	dev->flags &= ~VIRTIO_DEV_RUNNING;
-
-	vdev = (struct vhost_dev *)dev->priv;
+	ll_main_dev_cur = ll_root_used;
+	while (ll_main_dev_cur != NULL) {
+		if (ll_main_dev_cur->vdev->vid == vid) {
+			vdev = ll_main_dev_cur->vdev;
+			break;
+		}
+	}
+	if (!vdev)
+		return;
 
 	/* set the remove flag. */
 	vdev->remove = 1;
@@ -944,8 +949,7 @@ destroy_device(volatile struct virtio_net *dev)
 
 	if (ll_lcore_dev_cur == NULL) {
 		RTE_LOG(ERR, VHOST_CONFIG,
-			"(%"PRIu64") Failed to find the dev to be destroy.\n",
-			dev->device_fh);
+			"(%d) Failed to find the dev to be destroy.\n", vid);
 		return;
 	}
 
@@ -992,8 +996,8 @@ destroy_device(volatile struct virtio_net *dev)
 	/* Decrement number of device on the lcore. */
 	lcore_info[vdev->coreid].lcore_ll->device_num--;
 
-	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed "
-		"from data core\n", dev->device_fh);
+	RTE_LOG(INFO, VHOST_DATA, "(%d) Device has been removed "
+		"from data core\n", vid);
 
 	rte_free(vdev);
 
@@ -1004,7 +1008,7 @@ destroy_device(volatile struct virtio_net *dev)
  * to the main linked list and the allocated to a specific data core.
  */
 static int
-new_device(struct virtio_net *dev)
+new_device(int vid)
 {
 	struct virtio_net_data_ll *ll_dev;
 	int lcore, core_add = 0;
@@ -1014,18 +1018,16 @@ new_device(struct virtio_net *dev)
 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
 	if (vdev == NULL) {
 		RTE_LOG(INFO, VHOST_DATA,
-			"(%"PRIu64") Couldn't allocate memory for vhost dev\n",
-			dev->device_fh);
+			"(%d) Couldn't allocate memory for vhost dev\n", vid);
 		return -1;
 	}
-	vdev->dev = dev;
-	dev->priv = vdev;
+	vdev->vid = vid;
 	/* Add device to main ll */
 	ll_dev = get_data_ll_free_entry(&ll_root_free);
 	if (ll_dev == NULL) {
-		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in"
+		RTE_LOG(INFO, VHOST_DATA, "(%d) No free entry found in"
 			" linked list Device limit of %d devices per core"
-			" has been reached\n", dev->device_fh, nb_devices);
+			" has been reached\n", vid, nb_devices);
 		if (vdev->regions_hpa)
 			rte_free(vdev->regions_hpa);
 		rte_free(vdev);
@@ -1033,7 +1035,7 @@ new_device(struct virtio_net *dev)
 	}
 	ll_dev->vdev = vdev;
 	add_data_ll_entry(&ll_root_used, ll_dev);
-	vdev->rx_q = dev->device_fh;
+	vdev->rx_q = vid;
 
 	/* reset ready flag */
 	vdev->ready = DEVICE_MAC_LEARNING;
@@ -1050,10 +1052,9 @@ new_device(struct virtio_net *dev)
 	ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
 	if (ll_dev == NULL) {
 		RTE_LOG(INFO, VHOST_DATA,
-			"(%"PRIu64") Failed to add device to data core\n",
-			dev->device_fh);
+			"(%d) Failed to add device to data core\n", vid);
 		vdev->ready = DEVICE_SAFE_REMOVE;
-		destroy_device(dev);
+		destroy_device(vid);
 		rte_free(vdev->regions_hpa);
 		rte_free(vdev);
 		return -1;
@@ -1065,17 +1066,17 @@ new_device(struct virtio_net *dev)
 			ll_dev);
 
 	/* Initialize device stats */
-	memset(&dev_statistics[dev->device_fh], 0,
+	memset(&dev_statistics[vid], 0,
 		sizeof(struct device_statistics));
 
 	/* Disable notifications. */
-	rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
-	rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
+	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
+	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
 	lcore_info[vdev->coreid].lcore_ll->device_num++;
-	dev->flags |= VIRTIO_DEV_RUNNING;
 
-	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n",
-		dev->device_fh, vdev->coreid);
+	RTE_LOG(INFO, VHOST_DATA,
+		"(%d) Device has been added to data core %d\n",
+		vid, vdev->coreid);
 
 	return 0;
 }
@@ -1113,7 +1114,7 @@ print_stats(void)
 
 		dev_ll = ll_root_used;
 		while (dev_ll != NULL) {
-			device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
+			device_fh = dev_ll->vdev->vid;
 			tx_total = dev_statistics[device_fh].tx_total;
 			tx = dev_statistics[device_fh].tx;
 			tx_dropped = tx_total - tx;
@@ -1257,7 +1258,7 @@ main(int argc, char *argv[])
 	rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
 
 	/* Register CUSE device to handle IOCTLs. */
-	ret = rte_vhost_driver_register((char *)&dev_basename);
+	ret = rte_vhost_driver_register(dev_basename, 0);
 	if (ret != 0)
 		rte_exit(EXIT_FAILURE, "CUSE device setup failure.\n");
 
diff --git a/examples/tep_termination/main.h b/examples/tep_termination/main.h
index 4b123ab..c0ea766 100644
--- a/examples/tep_termination/main.h
+++ b/examples/tep_termination/main.h
@@ -71,8 +71,7 @@ struct device_statistics {
  * Device linked list structure for data path.
  */
 struct vhost_dev {
-	/**< Pointer to device created by vhost lib. */
-	struct virtio_net      *dev;
+	int vid;
 	/**< Number of memory regions for gpa to hpa translation. */
 	uint32_t nregions_hpa;
 	/**< Memory region information for gpa to hpa translation. */
@@ -116,6 +115,6 @@ struct virtio_net_data_ll {
 };
 
 uint32_t
-virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count);
+virtio_dev_rx(int vid, struct rte_mbuf **pkts, uint32_t count);
 
 #endif /* _MAIN_H_ */
diff --git a/examples/tep_termination/vxlan_setup.c b/examples/tep_termination/vxlan_setup.c
index 2a48e14..58bc334 100644
--- a/examples/tep_termination/vxlan_setup.c
+++ b/examples/tep_termination/vxlan_setup.c
@@ -244,17 +244,17 @@ vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
 {
 	int i, ret;
 	struct ether_hdr *pkt_hdr;
-	struct virtio_net *dev = vdev->dev;
-	uint64_t portid = dev->device_fh;
+	int vid = vdev->vid;
+	uint64_t portid = vid;
 	struct ipv4_hdr *ip;
 
 	struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
 
 	if (unlikely(portid > VXLAN_N_PORTS)) {
 		RTE_LOG(INFO, VHOST_DATA,
-			"(%"PRIu64") WARNING: Not configuring device,"
+			"(%d) WARNING: Not configuring device,"
 			"as already have %d ports for VXLAN.",
-			dev->device_fh, VXLAN_N_PORTS);
+			vid, VXLAN_N_PORTS);
 		return -1;
 	}
 
@@ -262,9 +262,9 @@ vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 	if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
 		RTE_LOG(INFO, VHOST_DATA,
-			"(%"PRIu64") WARNING: This device is using an existing"
+			"(%d) WARNING: This device is using an existing"
 			" MAC address and has not been registered.\n",
-			dev->device_fh);
+			vid);
 		return -1;
 	}
 
@@ -425,7 +425,7 @@ vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id,
 
 /* Check for decapsulation and pass packets directly to VIRTIO device */
 int
-vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts_burst,
+vxlan_rx_pkts(int vid, struct rte_mbuf **pkts_burst,
 		uint32_t rx_count)
 {
 	uint32_t i = 0;
@@ -436,11 +436,11 @@ vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts_burst,
 	for (i = 0; i < rx_count; i++) {
 		if (enable_stats) {
 			rte_atomic64_add(
-				&dev_statistics[dev->device_fh].rx_bad_ip_csum,
+				&dev_statistics[vid].rx_bad_ip_csum,
 				(pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
 				!= 0);
 			rte_atomic64_add(
-				&dev_statistics[dev->device_fh].rx_bad_ip_csum,
+				&dev_statistics[vid].rx_bad_ip_csum,
 				(pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
 				!= 0);
 		}
@@ -452,6 +452,6 @@ vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts_burst,
 			count++;
 	}
 
-	ret = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_valid, count);
+	ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count);
 	return ret;
 }
diff --git a/examples/tep_termination/vxlan_setup.h b/examples/tep_termination/vxlan_setup.h
index 1846540..8d26461 100644
--- a/examples/tep_termination/vxlan_setup.h
+++ b/examples/tep_termination/vxlan_setup.h
@@ -55,10 +55,10 @@ typedef void (*ol_tunnel_destroy_t)(struct vhost_dev *vdev);
 typedef int (*ol_tx_handle_t)(uint8_t port_id, uint16_t queue_id,
 			      struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 
-typedef int (*ol_rx_handle_t)(struct virtio_net *dev, struct rte_mbuf **pkts,
+typedef int (*ol_rx_handle_t)(int vid, struct rte_mbuf **pkts,
 			      uint32_t count);
 
-typedef int (*ol_param_handle)(struct virtio_net *dev);
+typedef int (*ol_param_handle)(int vid);
 
 struct ol_switch_ops {
 	ol_port_configure_t        port_configure;
@@ -82,6 +82,6 @@ int
 vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id,
 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
 int
-vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count);
+vxlan_rx_pkts(int vid, struct rte_mbuf **pkts, uint32_t count);
 
 #endif /* VXLAN_SETUP_H_ */
-- 
1.9.0

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [PATCH v3 0/6] vhost: add vhost-user client mode and reconnect ability
  @ 2016-06-07  4:05  3% ` Yuanhan Liu
  2016-06-14 12:00  0%   ` Yuanhan Liu
  0 siblings, 1 reply; 200+ results
From: Yuanhan Liu @ 2016-06-07  4:05 UTC (permalink / raw)
  To: dev; +Cc: huawei.xie, Traynor Kevin, marcandre.lureau, Yuanhan Liu

v3: - make the "reconnect" feature be default for client mode, as it's
      good to handle guest OS restart with less effort.
    - fix var not-initilized error pointed out by Rich

NOTE: I created a branch at dpdk.org [0] for more convenient testing:

    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing

When the DPDK vhost-user application (such as OVS) restarts (due to
crash, or update), the vhost-user connection between DPDK and QEMU
won't be established automatically again. In another word, the virtio
net is broken.

The reason it doesn't work is that DPDK just acts as server only.
A restart of the server needs a reconnection from the client (QEMU).
However, reconnect from QEMU is not supported from QEMU.

Adding the support of client mode and let DPDK be the client somehow
would resolve above issue a bit easier: a restart of DPDK would naturally
try to connect to the server (QEMU) automatically.

Therefore, this patchset implements the DPDK vhost-user client mode, by
introducing a new arg (flags) for API rte_vhost_driver_register(). And the
client mode is enabled when RTE_VHOST_USER_CLIENT is given. Note that this
implies an API breakage. However, since this release deals with ABI/API
refactoring, it should not be an issue.

Another interesting thing to make it work is that you not only have
to consider that case the DPDK vhost-user app might restart, but also
have to think that QEMU might restart as well: guest OS sometimes
just reboots. In such case, when the server is down, the client has
to keep reconnecting with the server until the server is back and the
connection is established again. And that's what "reconnect" patch for.

Note that current QEMU doesn't not support a second time connection
from client, thus a restart of DPDK vhost-user will not work. This is
because current QEMU won't be able to detect the disconnect from
restart, thus it will not listen for later connections. Patches [1] have
been sent, it's just not merged yet. But unlike the vhost-user mulitple
queue case, that we have critical depends on QEMU implementation, here
we have no such dependency, therefore, I think it's okay to make DPDK
be ready for the "reconnect" stuff first. (note that I also mentioned
this fact in the release doc).

    [1]: http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg01507.html

v2: - added release doc
    - do not remove socket file for the client mode
    - create one thread ony to handle all reconnects

Thanks.
	--yliu

---
Yuanhan Liu (6):
  vhost: rename structs for enabling client mode
  vhost: add vhost-user client mode
  vhost: add reconnect ability
  vhost: workaround stale vring base
  examples/vhost: add client option
  vhost: add pmd client option

 doc/guides/rel_notes/release_16_07.rst       |  21 ++
 drivers/net/vhost/rte_eth_vhost.c            |  38 ++-
 examples/vhost/main.c                        |  12 +-
 lib/librte_vhost/rte_virtio_net.h            |  12 +-
 lib/librte_vhost/vhost_cuse/vhost-net-cdev.c |   8 +-
 lib/librte_vhost/vhost_user/vhost-net-user.c | 403 ++++++++++++++++++---------
 lib/librte_vhost/vhost_user/vhost-net-user.h |   6 -
 lib/librte_vhost/virtio-net.c                |   9 +
 8 files changed, 361 insertions(+), 148 deletions(-)

-- 
1.9.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v2 2/8] lib/librte_ether: defind RX/TX lock mode
  @ 2016-06-07  5:45  4% ` Zhe Tao
  0 siblings, 0 replies; 200+ results
From: Zhe Tao @ 2016-06-07  5:45 UTC (permalink / raw)
  To: dev
  Cc: wenzhuo.lu, zhe.tao, konstantin.ananyev, bruce.richardson,
	jing.d.chen, cunming.liang, jingjing.wu, helin.zhang

From: Wenzhuo Lu <wenzhuo.lu@intel.com>

Define lock mode for RX/TX queue. Because when resetting
the device we want the resetting thread to get the lock
of the RX/TX queue to make sure the RX/TX is stopped.

Using next ABI macro for this ABI change as it has too
much impact. 7 APIs and 1 global variable are impacted.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Signed-off-by: Zhe Tao <zhe.tao@intel.com>
Signed-off-by: zhe.tao <zhe.tao@intel.com>
---
 lib/librte_ether/rte_ethdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 74e895f..4efb5e9 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -354,7 +354,12 @@ struct rte_eth_rxmode {
 		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
 		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
 		enable_scatter   : 1, /**< Enable scatter packets rx handler */
+#ifndef RTE_NEXT_ABI
 		enable_lro       : 1; /**< Enable LRO */
+#else
+		enable_lro       : 1, /**< Enable LRO */
+		lock_mode        : 1; /**< Using lock path */
+#endif
 };
 
 /**
@@ -634,11 +639,68 @@ struct rte_eth_txmode {
 		/**< If set, reject sending out tagged pkts */
 		hw_vlan_reject_untagged : 1,
 		/**< If set, reject sending out untagged pkts */
+#ifndef RTE_NEXT_ABI
 		hw_vlan_insert_pvid : 1;
 		/**< If set, enable port based VLAN insertion */
+#else
+		hw_vlan_insert_pvid : 1,
+		/**< If set, enable port based VLAN insertion */
+		lock_mode : 1;
+		/**< If set, using lock path */
+#endif
 };
 
 /**
+ * The macros for the RX/TX lock mode functions
+ */
+#ifdef RTE_NEXT_ABI
+#define RX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.rxmode.lock_mode ? \
+	func ## _lock : func)
+
+#define TX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.txmode.lock_mode ? \
+	func ## _lock : func)
+#else
+#define RX_LOCK_FUNCTION(dev, func) func
+
+#define TX_LOCK_FUNCTION(dev, func) func
+#endif
+
+/* Add the lock RX/TX function for VF reset */
+#define GENERATE_RX_LOCK(func, nic) \
+uint16_t func ## _lock(void *rx_queue, \
+		      struct rte_mbuf **rx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _rx_queue *rxq = rx_queue; \
+	uint16_t nb_rx = 0; \
+						\
+	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
+		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&rxq->rx_lock); \
+	} \
+	\
+	return nb_rx; \
+}
+
+#define GENERATE_TX_LOCK(func, nic) \
+uint16_t func ## _lock(void *tx_queue, \
+		      struct rte_mbuf **tx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _tx_queue *txq = tx_queue; \
+	uint16_t nb_tx = 0; \
+						\
+	if (rte_spinlock_trylock(&txq->tx_lock)) { \
+		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&txq->tx_lock); \
+	} \
+	\
+	return nb_tx; \
+}
+
+/**
  * A structure used to configure an RX ring of an Ethernet port.
  */
 struct rte_eth_rxconf {
-- 
2.1.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v3 2/8] lib/librte_ether: defind RX/TX lock mode
  @ 2016-06-07  6:12  4% ` Zhe Tao
  0 siblings, 0 replies; 200+ results
From: Zhe Tao @ 2016-06-07  6:12 UTC (permalink / raw)
  To: dev
  Cc: wenzhuo.lu, zhe.tao, konstantin.ananyev, bruce.richardson,
	jing.d.chen, cunming.liang, jingjing.wu, helin.zhang

Define lock mode for RX/TX queue. Because when resetting
the device we want the resetting thread to get the lock
of the RX/TX queue to make sure the RX/TX is stopped.

Using next ABI macro for this ABI change as it has too
much impact. 7 APIs and 1 global variable are impacted.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Signed-off-by: Zhe Tao <zhe.tao@intel.com>
---
 lib/librte_ether/rte_ethdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 74e895f..4efb5e9 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -354,7 +354,12 @@ struct rte_eth_rxmode {
 		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
 		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
 		enable_scatter   : 1, /**< Enable scatter packets rx handler */
+#ifndef RTE_NEXT_ABI
 		enable_lro       : 1; /**< Enable LRO */
+#else
+		enable_lro       : 1, /**< Enable LRO */
+		lock_mode        : 1; /**< Using lock path */
+#endif
 };
 
 /**
@@ -634,11 +639,68 @@ struct rte_eth_txmode {
 		/**< If set, reject sending out tagged pkts */
 		hw_vlan_reject_untagged : 1,
 		/**< If set, reject sending out untagged pkts */
+#ifndef RTE_NEXT_ABI
 		hw_vlan_insert_pvid : 1;
 		/**< If set, enable port based VLAN insertion */
+#else
+		hw_vlan_insert_pvid : 1,
+		/**< If set, enable port based VLAN insertion */
+		lock_mode : 1;
+		/**< If set, using lock path */
+#endif
 };
 
 /**
+ * The macros for the RX/TX lock mode functions
+ */
+#ifdef RTE_NEXT_ABI
+#define RX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.rxmode.lock_mode ? \
+	func ## _lock : func)
+
+#define TX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.txmode.lock_mode ? \
+	func ## _lock : func)
+#else
+#define RX_LOCK_FUNCTION(dev, func) func
+
+#define TX_LOCK_FUNCTION(dev, func) func
+#endif
+
+/* Add the lock RX/TX function for VF reset */
+#define GENERATE_RX_LOCK(func, nic) \
+uint16_t func ## _lock(void *rx_queue, \
+		      struct rte_mbuf **rx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _rx_queue *rxq = rx_queue; \
+	uint16_t nb_rx = 0; \
+						\
+	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
+		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&rxq->rx_lock); \
+	} \
+	\
+	return nb_rx; \
+}
+
+#define GENERATE_TX_LOCK(func, nic) \
+uint16_t func ## _lock(void *tx_queue, \
+		      struct rte_mbuf **tx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _tx_queue *txq = tx_queue; \
+	uint16_t nb_tx = 0; \
+						\
+	if (rte_spinlock_trylock(&txq->tx_lock)) { \
+		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&txq->tx_lock); \
+	} \
+	\
+	return nb_tx; \
+}
+
+/**
  * A structure used to configure an RX ring of an Ethernet port.
  */
 struct rte_eth_rxconf {
-- 
2.1.4

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  @ 2016-06-07  6:53  4%   ` Zhe Tao
  2016-06-07  9:58  0%     ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Zhe Tao @ 2016-06-07  6:53 UTC (permalink / raw)
  To: dev
  Cc: wenzhuo.lu, zhe.tao, konstantin.ananyev, bruce.richardson,
	jing.d.chen, cunming.liang, jingjing.wu, helin.zhang

Define lock mode for RX/TX queue. Because when resetting
the device we want the resetting thread to get the lock
of the RX/TX queue to make sure the RX/TX is stopped.

Using next ABI macro for this ABI change as it has too
much impact. 7 APIs and 1 global variable are impacted.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Signed-off-by: Zhe Tao <zhe.tao@intel.com>
---
 lib/librte_ether/rte_ethdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 74e895f..4efb5e9 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -354,7 +354,12 @@ struct rte_eth_rxmode {
 		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
 		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
 		enable_scatter   : 1, /**< Enable scatter packets rx handler */
+#ifndef RTE_NEXT_ABI
 		enable_lro       : 1; /**< Enable LRO */
+#else
+		enable_lro       : 1, /**< Enable LRO */
+		lock_mode        : 1; /**< Using lock path */
+#endif
 };
 
 /**
@@ -634,11 +639,68 @@ struct rte_eth_txmode {
 		/**< If set, reject sending out tagged pkts */
 		hw_vlan_reject_untagged : 1,
 		/**< If set, reject sending out untagged pkts */
+#ifndef RTE_NEXT_ABI
 		hw_vlan_insert_pvid : 1;
 		/**< If set, enable port based VLAN insertion */
+#else
+		hw_vlan_insert_pvid : 1,
+		/**< If set, enable port based VLAN insertion */
+		lock_mode : 1;
+		/**< If set, using lock path */
+#endif
 };
 
 /**
+ * The macros for the RX/TX lock mode functions
+ */
+#ifdef RTE_NEXT_ABI
+#define RX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.rxmode.lock_mode ? \
+	func ## _lock : func)
+
+#define TX_LOCK_FUNCTION(dev, func) \
+	(dev->data->dev_conf.txmode.lock_mode ? \
+	func ## _lock : func)
+#else
+#define RX_LOCK_FUNCTION(dev, func) func
+
+#define TX_LOCK_FUNCTION(dev, func) func
+#endif
+
+/* Add the lock RX/TX function for VF reset */
+#define GENERATE_RX_LOCK(func, nic) \
+uint16_t func ## _lock(void *rx_queue, \
+		      struct rte_mbuf **rx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _rx_queue *rxq = rx_queue; \
+	uint16_t nb_rx = 0; \
+						\
+	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
+		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&rxq->rx_lock); \
+	} \
+	\
+	return nb_rx; \
+}
+
+#define GENERATE_TX_LOCK(func, nic) \
+uint16_t func ## _lock(void *tx_queue, \
+		      struct rte_mbuf **tx_pkts, \
+		      uint16_t nb_pkts) \
+{					\
+	struct nic ## _tx_queue *txq = tx_queue; \
+	uint16_t nb_tx = 0; \
+						\
+	if (rte_spinlock_trylock(&txq->tx_lock)) { \
+		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
+		rte_spinlock_unlock(&txq->tx_lock); \
+	} \
+	\
+	return nb_tx; \
+}
+
+/**
  * A structure used to configure an RX ring of an Ethernet port.
  */
 struct rte_eth_rxconf {
-- 
2.1.4

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-07  6:53  4%   ` [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode Zhe Tao
@ 2016-06-07  9:58  0%     ` Ananyev, Konstantin
  2016-06-08  7:24  3%       ` Lu, Wenzhuo
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2016-06-07  9:58 UTC (permalink / raw)
  To: Tao, Zhe, dev
  Cc: Lu, Wenzhuo, Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu,
	Jingjing, Zhang, Helin


Hi Zhe & Wenzhuo,

Please find my comments below.
BTW, for clarification - is that patch for 16.11?
I believe it's too late to introduce such significant change in 16.07.
Thanks
Konstantin

> Define lock mode for RX/TX queue. Because when resetting
> the device we want the resetting thread to get the lock
> of the RX/TX queue to make sure the RX/TX is stopped.
> 
> Using next ABI macro for this ABI change as it has too
> much impact. 7 APIs and 1 global variable are impacted.
> 
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> ---
>  lib/librte_ether/rte_ethdev.h | 62 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 62 insertions(+)
> 
> diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
> index 74e895f..4efb5e9 100644
> --- a/lib/librte_ether/rte_ethdev.h
> +++ b/lib/librte_ether/rte_ethdev.h
> @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
>  		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
>  		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
>  		enable_scatter   : 1, /**< Enable scatter packets rx handler */
> +#ifndef RTE_NEXT_ABI
>  		enable_lro       : 1; /**< Enable LRO */
> +#else
> +		enable_lro       : 1, /**< Enable LRO */
> +		lock_mode        : 1; /**< Using lock path */
> +#endif
>  };
> 
>  /**
> @@ -634,11 +639,68 @@ struct rte_eth_txmode {
>  		/**< If set, reject sending out tagged pkts */
>  		hw_vlan_reject_untagged : 1,
>  		/**< If set, reject sending out untagged pkts */
> +#ifndef RTE_NEXT_ABI
>  		hw_vlan_insert_pvid : 1;
>  		/**< If set, enable port based VLAN insertion */
> +#else
> +		hw_vlan_insert_pvid : 1,
> +		/**< If set, enable port based VLAN insertion */
> +		lock_mode : 1;
> +		/**< If set, using lock path */
> +#endif
>  };
> 
>  /**
> + * The macros for the RX/TX lock mode functions
> + */
> +#ifdef RTE_NEXT_ABI
> +#define RX_LOCK_FUNCTION(dev, func) \
> +	(dev->data->dev_conf.rxmode.lock_mode ? \
> +	func ## _lock : func)
> +
> +#define TX_LOCK_FUNCTION(dev, func) \
> +	(dev->data->dev_conf.txmode.lock_mode ? \
> +	func ## _lock : func)
> +#else
> +#define RX_LOCK_FUNCTION(dev, func) func
> +
> +#define TX_LOCK_FUNCTION(dev, func) func
> +#endif
> +
> +/* Add the lock RX/TX function for VF reset */
> +#define GENERATE_RX_LOCK(func, nic) \
> +uint16_t func ## _lock(void *rx_queue, \
> +		      struct rte_mbuf **rx_pkts, \
> +		      uint16_t nb_pkts) \
> +{					\
> +	struct nic ## _rx_queue *rxq = rx_queue; \
> +	uint16_t nb_rx = 0; \
> +						\
> +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> +		rte_spinlock_unlock(&rxq->rx_lock); \
> +	} \
> +	\
> +	return nb_rx; \
> +}
> +
> +#define GENERATE_TX_LOCK(func, nic) \
> +uint16_t func ## _lock(void *tx_queue, \
> +		      struct rte_mbuf **tx_pkts, \
> +		      uint16_t nb_pkts) \
> +{					\
> +	struct nic ## _tx_queue *txq = tx_queue; \
> +	uint16_t nb_tx = 0; \
> +						\
> +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> +		rte_spinlock_unlock(&txq->tx_lock); \
> +	} \
> +	\
> +	return nb_tx; \
> +}

1. As I said in off-line dicussiion, I think this locking could
(and I think better be) impelented completely on rte_ethdev layer.
So actual PMD code will be unaffected.
Again that avoids us to introduce _lock version of every RX/Tx function
in each PMD.

2. Again, as discussed offline, I think it is better to have an explicit 
rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds into
RX/TX config strcutures.
Would help to avoid any confusion, I think.

3.  I thought the plan was to introduce a locking in all appropriate control path
functions (dev_start/dev_stop etc.)
Without that locking version of RX/TX seems a bit useless.
Yes, I understand that you do use locking inside dev_reset, but I suppose
the plan was to have a generic solution, no?
Again, interrupt fire when user invokes dev_start/stop or so, so we still 
need some synchronisation between them.

To be more specific, I thought about something like that:

static inline uint16_t
rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
                 struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
        struct rte_eth_dev *dev = &rte_eth_devices[port_id];

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
        RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
        RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);

        if (queue_id >= dev->data->nb_rx_queues) {
                RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
                return 0;
        }
#endif

+ if (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].lock) == 0)
+	return 0;
+  else if (dev->data->rx_queue_state[rx_queue_id] == RTE_ETH_QUEUE_STATE_STOPPED)) {
+	rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
+	return 0;
+	
      
 nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
                        rx_pkts, nb_pkts);

+ rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);

....

return nb_rx;
}

And inside queue_start:

int
rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id)
{
        struct rte_eth_dev *dev;

        RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);

        dev = &rte_eth_devices[port_id];
        if (rx_queue_id >= dev->data->nb_rx_queues) {
                RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
                return -EINVAL;
        }

        RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start, -ENOTSUP);

     rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)

        if (dev->data->rx_queue_state[rx_queue_id] != RTE_ETH_QUEUE_STATE_STOPPED) {
                RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with port_id=%" PRIu8
                        " already started\n",
                        rx_queue_id, port_id);
                ret = -EINVAL 0;
        } else
        	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);

    rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);

   return ret;
}

Then again, we don't need to do explicit locking inside dev_reset().
Does it make sense to you guys?


> +
> +/**
>   * A structure used to configure an RX ring of an Ethernet port.
>   */
>  struct rte_eth_rxconf {
> --
> 2.1.4

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 15:07  4% [dpdk-dev] RFC: DPDK Long Term Support Mcnamara, John
                   ` (2 preceding siblings ...)
  2016-06-05 18:15  5% ` Neil Horman
@ 2016-06-07 12:36  3% ` Christian Ehrhardt
  2016-06-07 19:39  0%   ` Martinx - ジェームズ
  3 siblings, 1 reply; 200+ results
From: Christian Ehrhardt @ 2016-06-07 12:36 UTC (permalink / raw)
  To: Mcnamara, John; +Cc: dev, Markos Chandras, Panu Matilainen

On Fri, Jun 3, 2016 at 5:07 PM, Mcnamara, John <john.mcnamara@intel.com>
wrote:
[...]
>
> LTS Version
> ------------
>
> The proposed initial LTS version will be DPDK 16.07. The next versions,
> based
> on a 2 year cycle, will be DPDK 18.08, 20.08, etc.
>

I can see on the discussions that much more things around this have to be
discussed and agreed, but to some extend we will also just "have to wait
and see" how things work out.
I fully agree to the API change argument to start with 16.07 and the 2 year
cycle (more would be nice, but this means effort and after a while almost
nothing is "easily" backportable).

Never the less I have to ask - as I'd be personally much more happy if it
would be the odd years autumn release that would make the LTS as it would
match our LTS releases much much better.
Otherwise we (Ubuntu) will always "just miss" the LTS by a few months.
First I'd have thought on xx.02 releases, but consuming applications might
need time to adapt and while there are the nice API/ABI guarantees
experience tells me to better leave some kind of time-buffer.

Also this would give all of us a first shot with a shorter (not so long as
in L) LTS to see if the process we defined works out before jumping on a
full 2 year cadence.

So while I see that this is kind of "my problem" I would at least try to
personally ask and vote for LTS being: 16.7, 17.11, 19.11, 21.11, ...

Christian Ehrhardt
Software Engineer, Ubuntu Server
Canonical Ltd

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 18:17  3% ` Matthew Hall
@ 2016-06-07 12:53  3%   ` Mcnamara, John
  0 siblings, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-07 12:53 UTC (permalink / raw)
  To: Matthew Hall; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen

> -----Original Message-----
> From: Matthew Hall [mailto:mhall@mhcomputing.net]
> Sent: Friday, June 3, 2016 7:17 PM
> To: Mcnamara, John <john.mcnamara@intel.com>
> Cc: dev <dev@dpdk.org>; Christian Ehrhardt
> <christian.ehrhardt@canonical.com>; Markos Chandras <mchandras@suse.de>;
> Panu Matilainen <pmatilai@redhat.com>
> Subject: Re: [dpdk-dev] RFC: DPDK Long Term Support
> 
> >
> > What changes should not be backported
> > -------------------------------------
> >
> > * API or ABI breaking changes.
> 
> I think this part needs some adjusting.
> 
> It seems like there should be allowance for bug fixes where the original
> does break ABI but it is possible to make a version that doesn't.
> 
> A lot of DPDK bug fixes I see would fall into this category and it isn't
> discussed.

Hi Matthew,

Agreed, we should allow fix to be backported even if the patch itself cannot be, for ABI reasons.

John

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-03 16:05  0% ` Thomas Monjalon
  2016-06-06 11:49  0%   ` Yuanhan Liu
@ 2016-06-07 13:17  3%   ` Mcnamara, John
  1 sibling, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-07 13:17 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, June 3, 2016 5:05 PM
> To: Mcnamara, John <john.mcnamara@intel.com>
> Cc: dev@dpdk.org; Christian Ehrhardt <christian.ehrhardt@canonical.com>;
> Markos Chandras <mchandras@suse.de>; Panu Matilainen <pmatilai@redhat.com>
> Subject: Re: [dpdk-dev] RFC: DPDK Long Term Support
> 
> Hi,
> 
> 2016-06-03 15:07, Mcnamara, John:
> > Introduction
> > ------------
> >
> > This document sets out a proposal for a DPDK Long Term Support release
> (LTS).
> 
> In general, LTS refer to a longer maintenance than than regular one.
> Here we are talking to doing some maintenance as stable releases first.
> Currently we have no maintenance at all.
> So I suggest to differentiate "stable branches" and "LTS" for some stable
> branches.

Hi Thomas,

I have no argument against this. It would be great to have a series of stable
branches of which some are LTS.

But at a minimum we are committing to have a least one maintained stable branch 
that will also be a LTS.

 
> I wonder if Yuanhan is OK to maintain every stable releases which could be
> requested/needed? Or should we have other committers for the stable
> releases that Yuanhan would not want to maintain himself?
> The Linux model is to let people declare themselves when they want to
> maintain a stable branch.


I think it is fine to have other committers.


> > The proposed duration of the LTS support is 2 years.
> 
> I think we should discuss the support duration for each release
> separately.
> 
> > There will only be one LTS branch being maintained at any time. At the
> > end of the 2 year cycle the maintenance on the previous LTS will be
> wound down.
> 
> Seems a bit too restrictive.
> Currently, there is no maintenance at all because nobody was volunteer.
> If Yuanhan is volunteer for a stable branch every 2 years, fine.
> If someone else is volunteer for other branches, why not let him do it?

I see no problem with that. This proposal just reflects that fact that we
have only had one volunteer to date and is based on what could be reasonably
done by one person (plus the validation support). If more maintainers come 
forward we can have more/more frequent stable branches.

We will, however, be constrained by the validation effort that can be offered, 
unless there are other validation volunteers.


> > The proposed initial LTS version will be DPDK 16.07. The next
> > versions, based on a 2 year cycle, will be DPDK 18.08, 20.08, etc.
> 
> Let's do a first run with 16.07 and see later what we want to do next.
> How long time a stable branch must be announced before its initial
> release?

Ok. The statement at the end about reviewing at the end of the first year
is meant to cover adjustments like this. I think that we will have to see
how things work out in practice and adjust as we go.



> > What changes should be backported
> > ---------------------------------
> >
> > * Bug fixes that don't break the ABI.
> 
> And API?
> And behaviour (if not clearly documented in the API)?


Yes. It should say ABI and API. Undocumented but implied or existing
bahaviour should also be maintained.


> > (OSV reps please confirm.)
> >
> > * Ubuntu 16.04 LTS
> > * RHEL 7.3
> > * SuSE 11 SP4 or 12
> > * FreeBSD 10.3
> 
> I'm sure there will be more validation on the field or from contributors.

Hopefully. :-)

John.
-- 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-05 18:15  5% ` Neil Horman
  2016-06-06  9:27  5%   ` Thomas Monjalon
@ 2016-06-07 15:55  5%   ` Mcnamara, John
  1 sibling, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-07 15:55 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen



> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Sunday, June 5, 2016 7:15 PM
> To: Mcnamara, John <john.mcnamara@intel.com>
> Cc: dev <dev@dpdk.org>; Christian Ehrhardt
> <christian.ehrhardt@canonical.com>; Markos Chandras <mchandras@suse.de>;
> Panu Matilainen <pmatilai@redhat.com>
> Subject: Re: [dpdk-dev] RFC: DPDK Long Term Support
> 
> >
> I'm not opposed to an LTS release, but it seems to be re-solving the issue
> of ABI breakage.  That is to say, there is alreay a process in place for
> managing ABI changes to the DPDK, which is designed to help ensure that:
> 
> 1) ABI changes are signaled at least 2 releases early
> 2) ABI changes whenever possible are designed such that backward
> compatibility versions can be encoded at the same time with versioning
> tags
> 
> Those two mechanism are expressly intended to allow application upgrades
> of DPDK libraries without worrying about ABI breakage.  

Hi,

The purpose of the LTS proposal isn't to replace or circumvent the ABI policy.
In fact backporting of patches would be very difficult without an upstream
ABI policy.

Even if the ABI policy was working perfectly there would still be a use case
for an LTS among consumers who want a fixed version with bug fixes or minor
changes. There are already several companies maintaining their own branches
like this. This purpose of this proposal is to get them to converge on a 
single version (or, if there is support, versions) and combine their efforts.


> While LTS releases
> are a fine approach for  some things, they sacrifice upstream efficiency
> (by creating work for backporting teams), while allowing upstream
> developers more leverage to just create ABI breaking changes on a whim,
> ignoring the existing ABI compatibility mechanism


An LTS release doesn't prevent us from maintaining upstream ABI compatibility
and it only gives developers leverage if we allow it to.

John.
-- 

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-06 13:47  5%     ` Neil Horman
  2016-06-06 14:21  4%       ` Thomas Monjalon
@ 2016-06-07 16:21  3%       ` Mcnamara, John
  1 sibling, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-07 16:21 UTC (permalink / raw)
  To: Neil Horman, Thomas Monjalon
  Cc: dev, Christian Ehrhardt, Markos Chandras, Panu Matilainen



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> Sent: Monday, June 6, 2016 2:48 PM
> To: Thomas Monjalon <thomas.monjalon@6wind.com>
> Cc: dev@dpdk.org; Mcnamara, John <john.mcnamara@intel.com>; Christian
> Ehrhardt <christian.ehrhardt@canonical.com>; Markos Chandras
> <mchandras@suse.de>; Panu Matilainen <pmatilai@redhat.com>
> Subject: Re: [dpdk-dev] RFC: DPDK Long Term Support
> 
> While I don't disagree with that statement (LTS does provide both of those
> things if the maintainer does it properly), I'm forced to ask the
> question, before we solve this problem in a new way, lets ask why the
> existing way isn't being used.  Do developers just not care about
> backwards compatibility?  Is the process to hard?  Something else?  I
> really don't like the idea of abandoning what currently exists to replace
> it with something else, without first addressing why what we have isn't
> working.

Hi Neil,

I think these questions around why the current ABI policy isn't working 
(or at least not working well) and how it can be fixed are worth raising
as a new discussion.

John.
-- 

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] RFC: DPDK Long Term Support
  2016-06-07 12:36  3% ` Christian Ehrhardt
@ 2016-06-07 19:39  0%   ` Martinx - ジェームズ
  0 siblings, 0 replies; 200+ results
From: Martinx - ジェームズ @ 2016-06-07 19:39 UTC (permalink / raw)
  To: Christian Ehrhardt; +Cc: dev

On 7 June 2016 at 08:36, Christian Ehrhardt <
christian.ehrhardt@canonical.com> wrote:

> On Fri, Jun 3, 2016 at 5:07 PM, Mcnamara, John <john.mcnamara@intel.com>
> wrote:
> [...]
> >
> > LTS Version
> > ------------
> >
> > The proposed initial LTS version will be DPDK 16.07. The next versions,
> > based
> > on a 2 year cycle, will be DPDK 18.08, 20.08, etc.
> >
>
> I can see on the discussions that much more things around this have to be
> discussed and agreed, but to some extend we will also just "have to wait
> and see" how things work out.
> I fully agree to the API change argument to start with 16.07 and the 2 year
> cycle (more would be nice, but this means effort and after a while almost
> nothing is "easily" backportable).
>
> Never the less I have to ask - as I'd be personally much more happy if it
> would be the odd years autumn release that would make the LTS as it would
> match our LTS releases much much better.
> Otherwise we (Ubuntu) will always "just miss" the LTS by a few months.
> First I'd have thought on xx.02 releases, but consuming applications might
> need time to adapt and while there are the nice API/ABI guarantees
> experience tells me to better leave some kind of time-buffer.
>
> Also this would give all of us a first shot with a shorter (not so long as
> in L) LTS to see if the process we defined works out before jumping on a
> full 2 year cadence.
>
> So while I see that this is kind of "my problem" I would at least try to
> personally ask and vote for LTS being: 16.7, 17.11, 19.11, 21.11, ...
>
> Christian Ehrhardt
> Software Engineer, Ubuntu Server
> Canonical Ltd
>

+1 For DPDK LTS being:

16.7, 17.11, 19.11, 21.11, ...

This way, DPDK LTS 17.11 will be part of Ubuntu LTS 18.04... DPDK LTS 19.11
on Ubuntu LTS 20.04...

Very likely that DPDK LTS 16.07 will be available to Ubuntu 16.04 via
Ubuntu Cloud Archive Newton.

Cheers!
Thiago

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-06  5:40  4% ` [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode Wenzhuo Lu
@ 2016-06-08  2:15  0%   ` Stephen Hemminger
  2016-06-08  7:34  0%     ` Lu, Wenzhuo
  0 siblings, 1 reply; 200+ results
From: Stephen Hemminger @ 2016-06-08  2:15 UTC (permalink / raw)
  To: Wenzhuo Lu; +Cc: dev, Zhe Tao

On Mon,  6 Jun 2016 13:40:47 +0800
Wenzhuo Lu <wenzhuo.lu@intel.com> wrote:

> Define lock mode for RX/TX queue. Because when resetting
> the device we want the resetting thread to get the lock
> of the RX/TX queue to make sure the RX/TX is stopped.
> 
> Using next ABI macro for this ABI change as it has too
> much impact. 7 APIs and 1 global variable are impacted.
> 
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> Signed-off-by: Zhe Tao <zhe.tao@intel.com>

Why does this patch set make a different assumption the rest of the DPDK?

The rest of the DPDK operates on the principle that the application
is smart enough to stop the device before making changes. There is no
equivalent to the Linux kernel RTNL mutex. The API assumes application
threads are well behaved and will not try and sabotage each other.

If you restrict the reset operation to only being available when RX/TX is stopped,
then no lock is needed.

The fact that it requires lots more locking inside each device driver implies
to me this is not correct way to architect this.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-07  9:58  0%     ` Ananyev, Konstantin
@ 2016-06-08  7:24  3%       ` Lu, Wenzhuo
  2016-06-08  9:19  0%         ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Lu, Wenzhuo @ 2016-06-08  7:24 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tao, Zhe, dev
  Cc: Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu, Jingjing,
	Zhang, Helin

Hi Konstantin,


> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Tuesday, June 7, 2016 5:59 PM
> To: Tao, Zhe; dev@dpdk.org
> Cc: Lu, Wenzhuo; Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing;
> Zhang, Helin
> Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> 
> 
> Hi Zhe & Wenzhuo,
> 
> Please find my comments below.
> BTW, for clarification - is that patch for 16.11?
> I believe it's too late to introduce such significant change in 16.07.
> Thanks
> Konstantin
Thanks for the comments.
Honestly, our purpose is 16.07. Realizing the big impact, we use NEXT_ABI to comment our change. So, I think although we want to merge it in 16.07 this change will become effective after we remove NEXT_ABI in 16.11.

> 
> > Define lock mode for RX/TX queue. Because when resetting the device we
> > want the resetting thread to get the lock of the RX/TX queue to make
> > sure the RX/TX is stopped.
> >
> > Using next ABI macro for this ABI change as it has too much impact. 7
> > APIs and 1 global variable are impacted.
> >
> > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> > ---
> >  lib/librte_ether/rte_ethdev.h | 62
> > +++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 62 insertions(+)
> >
> > diff --git a/lib/librte_ether/rte_ethdev.h
> > b/lib/librte_ether/rte_ethdev.h index 74e895f..4efb5e9 100644
> > --- a/lib/librte_ether/rte_ethdev.h
> > +++ b/lib/librte_ether/rte_ethdev.h
> > @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
> >  		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
> >  		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
> >  		enable_scatter   : 1, /**< Enable scatter packets rx handler */
> > +#ifndef RTE_NEXT_ABI
> >  		enable_lro       : 1; /**< Enable LRO */
> > +#else
> > +		enable_lro       : 1, /**< Enable LRO */
> > +		lock_mode        : 1; /**< Using lock path */
> > +#endif
> >  };
> >
> >  /**
> > @@ -634,11 +639,68 @@ struct rte_eth_txmode {
> >  		/**< If set, reject sending out tagged pkts */
> >  		hw_vlan_reject_untagged : 1,
> >  		/**< If set, reject sending out untagged pkts */
> > +#ifndef RTE_NEXT_ABI
> >  		hw_vlan_insert_pvid : 1;
> >  		/**< If set, enable port based VLAN insertion */
> > +#else
> > +		hw_vlan_insert_pvid : 1,
> > +		/**< If set, enable port based VLAN insertion */
> > +		lock_mode : 1;
> > +		/**< If set, using lock path */
> > +#endif
> >  };
> >
> >  /**
> > + * The macros for the RX/TX lock mode functions  */ #ifdef
> > +RTE_NEXT_ABI #define RX_LOCK_FUNCTION(dev, func) \
> > +	(dev->data->dev_conf.rxmode.lock_mode ? \
> > +	func ## _lock : func)
> > +
> > +#define TX_LOCK_FUNCTION(dev, func) \
> > +	(dev->data->dev_conf.txmode.lock_mode ? \
> > +	func ## _lock : func)
> > +#else
> > +#define RX_LOCK_FUNCTION(dev, func) func
> > +
> > +#define TX_LOCK_FUNCTION(dev, func) func #endif
> > +
> > +/* Add the lock RX/TX function for VF reset */ #define
> > +GENERATE_RX_LOCK(func, nic) \ uint16_t func ## _lock(void *rx_queue,
> > +\
> > +		      struct rte_mbuf **rx_pkts, \
> > +		      uint16_t nb_pkts) \
> > +{					\
> > +	struct nic ## _rx_queue *rxq = rx_queue; \
> > +	uint16_t nb_rx = 0; \
> > +						\
> > +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> > +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> > +		rte_spinlock_unlock(&rxq->rx_lock); \
> > +	} \
> > +	\
> > +	return nb_rx; \
> > +}
> > +
> > +#define GENERATE_TX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > +*tx_queue, \
> > +		      struct rte_mbuf **tx_pkts, \
> > +		      uint16_t nb_pkts) \
> > +{					\
> > +	struct nic ## _tx_queue *txq = tx_queue; \
> > +	uint16_t nb_tx = 0; \
> > +						\
> > +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> > +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> > +		rte_spinlock_unlock(&txq->tx_lock); \
> > +	} \
> > +	\
> > +	return nb_tx; \
> > +}
> 
> 1. As I said in off-line dicussiion, I think this locking could (and I think better be)
> impelented completely on rte_ethdev layer.
> So actual PMD code will be unaffected.
> Again that avoids us to introduce _lock version of every RX/Tx function in each
> PMD.
One purpose of implementing the lock in PMD layer is to avoid ABI change. But we introduce the field lock_mode in struct rte_eth_rx/txmode. So seems it's not a good reason now :)
The other purpose is we want to add a lock for every queue. But in rte layer the queue is void *, so we add the lock in the specific structures of the NICs. But as you mentioned below, we can add the lock as dev->data->rx_queue_state it the struct rte_eth_dev_data.
So, I prefer to add the lock in rte layer now.

> 
> 2. Again, as discussed offline, I think it is better to have an explicit
> rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds into RX/TX
> config strcutures.
> Would help to avoid any confusion, I think.
We want the users to choose the rx/tx path without  lock if they're sensitive to the performance and can handle the reset event in their APP. After introducing new fields of config struct, users can change the config to choose the different path.
If we introduce new API, it may be harder for the use to use it. I mean when users want to use lock mode, they may need to replace all the rte_eth_rx/tx_burst by rte_eth_rx/tx_burst_lock. So if we add the lock in rte layer, I still prefer adding lock_mode in the configuration, and the rte_eth_rx/tx_burst is changed like this,
rte_eth_rx/tx_burst
{
+ if lock_mode
+ try_lock
......
+ if lock_mode
+ release_lock
}


> 
> 3.  I thought the plan was to introduce a locking in all appropriate control path
> functions (dev_start/dev_stop etc.) Without that locking version of RX/TX seems
> a bit useless.
> Yes, I understand that you do use locking inside dev_reset, but I suppose the
> plan was to have a generic solution, no?
> Again, interrupt fire when user invokes dev_start/stop or so, so we still need
> some synchronisation between them.
> 
> To be more specific, I thought about something like that:
> 
> static inline uint16_t
> rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
>                  struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) {
>         struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> 
> #ifdef RTE_LIBRTE_ETHDEV_DEBUG
>         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
>         RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);
> 
>         if (queue_id >= dev->data->nb_rx_queues) {
>                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
>                 return 0;
>         }
> #endif
> 
> + if (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].lock) == 0)
> +	return 0;
> +  else if (dev->data->rx_queue_state[rx_queue_id] ==
> RTE_ETH_QUEUE_STATE_STOPPED)) {
> +	rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> +	return 0;
> +
> 
>  nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
>                         rx_pkts, nb_pkts);
> 
> + rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> 
> ....
> 
> return nb_rx;
> }
> 
> And inside queue_start:
> 
> int
> rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id) {
>         struct rte_eth_dev *dev;
> 
>         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> 
>         dev = &rte_eth_devices[port_id];
>         if (rx_queue_id >= dev->data->nb_rx_queues) {
>                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
>                 return -EINVAL;
>         }
> 
>         RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start, -ENOTSUP);
> 
>      rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)
I think you add the lock here to stop the rx/tx.
But to my opinion, we should lock the rx/tx much earlier before starting the queue. For example, when stop the port, the resource of the queues may be released. The rx/tx cannot be executed. So I prefer to get the lock before stopping the ports. Maybe better to keep the spinlock in the dev_reset.

> 
>         if (dev->data->rx_queue_state[rx_queue_id] !=
> RTE_ETH_QUEUE_STATE_STOPPED) {
>                 RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with
> port_id=%" PRIu8
>                         " already started\n",
>                         rx_queue_id, port_id);
>                 ret = -EINVAL 0;
>         } else
>         	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);
> 
>     rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> 
>    return ret;
> }
> 
> Then again, we don't need to do explicit locking inside dev_reset().
> Does it make sense to you guys?
Please see the answer above.

> 
> 
> > +
> > +/**
> >   * A structure used to configure an RX ring of an Ethernet port.
> >   */
> >  struct rte_eth_rxconf {
> > --
> > 2.1.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-08  2:15  0%   ` Stephen Hemminger
@ 2016-06-08  7:34  0%     ` Lu, Wenzhuo
  2016-06-09  7:50  0%       ` Olivier Matz
  0 siblings, 1 reply; 200+ results
From: Lu, Wenzhuo @ 2016-06-08  7:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Tao, Zhe

Hi Stephen,


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, June 8, 2016 10:16 AM
> To: Lu, Wenzhuo
> Cc: dev@dpdk.org; Tao, Zhe
> Subject: Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
> 
> On Mon,  6 Jun 2016 13:40:47 +0800
> Wenzhuo Lu <wenzhuo.lu@intel.com> wrote:
> 
> > Define lock mode for RX/TX queue. Because when resetting the device we
> > want the resetting thread to get the lock of the RX/TX queue to make
> > sure the RX/TX is stopped.
> >
> > Using next ABI macro for this ABI change as it has too much impact. 7
> > APIs and 1 global variable are impacted.
> >
> > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> 
> Why does this patch set make a different assumption the rest of the DPDK?
> 
> The rest of the DPDK operates on the principle that the application is smart
> enough to stop the device before making changes. There is no equivalent to the
> Linux kernel RTNL mutex. The API assumes application threads are well behaved
> and will not try and sabotage each other.
> 
> If you restrict the reset operation to only being available when RX/TX is stopped,
> then no lock is needed.
> 
> The fact that it requires lots more locking inside each device driver implies to me
> this is not correct way to architect this.
It's a good question. This patch set doesn't follow the regular assumption of DPDK.
But it's a requirement we've got from some customers. The users want the driver does as much as it can. The best is the link state change is transparent to the  users.
The patch set tries to provide another choice if the users don't want to stop their rx/tx to handle the reset event.

And as discussed in the other thread, most probably we will move the lock from the PMD layer to rte lay. It'll avoid the change in every device.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-08  7:24  3%       ` Lu, Wenzhuo
@ 2016-06-08  9:19  0%         ` Ananyev, Konstantin
  2016-06-12  2:00  0%           ` Lu, Wenzhuo
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2016-06-08  9:19 UTC (permalink / raw)
  To: Lu, Wenzhuo, Tao, Zhe, dev
  Cc: Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu, Jingjing,
	Zhang, Helin



> 
> Hi Konstantin,
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Tuesday, June 7, 2016 5:59 PM
> > To: Tao, Zhe; dev@dpdk.org
> > Cc: Lu, Wenzhuo; Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing;
> > Zhang, Helin
> > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> >
> >
> > Hi Zhe & Wenzhuo,
> >
> > Please find my comments below.
> > BTW, for clarification - is that patch for 16.11?
> > I believe it's too late to introduce such significant change in 16.07.
> > Thanks
> > Konstantin
> Thanks for the comments.
> Honestly, our purpose is 16.07. Realizing the big impact, we use NEXT_ABI to comment our change. So, I think although we want to
> merge it in 16.07 this change will become effective after we remove NEXT_ABI in 16.11.

I don't think it is achievable.
First I think your code is not in proper shape yet, right now.
Second, as you said, it is a significant change and I would like to hear opinions from the rest of the community.

> 
> >
> > > Define lock mode for RX/TX queue. Because when resetting the device we
> > > want the resetting thread to get the lock of the RX/TX queue to make
> > > sure the RX/TX is stopped.
> > >
> > > Using next ABI macro for this ABI change as it has too much impact. 7
> > > APIs and 1 global variable are impacted.
> > >
> > > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> > > ---
> > >  lib/librte_ether/rte_ethdev.h | 62
> > > +++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 62 insertions(+)
> > >
> > > diff --git a/lib/librte_ether/rte_ethdev.h
> > > b/lib/librte_ether/rte_ethdev.h index 74e895f..4efb5e9 100644
> > > --- a/lib/librte_ether/rte_ethdev.h
> > > +++ b/lib/librte_ether/rte_ethdev.h
> > > @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
> > >  		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
> > >  		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
> > >  		enable_scatter   : 1, /**< Enable scatter packets rx handler */
> > > +#ifndef RTE_NEXT_ABI
> > >  		enable_lro       : 1; /**< Enable LRO */
> > > +#else
> > > +		enable_lro       : 1, /**< Enable LRO */
> > > +		lock_mode        : 1; /**< Using lock path */
> > > +#endif
> > >  };
> > >
> > >  /**
> > > @@ -634,11 +639,68 @@ struct rte_eth_txmode {
> > >  		/**< If set, reject sending out tagged pkts */
> > >  		hw_vlan_reject_untagged : 1,
> > >  		/**< If set, reject sending out untagged pkts */
> > > +#ifndef RTE_NEXT_ABI
> > >  		hw_vlan_insert_pvid : 1;
> > >  		/**< If set, enable port based VLAN insertion */
> > > +#else
> > > +		hw_vlan_insert_pvid : 1,
> > > +		/**< If set, enable port based VLAN insertion */
> > > +		lock_mode : 1;
> > > +		/**< If set, using lock path */
> > > +#endif
> > >  };
> > >
> > >  /**
> > > + * The macros for the RX/TX lock mode functions  */ #ifdef
> > > +RTE_NEXT_ABI #define RX_LOCK_FUNCTION(dev, func) \
> > > +	(dev->data->dev_conf.rxmode.lock_mode ? \
> > > +	func ## _lock : func)
> > > +
> > > +#define TX_LOCK_FUNCTION(dev, func) \
> > > +	(dev->data->dev_conf.txmode.lock_mode ? \
> > > +	func ## _lock : func)
> > > +#else
> > > +#define RX_LOCK_FUNCTION(dev, func) func
> > > +
> > > +#define TX_LOCK_FUNCTION(dev, func) func #endif
> > > +
> > > +/* Add the lock RX/TX function for VF reset */ #define
> > > +GENERATE_RX_LOCK(func, nic) \ uint16_t func ## _lock(void *rx_queue,
> > > +\
> > > +		      struct rte_mbuf **rx_pkts, \
> > > +		      uint16_t nb_pkts) \
> > > +{					\
> > > +	struct nic ## _rx_queue *rxq = rx_queue; \
> > > +	uint16_t nb_rx = 0; \
> > > +						\
> > > +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> > > +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> > > +		rte_spinlock_unlock(&rxq->rx_lock); \
> > > +	} \
> > > +	\
> > > +	return nb_rx; \
> > > +}
> > > +
> > > +#define GENERATE_TX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > +*tx_queue, \
> > > +		      struct rte_mbuf **tx_pkts, \
> > > +		      uint16_t nb_pkts) \
> > > +{					\
> > > +	struct nic ## _tx_queue *txq = tx_queue; \
> > > +	uint16_t nb_tx = 0; \
> > > +						\
> > > +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> > > +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> > > +		rte_spinlock_unlock(&txq->tx_lock); \
> > > +	} \
> > > +	\
> > > +	return nb_tx; \
> > > +}
> >
> > 1. As I said in off-line dicussiion, I think this locking could (and I think better be)
> > impelented completely on rte_ethdev layer.
> > So actual PMD code will be unaffected.
> > Again that avoids us to introduce _lock version of every RX/Tx function in each
> > PMD.
> One purpose of implementing the lock in PMD layer is to avoid ABI change. But we introduce the field lock_mode in struct
> rte_eth_rx/txmode. So seems it's not a good reason now :)
> The other purpose is we want to add a lock for every queue. But in rte layer the queue is void *, so we add the lock in the specific
> structures of the NICs. But as you mentioned below, we can add the lock as dev->data->rx_queue_state it the struct
> rte_eth_dev_data.
> So, I prefer to add the lock in rte layer now.

OK.

> 
> >
> > 2. Again, as discussed offline, I think it is better to have an explicit
> > rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds into RX/TX
> > config strcutures.
> > Would help to avoid any confusion, I think.
> We want the users to choose the rx/tx path without  lock if they're sensitive to the performance and can handle the reset event in
> their APP. After introducing new fields of config struct, users can change the config to choose the different path.

I understand what you are doing.

> If we introduce new API, it may be harder for the use to use it. I mean when users want to use lock mode, they may need to replace
> all the rte_eth_rx/tx_burst by rte_eth_rx/tx_burst_lock. 

Yes, my opinion if users would like to use locking API they need to call it explicitly.


>So if we add the lock in rte layer, I still prefer adding lock_mode in the
> configuration, and the rte_eth_rx/tx_burst is changed like this,
> rte_eth_rx/tx_burst
> {
> + if lock_mode
> + try_lock
> ......
> + if lock_mode
> + release_lock
> }

My preference is to keep existing rx/tx_burst() functions unaffected by that patch.
At least for now.
I suppose that will minimise the risks and help users to avoid confusion what API 
(locking/non-locking) is in use. 

> 
> 
> >
> > 3.  I thought the plan was to introduce a locking in all appropriate control path
> > functions (dev_start/dev_stop etc.) Without that locking version of RX/TX seems
> > a bit useless.
> > Yes, I understand that you do use locking inside dev_reset, but I suppose the
> > plan was to have a generic solution, no?
> > Again, interrupt fire when user invokes dev_start/stop or so, so we still need
> > some synchronisation between them.
> >
> > To be more specific, I thought about something like that:
> >
> > static inline uint16_t
> > rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
> >                  struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) {
> >         struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> >
> > #ifdef RTE_LIBRTE_ETHDEV_DEBUG
> >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
> >         RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);
> >
> >         if (queue_id >= dev->data->nb_rx_queues) {
> >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
> >                 return 0;
> >         }
> > #endif
> >
> > + if (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].lock) == 0)
> > +	return 0;
> > +  else if (dev->data->rx_queue_state[rx_queue_id] ==
> > RTE_ETH_QUEUE_STATE_STOPPED)) {
> > +	rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> > +	return 0;
> > +
> >
> >  nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
> >                         rx_pkts, nb_pkts);
> >
> > + rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> >
> > ....
> >
> > return nb_rx;
> > }
> >
> > And inside queue_start:
> >
> > int
> > rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id) {
> >         struct rte_eth_dev *dev;
> >
> >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >
> >         dev = &rte_eth_devices[port_id];
> >         if (rx_queue_id >= dev->data->nb_rx_queues) {
> >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);
> >                 return -EINVAL;
> >         }
> >
> >         RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start, -ENOTSUP);
> >
> >      rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)
> I think you add the lock here to stop the rx/tx.
> But to my opinion, we should lock the rx/tx much earlier before starting the queue. For example, when stop the port, the resource of
> the queues may be released. 

I didn't get you here...
Before releasing the queue resources, queue_stop() has to be executed, right? 

>The rx/tx cannot be executed. So I prefer to get the lock before stopping the ports. 

Might be I wasn't clear enough here.
What I think we need to have:
 -To stop/start/rx/tx the queue (or do any other action that might change the queue internal structure)
   you have to grab the lock.
   After queue is stopped it's state has to be changed to  QUEUE_STATE_STOPPED (whti queue lock grabbed),
   so rx/tx_locked wouldn't    proceed with that queue.
  - dev_stop() - has to stop all its queues first, i.e. it needs to call queue_stop() for all of them.
 So after dev_stop() had finished  - all device queues have to be in   QUEUE_STATE_STOPPED
Same about dev_start() - after it does all other things - it will call queue_start() for all it's queues.
that will bring them into QUEUE_STARTED.
After that rx/tx_locked can use them again.

>Maybe better to keep the spinlock in the dev_reset.

Might be not :)

> 
> >
> >         if (dev->data->rx_queue_state[rx_queue_id] !=
> > RTE_ETH_QUEUE_STATE_STOPPED) {
> >                 RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with
> > port_id=%" PRIu8
> >                         " already started\n",
> >                         rx_queue_id, port_id);
> >                 ret = -EINVAL 0;
> >         } else
> >         	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);
> >
> >     rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> >
> >    return ret;
> > }
> >
> > Then again, we don't need to do explicit locking inside dev_reset().
> > Does it make sense to you guys?
> Please see the answer above.
> 
> >
> >
> > > +
> > > +/**
> > >   * A structure used to configure an RX ring of an Ethernet port.
> > >   */
> > >  struct rte_eth_rxconf {
> > > --
> > > 2.1.4

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 01/10] rte: change xstats to use integer ids
  @ 2016-06-08 11:16  3%     ` Remy Horton
  2016-06-08 12:22  0%       ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Remy Horton @ 2016-06-08 11:16 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

'noon,

On 08/06/2016 10:37, Thomas Monjalon wrote:
> 2016-05-30 11:48, Remy Horton:
>>   struct rte_eth_xstats {
>> +	/* FIXME: Remove name[] once remaining drivers converted */
>>   	char name[RTE_ETH_XSTATS_NAME_SIZE];
>
> What is the plan? This field must be deprecated with an attribute.
> We cannot have 2 different APIs depending of the driver.

This is where it gets logistically tricky..

Since there's an API/ABI breakage notice in place on this, my own 
preference would be to have the entire patchset quashed into a single 
patch. Problem is that rte/app changes (patches 1 & 7-9) are normally 
applied via master whereas driver changes (patches 2-6) go in via 
dpdk-next-net - it is not clear to me how patches should be submitted 
for this case..

> What are the remaining drivers to convert?

Opps, none. All relevant drivers are converted..

> This structure and the other one (rte_eth_xstats) are badly named.
> There is only one stat in each. So they should not have the plural form.
> rte_eth_xstat and rte_eth_xstat_name would be better.

I kept rte_eth_xstats as it was the name already in use within DPDK. 
Will change the other.

>> +int rte_eth_xstats_count(uint8_t port_id);
>
> This function is useless because we can have the count with
> rte_eth_xstats_get(p, NULL, 0)
> By the way it would be more consistent to have the same behaviour
> in rte_eth_xstats_names().

Feedback I got with earlier patches was that a seperate count function 
was preferable to overloading the fetch function using *data==NULL - is 
the use of the latter specifically preferred?

Other comments noted.

..Remy

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 01/10] rte: change xstats to use integer ids
  2016-06-08 11:16  3%     ` Remy Horton
@ 2016-06-08 12:22  0%       ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-08 12:22 UTC (permalink / raw)
  To: Remy Horton; +Cc: dev

2016-06-08 12:16, Remy Horton:
> 'noon,
> 
> On 08/06/2016 10:37, Thomas Monjalon wrote:
> > 2016-05-30 11:48, Remy Horton:
> >>   struct rte_eth_xstats {
> >> +	/* FIXME: Remove name[] once remaining drivers converted */
> >>   	char name[RTE_ETH_XSTATS_NAME_SIZE];
> >
> > What is the plan? This field must be deprecated with an attribute.
> > We cannot have 2 different APIs depending of the driver.
> 
> This is where it gets logistically tricky..
> 
> Since there's an API/ABI breakage notice in place on this, my own 
> preference would be to have the entire patchset quashed into a single 
> patch. Problem is that rte/app changes (patches 1 & 7-9) are normally 
> applied via master whereas driver changes (patches 2-6) go in via 
> dpdk-next-net - it is not clear to me how patches should be submitted 
> for this case..

Misunderstanding here. Patches are fine and will be integrated in the
main tree because they are not only some drivers changes.
I was talking about the old API with name in rte_eth_xstats.
I have not seen the patch 9 which removes it.

> >> +int rte_eth_xstats_count(uint8_t port_id);
> >
> > This function is useless because we can have the count with
> > rte_eth_xstats_get(p, NULL, 0)
> > By the way it would be more consistent to have the same behaviour
> > in rte_eth_xstats_names().
> 
> Feedback I got with earlier patches was that a seperate count function 
> was preferable to overloading the fetch function using *data==NULL - is 
> the use of the latter specifically preferred?

I prefer the fetch/NULL style to get a count.
It also handles nicely the fetch error because of a too small buffer.

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v5 0/9] add packet capture framework
  2016-05-23 21:38  3% ` [dpdk-dev] [PATCH v4 0/9] " Reshma Pattan
                     ` (2 preceding siblings ...)
  2016-05-23 21:38  9%   ` [dpdk-dev] [PATCH v4 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
@ 2016-06-08 13:38  3%   ` Reshma Pattan
  2016-06-08 13:38  5%     ` [dpdk-dev] [PATCH v5 8/9] doc: update doc for " Reshma Pattan
                       ` (2 more replies)
  3 siblings, 3 replies; 200+ results
From: Reshma Pattan @ 2016-06-08 13:38 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
Pdump library provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

Pdump library works on client/server based model.

Sever is responsible for enabling/disabling the packet captures.
Clients are responsible for requesting enable/disable of the
packet captures.

As part of packet capture framework initialization, pthread and
the server socket is created. Only one server socket is allowed on the system.
As part of enabling/disabling the packet capture, client sockets are created
and multiple client sockets are allowed.
Who ever calls initialization first they will succeed with the initialization,
next subsequent calls of initialization are not allowed. So next users can only
request enabling/disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (9):
  librte_ether: protect add/remove of rxtx callbacks with spinlocks
  librte_ether: add new api rte_eth_add_first_rx_callback
  librte_ether: add new fields to rte_eth_dev_info struct
  librte_ether: make rte_eth_dev_get_port_by_name
    rte_eth_dev_get_name_by_port public
  lib/librte_pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/test-pmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework
  doc: announce ABI change for rte_eth_dev_info structure

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 814 +++++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++
 doc/guides/rel_notes/deprecation.rst    |   6 +
 doc/guides/rel_notes/release_16_07.rst  |  13 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  59 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 +++
 lib/librte_pdump/rte_pdump.c            | 841 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 186 +++++++
 lib/librte_pdump/rte_pdump_version.map  |  12 +
 mk/rte.app.mk                           |   1 +
 21 files changed, 2372 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

-- 
2.5.0

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure
  2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
  2016-06-08 13:38  5%     ` [dpdk-dev] [PATCH v5 8/9] doc: update doc for " Reshma Pattan
@ 2016-06-08 13:38  9%     ` Reshma Pattan
  2016-06-08 16:15  4%       ` Mcnamara, John
  2016-06-09  8:50  2%     ` [dpdk-dev] [PATCH v6 0/8] add packet capture framework Reshma Pattan
  2 siblings, 1 reply; 200+ results
From: Reshma Pattan @ 2016-06-08 13:38 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

New fields nb_rx_queues and nb_tx_queues will be added to
rte_eth_dev_info structure.
Changes to API rte_eth_dev_info_get() will be done to update
these new fields to rte_eth_dev_info object.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 doc/guides/rel_notes/deprecation.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..04316fb 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -57,3 +57,9 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* A librte_ether public structure ``rte_eth_dev_info`` will be changed in 16.07.
+  The proposed change will add new parameters ``nb_rx_queues``, ``nb_tx_queues``
+  to the structure. These are the number of queues configured by software.
+  Modification to definition of ``rte_eth_dev_info_get()`` will be done
+  to update new parameters to ``rte_eth_dev_info`` object.
-- 
2.5.0

^ permalink raw reply	[relevance 9%]

* [dpdk-dev] [PATCH v5 8/9] doc: update doc for packet capture framework
  2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
@ 2016-06-08 13:38  5%     ` Reshma Pattan
  2016-06-08 13:38  9%     ` [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
  2016-06-09  8:50  2%     ` [dpdk-dev] [PATCH v6 0/8] add packet capture framework Reshma Pattan
  2 siblings, 0 replies; 200+ results
From: Reshma Pattan @ 2016-06-08 13:38 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  13 ++++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 ++++++++++++++++++++++++++++++++
 6 files changed, 247 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index a48c8de..ce7c941 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -436,6 +436,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..1809234
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,107 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _pdump_library:
+
+The librte_pdump Library
+========================
+
+The ``librte_pdump`` library provides a framework for packet capturing in DPDK.
+The library provides the following APIs to initialize the packet capture framework, to enable
+or disable the packet capture, and to uninitialize it:
+
+* ``rte_pdump_init()``:
+  This API initializes the packet capture framework.
+
+* ``rte_pdump_enable()``:
+  This API enables the packet capture on a given port and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_enable_by_deviceid()``:
+  This API enables the packet capture on a given device id (``vdev name or pci address``) and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_disable()``:
+  This API disables the packet capture on a given port and queue.
+
+* ``rte_pdump_disable_by_deviceid()``:
+  This API disables the packet capture on a given device id (``vdev name or pci address``) and queue.
+
+* ``rte_pdump_uninit()``:
+  This API uninitializes the packet capture framework.
+
+
+Operation
+---------
+
+The ``librte_pdump`` library works on a client/server model. The server is responsible for enabling or
+disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
+the packet capture.
+
+The packet capture framework, as part of its initialization, creates the pthread and the server socket in
+the pthread. The application that calls the framework initialization first will have the server socket created.
+Further calls to the framework initialization by the same application or other applications is not allowed i.e., only
+one server socket is allowed on the system. So the other applications can only request enabling or disabling of
+the packet capture at which point the client socket is created for them to send the request to the server.
+The server socket will listen for client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket. The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture. Whoever calls this API first will have the server socket created, the subsequent calls to this APIs
+will not create any further server socket. i.e. only one server socket is allowed.
+
+The library APIs ``rte_pdump_enable()`` and ``rte_pdump_enable_by_deviceid()`` enables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump enable" request and sends
+the request to the server. The server that is listening on the socket will take the request and enable the packet capture
+by registering the Ethernet RX and TX callbacks for the given port or device_id and queue combinations.
+Then the server will mirror the packets to the new mempool and enqueue them to the rte_ring that clients have passed
+to these APIs. The server also sends the response back to the client about the status of the request that was processed.
+After the response is received from the server, the client socket is closed.
+
+The library APIs ``rte_pdump_disable()`` and ``rte_pdump_disable_by_deviceid()`` disables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump disable" request and sends
+the request to the server. The server that is listening on the socket will take the request and disable the packet
+capture by removing the Ethernet RX and TX callbacks for the given port or device_id and queue combinations. The server
+also sends the response back to the client about the status of the request that was processed. After the response is
+received from the server, the client socket is closed.
+
+The library API ``rte_pdump_uninit()``, uninitializes the packet capture framework by closing the pthread and the
+server socket.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
+Users can use this as an example to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 307e7c4..438b705 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -57,6 +57,11 @@ New Features
   Enabled support for the AES CTR algorithm for Intel QuickAssist devices.
   Provided support for algorithm-chaining operations.
 
+* **Added packet capture framework.**
+
+  * A new library ``librte_pdump`` is added to provide packet capture APIs.
+  * A new ``app/pdump`` tool is added to capture packets in DPDK.
+
 
 Resolved Issues
 ---------------
@@ -126,6 +131,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to a public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -137,6 +147,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..96c8709
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,122 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+   .. Note::
+
+      * As of now the ``dpdk_pdump`` tool cannot capture the packets of virtual devices
+        in the primary process due to a bug in the ethdev library. Due to this bug, in a multi process context,
+        when the primary and secondary have different ports set, then the secondary process
+        (here the ``dpdk_pdump`` tool) overwrites the ``rte_eth_devices[]`` entries of the primary process.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH] mbuf: remove inconsistent assert statements
  @ 2016-06-08 16:07  3%           ` Ananyev, Konstantin
  2016-06-09  7:46  0%             ` Olivier Matz
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2016-06-08 16:07 UTC (permalink / raw)
  To: Olivier Matz, dev, Adrien Mazarguil

Hi Olivier,

> 
> Hi Adrien, Konstantin,
> 
> I'm jumping in this (interesting) discussion. I already talked a
> bit with Adrien in point to point, and I think its patch is valid.
> Please see some comments below.
> 
> 
> On 06/08/2016 03:57 PM, Adrien Mazarguil wrote:
> > On Wed, Jun 08, 2016 at 01:09:18PM +0000, Ananyev, Konstantin wrote:
> >>>
> >>> Hi Konstantin,
> >>>
> >>> On Wed, Jun 08, 2016 at 10:34:17AM +0000, Ananyev, Konstantin wrote:
> >>>> Hi Adrien,
> >>>>
> >>>>>
> >>>>> An assertion failure occurs in __rte_mbuf_raw_free() (called by a few PMDs)
> >>>>> when compiling DPDK with CONFIG_RTE_LOG_LEVEL=RTE_LOG_DEBUG and starting
> >>>>> applications with a log level high enough to trigger it.
> >>>>>
> >>>>> While rte_mbuf_raw_alloc() sets refcount to 1, __rte_mbuf_raw_free()
> >>>>> expects it to be 0.
> >>>>> Considering users are not expected to reset the
> >>>>> reference count to satisfy assert() and that raw functions are designed on
> >>>>> purpose without safety belts, remove these checks.
> >>>>
> >>>> Yes, it refcnt supposed to be set to 0 by __rte_pktmbuf_prefree_seg().
> >>>> Wright now, it is a user responsibility to make sure refcnt==0 before pushing
> >>>> mbuf back to the pool.
> >>>> Not sure why do you consider that wrong?
> >>>
> >>> I do not consider this wrong and I'm all for using assert() to catch
> >>> programming errors, however in this specific case, I think they are
> >>> inconsistent and misleading.
> >>
> >> Honestly, I don't understand why.
> >> Right now the rule of thumb is - when mbuf is in the pool, it's refcnt should be equal zero.
> 
> What is the purpose of this? Is there some code relying on this?

The whole current implementation of mbuf_free code path relies on that.
Straight here: 
if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
                m->next = NULL;
                __rte_mbuf_raw_free(m);
        }

If we'll exclude indirect mbuf logic, all it does:
if (rte_mbuf_refcnt_update(m, -1) == 0) {
	m->next = NULL;
	__rte_mbuf_raw_free(m);
}

I.E.:
decrement mbuf->refcnt.
If new value of refcnt is zero, then put it back into the pool.

So having ASERT(mbuf->refcnt==0) inside
__rte_mbuf_raw_free()/__rte_mbuf_raw_alloc()
looks absolutely valid to me.
I *has* to be zero at that point with current implementation,
And if it is not then we probably have (or will have) a silent memory corruption.

> 
> >> Yes, as you pointed below - that rule probably can be changed to:
> >> when mbuf is in the pool, it's refcnt should equal one, and that would probably allow us
> >> to speedup things a bit, but I suppose that's the matter of another aptch/discussion.
> >
> > Agreed.
> 
> I agree this is somehow another discussion, but let's dive into :)

OK :)

> 
> But since [1], it is allowed to call rte_mbuf_raw_alloc() in PMDs (all
> PMDs were calling an internal function before).

Yes, raw_alloc is public, NP with that.

> We could argue that
> rte_mbuf_raw_free() should also be made public for PMDs.

We could, but right now it is not.
Again, as I said, user could use it on his own but he obviously has to
obey the rules and do manually what __rte_pktmbuf_prefree_seg() does.
At least:

rte_mbuf_refcnt_set(m, 0);
__rte_mbuf_raw_free(m);

> 
> As you said below, no-one promised that the free() reverts the malloc(),
> but given the function names, one can legitimately expect that the
> following code is valid:
> 
> m = __rte_mbuf_raw_alloc();
> /* do nothing here */
> __rte_mbuf_raw_free(m);

Surely people could (and would) expect various things...
But the reality right now is: __rte_mbuf_raw_free() is an internal
function and not counterpart of __rte_mbuf_raw_alloc().
If the people don't bother to read API docs or/and the actual code,
I can't see how we can help them :)

> 
> If no code relies on having the refcnt set to 0 when a mbuf is in
> the pool, I suggest to relax this constraint as Adrien proposed.

Why not just rename it to __rte_mbuf_raw_free_dont_use_after_raw_alloc()?
To avoid any further confusion :)
Seriously speaking I would prefer to leave it as it is.
If you feel we have to introduce a counterpart of  rte_mbuf_raw_alloc(),
we can make a new public one:

rte_mbuf_raw_free(stuct rte_mbuf *m)
{
      if (rte_mbuf_refcnt_update(m, -1) == 0)
                __rte_mbuf_raw_free(m);
}

> 
> Then, my opinion is that the refcount should be set to 1 in
> rte_pktmbuf_reset().

I don't think we need to update rte_pktmbuf_reset(),
it doesn't touch refcnt at all and probably better to keep it that way.
To achieve what you suggesting, we probably need to:
1) update  _rte_pktmbuf_prefree_seg and rte_pktmbuf_detach() to
set refcnt back to 1, i.e:

static inline struct rte_mbuf* __attribute__((always_inline))
__rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
{
        __rte_mbuf_sanity_check(m, 0);

        if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
                /* if this is an indirect mbuf, it is detached. */
                if (RTE_MBUF_INDIRECT(m))
                        rte_pktmbuf_detach(m);
+	rte_mbuf_refcnt_set(m, 1);
                return m;
        }
        return NULL;
}

2) either:
   a) update mbuf constructor function, so it sets refcnt=1.
        I suppose that is easy for rte_pktmbuf_init(), but it means that all custom
        constructors should do the same.
        Which means possible changes in existing user code and all ABI change related hassle.
  b) keep rte_mbuf_raw_alloc() setting mbuf->refcnt=1.
      But then I don't see how will get any performance gain here.  

So not sure is it really worth it.

> And rte_pktmbuf_free() should not be allowed on
> an uninitialized mbuf (yes, this would require some changes in PMDs).

Not sure I understand you here...
free() wouldn not be allowed on mbuf whose recnf==0?
But it is not allowed right now anyway.

> This would open the door for bulk allocation/free in the mbuf api.

Hmm and what stops us having one right now?
As I know we do have rte_pktmbuf_alloc_bulk(), I don't see
why we can't have rte_pktmbuf_free_bulk() right now.

> 
> This could be done in several steps:
> 
> 1/ remove these assert(), add introduce a public rte_mbuf_raw_free()
> 2/ announce that rte_pktmbuf_free() won't work on uninitialized mbufs
> in a future version, and ensure that all PMD are inline with this
> requirement
> 3/ later, move refcount initialization in rte_pktmbuf_reset()
> 
> What do you think?

I still think that assert() is on a right place :)
*If* we'll change mbuf free code in a way that mbufs inside the pool
should have refcnt==1, then I think we'll change it to:
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1); 
But as I stated above, that change might cause some backward compatibility hassle: 2.a)
Or might not give us any performance gain: 2.b).

> 
> Another option is to remove the rte_mbuf_raw_alloc()/rte_mbuf_raw_free()
> and use mempool calls.

Don't see why we have to remove them...
Basically we have a bug in PMD, but instead of fixing it,
you guys suggest to change mbuf code.
Sounds a bit strange to me.
Why not just make for these PMDs to call rte_mempool_put() directly, if you are sure it is safe here?

> But having a mbuf wrapper does not seem a bad
> thing to me.

We can add some extra wrapper then, something like:
#define __RTE_MBUF_PUT_FREED(m)	(rte_mempool_put((m)->pool, m))
?

Konstantin


> 
> By the way, __rte_pktmbuf_prefree_seg() is also an internal function.
> I think we should try to update the mbuf API so that the PMDs do not
> need to call these internal functions.
> 
> 
> [1]
> http://dpdk.org/browse/dpdk/commit/?id=fbfd99551ca370266f4bfff58ce441cf5cb1203a
> 
> 
> Regards,
> Olivier
> 
> 
> >
> >>>> If the user calls __rte_mbuf_raw_free() manualy it is his responsibility to make
> >>>> sure mbuf's refcn==0.
> >>>
> >>> Sure, however what harm does it cause (besides assert() to fail), since the
> >>> allocation function sets refcount to 1?
> >>>
> >>> Why having the allocation function set the refcount if we are sure it is
> >>> already 0 (assert() proves it). Removing rte_mbuf_refcnt_set(m, 1) can
> >>> surely improve performance.
> >>
> >> That's' just an assert() enabled when MBUF_DEBUG  is on.
> >> Its sole purpose is to help troubleshoot the bugs and help to catch situations
> >> when someone silently updates mbufs supposed to be free.
> >
> > I perfectly understand and I cannot agree more with this explanation,
> > however the fact these functions are not symmetrical remains an issue that
> > needs to be addressed somehow in my opinion.
> >
> >>>> BTW, why are you doing it?
> >>>> The comment clearly states that the function is for internal use:
> >>>> /**
> >>>>  * @internal Put mbuf back into its original mempool.
> >>>>  * The use of that function is reserved for RTE internal needs.
> >>>>  * Please use rte_pktmbuf_free().
> >>>>  *
> >>>>  * @param m
> >>>>  *   The mbuf to be freed.
> >>>>  */
> >>>> static inline void __attribute__((always_inline))
> >>>> __rte_mbuf_raw_free(struct rte_mbuf *m)
> >>>
> >>> Several PMDs are using it anyway (won't name names, but I know one of them
> >>> quite well).
> >>
> >> Then it probably is a bug in these PMDs that need to be fixed.
> >>
> >>> I chose to modify this code instead of its users for the
> >>> following reasons:
> >>>
> >>> - Considering their names, these functions should be opposites and able to
> >>>   work together like malloc()/free().
> >>
> >> These are internal functions.
> >> Comments in mbuf clearly state that library users shouldn't call them directly.
> >> They are written to fit internal librte_mbuf needs, and no-one ever promised
> >> malloc/free() compatibility here.
> >
> > So it cannot be provided for the sake of not providing it or is there a good
> > reason?
> >
> > What I meant is that since PMDs already made the mistake of using these
> > functions to benefit from the improved performance, DPDK being all about
> > performance and stuff, let them use it as intended. Perhaps we should drop
> > those "__" like for rte_mbuf_raw_alloc().
> >
> >>>
> >>> - PMDs are relying on these functions for performance reasons, we can assume
> >>>   they took the extra care necessary to make sure it would work properly.
> >>
> >> That just doesn't seem correct to me.
> >> The proper way to do free fo mbuf segment is:
> >>
> >> static inline void __attribute__((always_inline))
> >> rte_pktmbuf_free_seg(struct rte_mbuf *m)
> >> {
> >>         if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
> >>                 m->next = NULL;
> >>                 __rte_mbuf_raw_free(m);
> >>         }
> >> }
> >>
> >> If by some reason you choose not to use this function, then it is your
> >> responsibility to perform similar actions on your own before pushing mbuf into the pool.
> >> That's what some TX functions for some Intel NICs do to improve performance:
> >> they call _prefree_seg() manually and try to put mbufs into the pool in groups.
> >
> > Not anymore it seems, but in the current code base both ena and mpipe PMDs
> > (soon mlx5 as well) seem to get this wrong.
> >
> >>> - Preventing it would make these PMDs slower and is not acceptable either.
> >>
> >> I can hardly imagine that __rte_pktmbuf_prefree_seg() impact would be that severe...
> >> But ok, probably  you do have some very specific case, but then why you PMD just doesn't call:
> >> rte_mempool_put(m->pool, m);
> >> directly?
> >
> > To survive the upcoming transition to mbufs backed by libc malloc() without
> > having to fix them? Joke aside, I guess the reason is to use functions with
> > "mbuf" in their names when dealing with mbufs.
> >
> >> Why instead you choose to change common functions and compromise
> >> librte_mbuf debug ability?
> >
> > No, I'm fine with keeping the debug ability, however I did not find a way to
> > both keep it and fix the consistency issue without breaking something
> > (performance or refcount assumptions I'm not familiar with elsewhere).
> >
> >>> What remains is the consistency issue, I think these statements were only
> >>> added to catch multiple frees,
> >>
> >> Yes these asserts() here to help catch bugs,
> >> and I think it is a good thing to have them here.
> >>
> >>> and those should be caught at a higher
> >>> level, where other consistency checks are also performed.
> >>
> >> Like where?
> >
> > Possibly rte_pktmbuf_free_seg().
> >
> >>>>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> >>>>> ---
> >>>>>  lib/librte_mbuf/rte_mbuf.h | 2 --
> >>>>>  1 file changed, 2 deletions(-)
> >>>>>
> >>>>> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> >>>>> index 11fa06d..7070bb8 100644
> >>>>> --- a/lib/librte_mbuf/rte_mbuf.h
> >>>>> +++ b/lib/librte_mbuf/rte_mbuf.h
> >>>>> @@ -1108,7 +1108,6 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
> >>>>>  	if (rte_mempool_get(mp, &mb) < 0)
> >>>>>  		return NULL;
> >>>>>  	m = (struct rte_mbuf *)mb;
> >>>>> -	RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
> >>>>>  	rte_mbuf_refcnt_set(m, 1);
> >>>>>  	__rte_mbuf_sanity_check(m, 0);
> >>>>>
> >>>>> @@ -1133,7 +1132,6 @@ __rte_mbuf_raw_alloc(struct rte_mempool *mp)
> >>>>>  static inline void __attribute__((always_inline))
> >>>>>  __rte_mbuf_raw_free(struct rte_mbuf *m)
> >>>>>  {
> >>>>> -	RTE_ASSERT(rte_mbuf_refcnt_read(m) == 0);
> >>>>>  	rte_mempool_put(m->pool, m);
> >>>>>  }
> >>>>>
> >>>>> --
> >>>>> 2.1.4
> >>>>
> >>>
> >>> --
> >>> Adrien Mazarguil
> >>> 6WIND
> >

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2] doc: fix code section in abi versioning doc
  2016-05-20 14:08 13% [dpdk-dev] [PATCH v2] " John McNamara
@ 2016-06-08 16:46  4% ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-08 16:46 UTC (permalink / raw)
  To: John McNamara; +Cc: dev

2016-05-20 15:08, John McNamara:
> Fix broken cosole directive in the ABI validator section of the
> ABI versioning docs.
> 
> Fixes: f1ef9794f9bd ("doc: add ABI guidelines")
> 
> Signed-off-by: John McNamara <john.mcnamara@intel.com>

Applied, thanks

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure
  2016-06-08 13:38  9%     ` [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
@ 2016-06-08 16:15  4%       ` Mcnamara, John
  0 siblings, 0 replies; 200+ results
From: Mcnamara, John @ 2016-06-08 16:15 UTC (permalink / raw)
  To: Pattan, Reshma, dev; +Cc: Pattan, Reshma

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Reshma Pattan
> Sent: Wednesday, June 8, 2016 2:38 PM
> To: dev@dpdk.org
> Cc: Pattan, Reshma <reshma.pattan@intel.com>
> Subject: [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for
> rte_eth_dev_info structure
> 
> New fields nb_rx_queues and nb_tx_queues will be added to rte_eth_dev_info
> structure.
> Changes to API rte_eth_dev_info_get() will be done to update these new
> fields to rte_eth_dev_info object.
> 
> Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
> ---
>  doc/guides/rel_notes/deprecation.rst | 6 ++++++
>  1 file changed, 6 insertions(+)
> 

Hi,

This isn't required in this patchset since the deprecation has already occurred in this release.

John

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH] mbuf: remove inconsistent assert statements
  2016-06-08 16:07  3%           ` Ananyev, Konstantin
@ 2016-06-09  7:46  0%             ` Olivier Matz
  2016-06-09 13:21  0%               ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2016-06-09  7:46 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Adrien Mazarguil

Hi Konstantin,

>>>>>> Yes, it refcnt supposed to be set to 0 by __rte_pktmbuf_prefree_seg().
>>>>>> Wright now, it is a user responsibility to make sure refcnt==0 before pushing
>>>>>> mbuf back to the pool.
>>>>>> Not sure why do you consider that wrong?
>>>>>
>>>>> I do not consider this wrong and I'm all for using assert() to catch
>>>>> programming errors, however in this specific case, I think they are
>>>>> inconsistent and misleading.
>>>>
>>>> Honestly, I don't understand why.
>>>> Right now the rule of thumb is - when mbuf is in the pool, it's refcnt should be equal zero.
>>
>> What is the purpose of this? Is there some code relying on this?
> 
> The whole current implementation of mbuf_free code path relies on that.
> Straight here: 
> if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
>                 m->next = NULL;
>                 __rte_mbuf_raw_free(m);
>         }
> 
> If we'll exclude indirect mbuf logic, all it does:
> if (rte_mbuf_refcnt_update(m, -1) == 0) {
> 	m->next = NULL;
> 	__rte_mbuf_raw_free(m);
> }
> 
> I.E.:
> decrement mbuf->refcnt.
> If new value of refcnt is zero, then put it back into the pool.
> 
> So having ASERT(mbuf->refcnt==0) inside
> __rte_mbuf_raw_free()/__rte_mbuf_raw_alloc()
> looks absolutely valid to me.
> I *has* to be zero at that point with current implementation,
> And if it is not then we probably have (or will have) a silent memory corruption.

This explains how the refcount is used, and why it is set
to zero before returning the mbuf to the pool with the mbuf
free functions.

It does not explain which code relies on the refcnt beeing 0
while the mbuf is in the pool.


>> But since [1], it is allowed to call rte_mbuf_raw_alloc() in PMDs (all
>> PMDs were calling an internal function before).
> 
> Yes, raw_alloc is public, NP with that.
> 
>> We could argue that
>> rte_mbuf_raw_free() should also be made public for PMDs.
> 
> We could, but right now it is not.
> Again, as I said, user could use it on his own but he obviously has to
> obey the rules and do manually what __rte_pktmbuf_prefree_seg() does.
> At least:
> 
> rte_mbuf_refcnt_set(m, 0);
> __rte_mbuf_raw_free(m);
> 
>>
>> As you said below, no-one promised that the free() reverts the malloc(),
>> but given the function names, one can legitimately expect that the
>> following code is valid:
>>
>> m = __rte_mbuf_raw_alloc();
>> /* do nothing here */
>> __rte_mbuf_raw_free(m);
> 
> Surely people could (and would) expect various things...
> But the reality right now is: __rte_mbuf_raw_free() is an internal
> function and not counterpart of __rte_mbuf_raw_alloc().
> If the people don't bother to read API docs or/and the actual code,
> I can't see how we can help them :)

Yes, of course, people should read the doc.
This does not prevent to have a nice API that behaves in a
natural way :)

By the way, the fact that today the mbuf refcnt should be 0 while
in a pool is not in the api doc, but in the code.

>> If no code relies on having the refcnt set to 0 when a mbuf is in
>> the pool, I suggest to relax this constraint as Adrien proposed.
> 
> Why not just rename it to __rte_mbuf_raw_free_dont_use_after_raw_alloc()?
> To avoid any further confusion :)
> Seriously speaking I would prefer to leave it as it is.
> If you feel we have to introduce a counterpart of  rte_mbuf_raw_alloc(),
> we can make a new public one:
> 
> rte_mbuf_raw_free(stuct rte_mbuf *m)
> {
>       if (rte_mbuf_refcnt_update(m, -1) == 0)
>                 __rte_mbuf_raw_free(m);
> }

This is an option, but I think it's not efficient to touch
the mbuf structure when allocating/freeing. See below.

>> Then, my opinion is that the refcount should be set to 1 in
>> rte_pktmbuf_reset().
> 
> I don't think we need to update rte_pktmbuf_reset(),
> it doesn't touch refcnt at all and probably better to keep it that way.

Why would it be better?
All mbuf struct initializations are done in that function, why would
it be different for the refcnt?

> To achieve what you suggesting, we probably need to:
> 1) update  _rte_pktmbuf_prefree_seg and rte_pktmbuf_detach() to
> set refcnt back to 1, i.e:
> 
> static inline struct rte_mbuf* __attribute__((always_inline))
> __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> {
>         __rte_mbuf_sanity_check(m, 0);
> 
>         if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
>                 /* if this is an indirect mbuf, it is detached. */
>                 if (RTE_MBUF_INDIRECT(m))
>                         rte_pktmbuf_detach(m);
> +	rte_mbuf_refcnt_set(m, 1);
>                 return m;
>         }
>         return NULL;
> }
> 
> 2) either:
>    a) update mbuf constructor function, so it sets refcnt=1.
>         I suppose that is easy for rte_pktmbuf_init(), but it means that all custom
>         constructors should do the same.
>         Which means possible changes in existing user code and all ABI change related hassle.
>   b) keep rte_mbuf_raw_alloc() setting mbuf->refcnt=1.
>       But then I don't see how will get any performance gain here.  
> 
> So not sure is it really worth it.
> 
>> And rte_pktmbuf_free() should not be allowed on
>> an uninitialized mbuf (yes, this would require some changes in PMDs).
> 
> Not sure I understand you here...
> free() wouldn not be allowed on mbuf whose recnf==0?
> But it is not allowed right now anyway.

Today:

  /* allowed */
  m = rte_pktmbuf_alloc();
  rte_pktmbuf_free(m);

  /* not allowed */
  m = rte_mbuf_raw_alloc();
  __rte_mbuf_raw_free(m);

  /* we should do instead (strange): */
  m = rte_mbuf_raw_alloc();
  rte_pktmbuf_free(m);

What I suggest to have:

  /* allowed, no change */
  m = rte_pktmbuf_alloc();
  rte_pktmbuf_free(m);

  /* allowed, these functions would be symetrical */
  m = rte_mbuf_raw_alloc();
  rte_mbuf_raw_free(m);

  /* not allowed, m->refcnt is uninitialized */
  m = rte_mbuf_raw_alloc();
  rte_pktmbuf_free(m);



> 
>> This would open the door for bulk allocation/free in the mbuf api.
> 
> Hmm and what stops us having one right now?
> As I know we do have rte_pktmbuf_alloc_bulk(), I don't see
> why we can't have rte_pktmbuf_free_bulk() right now.

I think, not touching the refcnt on raw allocation allows a PMD to
do the following:

  rte_mbuf_allow_raw_bulk(pool, &mbufs, n)
  for (i = 0; i < n; i++) {
    prefetch(m[i+1]);
    rte_pktmbuf_reset(); /* including refcnt = 1 */
    do_stuff(m);
  }
  /* if all mbufs are not used, free the remaining */
  rte_mbuf_free_raw_bulk(&mbufs[i], n-i);

In that case, we can prefetch mbufs before using them, and
we can free the unused mbufs without touching the structure.
It looks to be a good advantage.

Yes, we can do that with mempool functions. But I feel having a mbuf
API with type checking is better.

>> This could be done in several steps:
>>
>> 1/ remove these assert(), add introduce a public rte_mbuf_raw_free()
>> 2/ announce that rte_pktmbuf_free() won't work on uninitialized mbufs
>> in a future version, and ensure that all PMD are inline with this
>> requirement
>> 3/ later, move refcount initialization in rte_pktmbuf_reset()
>>
>> What do you think?
> 
> I still think that assert() is on a right place :)
> *If* we'll change mbuf free code in a way that mbufs inside the pool
> should have refcnt==1, then I think we'll change it to:
> RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1); 
> But as I stated above, that change might cause some backward compatibility hassle: 2.a)
> Or might not give us any performance gain: 2.b).

I suggest instead to have no constraint on the value of the refcnt
in the pool.

>> Another option is to remove the rte_mbuf_raw_alloc()/rte_mbuf_raw_free()
>> and use mempool calls.
> 
> Don't see why we have to remove them...
> Basically we have a bug in PMD, but instead of fixing it,
> you guys suggest to change mbuf code.
> Sounds a bit strange to me.
> Why not just make for these PMDs to call rte_mempool_put() directly, if you are sure it is safe here?

Yes, there are some bugs in the PMDs regarding the refcnt value when
compiled with MBUF_DEBUG=y. By the way, looking at the i40e code, we
have:

  i40e_alloc_rx_queue_mbufs()
  {
    for (...) {
       struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mp);
       ...
       rte_mbuf_refcnt_set(mbuf, 1);  /* not needed */
     ...
  }

  i40e_tx_free_bufs()
  {
    ...
    if (txq->txq_flags & (uint32_t)ETH_TXQ_FLAGS_NOREFCOUNT) {
      for (...) {
         /* returning a mbuf with refcnt=1 */
         rte_mempool_put(txep->mbuf->pool, txep->mbuf);
   ...

The fact that we can find many bugs related to that in PMDs is a
sign that the API is not understandable enough.

The point is not to fix the bugs in PMDs. I think the point is
to enhance the mbuf API.

Hope I have convinced you ;)

> 
>> But having a mbuf wrapper does not seem a bad
>> thing to me.
> 
> We can add some extra wrapper then, something like:
> #define __RTE_MBUF_PUT_FREED(m)	(rte_mempool_put((m)->pool, m))
> ?

I think a wrapper should do the type checking (struct mbuf).

Thanks for this exchange.
Regards,
Olivier

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-08  7:34  0%     ` Lu, Wenzhuo
@ 2016-06-09  7:50  0%       ` Olivier Matz
  2016-06-12  5:25  0%         ` Lu, Wenzhuo
  0 siblings, 1 reply; 200+ results
From: Olivier Matz @ 2016-06-09  7:50 UTC (permalink / raw)
  To: Lu, Wenzhuo, Stephen Hemminger; +Cc: dev, Tao, Zhe

Hi,

On 06/08/2016 09:34 AM, Lu, Wenzhuo wrote:
> Hi Stephen,
> 
> 
>> -----Original Message-----
>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>> Sent: Wednesday, June 8, 2016 10:16 AM
>> To: Lu, Wenzhuo
>> Cc: dev@dpdk.org; Tao, Zhe
>> Subject: Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
>>
>> On Mon,  6 Jun 2016 13:40:47 +0800
>> Wenzhuo Lu <wenzhuo.lu@intel.com> wrote:
>>
>>> Define lock mode for RX/TX queue. Because when resetting the device we
>>> want the resetting thread to get the lock of the RX/TX queue to make
>>> sure the RX/TX is stopped.
>>>
>>> Using next ABI macro for this ABI change as it has too much impact. 7
>>> APIs and 1 global variable are impacted.
>>>
>>> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
>>> Signed-off-by: Zhe Tao <zhe.tao@intel.com>
>>
>> Why does this patch set make a different assumption the rest of the DPDK?
>>
>> The rest of the DPDK operates on the principle that the application is smart
>> enough to stop the device before making changes. There is no equivalent to the
>> Linux kernel RTNL mutex. The API assumes application threads are well behaved
>> and will not try and sabotage each other.
>>
>> If you restrict the reset operation to only being available when RX/TX is stopped,
>> then no lock is needed.
>>
>> The fact that it requires lots more locking inside each device driver implies to me
>> this is not correct way to architect this.

+1

I'm not sure adding locks is the proper way to do.
This is the application responsibility to ensure that:
- control functions are not called concurrently on the same port
- rx/tx functions are not called when the device is stopped/reset/...

However, I do think the usage paradigms of the ethdev api should be
better documented in rte_ethdev.h (ex: which functions can be called
concurrently). This would be a first step.

If we really want a helper API to do that in DPDK, the _next_ step
could be to add them in the ethdev api to achieve this. Maybe
something like (the function names could be better):

- to be called on one control thread:

  rte_eth_stop_rxtx(port)
  rte_eth_start_rxtx(port)

  rte_eth_get_rxtx_state(port)
     -> return "running" if at least one core is inside the rx/tx code
     -> return "stopped" if all cores are outside the rx/tx code

- to be called on dataplane cores:

  /* same than rte_eth_rx_burst(), but checks if rx/tx is allowed
   * first, else do nothing */
  rte_eth_rx_burst_interruptible()
  rte_eth_tx_burst_interruptible()


The code of control thread could be:

  rte_eth_stop_rxtx(port);
  /* wait that all dataplane cores finished their processing */
  while (rte_eth_get_rxtx_state(port) != stopped)
      ;
  rte_eth_some_control_operation(port);
  rte_eth_start_rxtx(port);


I think this could be done without any lock, just with the proper
memory barriers and a per-core status.

But this API may impose a paradigm to the application, and I'm not
sure the DPDK should do that.

Regards,
Olivier

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 8/8] doc: update doc for packet capture framework
  2016-06-09  8:50  2%     ` [dpdk-dev] [PATCH v6 0/8] add packet capture framework Reshma Pattan
@ 2016-06-09  8:50  5%       ` Reshma Pattan
  2016-06-09 16:10  2%       ` [dpdk-dev] [PATCH v7 0/8] add " Reshma Pattan
  1 sibling, 0 replies; 200+ results
From: Reshma Pattan @ 2016-06-09  8:50 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
Acked-by: John McNamara <john.mcnamara@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  13 ++++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 ++++++++++++++++++++++++++++++++
 6 files changed, 247 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index a48c8de..ce7c941 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -436,6 +436,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..1809234
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,107 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _pdump_library:
+
+The librte_pdump Library
+========================
+
+The ``librte_pdump`` library provides a framework for packet capturing in DPDK.
+The library provides the following APIs to initialize the packet capture framework, to enable
+or disable the packet capture, and to uninitialize it:
+
+* ``rte_pdump_init()``:
+  This API initializes the packet capture framework.
+
+* ``rte_pdump_enable()``:
+  This API enables the packet capture on a given port and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_enable_by_deviceid()``:
+  This API enables the packet capture on a given device id (``vdev name or pci address``) and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_disable()``:
+  This API disables the packet capture on a given port and queue.
+
+* ``rte_pdump_disable_by_deviceid()``:
+  This API disables the packet capture on a given device id (``vdev name or pci address``) and queue.
+
+* ``rte_pdump_uninit()``:
+  This API uninitializes the packet capture framework.
+
+
+Operation
+---------
+
+The ``librte_pdump`` library works on a client/server model. The server is responsible for enabling or
+disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
+the packet capture.
+
+The packet capture framework, as part of its initialization, creates the pthread and the server socket in
+the pthread. The application that calls the framework initialization first will have the server socket created.
+Further calls to the framework initialization by the same application or other applications is not allowed i.e., only
+one server socket is allowed on the system. So the other applications can only request enabling or disabling of
+the packet capture at which point the client socket is created for them to send the request to the server.
+The server socket will listen for client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket. The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture. Whoever calls this API first will have the server socket created, the subsequent calls to this APIs
+will not create any further server socket. i.e. only one server socket is allowed.
+
+The library APIs ``rte_pdump_enable()`` and ``rte_pdump_enable_by_deviceid()`` enables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump enable" request and sends
+the request to the server. The server that is listening on the socket will take the request and enable the packet capture
+by registering the Ethernet RX and TX callbacks for the given port or device_id and queue combinations.
+Then the server will mirror the packets to the new mempool and enqueue them to the rte_ring that clients have passed
+to these APIs. The server also sends the response back to the client about the status of the request that was processed.
+After the response is received from the server, the client socket is closed.
+
+The library APIs ``rte_pdump_disable()`` and ``rte_pdump_disable_by_deviceid()`` disables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump disable" request and sends
+the request to the server. The server that is listening on the socket will take the request and disable the packet
+capture by removing the Ethernet RX and TX callbacks for the given port or device_id and queue combinations. The server
+also sends the response back to the client about the status of the request that was processed. After the response is
+received from the server, the client socket is closed.
+
+The library API ``rte_pdump_uninit()``, uninitializes the packet capture framework by closing the pthread and the
+server socket.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
+Users can use this as an example to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..a4de2a2 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -66,6 +66,11 @@ New Features
   * Enable RSS per network interface through the configuration file.
   * Streamline the CLI code.
 
+* **Added packet capture framework.**
+
+  * A new library ``librte_pdump`` is added to provide packet capture APIs.
+  * A new ``app/pdump`` tool is added to capture packets in DPDK.
+
 
 Resolved Issues
 ---------------
@@ -135,6 +140,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to a public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -146,6 +156,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..96c8709
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,122 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+   .. Note::
+
+      * As of now the ``dpdk_pdump`` tool cannot capture the packets of virtual devices
+        in the primary process due to a bug in the ethdev library. Due to this bug, in a multi process context,
+        when the primary and secondary have different ports set, then the secondary process
+        (here the ``dpdk_pdump`` tool) overwrites the ``rte_eth_devices[]`` entries of the primary process.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v6 0/8] add packet capture framework
  2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
  2016-06-08 13:38  5%     ` [dpdk-dev] [PATCH v5 8/9] doc: update doc for " Reshma Pattan
  2016-06-08 13:38  9%     ` [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
@ 2016-06-09  8:50  2%     ` Reshma Pattan
  2016-06-09  8:50  5%       ` [dpdk-dev] [PATCH v6 8/8] doc: update doc for " Reshma Pattan
  2016-06-09 16:10  2%       ` [dpdk-dev] [PATCH v7 0/8] add " Reshma Pattan
  2 siblings, 2 replies; 200+ results
From: Reshma Pattan @ 2016-06-09  8:50 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
Pdump library provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

Pdump library works on client/server based model.

Sever is responsible for enabling/disabling the packet captures.
Clients are responsible for requesting enable/disable of the
packet captures.

As part of packet capture framework initialization, pthread and
the server socket is created. Only one server socket is allowed on the system.
As part of enabling/disabling the packet capture, client sockets are created
and multiple client sockets are allowed.
Who ever calls initialization first they will succeed with the initialization,
next subsequent calls of initialization are not allowed. So next users can only
request enabling/disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v6:
removed below deprecation notice patch from patch set.
http://dpdk.org/dev/patchwork/patch/13372/

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (8):
  librte_ether: protect add/remove of rxtx callbacks with spinlocks
  librte_ether: add new api rte_eth_add_first_rx_callback
  librte_ether: add new fields to rte_eth_dev_info struct
  librte_ether: make rte_eth_dev_get_port_by_name
    rte_eth_dev_get_name_by_port public
  lib/librte_pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/test-pmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 814 +++++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++
 doc/guides/rel_notes/release_16_07.rst  |  13 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  59 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 +++
 lib/librte_pdump/rte_pdump.c            | 841 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 186 +++++++
 lib/librte_pdump/rte_pdump_version.map  |  12 +
 mk/rte.app.mk                           |   1 +
 20 files changed, 2366 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

-- 
2.5.0

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH] mbuf: remove inconsistent assert statements
  2016-06-09  7:46  0%             ` Olivier Matz
@ 2016-06-09 13:21  0%               ` Ananyev, Konstantin
  0 siblings, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2016-06-09 13:21 UTC (permalink / raw)
  To: Olivier Matz, dev, Adrien Mazarguil; +Cc: Zhang, Helin

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Thursday, June 09, 2016 8:47 AM
> To: Ananyev, Konstantin; dev@dpdk.org; Adrien Mazarguil
> Subject: Re: [dpdk-dev] [PATCH] mbuf: remove inconsistent assert statements
> 
> Hi Konstantin,
> 
> >>>>>> Yes, it refcnt supposed to be set to 0 by __rte_pktmbuf_prefree_seg().
> >>>>>> Wright now, it is a user responsibility to make sure refcnt==0 before pushing
> >>>>>> mbuf back to the pool.
> >>>>>> Not sure why do you consider that wrong?
> >>>>>
> >>>>> I do not consider this wrong and I'm all for using assert() to catch
> >>>>> programming errors, however in this specific case, I think they are
> >>>>> inconsistent and misleading.
> >>>>
> >>>> Honestly, I don't understand why.
> >>>> Right now the rule of thumb is - when mbuf is in the pool, it's refcnt should be equal zero.
> >>
> >> What is the purpose of this? Is there some code relying on this?
> >
> > The whole current implementation of mbuf_free code path relies on that.
> > Straight here:
> > if (likely(NULL != (m = __rte_pktmbuf_prefree_seg(m)))) {
> >                 m->next = NULL;
> >                 __rte_mbuf_raw_free(m);
> >         }
> >
> > If we'll exclude indirect mbuf logic, all it does:
> > if (rte_mbuf_refcnt_update(m, -1) == 0) {
> > 	m->next = NULL;
> > 	__rte_mbuf_raw_free(m);
> > }
> >
> > I.E.:
> > decrement mbuf->refcnt.
> > If new value of refcnt is zero, then put it back into the pool.
> >
> > So having ASERT(mbuf->refcnt==0) inside
> > __rte_mbuf_raw_free()/__rte_mbuf_raw_alloc()
> > looks absolutely valid to me.
> > I *has* to be zero at that point with current implementation,
> > And if it is not then we probably have (or will have) a silent memory corruption.
> 
> This explains how the refcount is used, and why it is set
> to zero before returning the mbuf to the pool with the mbuf
> free functions.

>From my point, that shows that rte_pktmbuf_free() relies on the value of the refcnt
to make a decision is it ok to put mbuf back to the pool or not.
Right now it puts mbuf to the pool *only* if it's refcnt==0.
As discussed below, we probably can change it to be refcnt==1
(if there really would be noticeable performance gain).
But I think it still should be just one predefined value of refcnt (0 or 1).
In theory it is possible to allow both (0 and 1),
but that will make it hard to debug any alloc/free issues,
plus would neglect any possible performance gain -
as in that case raw_alloc (or it's analog) should still do
mbuf->refcnt=1; 

> 
> It does not explain which code relies on the refcnt beeing 0
> while the mbuf is in the pool.
> 
> 
> >> But since [1], it is allowed to call rte_mbuf_raw_alloc() in PMDs (all
> >> PMDs were calling an internal function before).
> >
> > Yes, raw_alloc is public, NP with that.
> >
> >> We could argue that
> >> rte_mbuf_raw_free() should also be made public for PMDs.
> >
> > We could, but right now it is not.
> > Again, as I said, user could use it on his own but he obviously has to
> > obey the rules and do manually what __rte_pktmbuf_prefree_seg() does.
> > At least:
> >
> > rte_mbuf_refcnt_set(m, 0);
> > __rte_mbuf_raw_free(m);
> >
> >>
> >> As you said below, no-one promised that the free() reverts the malloc(),
> >> but given the function names, one can legitimately expect that the
> >> following code is valid:
> >>
> >> m = __rte_mbuf_raw_alloc();
> >> /* do nothing here */
> >> __rte_mbuf_raw_free(m);
> >
> > Surely people could (and would) expect various things...
> > But the reality right now is: __rte_mbuf_raw_free() is an internal
> > function and not counterpart of __rte_mbuf_raw_alloc().
> > If the people don't bother to read API docs or/and the actual code,
> > I can't see how we can help them :)
> 
> Yes, of course, people should read the doc.
> This does not prevent to have a nice API that behaves in a
> natural way :)
> 
> By the way, the fact that today the mbuf refcnt should be 0 while
> in a pool is not in the api doc, but in the code.

Ok, I admit there is a bug in the docs, let's add this line to the PG and fix it :)

> 
> >> If no code relies on having the refcnt set to 0 when a mbuf is in
> >> the pool, I suggest to relax this constraint as Adrien proposed.
> >
> > Why not just rename it to __rte_mbuf_raw_free_dont_use_after_raw_alloc()?
> > To avoid any further confusion :)
> > Seriously speaking I would prefer to leave it as it is.
> > If you feel we have to introduce a counterpart of  rte_mbuf_raw_alloc(),
> > we can make a new public one:
> >
> > rte_mbuf_raw_free(stuct rte_mbuf *m)
> > {
> >       if (rte_mbuf_refcnt_update(m, -1) == 0)
> >                 __rte_mbuf_raw_free(m);
> > }
> 
> This is an option, but I think it's not efficient to touch
> the mbuf structure when allocating/freeing. See below.

I don't think it is totally avoidable for generic case anyway.

> 
> >> Then, my opinion is that the refcount should be set to 1 in
> >> rte_pktmbuf_reset().
> >
> > I don't think we need to update rte_pktmbuf_reset(),
> > it doesn't touch refcnt at all and probably better to keep it that way.
> 
> Why would it be better?

First because pktmbuf_reset() is not very efficient one, and many PMDs
avoid to use it.
Second I think managing refcnt is more part of alloc/free then init procedure.

> All mbuf struct initializations are done in that function, why would
> it be different for the refcnt?
> 
> > To achieve what you suggesting, we probably need to:
> > 1) update  _rte_pktmbuf_prefree_seg and rte_pktmbuf_detach() to
> > set refcnt back to 1, i.e:
> >
> > static inline struct rte_mbuf* __attribute__((always_inline))
> > __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> > {
> >         __rte_mbuf_sanity_check(m, 0);
> >
> >         if (likely(rte_mbuf_refcnt_update(m, -1) == 0)) {
> >                 /* if this is an indirect mbuf, it is detached. */
> >                 if (RTE_MBUF_INDIRECT(m))
> >                         rte_pktmbuf_detach(m);
> > +	rte_mbuf_refcnt_set(m, 1);
> >                 return m;
> >         }
> >         return NULL;
> > }
> >
> > 2) either:
> >    a) update mbuf constructor function, so it sets refcnt=1.
> >         I suppose that is easy for rte_pktmbuf_init(), but it means that all custom
> >         constructors should do the same.
> >         Which means possible changes in existing user code and all ABI change related hassle.
> >   b) keep rte_mbuf_raw_alloc() setting mbuf->refcnt=1.
> >       But then I don't see how will get any performance gain here.
> >
> > So not sure is it really worth it.
> >
> >> And rte_pktmbuf_free() should not be allowed on
> >> an uninitialized mbuf (yes, this would require some changes in PMDs).
> >
> > Not sure I understand you here...
> > free() wouldn not be allowed on mbuf whose recnf==0?
> > But it is not allowed right now anyway.
> 
> Today:
> 
>   /* allowed */
>   m = rte_pktmbuf_alloc();
>   rte_pktmbuf_free(m);
> 
>   /* not allowed */
>   m = rte_mbuf_raw_alloc();
>   __rte_mbuf_raw_free(m);
> 
>   /* we should do instead (strange): */
>   m = rte_mbuf_raw_alloc();
>   rte_pktmbuf_free(m);
> 
> What I suggest to have:
> 
>   /* allowed, no change */
>   m = rte_pktmbuf_alloc();
>   rte_pktmbuf_free(m);
> 
>   /* allowed, these functions would be symetrical */
>   m = rte_mbuf_raw_alloc();
>   rte_mbuf_raw_free(m);
> 
>   /* not allowed, m->refcnt is uninitialized */
>   m = rte_mbuf_raw_alloc();
>   rte_pktmbuf_free(m);

Hmm, and what it will buy us (except of symmetry)?

> 
> 
> 
> >
> >> This would open the door for bulk allocation/free in the mbuf api.
> >
> > Hmm and what stops us having one right now?
> > As I know we do have rte_pktmbuf_alloc_bulk(), I don't see
> > why we can't have rte_pktmbuf_free_bulk() right now.
> 
> I think, not touching the refcnt on raw allocation allows a PMD to
> do the following:
> 
>   rte_mbuf_allow_raw_bulk(pool, &mbufs, n)
>   for (i = 0; i < n; i++) {
>     prefetch(m[i+1]);
>     rte_pktmbuf_reset(); /* including refcnt = 1 */
>     do_stuff(m);
>   }

You can do that part right now.
That what many PMDs do, just function names are a bit different:

rte_mempool_get_bulk(pool, &mbufs, n);
for (i = 0; i < n; i++) {
      custom_rte_pktmbuf_fill(); /* including refcnt = 1 */
      do_stuff(m);
}

>   /* if all mbufs are not used, free the remaining */
>   rte_mbuf_free_raw_bulk(&mbufs[i], n-i);

Ok and what be a performance critical use-case for that?
As I know, the only possible cases inside PMD when it allocates & free same
mbuf internally are:
- error handling, queue stop (which usually not that performance critical).
- situation when you have cut crc bytes and they are in separate segment
(again a corner case).
What is the use case you have in your mind for that?

> 
> In that case, we can prefetch mbufs before using them,

You can do that now, look at rte_pktmbuf_alloc_bulk().
Nothing stops you to add prefetch() here, if it will be really faster -
extra parameter for rte_pktmbuf_alloc_bulk() or new function rte_pktmbuf_alloc_bulk_prefetch().

> and
> we can free the unused mbufs without touching the structure.

In many case this is not possible anyway:
you can't guarantee that all mbufs you get from upper layer are from the same pool,
and they all have refcnt==1.
There are probably some custom cases when you can safely assume that,
but I am not sure we should restructure all mbuf API for that particular case.
In such situations some custom function should do.

> It looks to be a good advantage.
> 
> Yes, we can do that with mempool functions. But I feel having a mbuf
> API with type checking is better.
> 
> >> This could be done in several steps:
> >>
> >> 1/ remove these assert(), add introduce a public rte_mbuf_raw_free()
> >> 2/ announce that rte_pktmbuf_free() won't work on uninitialized mbufs
> >> in a future version, and ensure that all PMD are inline with this
> >> requirement
> >> 3/ later, move refcount initialization in rte_pktmbuf_reset()
> >>
> >> What do you think?
> >
> > I still think that assert() is on a right place :)
> > *If* we'll change mbuf free code in a way that mbufs inside the pool
> > should have refcnt==1, then I think we'll change it to:
> > RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
> > But as I stated above, that change might cause some backward compatibility hassle: 2.a)
> > Or might not give us any performance gain: 2.b).
> 
> I suggest instead to have no constraint on the value of the refcnt
> in the pool.
> 
> >> Another option is to remove the rte_mbuf_raw_alloc()/rte_mbuf_raw_free()
> >> and use mempool calls.
> >
> > Don't see why we have to remove them...
> > Basically we have a bug in PMD, but instead of fixing it,
> > you guys suggest to change mbuf code.
> > Sounds a bit strange to me.
> > Why not just make for these PMDs to call rte_mempool_put() directly, if you are sure it is safe here?
> 
> Yes, there are some bugs in the PMDs regarding the refcnt value when
> compiled with MBUF_DEBUG=y.

Ok, then I suggest we fix them first, then start to discuss possible mbuf optimisations.
It is really strange when invalid usage of generic API in some submodules drives the generic API change.

> By the way, looking at the i40e code, we
> have:
> 
>   i40e_alloc_rx_queue_mbufs()
>   {
>     for (...) {
>        struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mp);
>        ...
>        rte_mbuf_refcnt_set(mbuf, 1);  /* not needed */
>      ...
>   }

Yes, I agree it is unnecessary here, and could be safely removed,
but how it relates to our discussion?

> 
>   i40e_tx_free_bufs()
>   {
>     ...
>     if (txq->txq_flags & (uint32_t)ETH_TXQ_FLAGS_NOREFCOUNT) {
>       for (...) {
>          /* returning a mbuf with refcnt=1 */
>          rte_mempool_put(txep->mbuf->pool, txep->mbuf);
>    ...

Hmm, from what I see that code is not correct anyway -
as we still can't guarantee that all mbufs we TX-ed are from the same pool.
So I think that code has to be removed.
Probably i40e maintainers have a better explanation why it is here.
Again, personally I think the ETH_TXQ_FLAGS_NOREFCOUNT flag should
also be removed (or deprecated) - I think it was left here from old days.

> 
> The fact that we can find many bugs related to that in PMDs is a
> sign that the API is not understandable enough.

For me it is just a sign that we need put a better effort in writing/reviewing the code.

> 
> The point is not to fix the bugs in PMDs. I think the point is
> to enhance the mbuf API.

As I said - I think bugs in PMDs shouldn't drive API changes :)
They need to be fixed first.

BTW, I looked who is using __rte_mbuf_raw_free() in the current DPDK code.
I found only two instances:

1) drivers/net/mpipe/mpipe_tilegx.c:
Inside mpipe_recv_flush_stack(struct mpipe_dev_priv *priv)
As I can see that function does set mbuf->refcnt=1 before calling __raw_free.
So we should be fine here, I think.
  
2) drivers/net/ena/ena_ethdev.c
Inside ena_rx_queue_release_bufs().
Again that one seems ok, as I can see from the code:
ena PMD calls  rte_mempool_get_bulk() and keep mbuf->refcnt==0
till either:
-  that mbuf is freed by ena_rx_queue_release_bufs() 
- or filled by eth_ena_recv_pkts() - at that stage mbuf->recnt is set to 1.

So another question is what PMD is failing?

> 
> Hope I have convinced you ;)
> 
> >
> >> But having a mbuf wrapper does not seem a bad
> >> thing to me.
> >
> > We can add some extra wrapper then, something like:
> > #define __RTE_MBUF_PUT_FREED(m)	(rte_mempool_put((m)->pool, m))
> > ?
> 
> I think a wrapper should do the type checking (struct mbuf).

Sure, we can have a wrapper inline function, this is just an example.

> 
> Thanks for this exchange.
> Regards,
> Olivier

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH] log: deprecate history dump
@ 2016-06-09 14:09  6% Thomas Monjalon
  2016-06-09 14:45  0% ` David Marchand
  2016-06-09 15:06  5% ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
  0 siblings, 2 replies; 200+ results
From: Thomas Monjalon @ 2016-06-09 14:09 UTC (permalink / raw)
  To: david.marchand; +Cc: dev

The log history uses rte_mempool. In order to remove the mempool
dependency in EAL (and improve the build), this feature is deprecated.
The ABI is kept but the behaviour is now voided because it seems this
function was not used. The history can be read from syslog.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
 app/test-pmd/cmdline.c                       |   3 -
 app/test/autotest_test_funcs.py              |   5 --
 app/test/commands.c                          |   4 +-
 app/test/test_logs.c                         |   3 -
 doc/guides/rel_notes/deprecation.rst         |   3 +
 lib/librte_eal/bsdapp/eal/eal_debug.c        |   6 --
 lib/librte_eal/common/eal_common_log.c       | 128 +--------------------------
 lib/librte_eal/common/include/rte_log.h      |   8 ++
 lib/librte_eal/linuxapp/eal/eal_debug.c      |   6 --
 lib/librte_eal/linuxapp/eal/eal_interrupts.c |   1 -
 lib/librte_eal/linuxapp/eal/eal_ivshmem.c    |   1 -
 lib/librte_eal/linuxapp/eal/eal_log.c        |   3 -
 lib/librte_mempool/rte_mempool.c             |   4 -
 13 files changed, 16 insertions(+), 159 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 1921612..fd389ac 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -7268,8 +7268,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -7284,7 +7282,6 @@ cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
 		"dump_physmem#"
 		"dump_memzone#"
-		"dump_log_history#"
 		"dump_struct_sizes#"
 		"dump_ring#"
 		"dump_mempool#"
diff --git a/app/test/autotest_test_funcs.py b/app/test/autotest_test_funcs.py
index b60b941..14cffd0 100644
--- a/app/test/autotest_test_funcs.py
+++ b/app/test/autotest_test_funcs.py
@@ -144,16 +144,11 @@ def logs_autotest(child, test_name):
 	i = 0
 	child.sendline(test_name)
 
-	# logs sequence is printed twice because of history dump
 	log_list = [
 		"TESTAPP1: error message",
 		"TESTAPP1: critical message",
 		"TESTAPP2: critical message",
 		"TESTAPP1: error message",
-		"TESTAPP1: error message",
-		"TESTAPP1: critical message",
-		"TESTAPP2: critical message",
-		"TESTAPP1: error message",
 	]
 
 	for log_msg in log_list:
diff --git a/app/test/commands.c b/app/test/commands.c
index e0af8e4..2df46b0 100644
--- a/app/test/commands.c
+++ b/app/test/commands.c
@@ -150,8 +150,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -164,7 +162,7 @@ static void cmd_dump_parsed(void *parsed_result,
 
 cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
-				 "dump_physmem#dump_memzone#dump_log_history#"
+				 "dump_physmem#dump_memzone#"
 				 "dump_struct_sizes#dump_ring#dump_mempool#"
 				 "dump_devargs");
 
diff --git a/app/test/test_logs.c b/app/test/test_logs.c
index 05aa862..d0a9962 100644
--- a/app/test/test_logs.c
+++ b/app/test/test_logs.c
@@ -83,9 +83,6 @@ test_logs(void)
 	RTE_LOG(ERR, TESTAPP1, "error message\n");
 	RTE_LOG(ERR, TESTAPP2, "error message (not displayed)\n");
 
-	/* print again the previous logs */
-	rte_log_dump_history(stdout);
-
 	return 0;
 }
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..f11df93 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -8,6 +8,9 @@ API and ABI deprecation notices are to be posted here.
 Deprecation Notices
 -------------------
 
+* The log history is deprecated in release 16.07.
+  It is voided and will be completely removed in release 16.11.
+
 * The ethdev hotplug API is going to be moved to EAL with a notification
   mechanism added to crypto and ethdev libraries so that hotplug is now
   available to both of them. This API will be stripped of the device arguments
diff --git a/lib/librte_eal/bsdapp/eal/eal_debug.c b/lib/librte_eal/bsdapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_debug.c
+++ b/lib/librte_eal/bsdapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c
index b5e37bb..94ecdd2 100644
--- a/lib/librte_eal/common/eal_common_log.c
+++ b/lib/librte_eal/common/eal_common_log.c
@@ -56,29 +56,11 @@
 #include <rte_spinlock.h>
 #include <rte_branch_prediction.h>
 #include <rte_ring.h>
-#include <rte_mempool.h>
 
 #include "eal_private.h"
 
 #define LOG_ELT_SIZE     2048
 
-#define LOG_HISTORY_MP_NAME "log_history"
-
-STAILQ_HEAD(log_history_list, log_history);
-
-/**
- * The structure of a message log in the log history.
- */
-struct log_history {
-	STAILQ_ENTRY(log_history) next;
-	unsigned size;
-	char buf[0];
-};
-
-static struct rte_mempool *log_history_mp = NULL;
-static unsigned log_history_size = 0;
-static struct log_history_list log_history;
-
 /* global log structure */
 struct rte_logs rte_logs = {
 	.type = ~0,
@@ -86,10 +68,7 @@ struct rte_logs rte_logs = {
 	.file = NULL,
 };
 
-static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER;
-static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER;
 static FILE *default_log_stream;
-static int history_enabled = 1;
 
 /**
  * This global structure stores some informations about the message
@@ -106,59 +85,14 @@ static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg);
 /* default logs */
 
 int
-rte_log_add_in_history(const char *buf, size_t size)
+rte_log_add_in_history(const char *buf __rte_unused, size_t size __rte_unused)
 {
-	struct log_history *hist_buf = NULL;
-	static const unsigned hist_buf_size = LOG_ELT_SIZE - sizeof(*hist_buf);
-	void *obj;
-
-	if (history_enabled == 0)
-		return 0;
-
-	rte_spinlock_lock(&log_list_lock);
-
-	/* get a buffer for adding in history */
-	if (log_history_size > RTE_LOG_HISTORY) {
-		hist_buf = STAILQ_FIRST(&log_history);
-		if (hist_buf) {
-			STAILQ_REMOVE_HEAD(&log_history, next);
-			log_history_size--;
-		}
-	}
-	else {
-		if (rte_mempool_mc_get(log_history_mp, &obj) < 0)
-			obj = NULL;
-		hist_buf = obj;
-	}
-
-	/* no buffer */
-	if (hist_buf == NULL) {
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* not enough room for msg, buffer go back in mempool */
-	if (size >= hist_buf_size) {
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* add in history */
-	memcpy(hist_buf->buf, buf, size);
-	hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0';
-	hist_buf->size = size;
-	STAILQ_INSERT_TAIL(&log_history, hist_buf, next);
-	log_history_size++;
-	rte_spinlock_unlock(&log_list_lock);
-
 	return 0;
 }
 
 void
-rte_log_set_history(int enable)
+rte_log_set_history(int enable __rte_unused)
 {
-	history_enabled = enable;
 }
 
 /* Change the stream that will be used by logging system */
@@ -217,44 +151,8 @@ int rte_log_cur_msg_logtype(void)
 
 /* Dump log history to file */
 void
-rte_log_dump_history(FILE *out)
+rte_log_dump_history(FILE *out __rte_unused)
 {
-	struct log_history_list tmp_log_history;
-	struct log_history *hist_buf;
-	unsigned i;
-
-	/* only one dump at a time */
-	rte_spinlock_lock(&log_dump_lock);
-
-	/* save list, and re-init to allow logging during dump */
-	rte_spinlock_lock(&log_list_lock);
-	tmp_log_history = log_history;
-	STAILQ_INIT(&log_history);
-	log_history_size = 0;
-	rte_spinlock_unlock(&log_list_lock);
-
-	for (i=0; i<RTE_LOG_HISTORY; i++) {
-
-		/* remove one message from history list */
-		hist_buf = STAILQ_FIRST(&tmp_log_history);
-
-		if (hist_buf == NULL)
-			break;
-
-		STAILQ_REMOVE_HEAD(&tmp_log_history, next);
-
-		/* write on stdout */
-		if (fwrite(hist_buf->buf, hist_buf->size, 1, out) == 0) {
-			rte_mempool_mp_put(log_history_mp, hist_buf);
-			break;
-		}
-
-		/* put back message structure in pool */
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-	}
-	fflush(out);
-
-	rte_spinlock_unlock(&log_dump_lock);
 }
 
 /*
@@ -297,29 +195,11 @@ rte_log(uint32_t level, uint32_t logtype, const char *format, ...)
 }
 
 /*
- * called by environment-specific log init function to initialize log
- * history
+ * called by environment-specific log init function
  */
 int
 rte_eal_common_log_init(FILE *default_log)
 {
-	STAILQ_INIT(&log_history);
-
-	/* reserve RTE_LOG_HISTORY*2 elements, so we can dump and
-	 * keep logging during this time */
-	log_history_mp = rte_mempool_create(LOG_HISTORY_MP_NAME, RTE_LOG_HISTORY*2,
-				LOG_ELT_SIZE, 0, 0,
-				NULL, NULL,
-				NULL, NULL,
-				SOCKET_ID_ANY, MEMPOOL_F_NO_PHYS_CONTIG);
-
-	if ((log_history_mp == NULL) &&
-	    ((log_history_mp = rte_mempool_lookup(LOG_HISTORY_MP_NAME)) == NULL)){
-		RTE_LOG(ERR, EAL, "%s(): cannot create log_history mempool\n",
-			__func__);
-		return -1;
-	}
-
 	default_log_stream = default_log;
 	rte_openlog_stream(default_log);
 
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index 2e47e7f..b1add04 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -42,6 +42,8 @@
  * This file provides a log API to RTE applications.
  */
 
+#include "rte_common.h" /* for __rte_deprecated macro */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -179,22 +181,27 @@ int rte_log_cur_msg_loglevel(void);
 int rte_log_cur_msg_logtype(void);
 
 /**
+ * @deprecated
  * Enable or disable the history (enabled by default)
  *
  * @param enable
  *   true to enable, or 0 to disable history.
  */
+__rte_deprecated
 void rte_log_set_history(int enable);
 
 /**
+ * @deprecated
  * Dump the log history to a file
  *
  * @param f
  *   A pointer to a file for output
  */
+__rte_deprecated
 void rte_log_dump_history(FILE *f);
 
 /**
+ * @deprecated
  * Add a log message to the history.
  *
  * This function can be called from a user-defined log stream. It adds
@@ -209,6 +216,7 @@ void rte_log_dump_history(FILE *f);
  *   - 0: Success.
  *   - (-ENOBUFS) if there is no room to store the message.
  */
+__rte_deprecated
 int rte_log_add_in_history(const char *buf, size_t size);
 
 /**
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_debug.c
+++ b/lib/librte_eal/linuxapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index 06b26a9..0bee8aa 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -60,7 +60,6 @@
 #include <rte_ring.h>
 #include <rte_debug.h>
 #include <rte_log.h>
-#include <rte_mempool.h>
 #include <rte_pci.h>
 #include <rte_malloc.h>
 #include <rte_errno.h>
diff --git a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c
index eea0314..67b3caf 100644
--- a/lib/librte_eal/linuxapp/eal/eal_ivshmem.c
+++ b/lib/librte_eal/linuxapp/eal/eal_ivshmem.c
@@ -49,7 +49,6 @@
 #include <rte_string_fns.h>
 #include <rte_errno.h>
 #include <rte_ring.h>
-#include <rte_mempool.h>
 #include <rte_malloc.h>
 #include <rte_common.h>
 #include <rte_ivshmem.h>
diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c
index 0b133c3..8464152 100644
--- a/lib/librte_eal/linuxapp/eal/eal_log.c
+++ b/lib/librte_eal/linuxapp/eal/eal_log.c
@@ -60,9 +60,6 @@ console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
 	ssize_t ret;
 	uint32_t loglevel;
 
-	/* add this log in history */
-	rte_log_add_in_history(buf, size);
-
 	/* write on stdout */
 	ret = fwrite(buf, 1, size, stdout);
 	fflush(stdout);
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index b54de43..22a5645 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -1003,7 +1003,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 
 		if (free == 0) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1012,7 +1011,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 			hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2;
 		} else if (free == 1) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1022,7 +1020,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		} else if (free == 2) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 &&
 			    cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1032,7 +1029,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		tlr = __mempool_get_trailer(obj);
 		cookie = tlr->cookie;
 		if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) {
-			rte_log_set_history(0);
 			RTE_LOG(CRIT, MEMPOOL,
 				"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 				obj, (const void *) mp, cookie);
-- 
2.7.0

^ permalink raw reply	[relevance 6%]

* Re: [dpdk-dev] [PATCH] log: deprecate history dump
  2016-06-09 14:09  6% [dpdk-dev] [PATCH] log: deprecate history dump Thomas Monjalon
@ 2016-06-09 14:45  0% ` David Marchand
  2016-06-09 15:01  0%   ` Thomas Monjalon
  2016-06-09 15:01  0%   ` Christian Ehrhardt
  2016-06-09 15:06  5% ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
  1 sibling, 2 replies; 200+ results
From: David Marchand @ 2016-06-09 14:45 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Thomas,

On Thu, Jun 9, 2016 at 4:09 PM, Thomas Monjalon
<thomas.monjalon@6wind.com> wrote:
> The log history uses rte_mempool. In order to remove the mempool
> dependency in EAL (and improve the build), this feature is deprecated.
> The ABI is kept but the behaviour is now voided because it seems this
> function was not used. The history can be read from syslog.

It does look like it is not really used.
I am for this change unless someone complains.

Comments below.

> Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
> ---
>  app/test-pmd/cmdline.c                       |   3 -
>  app/test/autotest_test_funcs.py              |   5 --
>  app/test/commands.c                          |   4 +-
>  app/test/test_logs.c                         |   3 -
>  doc/guides/rel_notes/deprecation.rst         |   3 +
>  lib/librte_eal/bsdapp/eal/eal_debug.c        |   6 --
>  lib/librte_eal/common/eal_common_log.c       | 128 +--------------------------
>  lib/librte_eal/common/include/rte_log.h      |   8 ++
>  lib/librte_eal/linuxapp/eal/eal_debug.c      |   6 --
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c |   1 -
>  lib/librte_eal/linuxapp/eal/eal_ivshmem.c    |   1 -
>  lib/librte_eal/linuxapp/eal/eal_log.c        |   3 -
>  lib/librte_mempool/rte_mempool.c             |   4 -
>  13 files changed, 16 insertions(+), 159 deletions(-)

- You missed autotest_data.py.

$ git grep dump_log_history
app/test/autotest_data.py:               "Command" :    "dump_log_history",

- eal Makefile still refers to librte_mempool.

- Since you are looking at this, what keeps us from removing the
dependency on librte_ring ?
I would say it was mainly because of mempool.
Maybe ivshmem ?


[snip]

> diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
> index ad05eba..f11df93 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -8,6 +8,9 @@ API and ABI deprecation notices are to be posted here.
>  Deprecation Notices
>  -------------------
>
> +* The log history is deprecated in release 16.07.
> +  It is voided and will be completely removed in release 16.11.
> +

It is voided in 16.07 and will be completely removed in release 16.11.


>  * The ethdev hotplug API is going to be moved to EAL with a notification
>    mechanism added to crypto and ethdev libraries so that hotplug is now
>    available to both of them. This API will be stripped of the device arguments

> diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c
> index b5e37bb..94ecdd2 100644
> --- a/lib/librte_eal/common/eal_common_log.c
> +++ b/lib/librte_eal/common/eal_common_log.c
> @@ -56,29 +56,11 @@
>  #include <rte_spinlock.h>
>  #include <rte_branch_prediction.h>
>  #include <rte_ring.h>
> -#include <rte_mempool.h>
>
>  #include "eal_private.h"
>
>  #define LOG_ELT_SIZE     2048

We don't need LOG_ELT_SIZE.

>
> -#define LOG_HISTORY_MP_NAME "log_history"
> -
> -STAILQ_HEAD(log_history_list, log_history);
> -
> -/**
> - * The structure of a message log in the log history.
> - */
> -struct log_history {
> -       STAILQ_ENTRY(log_history) next;
> -       unsigned size;
> -       char buf[0];
> -};
> -
> -static struct rte_mempool *log_history_mp = NULL;
> -static unsigned log_history_size = 0;
> -static struct log_history_list log_history;
> -
>  /* global log structure */
>  struct rte_logs rte_logs = {
>         .type = ~0,
> @@ -86,10 +68,7 @@ struct rte_logs rte_logs = {
>         .file = NULL,
>  };
>
> -static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER;
> -static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER;
>  static FILE *default_log_stream;
> -static int history_enabled = 1;
>
>  /**
>   * This global structure stores some informations about the message
> @@ -106,59 +85,14 @@ static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg);
>  /* default logs */
>
>  int
> -rte_log_add_in_history(const char *buf, size_t size)
> +rte_log_add_in_history(const char *buf __rte_unused, size_t size __rte_unused)
>  {
> -       struct log_history *hist_buf = NULL;
> -       static const unsigned hist_buf_size = LOG_ELT_SIZE - sizeof(*hist_buf);
> -       void *obj;
> -
> -       if (history_enabled == 0)
> -               return 0;
> -
> -       rte_spinlock_lock(&log_list_lock);
> -
> -       /* get a buffer for adding in history */
> -       if (log_history_size > RTE_LOG_HISTORY) {
> -               hist_buf = STAILQ_FIRST(&log_history);
> -               if (hist_buf) {
> -                       STAILQ_REMOVE_HEAD(&log_history, next);
> -                       log_history_size--;
> -               }
> -       }
> -       else {
> -               if (rte_mempool_mc_get(log_history_mp, &obj) < 0)
> -                       obj = NULL;
> -               hist_buf = obj;
> -       }
> -
> -       /* no buffer */
> -       if (hist_buf == NULL) {
> -               rte_spinlock_unlock(&log_list_lock);
> -               return -ENOBUFS;
> -       }
> -
> -       /* not enough room for msg, buffer go back in mempool */
> -       if (size >= hist_buf_size) {
> -               rte_mempool_mp_put(log_history_mp, hist_buf);
> -               rte_spinlock_unlock(&log_list_lock);
> -               return -ENOBUFS;
> -       }
> -
> -       /* add in history */
> -       memcpy(hist_buf->buf, buf, size);
> -       hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0';
> -       hist_buf->size = size;
> -       STAILQ_INSERT_TAIL(&log_history, hist_buf, next);
> -       log_history_size++;
> -       rte_spinlock_unlock(&log_list_lock);
> -
>         return 0;
>  }
>
>  void
> -rte_log_set_history(int enable)
> +rte_log_set_history(int enable __rte_unused)
>  {
> -       history_enabled = enable;
>  }

Maybe a warning here for the people who did not read the deprecation notices ?


-- 
David Marchand

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] log: deprecate history dump
  2016-06-09 14:45  0% ` David Marchand
@ 2016-06-09 15:01  0%   ` Thomas Monjalon
  2016-06-09 15:01  0%   ` Christian Ehrhardt
  1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-09 15:01 UTC (permalink / raw)
  To: David Marchand; +Cc: dev

2016-06-09 16:45, David Marchand:
> On Thu, Jun 9, 2016 at 4:09 PM, Thomas Monjalon
> <thomas.monjalon@6wind.com> wrote:
> > The log history uses rte_mempool. In order to remove the mempool
> > dependency in EAL (and improve the build), this feature is deprecated.
> > The ABI is kept but the behaviour is now voided because it seems this
> > function was not used. The history can be read from syslog.
> 
> It does look like it is not really used.
> I am for this change unless someone complains.
> 
> Comments below.

All your comments will be addressed in a v2. Thanks

> - Since you are looking at this, what keeps us from removing the
> dependency on librte_ring ?

Please see this first small cleanup:
	http://dpdk.org/ml/archives/dev/2016-June/040798.html

> I would say it was mainly because of mempool.
> Maybe ivshmem ?

Yes CONFIG_RTE_LIBRTE_IVSHMEM brings dependencies to rte_ring and rte_ivshmem.
This "feature" also pollute the memory allocator and makes rework harder.
That's why I would be in favor of removing CONFIG_RTE_LIBRTE_IVSHMEM.

Otherwise, as an alternative proposal, the file
	lib/librte_eal/linuxapp/eal/eal_ivshmem.c
could be moved outside of EAL. Probably that lib/librte_ivshmem/
would be a good place.
The tricky operation would be to remove ivshmem init from eal:

#ifdef RTE_LIBRTE_IVSHMEM
    if (rte_eal_ivshmem_init() < 0)                                                                              
        rte_panic("Cannot init IVSHMEM\n");
#endif

    if (rte_eal_memory_init() < 0)
        rte_panic("Cannot init memory\n");

    /* the directories are locked during eal_hugepage_info_init */
    eal_hugedirs_unlock();

    if (rte_eal_memzone_init() < 0)
        rte_panic("Cannot init memzone\n");

    if (rte_eal_tailqs_init() < 0)
        rte_panic("Cannot init tail queues for objects\n");

#ifdef RTE_LIBRTE_IVSHMEM
    if (rte_eal_ivshmem_obj_init() < 0)
        rte_panic("Cannot init IVSHMEM objects\n");
#endif

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] log: deprecate history dump
  2016-06-09 14:45  0% ` David Marchand
  2016-06-09 15:01  0%   ` Thomas Monjalon
@ 2016-06-09 15:01  0%   ` Christian Ehrhardt
  1 sibling, 0 replies; 200+ results
From: Christian Ehrhardt @ 2016-06-09 15:01 UTC (permalink / raw)
  To: David Marchand; +Cc: Thomas Monjalon, dev

Hi,
in I totally like it - thanks Thomas for picking that up.

I just wanted to mention that the Makefile still refers to mempool, but
David beat me in time and Detail a lot.

I'll certainly try to follow and help the bit I can.



Christian Ehrhardt
Software Engineer, Ubuntu Server
Canonical Ltd

On Thu, Jun 9, 2016 at 4:45 PM, David Marchand <david.marchand@6wind.com>
wrote:

> Thomas,
>
> On Thu, Jun 9, 2016 at 4:09 PM, Thomas Monjalon
> <thomas.monjalon@6wind.com> wrote:
> > The log history uses rte_mempool. In order to remove the mempool
> > dependency in EAL (and improve the build), this feature is deprecated.
> > The ABI is kept but the behaviour is now voided because it seems this
> > function was not used. The history can be read from syslog.
>
> It does look like it is not really used.
> I am for this change unless someone complains.
>
> Comments below.
>
> > Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
> > ---
> >  app/test-pmd/cmdline.c                       |   3 -
> >  app/test/autotest_test_funcs.py              |   5 --
> >  app/test/commands.c                          |   4 +-
> >  app/test/test_logs.c                         |   3 -
> >  doc/guides/rel_notes/deprecation.rst         |   3 +
> >  lib/librte_eal/bsdapp/eal/eal_debug.c        |   6 --
> >  lib/librte_eal/common/eal_common_log.c       | 128
> +--------------------------
> >  lib/librte_eal/common/include/rte_log.h      |   8 ++
> >  lib/librte_eal/linuxapp/eal/eal_debug.c      |   6 --
> >  lib/librte_eal/linuxapp/eal/eal_interrupts.c |   1 -
> >  lib/librte_eal/linuxapp/eal/eal_ivshmem.c    |   1 -
> >  lib/librte_eal/linuxapp/eal/eal_log.c        |   3 -
> >  lib/librte_mempool/rte_mempool.c             |   4 -
> >  13 files changed, 16 insertions(+), 159 deletions(-)
>
> - You missed autotest_data.py.
>
> $ git grep dump_log_history
> app/test/autotest_data.py:               "Command" :    "dump_log_history",
>
> - eal Makefile still refers to librte_mempool.
>
> - Since you are looking at this, what keeps us from removing the
> dependency on librte_ring ?
> I would say it was mainly because of mempool.
> Maybe ivshmem ?
>
>
> [snip]
>
> > diff --git a/doc/guides/rel_notes/deprecation.rst
> b/doc/guides/rel_notes/deprecation.rst
> > index ad05eba..f11df93 100644
> > --- a/doc/guides/rel_notes/deprecation.rst
> > +++ b/doc/guides/rel_notes/deprecation.rst
> > @@ -8,6 +8,9 @@ API and ABI deprecation notices are to be posted here.
> >  Deprecation Notices
> >  -------------------
> >
> > +* The log history is deprecated in release 16.07.
> > +  It is voided and will be completely removed in release 16.11.
> > +
>
> It is voided in 16.07 and will be completely removed in release 16.11.
>
>
> >  * The ethdev hotplug API is going to be moved to EAL with a notification
> >    mechanism added to crypto and ethdev libraries so that hotplug is now
> >    available to both of them. This API will be stripped of the device
> arguments
>
> > diff --git a/lib/librte_eal/common/eal_common_log.c
> b/lib/librte_eal/common/eal_common_log.c
> > index b5e37bb..94ecdd2 100644
> > --- a/lib/librte_eal/common/eal_common_log.c
> > +++ b/lib/librte_eal/common/eal_common_log.c
> > @@ -56,29 +56,11 @@
> >  #include <rte_spinlock.h>
> >  #include <rte_branch_prediction.h>
> >  #include <rte_ring.h>
> > -#include <rte_mempool.h>
> >
> >  #include "eal_private.h"
> >
> >  #define LOG_ELT_SIZE     2048
>
> We don't need LOG_ELT_SIZE.
>
> >
> > -#define LOG_HISTORY_MP_NAME "log_history"
> > -
> > -STAILQ_HEAD(log_history_list, log_history);
> > -
> > -/**
> > - * The structure of a message log in the log history.
> > - */
> > -struct log_history {
> > -       STAILQ_ENTRY(log_history) next;
> > -       unsigned size;
> > -       char buf[0];
> > -};
> > -
> > -static struct rte_mempool *log_history_mp = NULL;
> > -static unsigned log_history_size = 0;
> > -static struct log_history_list log_history;
> > -
> >  /* global log structure */
> >  struct rte_logs rte_logs = {
> >         .type = ~0,
> > @@ -86,10 +68,7 @@ struct rte_logs rte_logs = {
> >         .file = NULL,
> >  };
> >
> > -static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER;
> > -static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER;
> >  static FILE *default_log_stream;
> > -static int history_enabled = 1;
> >
> >  /**
> >   * This global structure stores some informations about the message
> > @@ -106,59 +85,14 @@ static RTE_DEFINE_PER_LCORE(struct log_cur_msg,
> log_cur_msg);
> >  /* default logs */
> >
> >  int
> > -rte_log_add_in_history(const char *buf, size_t size)
> > +rte_log_add_in_history(const char *buf __rte_unused, size_t size
> __rte_unused)
> >  {
> > -       struct log_history *hist_buf = NULL;
> > -       static const unsigned hist_buf_size = LOG_ELT_SIZE -
> sizeof(*hist_buf);
> > -       void *obj;
> > -
> > -       if (history_enabled == 0)
> > -               return 0;
> > -
> > -       rte_spinlock_lock(&log_list_lock);
> > -
> > -       /* get a buffer for adding in history */
> > -       if (log_history_size > RTE_LOG_HISTORY) {
> > -               hist_buf = STAILQ_FIRST(&log_history);
> > -               if (hist_buf) {
> > -                       STAILQ_REMOVE_HEAD(&log_history, next);
> > -                       log_history_size--;
> > -               }
> > -       }
> > -       else {
> > -               if (rte_mempool_mc_get(log_history_mp, &obj) < 0)
> > -                       obj = NULL;
> > -               hist_buf = obj;
> > -       }
> > -
> > -       /* no buffer */
> > -       if (hist_buf == NULL) {
> > -               rte_spinlock_unlock(&log_list_lock);
> > -               return -ENOBUFS;
> > -       }
> > -
> > -       /* not enough room for msg, buffer go back in mempool */
> > -       if (size >= hist_buf_size) {
> > -               rte_mempool_mp_put(log_history_mp, hist_buf);
> > -               rte_spinlock_unlock(&log_list_lock);
> > -               return -ENOBUFS;
> > -       }
> > -
> > -       /* add in history */
> > -       memcpy(hist_buf->buf, buf, size);
> > -       hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0';
> > -       hist_buf->size = size;
> > -       STAILQ_INSERT_TAIL(&log_history, hist_buf, next);
> > -       log_history_size++;
> > -       rte_spinlock_unlock(&log_list_lock);
> > -
> >         return 0;
> >  }
> >
> >  void
> > -rte_log_set_history(int enable)
> > +rte_log_set_history(int enable __rte_unused)
> >  {
> > -       history_enabled = enable;
> >  }
>
> Maybe a warning here for the people who did not read the deprecation
> notices ?
>
>
> --
> David Marchand
>

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2] log: deprecate history dump
  2016-06-09 14:09  6% [dpdk-dev] [PATCH] log: deprecate history dump Thomas Monjalon
  2016-06-09 14:45  0% ` David Marchand
@ 2016-06-09 15:06  5% ` Thomas Monjalon
  2016-06-09 22:10  5%   ` [dpdk-dev] [PATCH v3] " Thomas Monjalon
  1 sibling, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-06-09 15:06 UTC (permalink / raw)
  To: david.marchand; +Cc: dev

The log history uses rte_mempool. In order to remove the mempool
dependency in EAL (and improve the build), this feature is deprecated.
The ABI is kept but the behaviour is now voided because it seems this
function was not used. The history can be read from syslog.
When enabling the log history, a warning is logged.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
v2:
- remove more mempool and log history traces
- add a warning if enabling log history
- move not related mempool includes cleanup in another patch
---
 app/test-pmd/cmdline.c                  |   3 -
 app/test/autotest_data.py               |   6 --
 app/test/autotest_test_funcs.py         |   5 --
 app/test/commands.c                     |   4 +-
 app/test/test_logs.c                    |   3 -
 doc/guides/prog_guide/mempool_lib.rst   |   4 +-
 doc/guides/rel_notes/deprecation.rst    |   3 +
 lib/librte_eal/bsdapp/eal/Makefile      |   1 -
 lib/librte_eal/bsdapp/eal/eal_debug.c   |   6 --
 lib/librte_eal/common/eal_common_log.c  | 143 ++------------------------------
 lib/librte_eal/common/eal_private.h     |   3 -
 lib/librte_eal/common/include/rte_log.h |   8 ++
 lib/librte_eal/linuxapp/eal/Makefile    |   1 -
 lib/librte_eal/linuxapp/eal/eal_debug.c |   6 --
 lib/librte_eal/linuxapp/eal/eal_log.c   |   9 +-
 lib/librte_mempool/rte_mempool.c        |   4 -
 16 files changed, 20 insertions(+), 189 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 1921612..fd389ac 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -7268,8 +7268,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -7284,7 +7282,6 @@ cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
 		"dump_physmem#"
 		"dump_memzone#"
-		"dump_log_history#"
 		"dump_struct_sizes#"
 		"dump_ring#"
 		"dump_mempool#"
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 78d2edd..6c87809 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -94,12 +94,6 @@ parallel_test_group_list = [
 		 "Report" :	None,
 		},
 		{
-		 "Name" :	"Dump log history",
-		 "Command" :	"dump_log_history",
-		 "Func" :	dump_autotest,
-		 "Report" :	None,
-		},
-		{
 		 "Name" :	"Dump rings",
 		 "Command" :	"dump_ring",
 		 "Func" :	dump_autotest,
diff --git a/app/test/autotest_test_funcs.py b/app/test/autotest_test_funcs.py
index b60b941..14cffd0 100644
--- a/app/test/autotest_test_funcs.py
+++ b/app/test/autotest_test_funcs.py
@@ -144,16 +144,11 @@ def logs_autotest(child, test_name):
 	i = 0
 	child.sendline(test_name)
 
-	# logs sequence is printed twice because of history dump
 	log_list = [
 		"TESTAPP1: error message",
 		"TESTAPP1: critical message",
 		"TESTAPP2: critical message",
 		"TESTAPP1: error message",
-		"TESTAPP1: error message",
-		"TESTAPP1: critical message",
-		"TESTAPP2: critical message",
-		"TESTAPP1: error message",
 	]
 
 	for log_msg in log_list:
diff --git a/app/test/commands.c b/app/test/commands.c
index e0af8e4..2df46b0 100644
--- a/app/test/commands.c
+++ b/app/test/commands.c
@@ -150,8 +150,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -164,7 +162,7 @@ static void cmd_dump_parsed(void *parsed_result,
 
 cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
-				 "dump_physmem#dump_memzone#dump_log_history#"
+				 "dump_physmem#dump_memzone#"
 				 "dump_struct_sizes#dump_ring#dump_mempool#"
 				 "dump_devargs");
 
diff --git a/app/test/test_logs.c b/app/test/test_logs.c
index 05aa862..d0a9962 100644
--- a/app/test/test_logs.c
+++ b/app/test/test_logs.c
@@ -83,9 +83,6 @@ test_logs(void)
 	RTE_LOG(ERR, TESTAPP1, "error message\n");
 	RTE_LOG(ERR, TESTAPP2, "error message (not displayed)\n");
 
-	/* print again the previous logs */
-	rte_log_dump_history(stdout);
-
 	return 0;
 }
 
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index 5fae79a..c3afc2e 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -38,9 +38,7 @@ In the DPDK, it is identified by name and uses a ring to store free objects.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
-This library is used by the
-:ref:`Mbuf Library <Mbuf_Library>` and the
-:ref:`Environment Abstraction Layer <Environment_Abstraction_Layer>` (for logging history).
+This library is used by the :ref:`Mbuf Library <Mbuf_Library>`.
 
 Cookies
 -------
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..bda40c1 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -8,6 +8,9 @@ API and ABI deprecation notices are to be posted here.
 Deprecation Notices
 -------------------
 
+* The log history is deprecated.
+  It is voided in 16.07 and will be removed in release 16.11.
+
 * The ethdev hotplug API is going to be moved to EAL with a notification
   mechanism added to crypto and ethdev libraries so that hotplug is now
   available to both of them. This API will be stripped of the device arguments
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 9054ad6..474651b 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,7 +41,6 @@ CFLAGS += -I$(SRCDIR)/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_ring
-CFLAGS += -I$(RTE_SDK)/lib/librte_mempool
 CFLAGS += $(WERROR_FLAGS) -O3
 
 LDLIBS += -lexecinfo
diff --git a/lib/librte_eal/bsdapp/eal/eal_debug.c b/lib/librte_eal/bsdapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_debug.c
+++ b/lib/librte_eal/bsdapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c
index b5e37bb..0739331 100644
--- a/lib/librte_eal/common/eal_common_log.c
+++ b/lib/librte_eal/common/eal_common_log.c
@@ -38,47 +38,14 @@
 #include <sys/types.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <sys/queue.h>
 
 #include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_memzone.h>
-#include <rte_launch.h>
 #include <rte_common.h>
-#include <rte_cycles.h>
 #include <rte_eal.h>
 #include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_atomic.h>
-#include <rte_debug.h>
-#include <rte_spinlock.h>
-#include <rte_branch_prediction.h>
-#include <rte_ring.h>
-#include <rte_mempool.h>
 
 #include "eal_private.h"
 
-#define LOG_ELT_SIZE     2048
-
-#define LOG_HISTORY_MP_NAME "log_history"
-
-STAILQ_HEAD(log_history_list, log_history);
-
-/**
- * The structure of a message log in the log history.
- */
-struct log_history {
-	STAILQ_ENTRY(log_history) next;
-	unsigned size;
-	char buf[0];
-};
-
-static struct rte_mempool *log_history_mp = NULL;
-static unsigned log_history_size = 0;
-static struct log_history_list log_history;
-
 /* global log structure */
 struct rte_logs rte_logs = {
 	.type = ~0,
@@ -86,10 +53,7 @@ struct rte_logs rte_logs = {
 	.file = NULL,
 };
 
-static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER;
-static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER;
 static FILE *default_log_stream;
-static int history_enabled = 1;
 
 /**
  * This global structure stores some informations about the message
@@ -106,59 +70,16 @@ static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg);
 /* default logs */
 
 int
-rte_log_add_in_history(const char *buf, size_t size)
+rte_log_add_in_history(const char *buf __rte_unused, size_t size __rte_unused)
 {
-	struct log_history *hist_buf = NULL;
-	static const unsigned hist_buf_size = LOG_ELT_SIZE - sizeof(*hist_buf);
-	void *obj;
-
-	if (history_enabled == 0)
-		return 0;
-
-	rte_spinlock_lock(&log_list_lock);
-
-	/* get a buffer for adding in history */
-	if (log_history_size > RTE_LOG_HISTORY) {
-		hist_buf = STAILQ_FIRST(&log_history);
-		if (hist_buf) {
-			STAILQ_REMOVE_HEAD(&log_history, next);
-			log_history_size--;
-		}
-	}
-	else {
-		if (rte_mempool_mc_get(log_history_mp, &obj) < 0)
-			obj = NULL;
-		hist_buf = obj;
-	}
-
-	/* no buffer */
-	if (hist_buf == NULL) {
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* not enough room for msg, buffer go back in mempool */
-	if (size >= hist_buf_size) {
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* add in history */
-	memcpy(hist_buf->buf, buf, size);
-	hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0';
-	hist_buf->size = size;
-	STAILQ_INSERT_TAIL(&log_history, hist_buf, next);
-	log_history_size++;
-	rte_spinlock_unlock(&log_list_lock);
-
 	return 0;
 }
 
 void
 rte_log_set_history(int enable)
 {
-	history_enabled = enable;
+	if (enable)
+		RTE_LOG(WARNING, EAL, "The log history is deprecated.\n");
 }
 
 /* Change the stream that will be used by logging system */
@@ -217,44 +138,8 @@ int rte_log_cur_msg_logtype(void)
 
 /* Dump log history to file */
 void
-rte_log_dump_history(FILE *out)
+rte_log_dump_history(FILE *out __rte_unused)
 {
-	struct log_history_list tmp_log_history;
-	struct log_history *hist_buf;
-	unsigned i;
-
-	/* only one dump at a time */
-	rte_spinlock_lock(&log_dump_lock);
-
-	/* save list, and re-init to allow logging during dump */
-	rte_spinlock_lock(&log_list_lock);
-	tmp_log_history = log_history;
-	STAILQ_INIT(&log_history);
-	log_history_size = 0;
-	rte_spinlock_unlock(&log_list_lock);
-
-	for (i=0; i<RTE_LOG_HISTORY; i++) {
-
-		/* remove one message from history list */
-		hist_buf = STAILQ_FIRST(&tmp_log_history);
-
-		if (hist_buf == NULL)
-			break;
-
-		STAILQ_REMOVE_HEAD(&tmp_log_history, next);
-
-		/* write on stdout */
-		if (fwrite(hist_buf->buf, hist_buf->size, 1, out) == 0) {
-			rte_mempool_mp_put(log_history_mp, hist_buf);
-			break;
-		}
-
-		/* put back message structure in pool */
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-	}
-	fflush(out);
-
-	rte_spinlock_unlock(&log_dump_lock);
 }
 
 /*
@@ -297,29 +182,11 @@ rte_log(uint32_t level, uint32_t logtype, const char *format, ...)
 }
 
 /*
- * called by environment-specific log init function to initialize log
- * history
+ * called by environment-specific log init function
  */
 int
 rte_eal_common_log_init(FILE *default_log)
 {
-	STAILQ_INIT(&log_history);
-
-	/* reserve RTE_LOG_HISTORY*2 elements, so we can dump and
-	 * keep logging during this time */
-	log_history_mp = rte_mempool_create(LOG_HISTORY_MP_NAME, RTE_LOG_HISTORY*2,
-				LOG_ELT_SIZE, 0, 0,
-				NULL, NULL,
-				NULL, NULL,
-				SOCKET_ID_ANY, MEMPOOL_F_NO_PHYS_CONTIG);
-
-	if ((log_history_mp == NULL) &&
-	    ((log_history_mp = rte_mempool_lookup(LOG_HISTORY_MP_NAME)) == NULL)){
-		RTE_LOG(ERR, EAL, "%s(): cannot create log_history mempool\n",
-			__func__);
-		return -1;
-	}
-
 	default_log_stream = default_log;
 	rte_openlog_stream(default_log);
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 2342fa1..857dc3e 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -49,9 +49,6 @@ int rte_eal_memzone_init(void);
 /**
  * Common log initialization function (private to eal).
  *
- * Called by environment-specific log initialization function to initialize
- * log history.
- *
  * @param default_log
  *   The default log stream to be used.
  * @return
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index 2e47e7f..b1add04 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -42,6 +42,8 @@
  * This file provides a log API to RTE applications.
  */
 
+#include "rte_common.h" /* for __rte_deprecated macro */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -179,22 +181,27 @@ int rte_log_cur_msg_loglevel(void);
 int rte_log_cur_msg_logtype(void);
 
 /**
+ * @deprecated
  * Enable or disable the history (enabled by default)
  *
  * @param enable
  *   true to enable, or 0 to disable history.
  */
+__rte_deprecated
 void rte_log_set_history(int enable);
 
 /**
+ * @deprecated
  * Dump the log history to a file
  *
  * @param f
  *   A pointer to a file for output
  */
+__rte_deprecated
 void rte_log_dump_history(FILE *f);
 
 /**
+ * @deprecated
  * Add a log message to the history.
  *
  * This function can be called from a user-defined log stream. It adds
@@ -209,6 +216,7 @@ void rte_log_dump_history(FILE *f);
  *   - 0: Success.
  *   - (-ENOBUFS) if there is no room to store the message.
  */
+__rte_deprecated
 int rte_log_add_in_history(const char *buf, size_t size);
 
 /**
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index e109361..a0ea698 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -45,7 +45,6 @@ CFLAGS += -I$(SRCDIR)/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_ring
-CFLAGS += -I$(RTE_SDK)/lib/librte_mempool
 CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem
 CFLAGS += $(WERROR_FLAGS) -O3
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_debug.c
+++ b/lib/librte_eal/linuxapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c
index 0b133c3..d391100 100644
--- a/lib/librte_eal/linuxapp/eal/eal_log.c
+++ b/lib/librte_eal/linuxapp/eal/eal_log.c
@@ -50,8 +50,7 @@
 #include "eal_private.h"
 
 /*
- * default log function, used once mempool (hence log history) is
- * available
+ * default log function
  */
 static ssize_t
 console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
@@ -60,9 +59,6 @@ console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
 	ssize_t ret;
 	uint32_t loglevel;
 
-	/* add this log in history */
-	rte_log_add_in_history(buf, size);
-
 	/* write on stdout */
 	ret = fwrite(buf, 1, size, stdout);
 	fflush(stdout);
@@ -110,8 +106,7 @@ rte_eal_log_init(const char *id, int facility)
 /* early logs */
 
 /*
- * early log function, used during boot when mempool (hence log
- * history) is not available
+ * early log function, used before rte_eal_log_init
  */
 static ssize_t
 early_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index b54de43..22a5645 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -1003,7 +1003,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 
 		if (free == 0) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1012,7 +1011,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 			hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2;
 		} else if (free == 1) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1022,7 +1020,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		} else if (free == 2) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 &&
 			    cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1032,7 +1029,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		tlr = __mempool_get_trailer(obj);
 		cookie = tlr->cookie;
 		if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) {
-			rte_log_set_history(0);
 			RTE_LOG(CRIT, MEMPOOL,
 				"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 				obj, (const void *) mp, cookie);
-- 
2.7.0

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v7 0/8] add packet capture framework
  2016-06-09  8:50  2%     ` [dpdk-dev] [PATCH v6 0/8] add packet capture framework Reshma Pattan
  2016-06-09  8:50  5%       ` [dpdk-dev] [PATCH v6 8/8] doc: update doc for " Reshma Pattan
@ 2016-06-09 16:10  2%       ` Reshma Pattan
  2016-06-09 16:10  5%         ` [dpdk-dev] [PATCH v7 8/8] doc: update doc for " Reshma Pattan
  2016-06-09 17:34  0%         ` [dpdk-dev] [PATCH v7 0/8] add " Ananyev, Konstantin
  1 sibling, 2 replies; 200+ results
From: Reshma Pattan @ 2016-06-09 16:10 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
Pdump library provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

Pdump library works on client/server based model.

Sever is responsible for enabling/disabling the packet captures.
Clients are responsible for requesting enable/disable of the
packet captures.

As part of packet capture framework initialization, pthread and
the server socket is created. Only one server socket is allowed on the system.
As part of enabling/disabling the packet capture, client sockets are created
and multiple client sockets are allowed.
Who ever calls initialization first they will succeed with the initialization,
next subsequent calls of initialization are not allowed. So next users can only
request enabling/disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v7:
fixed lines over 90 characters.

v6:
removed below deprecation notice patch from patch set.
http://dpdk.org/dev/patchwork/patch/13372/

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (8):
  librte_ether: protect add/remove of rxtx callbacks with spinlocks
  librte_ether: add new api rte_eth_add_first_rx_callback
  librte_ether: add new fields to rte_eth_dev_info struct
  librte_ether: make rte_eth_dev_get_port_by_name
    rte_eth_dev_get_name_by_port public
  lib/librte_pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/test-pmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++
 doc/guides/rel_notes/release_16_07.rst  |  13 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  60 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 ++
 lib/librte_pdump/rte_pdump.c            | 872 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 186 +++++++
 lib/librte_pdump/rte_pdump_version.map  |  12 +
 mk/rte.app.mk                           |   1 +
 20 files changed, 2428 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

-- 
2.5.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v7 8/8] doc: update doc for packet capture framework
  2016-06-09 16:10  2%       ` [dpdk-dev] [PATCH v7 0/8] add " Reshma Pattan
@ 2016-06-09 16:10  5%         ` Reshma Pattan
  2016-06-09 17:34  0%         ` [dpdk-dev] [PATCH v7 0/8] add " Ananyev, Konstantin
  1 sibling, 0 replies; 200+ results
From: Reshma Pattan @ 2016-06-09 16:10 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
Acked-by: John McNamara <john.mcnamara@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 107 ++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  13 ++++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 ++++++++++++++++++++++++++++++++
 6 files changed, 247 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index a48c8de..ce7c941 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -436,6 +436,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..1809234
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,107 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _pdump_library:
+
+The librte_pdump Library
+========================
+
+The ``librte_pdump`` library provides a framework for packet capturing in DPDK.
+The library provides the following APIs to initialize the packet capture framework, to enable
+or disable the packet capture, and to uninitialize it:
+
+* ``rte_pdump_init()``:
+  This API initializes the packet capture framework.
+
+* ``rte_pdump_enable()``:
+  This API enables the packet capture on a given port and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_enable_by_deviceid()``:
+  This API enables the packet capture on a given device id (``vdev name or pci address``) and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_disable()``:
+  This API disables the packet capture on a given port and queue.
+
+* ``rte_pdump_disable_by_deviceid()``:
+  This API disables the packet capture on a given device id (``vdev name or pci address``) and queue.
+
+* ``rte_pdump_uninit()``:
+  This API uninitializes the packet capture framework.
+
+
+Operation
+---------
+
+The ``librte_pdump`` library works on a client/server model. The server is responsible for enabling or
+disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
+the packet capture.
+
+The packet capture framework, as part of its initialization, creates the pthread and the server socket in
+the pthread. The application that calls the framework initialization first will have the server socket created.
+Further calls to the framework initialization by the same application or other applications is not allowed i.e., only
+one server socket is allowed on the system. So the other applications can only request enabling or disabling of
+the packet capture at which point the client socket is created for them to send the request to the server.
+The server socket will listen for client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket. The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture. Whoever calls this API first will have the server socket created, the subsequent calls to this APIs
+will not create any further server socket. i.e. only one server socket is allowed.
+
+The library APIs ``rte_pdump_enable()`` and ``rte_pdump_enable_by_deviceid()`` enables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump enable" request and sends
+the request to the server. The server that is listening on the socket will take the request and enable the packet capture
+by registering the Ethernet RX and TX callbacks for the given port or device_id and queue combinations.
+Then the server will mirror the packets to the new mempool and enqueue them to the rte_ring that clients have passed
+to these APIs. The server also sends the response back to the client about the status of the request that was processed.
+After the response is received from the server, the client socket is closed.
+
+The library APIs ``rte_pdump_disable()`` and ``rte_pdump_disable_by_deviceid()`` disables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump disable" request and sends
+the request to the server. The server that is listening on the socket will take the request and disable the packet
+capture by removing the Ethernet RX and TX callbacks for the given port or device_id and queue combinations. The server
+also sends the response back to the client about the status of the request that was processed. After the response is
+received from the server, the client socket is closed.
+
+The library API ``rte_pdump_uninit()``, uninitializes the packet capture framework by closing the pthread and the
+server socket.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
+Users can use this as an example to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..a4de2a2 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -66,6 +66,11 @@ New Features
   * Enable RSS per network interface through the configuration file.
   * Streamline the CLI code.
 
+* **Added packet capture framework.**
+
+  * A new library ``librte_pdump`` is added to provide packet capture APIs.
+  * A new ``app/pdump`` tool is added to capture packets in DPDK.
+
 
 Resolved Issues
 ---------------
@@ -135,6 +140,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to a public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -146,6 +156,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..96c8709
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,122 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+   .. Note::
+
+      * As of now the ``dpdk_pdump`` tool cannot capture the packets of virtual devices
+        in the primary process due to a bug in the ethdev library. Due to this bug, in a multi process context,
+        when the primary and secondary have different ports set, then the secondary process
+        (here the ``dpdk_pdump`` tool) overwrites the ``rte_eth_devices[]`` entries of the primary process.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH v7 0/8] add packet capture framework
  2016-06-09 16:10  2%       ` [dpdk-dev] [PATCH v7 0/8] add " Reshma Pattan
  2016-06-09 16:10  5%         ` [dpdk-dev] [PATCH v7 8/8] doc: update doc for " Reshma Pattan
@ 2016-06-09 17:34  0%         ` Ananyev, Konstantin
  1 sibling, 0 replies; 200+ results
From: Ananyev, Konstantin @ 2016-06-09 17:34 UTC (permalink / raw)
  To: Pattan, Reshma, dev

> 
> This patch set include below changes
> 
> 1)Changes to librte_ether.
> 2)A new library librte_pdump added for packet capture framework.
> 3)A new app/pdump tool added for packet capturing.
> 4)Test pmd changes done to initialize packet capture framework.
> 5)Documentation update.
> 
> 1)librte_pdump
> ==============
> To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
> is added.Users can develop their own packet capturing application using new library APIs.
> 
> Operation:
> ----------
> Pdump library provides APIs to support packet capturing on dpdk Ethernet devices.
> Library provides APIs to initialize the packet capture framework, enable/disable
> the packet capture and uninitialize the packet capture framework.
> 
> Pdump library works on client/server based model.
> 
> Sever is responsible for enabling/disabling the packet captures.
> Clients are responsible for requesting enable/disable of the
> packet captures.
> 
> As part of packet capture framework initialization, pthread and
> the server socket is created. Only one server socket is allowed on the system.
> As part of enabling/disabling the packet capture, client sockets are created
> and multiple client sockets are allowed.
> Who ever calls initialization first they will succeed with the initialization,
> next subsequent calls of initialization are not allowed. So next users can only
> request enabling/disabling the packet capture.
> 
> Applications using below APIs need to pass port/device_id, queue, mempool and
> ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
> packets of the port for users. Users need to dequeue the rings and write the packets
> to vdev(pcap/tuntap) to view the packets using any standard tools.
> 
> Note:
> Mempool and Ring should be mc/mp supportable.
> Mempool mbuf size should be big enough to handle the rx/tx packets of a port.
> 
> APIs:
> -----
> rte_pdump_init()
> rte_pdump_enable()
> rte_pdump_enable_by_deviceid()
> rte_pdump_disable()
> rte_pdump_disable_by_deviceid()
> rte_pdump_uninit()
> 
> 2)app/pdump tool
> ================
> Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
> This tool by default runs as secondary process, and provides the support for
> the command line options for packet capture.
> 
> ./build/app/dpdk_pdump --
>                        --pdump '(port=<port id> | device_id=<pci id or vdev name>),
>                                 (queue=<queue id>),
>                                 (rx-dev=<iface or pcap file> |
>                                  tx-dev=<iface or pcap file>),
>                                 [ring-size=<ring size>],
>                                 [mbuf-size=<mbuf data size>],
>                                 [total-num-mbufs=<number of mbufs>]'
> 
> Parameters inside the parenthesis represents the mandatory parameters.
> Parameters inside the square brackets represents optional parameters.
> User has to pass on packet capture parameters under --pdump parameters, multiples of
> --pdump can be passed to capture packets on different port and queue combinations
> 
> Operation:
> ----------
> *Tool parse the user command line arguments,
> creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
> of the device passed in rx-dev|tx-dev parameters.
> 
> *Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
> to enable packet capturing on a specific port/device_id and queue by passing on
> port|device_id, queue, mempool and ring info.
> 
> *Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.
> 
> *Tool can be stopped using SIGINT, upon which tool calls
> rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.
> 
> Note:
> CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.
> 
> 3)Test-pmd changes
> ==================
> Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
> So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.
> 
> Similarly any application which needs packet capture should call initialize/uninitialize APIs of
> librte_pdump and use pdump tool to start the capture.
> 
> 4)Packet capture flow between pdump tool and librte_pdump
> =========================================================
> * Pdump tool (Secondary process) requests packet capture
> for specific port|device_id and queue combinations.
> 
> *Library in secondary process context creates client socket and communicates
> the port|device_id, queue, ring and mempool to server.
> 
> *Library initializes server in primary process 'test-pmd' context and server serves
> the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.
> 
> *Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.
> 
> *Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
> so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.
> 
> *Once the pdump tool is terminated with SIGINT it will disable the packet capturing.
> 
> *Library receives the disable packet capture request, communicate the info to server,
> server will remove the Ethernet rxtx call-backs.
> 
> *Packet capture can be seen using tcpdump command
> "tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"
> 
> 5)Example command line
> ======================
> ./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-
> size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-
> dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'
> 
> v7:
> fixed lines over 90 characters.
> 
> v6:
> removed below deprecation notice patch from patch set.
> http://dpdk.org/dev/patchwork/patch/13372/
> 
> v5:
> addressed code review comments for below patches
> http://dpdk.org/dev/patchwork/patch/12955/
> http://dpdk.org/dev/patchwork/patch/12951/
> 
> v4:
> added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
> made doc changes as per doc guidelines.
> replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
> removed rxtx-dev parameter from pdump tool command line.
> 
> v3:
> app/pdump: Moved cleanup code from signal handler to main.
> divided librte_ether changes into multiple patches.
> example command changed in app/pdump application guide
> 
> v2:
> fix compilation issues for 4.8.3
> fix unnecessary #includes
> 
> 
> Reshma Pattan (8):
>   librte_ether: protect add/remove of rxtx callbacks with spinlocks
>   librte_ether: add new api rte_eth_add_first_rx_callback
>   librte_ether: add new fields to rte_eth_dev_info struct
>   librte_ether: make rte_eth_dev_get_port_by_name
>     rte_eth_dev_get_name_by_port public
>   lib/librte_pdump: add new library for packet capturing support
>   app/pdump: add pdump tool for packet capturing
>   app/test-pmd: add pdump initialization uninitialization
>   doc: update doc for packet capture framework
> 

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> --
> 2.5.0


^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3] log: deprecate history dump
  2016-06-09 15:06  5% ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
@ 2016-06-09 22:10  5%   ` Thomas Monjalon
  2016-06-10  9:50  0%     ` David Marchand
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-06-09 22:10 UTC (permalink / raw)
  To: david.marchand; +Cc: dev

The log history uses rte_mempool. In order to remove the mempool
dependency in EAL (and improve the build), this feature is deprecated.
The ABI is kept but the behaviour is now voided because it seems this
function was not used. The history can be read from syslog.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
v3:
- keep mempool header path in linuxapp for CONFIG_RTE_LIBRTE_IVSHMEM
v2:
- remove more mempool and log history traces
- add a warning if enabling log history
- move not related mempool includes cleanup in another patch
---
 app/test-pmd/cmdline.c                  |   3 -
 app/test/autotest_data.py               |   6 --
 app/test/autotest_test_funcs.py         |   5 --
 app/test/commands.c                     |   4 +-
 app/test/test_logs.c                    |   3 -
 doc/guides/prog_guide/mempool_lib.rst   |   4 +-
 doc/guides/rel_notes/deprecation.rst    |   3 +
 lib/librte_eal/bsdapp/eal/Makefile      |   1 -
 lib/librte_eal/bsdapp/eal/eal_debug.c   |   6 --
 lib/librte_eal/common/eal_common_log.c  | 148 ++------------------------------
 lib/librte_eal/common/eal_private.h     |   3 -
 lib/librte_eal/common/include/rte_log.h |   8 ++
 lib/librte_eal/linuxapp/eal/eal_debug.c |   6 --
 lib/librte_eal/linuxapp/eal/eal_log.c   |   9 +-
 lib/librte_mempool/rte_mempool.c        |   4 -
 15 files changed, 20 insertions(+), 193 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 1921612..fd389ac 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -7268,8 +7268,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -7284,7 +7282,6 @@ cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
 		"dump_physmem#"
 		"dump_memzone#"
-		"dump_log_history#"
 		"dump_struct_sizes#"
 		"dump_ring#"
 		"dump_mempool#"
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 78d2edd..6c87809 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -94,12 +94,6 @@ parallel_test_group_list = [
 		 "Report" :	None,
 		},
 		{
-		 "Name" :	"Dump log history",
-		 "Command" :	"dump_log_history",
-		 "Func" :	dump_autotest,
-		 "Report" :	None,
-		},
-		{
 		 "Name" :	"Dump rings",
 		 "Command" :	"dump_ring",
 		 "Func" :	dump_autotest,
diff --git a/app/test/autotest_test_funcs.py b/app/test/autotest_test_funcs.py
index b60b941..14cffd0 100644
--- a/app/test/autotest_test_funcs.py
+++ b/app/test/autotest_test_funcs.py
@@ -144,16 +144,11 @@ def logs_autotest(child, test_name):
 	i = 0
 	child.sendline(test_name)
 
-	# logs sequence is printed twice because of history dump
 	log_list = [
 		"TESTAPP1: error message",
 		"TESTAPP1: critical message",
 		"TESTAPP2: critical message",
 		"TESTAPP1: error message",
-		"TESTAPP1: error message",
-		"TESTAPP1: critical message",
-		"TESTAPP2: critical message",
-		"TESTAPP1: error message",
 	]
 
 	for log_msg in log_list:
diff --git a/app/test/commands.c b/app/test/commands.c
index e0af8e4..2df46b0 100644
--- a/app/test/commands.c
+++ b/app/test/commands.c
@@ -150,8 +150,6 @@ static void cmd_dump_parsed(void *parsed_result,
 		rte_dump_physmem_layout(stdout);
 	else if (!strcmp(res->dump, "dump_memzone"))
 		rte_memzone_dump(stdout);
-	else if (!strcmp(res->dump, "dump_log_history"))
-		rte_log_dump_history(stdout);
 	else if (!strcmp(res->dump, "dump_struct_sizes"))
 		dump_struct_sizes();
 	else if (!strcmp(res->dump, "dump_ring"))
@@ -164,7 +162,7 @@ static void cmd_dump_parsed(void *parsed_result,
 
 cmdline_parse_token_string_t cmd_dump_dump =
 	TOKEN_STRING_INITIALIZER(struct cmd_dump_result, dump,
-				 "dump_physmem#dump_memzone#dump_log_history#"
+				 "dump_physmem#dump_memzone#"
 				 "dump_struct_sizes#dump_ring#dump_mempool#"
 				 "dump_devargs");
 
diff --git a/app/test/test_logs.c b/app/test/test_logs.c
index 05aa862..d0a9962 100644
--- a/app/test/test_logs.c
+++ b/app/test/test_logs.c
@@ -83,9 +83,6 @@ test_logs(void)
 	RTE_LOG(ERR, TESTAPP1, "error message\n");
 	RTE_LOG(ERR, TESTAPP2, "error message (not displayed)\n");
 
-	/* print again the previous logs */
-	rte_log_dump_history(stdout);
-
 	return 0;
 }
 
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index 5fae79a..c3afc2e 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -38,9 +38,7 @@ In the DPDK, it is identified by name and uses a ring to store free objects.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
-This library is used by the
-:ref:`Mbuf Library <Mbuf_Library>` and the
-:ref:`Environment Abstraction Layer <Environment_Abstraction_Layer>` (for logging history).
+This library is used by the :ref:`Mbuf Library <Mbuf_Library>`.
 
 Cookies
 -------
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ad05eba..bda40c1 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -8,6 +8,9 @@ API and ABI deprecation notices are to be posted here.
 Deprecation Notices
 -------------------
 
+* The log history is deprecated.
+  It is voided in 16.07 and will be removed in release 16.11.
+
 * The ethdev hotplug API is going to be moved to EAL with a notification
   mechanism added to crypto and ethdev libraries so that hotplug is now
   available to both of them. This API will be stripped of the device arguments
diff --git a/lib/librte_eal/bsdapp/eal/Makefile b/lib/librte_eal/bsdapp/eal/Makefile
index 9054ad6..474651b 100644
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@@ -41,7 +41,6 @@ CFLAGS += -I$(SRCDIR)/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
 CFLAGS += -I$(RTE_SDK)/lib/librte_ring
-CFLAGS += -I$(RTE_SDK)/lib/librte_mempool
 CFLAGS += $(WERROR_FLAGS) -O3
 
 LDLIBS += -lexecinfo
diff --git a/lib/librte_eal/bsdapp/eal/eal_debug.c b/lib/librte_eal/bsdapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/bsdapp/eal/eal_debug.c
+++ b/lib/librte_eal/bsdapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/common/eal_common_log.c b/lib/librte_eal/common/eal_common_log.c
index b5e37bb..7916c78 100644
--- a/lib/librte_eal/common/eal_common_log.c
+++ b/lib/librte_eal/common/eal_common_log.c
@@ -31,54 +31,16 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <string.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdarg.h>
-#include <sys/types.h>
 #include <stdlib.h>
-#include <unistd.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <sys/queue.h>
 
 #include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_memzone.h>
-#include <rte_launch.h>
-#include <rte_common.h>
-#include <rte_cycles.h>
-#include <rte_eal.h>
 #include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_atomic.h>
-#include <rte_debug.h>
-#include <rte_spinlock.h>
-#include <rte_branch_prediction.h>
-#include <rte_ring.h>
-#include <rte_mempool.h>
 
 #include "eal_private.h"
 
-#define LOG_ELT_SIZE     2048
-
-#define LOG_HISTORY_MP_NAME "log_history"
-
-STAILQ_HEAD(log_history_list, log_history);
-
-/**
- * The structure of a message log in the log history.
- */
-struct log_history {
-	STAILQ_ENTRY(log_history) next;
-	unsigned size;
-	char buf[0];
-};
-
-static struct rte_mempool *log_history_mp = NULL;
-static unsigned log_history_size = 0;
-static struct log_history_list log_history;
-
 /* global log structure */
 struct rte_logs rte_logs = {
 	.type = ~0,
@@ -86,10 +48,7 @@ struct rte_logs rte_logs = {
 	.file = NULL,
 };
 
-static rte_spinlock_t log_dump_lock = RTE_SPINLOCK_INITIALIZER;
-static rte_spinlock_t log_list_lock = RTE_SPINLOCK_INITIALIZER;
 static FILE *default_log_stream;
-static int history_enabled = 1;
 
 /**
  * This global structure stores some informations about the message
@@ -106,59 +65,16 @@ static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg);
 /* default logs */
 
 int
-rte_log_add_in_history(const char *buf, size_t size)
+rte_log_add_in_history(const char *buf __rte_unused, size_t size __rte_unused)
 {
-	struct log_history *hist_buf = NULL;
-	static const unsigned hist_buf_size = LOG_ELT_SIZE - sizeof(*hist_buf);
-	void *obj;
-
-	if (history_enabled == 0)
-		return 0;
-
-	rte_spinlock_lock(&log_list_lock);
-
-	/* get a buffer for adding in history */
-	if (log_history_size > RTE_LOG_HISTORY) {
-		hist_buf = STAILQ_FIRST(&log_history);
-		if (hist_buf) {
-			STAILQ_REMOVE_HEAD(&log_history, next);
-			log_history_size--;
-		}
-	}
-	else {
-		if (rte_mempool_mc_get(log_history_mp, &obj) < 0)
-			obj = NULL;
-		hist_buf = obj;
-	}
-
-	/* no buffer */
-	if (hist_buf == NULL) {
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* not enough room for msg, buffer go back in mempool */
-	if (size >= hist_buf_size) {
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-		rte_spinlock_unlock(&log_list_lock);
-		return -ENOBUFS;
-	}
-
-	/* add in history */
-	memcpy(hist_buf->buf, buf, size);
-	hist_buf->buf[size] = hist_buf->buf[hist_buf_size-1] = '\0';
-	hist_buf->size = size;
-	STAILQ_INSERT_TAIL(&log_history, hist_buf, next);
-	log_history_size++;
-	rte_spinlock_unlock(&log_list_lock);
-
 	return 0;
 }
 
 void
 rte_log_set_history(int enable)
 {
-	history_enabled = enable;
+	if (enable)
+		RTE_LOG(WARNING, EAL, "The log history is deprecated.\n");
 }
 
 /* Change the stream that will be used by logging system */
@@ -217,44 +133,8 @@ int rte_log_cur_msg_logtype(void)
 
 /* Dump log history to file */
 void
-rte_log_dump_history(FILE *out)
+rte_log_dump_history(FILE *out __rte_unused)
 {
-	struct log_history_list tmp_log_history;
-	struct log_history *hist_buf;
-	unsigned i;
-
-	/* only one dump at a time */
-	rte_spinlock_lock(&log_dump_lock);
-
-	/* save list, and re-init to allow logging during dump */
-	rte_spinlock_lock(&log_list_lock);
-	tmp_log_history = log_history;
-	STAILQ_INIT(&log_history);
-	log_history_size = 0;
-	rte_spinlock_unlock(&log_list_lock);
-
-	for (i=0; i<RTE_LOG_HISTORY; i++) {
-
-		/* remove one message from history list */
-		hist_buf = STAILQ_FIRST(&tmp_log_history);
-
-		if (hist_buf == NULL)
-			break;
-
-		STAILQ_REMOVE_HEAD(&tmp_log_history, next);
-
-		/* write on stdout */
-		if (fwrite(hist_buf->buf, hist_buf->size, 1, out) == 0) {
-			rte_mempool_mp_put(log_history_mp, hist_buf);
-			break;
-		}
-
-		/* put back message structure in pool */
-		rte_mempool_mp_put(log_history_mp, hist_buf);
-	}
-	fflush(out);
-
-	rte_spinlock_unlock(&log_dump_lock);
 }
 
 /*
@@ -297,29 +177,11 @@ rte_log(uint32_t level, uint32_t logtype, const char *format, ...)
 }
 
 /*
- * called by environment-specific log init function to initialize log
- * history
+ * called by environment-specific log init function
  */
 int
 rte_eal_common_log_init(FILE *default_log)
 {
-	STAILQ_INIT(&log_history);
-
-	/* reserve RTE_LOG_HISTORY*2 elements, so we can dump and
-	 * keep logging during this time */
-	log_history_mp = rte_mempool_create(LOG_HISTORY_MP_NAME, RTE_LOG_HISTORY*2,
-				LOG_ELT_SIZE, 0, 0,
-				NULL, NULL,
-				NULL, NULL,
-				SOCKET_ID_ANY, MEMPOOL_F_NO_PHYS_CONTIG);
-
-	if ((log_history_mp == NULL) &&
-	    ((log_history_mp = rte_mempool_lookup(LOG_HISTORY_MP_NAME)) == NULL)){
-		RTE_LOG(ERR, EAL, "%s(): cannot create log_history mempool\n",
-			__func__);
-		return -1;
-	}
-
 	default_log_stream = default_log;
 	rte_openlog_stream(default_log);
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 2342fa1..857dc3e 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -49,9 +49,6 @@ int rte_eal_memzone_init(void);
 /**
  * Common log initialization function (private to eal).
  *
- * Called by environment-specific log initialization function to initialize
- * log history.
- *
  * @param default_log
  *   The default log stream to be used.
  * @return
diff --git a/lib/librte_eal/common/include/rte_log.h b/lib/librte_eal/common/include/rte_log.h
index 2e47e7f..b1add04 100644
--- a/lib/librte_eal/common/include/rte_log.h
+++ b/lib/librte_eal/common/include/rte_log.h
@@ -42,6 +42,8 @@
  * This file provides a log API to RTE applications.
  */
 
+#include "rte_common.h" /* for __rte_deprecated macro */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -179,22 +181,27 @@ int rte_log_cur_msg_loglevel(void);
 int rte_log_cur_msg_logtype(void);
 
 /**
+ * @deprecated
  * Enable or disable the history (enabled by default)
  *
  * @param enable
  *   true to enable, or 0 to disable history.
  */
+__rte_deprecated
 void rte_log_set_history(int enable);
 
 /**
+ * @deprecated
  * Dump the log history to a file
  *
  * @param f
  *   A pointer to a file for output
  */
+__rte_deprecated
 void rte_log_dump_history(FILE *f);
 
 /**
+ * @deprecated
  * Add a log message to the history.
  *
  * This function can be called from a user-defined log stream. It adds
@@ -209,6 +216,7 @@ void rte_log_dump_history(FILE *f);
  *   - 0: Success.
  *   - (-ENOBUFS) if there is no room to store the message.
  */
+__rte_deprecated
 int rte_log_add_in_history(const char *buf, size_t size);
 
 /**
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
index 907fbfa..5fbc17c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_debug.c
+++ b/lib/librte_eal/linuxapp/eal/eal_debug.c
@@ -77,9 +77,6 @@ void __rte_panic(const char *funcname, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
 	va_start(ap, format);
 	rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
@@ -98,9 +95,6 @@ rte_exit(int exit_code, const char *format, ...)
 {
 	va_list ap;
 
-	/* disable history */
-	rte_log_set_history(0);
-
 	if (exit_code != 0)
 		RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
 				"  Cause: ", exit_code);
diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c
index 0b133c3..d391100 100644
--- a/lib/librte_eal/linuxapp/eal/eal_log.c
+++ b/lib/librte_eal/linuxapp/eal/eal_log.c
@@ -50,8 +50,7 @@
 #include "eal_private.h"
 
 /*
- * default log function, used once mempool (hence log history) is
- * available
+ * default log function
  */
 static ssize_t
 console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
@@ -60,9 +59,6 @@ console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
 	ssize_t ret;
 	uint32_t loglevel;
 
-	/* add this log in history */
-	rte_log_add_in_history(buf, size);
-
 	/* write on stdout */
 	ret = fwrite(buf, 1, size, stdout);
 	fflush(stdout);
@@ -110,8 +106,7 @@ rte_eal_log_init(const char *id, int facility)
 /* early logs */
 
 /*
- * early log function, used during boot when mempool (hence log
- * history) is not available
+ * early log function, used before rte_eal_log_init
  */
 static ssize_t
 early_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index b54de43..22a5645 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -1003,7 +1003,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 
 		if (free == 0) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1012,7 +1011,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 			hdr->cookie = RTE_MEMPOOL_HEADER_COOKIE2;
 		} else if (free == 1) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1022,7 +1020,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		} else if (free == 2) {
 			if (cookie != RTE_MEMPOOL_HEADER_COOKIE1 &&
 			    cookie != RTE_MEMPOOL_HEADER_COOKIE2) {
-				rte_log_set_history(0);
 				RTE_LOG(CRIT, MEMPOOL,
 					"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 					obj, (const void *) mp, cookie);
@@ -1032,7 +1029,6 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 		tlr = __mempool_get_trailer(obj);
 		cookie = tlr->cookie;
 		if (cookie != RTE_MEMPOOL_TRAILER_COOKIE) {
-			rte_log_set_history(0);
 			RTE_LOG(CRIT, MEMPOOL,
 				"obj=%p, mempool=%p, cookie=%" PRIx64 "\n",
 				obj, (const void *) mp, cookie);
-- 
2.7.0

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH v3] log: deprecate history dump
  2016-06-09 22:10  5%   ` [dpdk-dev] [PATCH v3] " Thomas Monjalon
@ 2016-06-10  9:50  0%     ` David Marchand
  2016-06-10 13:09  0%       ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: David Marchand @ 2016-06-10  9:50 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

On Fri, Jun 10, 2016 at 12:10 AM, Thomas Monjalon
<thomas.monjalon@6wind.com> wrote:
> The log history uses rte_mempool. In order to remove the mempool
> dependency in EAL (and improve the build), this feature is deprecated.
> The ABI is kept but the behaviour is now voided because it seems this
> function was not used. The history can be read from syslog.
>
> Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>

Acked-by: David Marchand <david.marchand@6wind.com>


-- 
David Marchand

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v1] hash: add tsx support for cuckoo hash
  @ 2016-06-10 11:09  0%     ` De Lara Guarch, Pablo
  0 siblings, 0 replies; 200+ results
From: De Lara Guarch, Pablo @ 2016-06-10 11:09 UTC (permalink / raw)
  To: Shen, Wei1, Stephen Hemminger; +Cc: dev, Maciocco, Christian, Gobriel, Sameh



> -----Original Message-----
> From: Shen, Wei1
> Sent: Monday, May 09, 2016 5:52 PM
> To: Stephen Hemminger
> Cc: dev@dpdk.org; De Lara Guarch, Pablo; Maciocco, Christian; Gobriel,
> Sameh
> Subject: Re: [dpdk-dev] [PATCH v1] hash: add tsx support for cuckoo hash
> 
> Hi Stephen,
> 
> Greetings. Thanks for your great feedback. Let’s me address your concern
> here.
> 
> 1) It changes ABI, so it breaks old programs
> The patch uses the extra_flag field in the rte_hash_parameters struct to set
> the default insertion behavior. Today there is only one bit used by this flag
> (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT 0x1) and we used the
> next unused bit (RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD 0x2) in this
> patch. So ABI are maintained.

Agree on this. Also, if the problem is on the rte_hash (because of the change of
the cache size or the addition of a new field at the end), that should not be a problem,
as far as I understand, since it is modifying an internal structure
(rte_hash was made internal back in v2.1).

> 
> 2) What about older processors, need to detect and handle them at runtime.
> Correct. This patch is based on the previous Transactional Memory patch.
> Since these previous patches already assume the user to check the presence
> of TSX so we build on top this assumption. But I personally agree with you
> that handling TSX check should be made easier.
> http://dpdk.org/ml/archives/dev/2015-June/018571.html
> http://dpdk.org/ml/archives/dev/2015-June/018566.html
> 
> 3) Why can't this just be the default behavior with correct fallback to locking
> on older processors.
> This is an excellent point. We discussed this before. Our thought at that time
> is, since TSX insertion is a bit slower than without anything (TSX or other
> locks), it would benefit apps that is designed to have a single writer to the
> hash table (for instance in some master-slave model). We might need more
> feedback from user about whether making it default is more desirable if
> most the app is designed with multi-writer manner.
> 
> Thanks,
> 
> 
> --
> Best,
> 
> Wei Shen.
> 
> 
> 
> 
> 
> 
> On 5/6/16, 9:56 PM, "Stephen Hemminger"
> <stephen@networkplumber.org> wrote:
> 
> >On Fri,  6 May 2016 21:05:02 +0100
> >Shen Wei <wei1.shen@intel.com> wrote:
> >
> >> --- a/lib/librte_hash/rte_cuckoo_hash.c
> >> +++ b/lib/librte_hash/rte_cuckoo_hash.c
> >> @@ -1,7 +1,7 @@
> >>  /*-
> >>   *   BSD LICENSE
> >>   *
> >> - *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> >> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> >>   *   All rights reserved.
> >>   *
> >>   *   Redistribution and use in source and binary forms, with or without
> >> @@ -100,7 +100,9 @@ EAL_REGISTER_TAILQ(rte_hash_tailq)
> >>
> >>  #define KEY_ALIGNMENT			16
> >>
> >> -#define LCORE_CACHE_SIZE		8
> >> +#define LCORE_CACHE_SIZE		64
> >> +
> >> +#define RTE_HASH_BFS_QUEUEs_MAX_LEN  5000
> >>
> >>  #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
> >>  /*
> >> @@ -190,6 +192,7 @@ struct rte_hash {
> >>  							memory support */
> >>  	struct lcore_cache *local_free_slots;
> >>  	/**< Local cache per lcore, storing some indexes of the free slots */
> >> +	uint8_t multiwrite_add; /**< Multi-write safe hash add behavior */
> >>  } __rte_cache_aligned;
> >>
> >
> >I like the idea of using TSX to allow multi-writer safety, but there are
> >several problems with this patch.
> >
> >1) It changes ABI, so it breaks old programs
> >2) What about older processors, need to detect and handle them at
> runtime.
> >3) Why can't this just be the default behavior with correct
> >   fallback to locking on older processors.
> >
> >Actually lock ellision in DPDK is an interesting topic in general that
> >needs to be addressed.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3] log: deprecate history dump
  2016-06-10  9:50  0%     ` David Marchand
@ 2016-06-10 13:09  0%       ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-10 13:09 UTC (permalink / raw)
  To: dev; +Cc: David Marchand

> > The log history uses rte_mempool. In order to remove the mempool
> > dependency in EAL (and improve the build), this feature is deprecated.
> > The ABI is kept but the behaviour is now voided because it seems this
> > function was not used. The history can be read from syslog.
> >
> > Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
> 
> Acked-by: David Marchand <david.marchand@6wind.com>

Applied

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v9 0/3] mempool: add external mempool manager
  2016-06-03 14:58  2%       ` [dpdk-dev] [PATCH v8 " David Hunt
@ 2016-06-10 15:16  2%         ` David Hunt
  2016-06-14  9:46  2%           ` [dpdk-dev] [PATCH v10 " David Hunt
  0 siblings, 1 reply; 200+ results
From: David Hunt @ 2016-06-10 15:16 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 09/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool ops struct name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. put       - puts an object back into the mempool once an application has
                finished with it
 3. get       - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time a get/put/get_count is called from the application/PMD, the
callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .put = common_ring_sp_put,
    .get = common_ring_mc_get,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool manager

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v8 0/8] add packet capture framework
       [not found]     <1465487895-5870-1-git-send-email-reshma.pattan@intel.com>
@ 2016-06-10 16:18  2% ` Reshma Pattan
  2016-06-10 16:18  5%   ` [dpdk-dev] [PATCH v8 8/8] doc: update doc for " Reshma Pattan
                     ` (2 more replies)
  0 siblings, 3 replies; 200+ results
From: Reshma Pattan @ 2016-06-10 16:18 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
The librte_pdump provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

The librte_pdump library works on a client/server model. The server is responsible for enabling or
disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
the packet capture.

The packet capture framework, as part of its initialization, creates the pthread and the server socket in
the pthread. The application that calls the framework initialization will have the server socket created,
either under the path that the application has passed or under the default path i.e. either ''/var/run'' for
root user or ''$HOME'' for non root user.

Applications that request enabling or disabling of the packet capture will have the client socket created either under
the ''/var/run/'' for root users or ''$HOME'' for not root users to send the requests to the server.
The server socket will listen for client requests for enabling or disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()
rte_pdump_set_socket_dir()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v8:
added server socket argument to rte_pdump_init() API ==> http://dpdk.org/dev/patchwork/patch/13402/
added rte_pdump_set_socket_dir() API.
updated documentation for new changes.

v7:
fixed lines over 90 characters.

v6:
removed below deprecation notice patch from patch set.
http://dpdk.org/dev/patchwork/patch/13372/

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (8):
  librte_ether: protect add/remove of rxtx callbacks with spinlocks
  librte_ether: add new api rte_eth_add_first_rx_callback
  librte_ether: add new fields to rte_eth_dev_info struct
  librte_ether: make rte_eth_dev_get_port_by_name
    rte_eth_dev_get_name_by_port public
  lib/librte_pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/test-pmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 117 +++++
 doc/guides/rel_notes/release_16_07.rst  |  13 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  60 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 ++
 lib/librte_pdump/rte_pdump.c            | 904 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 208 ++++++++
 lib/librte_pdump/rte_pdump_version.map  |  13 +
 mk/rte.app.mk                           |   1 +
 20 files changed, 2493 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
-- 
2.5.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v8 8/8] doc: update doc for packet capture framework
  2016-06-10 16:18  2% ` [dpdk-dev] [PATCH v8 0/8] add packet capture framework Reshma Pattan
@ 2016-06-10 16:18  5%   ` Reshma Pattan
  2016-06-10 23:23  0%   ` [dpdk-dev] [PATCH v8 0/8] add " Neil Horman
  2016-06-14  9:38  2%   ` [dpdk-dev] [PATCH v9 " Reshma Pattan
  2 siblings, 0 replies; 200+ results
From: Reshma Pattan @ 2016-06-10 16:18 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
Acked-by: John McNamara <john.mcnamara@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 117 ++++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  13 ++++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 ++++++++++++++++++++++++++++++++
 6 files changed, 257 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index a48c8de..ce7c941 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -436,6 +436,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..3088063
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,117 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _pdump_library:
+
+The librte_pdump Library
+========================
+
+The ``librte_pdump`` library provides a framework for packet capturing in DPDK.
+The library provides the following APIs to initialize the packet capture framework, to enable
+or disable the packet capture, and to uninitialize it:
+
+* ``rte_pdump_init()``:
+  This API initializes the packet capture framework.
+
+* ``rte_pdump_enable()``:
+  This API enables the packet capture on a given port and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_enable_by_deviceid()``:
+  This API enables the packet capture on a given device id (``vdev name or pci address``) and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_disable()``:
+  This API disables the packet capture on a given port and queue.
+
+* ``rte_pdump_disable_by_deviceid()``:
+  This API disables the packet capture on a given device id (``vdev name or pci address``) and queue.
+
+* ``rte_pdump_uninit()``:
+  This API uninitializes the packet capture framework.
+
+* ``rte_pdump_set_socket_dir()``:
+  This API sets the server socket path.
+  Note: This API is not thread-safe.
+
+
+Operation
+---------
+
+The ``librte_pdump`` library works on a client/server model. The server is responsible for enabling or
+disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
+the packet capture.
+
+The packet capture framework, as part of its initialization, creates the pthread and the server socket in
+the pthread. The application that calls the framework initialization will have the server socket created,
+either under the path that the application has passed or under the default path i.e. either ``/var/run`` for
+root user or ``$HOME`` for non root user.
+
+Applications that request enabling or disabling of the packet capture will have the client socket created either under
+the ``/var/run/`` for root users or ``$HOME`` for not root users to send the requests to the server.
+The server socket will listen for client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket. The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture.
+
+The library APIs ``rte_pdump_enable()`` and ``rte_pdump_enable_by_deviceid()`` enables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump enable" request and sends
+the request to the server. The server that is listening on the socket will take the request and enable the packet capture
+by registering the Ethernet RX and TX callbacks for the given port or device_id and queue combinations.
+Then the server will mirror the packets to the new mempool and enqueue them to the rte_ring that clients have passed
+to these APIs. The server also sends the response back to the client about the status of the request that was processed.
+After the response is received from the server, the client socket is closed.
+
+The library APIs ``rte_pdump_disable()`` and ``rte_pdump_disable_by_deviceid()`` disables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump disable" request and sends
+the request to the server. The server that is listening on the socket will take the request and disable the packet
+capture by removing the Ethernet RX and TX callbacks for the given port or device_id and queue combinations. The server
+also sends the response back to the client about the status of the request that was processed. After the response is
+received from the server, the client socket is closed.
+
+The library API ``rte_pdump_uninit()``, uninitializes the packet capture framework by closing the pthread and the
+server socket.
+
+The library API ``rte_pdump_set_server_socket_dir()``, sets the given path as server socket path.
+If the given path is ``NULL``, default path will be selected, i.e. either ``/var/run/`` for root users or ``$HOME``
+for non root users. Clients need to call this API only when their server socket path is non default path.
+The given server socket path will be used by clients to send the pdump enable and disable requests to the server.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
+Users can use this as an example to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..a4de2a2 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -66,6 +66,11 @@ New Features
   * Enable RSS per network interface through the configuration file.
   * Streamline the CLI code.
 
+* **Added packet capture framework.**
+
+  * A new library ``librte_pdump`` is added to provide packet capture APIs.
+  * A new ``app/pdump`` tool is added to capture packets in DPDK.
+
 
 Resolved Issues
 ---------------
@@ -135,6 +140,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to a public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -146,6 +156,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..96c8709
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,122 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+   .. Note::
+
+      * As of now the ``dpdk_pdump`` tool cannot capture the packets of virtual devices
+        in the primary process due to a bug in the ethdev library. Due to this bug, in a multi process context,
+        when the primary and secondary have different ports set, then the secondary process
+        (here the ``dpdk_pdump`` tool) overwrites the ``rte_eth_devices[]`` entries of the primary process.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 5%]

* Re: [dpdk-dev] [PATCH v8 0/8] add packet capture framework
  2016-06-10 16:18  2% ` [dpdk-dev] [PATCH v8 0/8] add packet capture framework Reshma Pattan
  2016-06-10 16:18  5%   ` [dpdk-dev] [PATCH v8 8/8] doc: update doc for " Reshma Pattan
@ 2016-06-10 23:23  0%   ` Neil Horman
  2016-06-13  8:47  0%     ` Pattan, Reshma
  2016-06-14  9:38  2%   ` [dpdk-dev] [PATCH v9 " Reshma Pattan
  2 siblings, 1 reply; 200+ results
From: Neil Horman @ 2016-06-10 23:23 UTC (permalink / raw)
  To: Reshma Pattan; +Cc: dev

On Fri, Jun 10, 2016 at 05:18:46PM +0100, Reshma Pattan wrote:
> This patch set include below changes
> 
> 1)Changes to librte_ether.
> 2)A new library librte_pdump added for packet capture framework.
> 3)A new app/pdump tool added for packet capturing.
> 4)Test pmd changes done to initialize packet capture framework.
> 5)Documentation update.
> 
> 1)librte_pdump
> ==============
> To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
> is added.Users can develop their own packet capturing application using new library APIs.
> 
> Operation:
> ----------
> The librte_pdump provides APIs to support packet capturing on dpdk Ethernet devices.
> Library provides APIs to initialize the packet capture framework, enable/disable
> the packet capture and uninitialize the packet capture framework.
> 
> The librte_pdump library works on a client/server model. The server is responsible for enabling or
> disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
> the packet capture.
> 
> The packet capture framework, as part of its initialization, creates the pthread and the server socket in
> the pthread. The application that calls the framework initialization will have the server socket created,
> either under the path that the application has passed or under the default path i.e. either ''/var/run'' for
> root user or ''$HOME'' for non root user.
> 
> Applications that request enabling or disabling of the packet capture will have the client socket created either under
> the ''/var/run/'' for root users or ''$HOME'' for not root users to send the requests to the server.
> The server socket will listen for client requests for enabling or disabling the packet capture.
> 
> Applications using below APIs need to pass port/device_id, queue, mempool and
> ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
> packets of the port for users. Users need to dequeue the rings and write the packets
> to vdev(pcap/tuntap) to view the packets using any standard tools.
> 
> Note:
> Mempool and Ring should be mc/mp supportable.
> Mempool mbuf size should be big enough to handle the rx/tx packets of a port.
> 
> APIs:
> -----
> rte_pdump_init()
> rte_pdump_enable()
> rte_pdump_enable_by_deviceid()
> rte_pdump_disable()
> rte_pdump_disable_by_deviceid()
> rte_pdump_uninit()
> rte_pdump_set_socket_dir()
> 
> 2)app/pdump tool
> ================
> Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
> This tool by default runs as secondary process, and provides the support for
> the command line options for packet capture.
> 
> ./build/app/dpdk_pdump --
>                        --pdump '(port=<port id> | device_id=<pci id or vdev name>),
>                                 (queue=<queue id>),
>                                 (rx-dev=<iface or pcap file> |
>                                  tx-dev=<iface or pcap file>),
>                                 [ring-size=<ring size>],
>                                 [mbuf-size=<mbuf data size>],
>                                 [total-num-mbufs=<number of mbufs>]'
> 
> Parameters inside the parenthesis represents the mandatory parameters.
> Parameters inside the square brackets represents optional parameters.
> User has to pass on packet capture parameters under --pdump parameters, multiples of
> --pdump can be passed to capture packets on different port and queue combinations
> 
> Operation:
> ----------
> *Tool parse the user command line arguments,
> creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
> of the device passed in rx-dev|tx-dev parameters.
> 
> *Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
> to enable packet capturing on a specific port/device_id and queue by passing on
> port|device_id, queue, mempool and ring info.
> 
> *Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.
> 
> *Tool can be stopped using SIGINT, upon which tool calls
> rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.
> 
> Note:
> CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.
> 
> 3)Test-pmd changes
> ==================
> Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
> So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.
> 
> Similarly any application which needs packet capture should call initialize/uninitialize APIs of
> librte_pdump and use pdump tool to start the capture.
> 
> 4)Packet capture flow between pdump tool and librte_pdump
> =========================================================
> * Pdump tool (Secondary process) requests packet capture
> for specific port|device_id and queue combinations.
> 
> *Library in secondary process context creates client socket and communicates
> the port|device_id, queue, ring and mempool to server.
> 
> *Library initializes server in primary process 'test-pmd' context and server serves
> the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.
> 
> *Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.
> 
> *Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
> so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.
> 
> *Once the pdump tool is terminated with SIGINT it will disable the packet capturing.
> 
> *Library receives the disable packet capture request, communicate the info to server,
> server will remove the Ethernet rxtx call-backs.
> 
> *Packet capture can be seen using tcpdump command
> "tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"
> 
> 5)Example command line
> ======================
> ./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'
> 
> v8:
> added server socket argument to rte_pdump_init() API ==> http://dpdk.org/dev/patchwork/patch/13402/
> added rte_pdump_set_socket_dir() API.
> updated documentation for new changes.
> 
> v7:
> fixed lines over 90 characters.
> 
> v6:
> removed below deprecation notice patch from patch set.
> http://dpdk.org/dev/patchwork/patch/13372/
> 
> v5:
> addressed code review comments for below patches
> http://dpdk.org/dev/patchwork/patch/12955/
> http://dpdk.org/dev/patchwork/patch/12951/
> 
> v4:
> added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
> made doc changes as per doc guidelines.
> replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
> removed rxtx-dev parameter from pdump tool command line.
> 
> v3:
> app/pdump: Moved cleanup code from signal handler to main.
> divided librte_ether changes into multiple patches.
> example command changed in app/pdump application guide
> 
> v2:
> fix compilation issues for 4.8.3
> fix unnecessary #includes
> 
> 
> Reshma Pattan (8):
>   librte_ether: protect add/remove of rxtx callbacks with spinlocks
>   librte_ether: add new api rte_eth_add_first_rx_callback
>   librte_ether: add new fields to rte_eth_dev_info struct
>   librte_ether: make rte_eth_dev_get_port_by_name
>     rte_eth_dev_get_name_by_port public
>   lib/librte_pdump: add new library for packet capturing support
>   app/pdump: add pdump tool for packet capturing
>   app/test-pmd: add pdump initialization uninitialization
>   doc: update doc for packet capture framework
> 
>  MAINTAINERS                             |   8 +
>  app/Makefile                            |   1 +
>  app/pdump/Makefile                      |  45 ++
>  app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++
>  app/test-pmd/testpmd.c                  |   6 +
>  config/common_base                      |   5 +
>  doc/guides/prog_guide/index.rst         |   1 +
>  doc/guides/prog_guide/pdump_library.rst | 117 +++++
>  doc/guides/rel_notes/release_16_07.rst  |  13 +
>  doc/guides/sample_app_ug/index.rst      |   1 +
>  doc/guides/sample_app_ug/pdump.rst      | 122 +++++
>  lib/Makefile                            |   1 +
>  lib/librte_ether/rte_ethdev.c           | 123 +++--
>  lib/librte_ether/rte_ethdev.h           |  60 +++
>  lib/librte_ether/rte_ether_version.map  |   9 +
>  lib/librte_pdump/Makefile               |  55 ++
>  lib/librte_pdump/rte_pdump.c            | 904 ++++++++++++++++++++++++++++++++
>  lib/librte_pdump/rte_pdump.h            | 208 ++++++++
>  lib/librte_pdump/rte_pdump_version.map  |  13 +
>  mk/rte.app.mk                           |   1 +
>  20 files changed, 2493 insertions(+), 44 deletions(-)
>  create mode 100644 app/pdump/Makefile
>  create mode 100644 app/pdump/main.c
>  create mode 100644 doc/guides/prog_guide/pdump_library.rst
>  create mode 100644 doc/guides/sample_app_ug/pdump.rst
>  create mode 100644 lib/librte_pdump/Makefile
>  create mode 100644 lib/librte_pdump/rte_pdump.c
>  create mode 100644 lib/librte_pdump/rte_pdump.h
>  create mode 100644 lib/librte_pdump/rte_pdump_version.map
> 
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> -- 
> 2.5.0
> 
> 
This seems useful, but the pcap pmd already accepts pcap formatted files for
input to send using the pcap library.  Shouldn't this functionality be
integrated with that pmd instead of breaking it out to its own library?

Neil

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-08  9:19  0%         ` Ananyev, Konstantin
@ 2016-06-12  2:00  0%           ` Lu, Wenzhuo
  2016-06-12 23:16  0%             ` Ananyev, Konstantin
  0 siblings, 1 reply; 200+ results
From: Lu, Wenzhuo @ 2016-06-12  2:00 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tao, Zhe, dev
  Cc: Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu, Jingjing,
	Zhang, Helin

Hi Konstantin,


> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, June 8, 2016 5:20 PM
> To: Lu, Wenzhuo; Tao, Zhe; dev@dpdk.org
> Cc: Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing; Zhang, Helin
> Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> 
> 
> 
> >
> > Hi Konstantin,
> >
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Tuesday, June 7, 2016 5:59 PM
> > > To: Tao, Zhe; dev@dpdk.org
> > > Cc: Lu, Wenzhuo; Richardson, Bruce; Chen, Jing D; Liang, Cunming;
> > > Wu, Jingjing; Zhang, Helin
> > > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> > >
> > >
> > > Hi Zhe & Wenzhuo,
> > >
> > > Please find my comments below.
> > > BTW, for clarification - is that patch for 16.11?
> > > I believe it's too late to introduce such significant change in 16.07.
> > > Thanks
> > > Konstantin
> > Thanks for the comments.
> > Honestly, our purpose is 16.07. Realizing the big impact, we use
> > NEXT_ABI to comment our change. So, I think although we want to merge it in
> 16.07 this change will become effective after we remove NEXT_ABI in 16.11.
> 
> I don't think it is achievable.
> First I think your code is not in proper shape yet, right now.
> Second, as you said, it is a significant change and I would like to hear opinions
> from the rest of the community.
Agree it should have risk. I mean our target is 16.07. But surely if it can be achieved depends on the feedback from the community.

> 
> >
> > >
> > > > Define lock mode for RX/TX queue. Because when resetting the
> > > > device we want the resetting thread to get the lock of the RX/TX
> > > > queue to make sure the RX/TX is stopped.
> > > >
> > > > Using next ABI macro for this ABI change as it has too much
> > > > impact. 7 APIs and 1 global variable are impacted.
> > > >
> > > > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > > > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> > > > ---
> > > >  lib/librte_ether/rte_ethdev.h | 62
> > > > +++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 62 insertions(+)
> > > >
> > > > diff --git a/lib/librte_ether/rte_ethdev.h
> > > > b/lib/librte_ether/rte_ethdev.h index 74e895f..4efb5e9 100644
> > > > --- a/lib/librte_ether/rte_ethdev.h
> > > > +++ b/lib/librte_ether/rte_ethdev.h
> > > > @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
> > > >  		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
> > > >  		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
> > > >  		enable_scatter   : 1, /**< Enable scatter packets rx handler */
> > > > +#ifndef RTE_NEXT_ABI
> > > >  		enable_lro       : 1; /**< Enable LRO */
> > > > +#else
> > > > +		enable_lro       : 1, /**< Enable LRO */
> > > > +		lock_mode        : 1; /**< Using lock path */
> > > > +#endif
> > > >  };
> > > >
> > > >  /**
> > > > @@ -634,11 +639,68 @@ struct rte_eth_txmode {
> > > >  		/**< If set, reject sending out tagged pkts */
> > > >  		hw_vlan_reject_untagged : 1,
> > > >  		/**< If set, reject sending out untagged pkts */
> > > > +#ifndef RTE_NEXT_ABI
> > > >  		hw_vlan_insert_pvid : 1;
> > > >  		/**< If set, enable port based VLAN insertion */
> > > > +#else
> > > > +		hw_vlan_insert_pvid : 1,
> > > > +		/**< If set, enable port based VLAN insertion */
> > > > +		lock_mode : 1;
> > > > +		/**< If set, using lock path */ #endif
> > > >  };
> > > >
> > > >  /**
> > > > + * The macros for the RX/TX lock mode functions  */ #ifdef
> > > > +RTE_NEXT_ABI #define RX_LOCK_FUNCTION(dev, func) \
> > > > +	(dev->data->dev_conf.rxmode.lock_mode ? \
> > > > +	func ## _lock : func)
> > > > +
> > > > +#define TX_LOCK_FUNCTION(dev, func) \
> > > > +	(dev->data->dev_conf.txmode.lock_mode ? \
> > > > +	func ## _lock : func)
> > > > +#else
> > > > +#define RX_LOCK_FUNCTION(dev, func) func
> > > > +
> > > > +#define TX_LOCK_FUNCTION(dev, func) func #endif
> > > > +
> > > > +/* Add the lock RX/TX function for VF reset */ #define
> > > > +GENERATE_RX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > > +*rx_queue, \
> > > > +		      struct rte_mbuf **rx_pkts, \
> > > > +		      uint16_t nb_pkts) \
> > > > +{					\
> > > > +	struct nic ## _rx_queue *rxq = rx_queue; \
> > > > +	uint16_t nb_rx = 0; \
> > > > +						\
> > > > +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> > > > +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> > > > +		rte_spinlock_unlock(&rxq->rx_lock); \
> > > > +	} \
> > > > +	\
> > > > +	return nb_rx; \
> > > > +}
> > > > +
> > > > +#define GENERATE_TX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > > +*tx_queue, \
> > > > +		      struct rte_mbuf **tx_pkts, \
> > > > +		      uint16_t nb_pkts) \
> > > > +{					\
> > > > +	struct nic ## _tx_queue *txq = tx_queue; \
> > > > +	uint16_t nb_tx = 0; \
> > > > +						\
> > > > +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> > > > +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> > > > +		rte_spinlock_unlock(&txq->tx_lock); \
> > > > +	} \
> > > > +	\
> > > > +	return nb_tx; \
> > > > +}
> > >
> > > 1. As I said in off-line dicussiion, I think this locking could (and
> > > I think better be) impelented completely on rte_ethdev layer.
> > > So actual PMD code will be unaffected.
> > > Again that avoids us to introduce _lock version of every RX/Tx
> > > function in each PMD.
> > One purpose of implementing the lock in PMD layer is to avoid ABI
> > change. But we introduce the field lock_mode in struct
> > rte_eth_rx/txmode. So seems it's not a good reason now :) The other
> > purpose is we want to add a lock for every queue. But in rte layer the
> > queue is void *, so we add the lock in the specific structures of the NICs. But as
> you mentioned below, we can add the lock as dev->data->rx_queue_state it the
> struct rte_eth_dev_data.
> > So, I prefer to add the lock in rte layer now.
> 
> OK.
> 
> >
> > >
> > > 2. Again, as discussed offline, I think it is better to have an
> > > explicit
> > > rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds
> > > into RX/TX config strcutures.
> > > Would help to avoid any confusion, I think.
> > We want the users to choose the rx/tx path without  lock if they're
> > sensitive to the performance and can handle the reset event in their APP. After
> introducing new fields of config struct, users can change the config to choose
> the different path.
> 
> I understand what you are doing.
> 
> > If we introduce new API, it may be harder for the use to use it. I
> > mean when users want to use lock mode, they may need to replace all the
> rte_eth_rx/tx_burst by rte_eth_rx/tx_burst_lock.
> 
> Yes, my opinion if users would like to use locking API they need to call it
> explicitly.
> 
> 
> >So if we add the lock in rte layer, I still prefer adding lock_mode in
> >the  configuration, and the rte_eth_rx/tx_burst is changed like this,
> >rte_eth_rx/tx_burst  {
> > + if lock_mode
> > + try_lock
> > ......
> > + if lock_mode
> > + release_lock
> > }
> 
> My preference is to keep existing rx/tx_burst() functions unaffected by that
> patch.
> At least for now.
> I suppose that will minimise the risks and help users to avoid confusion what API
> (locking/non-locking) is in use.
OK. Let me add new APIs.

> 
> >
> >
> > >
> > > 3.  I thought the plan was to introduce a locking in all appropriate
> > > control path functions (dev_start/dev_stop etc.) Without that
> > > locking version of RX/TX seems a bit useless.
> > > Yes, I understand that you do use locking inside dev_reset, but I
> > > suppose the plan was to have a generic solution, no?
> > > Again, interrupt fire when user invokes dev_start/stop or so, so we
> > > still need some synchronisation between them.
> > >
> > > To be more specific, I thought about something like that:
> > >
> > > static inline uint16_t
> > > rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
> > >                  struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) {
> > >         struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > >
> > > #ifdef RTE_LIBRTE_ETHDEV_DEBUG
> > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
> > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);
> > >
> > >         if (queue_id >= dev->data->nb_rx_queues) {
> > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
> > >                 return 0;
> > >         }
> > > #endif
> > >
> > > + if (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].lock)
> == 0)
> > > +	return 0;
> > > +  else if (dev->data->rx_queue_state[rx_queue_id] ==
> > > RTE_ETH_QUEUE_STATE_STOPPED)) {
> > > +	rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> > > +	return 0;
> > > +
> > >
> > >  nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
> > >                         rx_pkts, nb_pkts);
> > >
> > > + rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock
> > > + );
> > >
> > > ....
> > >
> > > return nb_rx;
> > > }
> > >
> > > And inside queue_start:
> > >
> > > int
> > > rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id) {
> > >         struct rte_eth_dev *dev;
> > >
> > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > >
> > >         dev = &rte_eth_devices[port_id];
> > >         if (rx_queue_id >= dev->data->nb_rx_queues) {
> > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n",
> rx_queue_id);
> > >                 return -EINVAL;
> > >         }
> > >
> > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start,
> > > -ENOTSUP);
> > >
> > >      rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)
> > I think you add the lock here to stop the rx/tx.
> > But to my opinion, we should lock the rx/tx much earlier before
> > starting the queue. For example, when stop the port, the resource of the
> queues may be released.
> 
> I didn't get you here...
> Before releasing the queue resources, queue_stop() has to be executed, right?
Sorry, I saw your example with rte_eth_dev_rx_queue_start, I didn't know you also want to change rte_eth_dev_rx_queue_stop too.
Agree this should work it we call queue_start/stop when reset the port. But we will not call them. I find the queue_stop/start are per-queue functions and not supported by all NICs.
Our solution now is stop the whole port and restart the whole port. We will not stop/restart queue by queue.

> 
> >The rx/tx cannot be executed. So I prefer to get the lock before stopping the
> ports.
> 
> Might be I wasn't clear enough here.
> What I think we need to have:
>  -To stop/start/rx/tx the queue (or do any other action that might change the
> queue internal structure)
>    you have to grab the lock.
>    After queue is stopped it's state has to be changed to
> QUEUE_STATE_STOPPED (whti queue lock grabbed),
>    so rx/tx_locked wouldn't    proceed with that queue.
>   - dev_stop() - has to stop all its queues first, i.e. it needs to call queue_stop()
> for all of them.
>  So after dev_stop() had finished  - all device queues have to be in
> QUEUE_STATE_STOPPED
> Same about dev_start() - after it does all other things - it will call queue_start()
> for all it's queues.
> that will bring them into QUEUE_STARTED.
> After that rx/tx_locked can use them again.
> 
> >Maybe better to keep the spinlock in the dev_reset.
> 
> Might be not :)
> 
> >
> > >
> > >         if (dev->data->rx_queue_state[rx_queue_id] !=
> > > RTE_ETH_QUEUE_STATE_STOPPED) {
> > >                 RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with
> > > port_id=%" PRIu8
> > >                         " already started\n",
> > >                         rx_queue_id, port_id);
> > >                 ret = -EINVAL 0;
> > >         } else
> > >         	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);
> > >
> > >
> > > rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> > >
> > >    return ret;
> > > }
> > >
> > > Then again, we don't need to do explicit locking inside dev_reset().
> > > Does it make sense to you guys?
> > Please see the answer above.
> >
> > >
> > >
> > > > +
> > > > +/**
> > > >   * A structure used to configure an RX ring of an Ethernet port.
> > > >   */
> > > >  struct rte_eth_rxconf {
> > > > --
> > > > 2.1.4

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-09  7:50  0%       ` Olivier Matz
@ 2016-06-12  5:25  0%         ` Lu, Wenzhuo
  0 siblings, 0 replies; 200+ results
From: Lu, Wenzhuo @ 2016-06-12  5:25 UTC (permalink / raw)
  To: Olivier Matz, Stephen Hemminger; +Cc: dev, Tao, Zhe

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Thursday, June 9, 2016 3:51 PM
> To: Lu, Wenzhuo; Stephen Hemminger
> Cc: dev@dpdk.org; Tao, Zhe
> Subject: Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode
> 
> Hi,
> 
> On 06/08/2016 09:34 AM, Lu, Wenzhuo wrote:
> > Hi Stephen,
> >
> >
> >> -----Original Message-----
> >> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> >> Sent: Wednesday, June 8, 2016 10:16 AM
> >> To: Lu, Wenzhuo
> >> Cc: dev@dpdk.org; Tao, Zhe
> >> Subject: Re: [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX
> >> lock mode
> >>
> >> On Mon,  6 Jun 2016 13:40:47 +0800
> >> Wenzhuo Lu <wenzhuo.lu@intel.com> wrote:
> >>
> >>> Define lock mode for RX/TX queue. Because when resetting the device
> >>> we want the resetting thread to get the lock of the RX/TX queue to
> >>> make sure the RX/TX is stopped.
> >>>
> >>> Using next ABI macro for this ABI change as it has too much impact.
> >>> 7 APIs and 1 global variable are impacted.
> >>>
> >>> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> >>> Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> >>
> >> Why does this patch set make a different assumption the rest of the DPDK?
> >>
> >> The rest of the DPDK operates on the principle that the application
> >> is smart enough to stop the device before making changes. There is no
> >> equivalent to the Linux kernel RTNL mutex. The API assumes
> >> application threads are well behaved and will not try and sabotage each
> other.
> >>
> >> If you restrict the reset operation to only being available when
> >> RX/TX is stopped, then no lock is needed.
> >>
> >> The fact that it requires lots more locking inside each device driver
> >> implies to me this is not correct way to architect this.
> 
> +1
> 
> I'm not sure adding locks is the proper way to do.
> This is the application responsibility to ensure that:
> - control functions are not called concurrently on the same port
> - rx/tx functions are not called when the device is stopped/reset/...
> 
> However, I do think the usage paradigms of the ethdev api should be better
> documented in rte_ethdev.h (ex: which functions can be called concurrently).
> This would be a first step.
> 
> If we really want a helper API to do that in DPDK, the _next_ step could be to
> add them in the ethdev api to achieve this. Maybe something like (the function
> names could be better):
> 
> - to be called on one control thread:
> 
>   rte_eth_stop_rxtx(port)
>   rte_eth_start_rxtx(port)
> 
>   rte_eth_get_rxtx_state(port)
>      -> return "running" if at least one core is inside the rx/tx code
>      -> return "stopped" if all cores are outside the rx/tx code
> 
> - to be called on dataplane cores:
> 
>   /* same than rte_eth_rx_burst(), but checks if rx/tx is allowed
>    * first, else do nothing */
>   rte_eth_rx_burst_interruptible()
>   rte_eth_tx_burst_interruptible()
> 
> 
> The code of control thread could be:
> 
>   rte_eth_stop_rxtx(port);
>   /* wait that all dataplane cores finished their processing */
>   while (rte_eth_get_rxtx_state(port) != stopped)
>       ;
>   rte_eth_some_control_operation(port);
>   rte_eth_start_rxtx(port);
> 
> 
> I think this could be done without any lock, just with the proper memory barriers
> and a per-core status.
> 
> But this API may impose a paradigm to the application, and I'm not sure the
> DPDK should do that.
I don't quite catch your point. Seems your solution still need the APP to change the code. I think it's more complex than just letting the APP to stop the rx/tx and reset the port. Our purpose of this patch set is to let APP do less as possible. It's not a good choice if we make it more complex.
And seems it's hard to stop and start rx/tx in rte layer. Normally APP should do that. To my opinion, we have to introduce lock in rte to achieve that.

> 
> Regards,
> Olivier

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-12  2:00  0%           ` Lu, Wenzhuo
@ 2016-06-12 23:16  0%             ` Ananyev, Konstantin
  2016-06-13  1:06  0%               ` Lu, Wenzhuo
  0 siblings, 1 reply; 200+ results
From: Ananyev, Konstantin @ 2016-06-12 23:16 UTC (permalink / raw)
  To: Lu, Wenzhuo, Tao, Zhe, dev
  Cc: Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu, Jingjing,
	Zhang, Helin

Hi Wenzhuo,

> 
> Hi Konstantin,
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Wednesday, June 8, 2016 5:20 PM
> > To: Lu, Wenzhuo; Tao, Zhe; dev@dpdk.org
> > Cc: Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing; Zhang, Helin
> > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> >
> >
> >
> > >
> > > Hi Konstantin,
> > >
> > >
> > > > -----Original Message-----
> > > > From: Ananyev, Konstantin
> > > > Sent: Tuesday, June 7, 2016 5:59 PM
> > > > To: Tao, Zhe; dev@dpdk.org
> > > > Cc: Lu, Wenzhuo; Richardson, Bruce; Chen, Jing D; Liang, Cunming;
> > > > Wu, Jingjing; Zhang, Helin
> > > > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> > > >
> > > >
> > > > Hi Zhe & Wenzhuo,
> > > >
> > > > Please find my comments below.
> > > > BTW, for clarification - is that patch for 16.11?
> > > > I believe it's too late to introduce such significant change in 16.07.
> > > > Thanks
> > > > Konstantin
> > > Thanks for the comments.
> > > Honestly, our purpose is 16.07. Realizing the big impact, we use
> > > NEXT_ABI to comment our change. So, I think although we want to merge it in
> > 16.07 this change will become effective after we remove NEXT_ABI in 16.11.
> >
> > I don't think it is achievable.
> > First I think your code is not in proper shape yet, right now.
> > Second, as you said, it is a significant change and I would like to hear opinions
> > from the rest of the community.
> Agree it should have risk. I mean our target is 16.07. But surely if it can be achieved depends on the feedback from the community.
> 
> >
> > >
> > > >
> > > > > Define lock mode for RX/TX queue. Because when resetting the
> > > > > device we want the resetting thread to get the lock of the RX/TX
> > > > > queue to make sure the RX/TX is stopped.
> > > > >
> > > > > Using next ABI macro for this ABI change as it has too much
> > > > > impact. 7 APIs and 1 global variable are impacted.
> > > > >
> > > > > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > > > > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> > > > > ---
> > > > >  lib/librte_ether/rte_ethdev.h | 62
> > > > > +++++++++++++++++++++++++++++++++++++++++++
> > > > >  1 file changed, 62 insertions(+)
> > > > >
> > > > > diff --git a/lib/librte_ether/rte_ethdev.h
> > > > > b/lib/librte_ether/rte_ethdev.h index 74e895f..4efb5e9 100644
> > > > > --- a/lib/librte_ether/rte_ethdev.h
> > > > > +++ b/lib/librte_ether/rte_ethdev.h
> > > > > @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
> > > > >  		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
> > > > >  		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
> > > > >  		enable_scatter   : 1, /**< Enable scatter packets rx handler */
> > > > > +#ifndef RTE_NEXT_ABI
> > > > >  		enable_lro       : 1; /**< Enable LRO */
> > > > > +#else
> > > > > +		enable_lro       : 1, /**< Enable LRO */
> > > > > +		lock_mode        : 1; /**< Using lock path */
> > > > > +#endif
> > > > >  };
> > > > >
> > > > >  /**
> > > > > @@ -634,11 +639,68 @@ struct rte_eth_txmode {
> > > > >  		/**< If set, reject sending out tagged pkts */
> > > > >  		hw_vlan_reject_untagged : 1,
> > > > >  		/**< If set, reject sending out untagged pkts */
> > > > > +#ifndef RTE_NEXT_ABI
> > > > >  		hw_vlan_insert_pvid : 1;
> > > > >  		/**< If set, enable port based VLAN insertion */
> > > > > +#else
> > > > > +		hw_vlan_insert_pvid : 1,
> > > > > +		/**< If set, enable port based VLAN insertion */
> > > > > +		lock_mode : 1;
> > > > > +		/**< If set, using lock path */ #endif
> > > > >  };
> > > > >
> > > > >  /**
> > > > > + * The macros for the RX/TX lock mode functions  */ #ifdef
> > > > > +RTE_NEXT_ABI #define RX_LOCK_FUNCTION(dev, func) \
> > > > > +	(dev->data->dev_conf.rxmode.lock_mode ? \
> > > > > +	func ## _lock : func)
> > > > > +
> > > > > +#define TX_LOCK_FUNCTION(dev, func) \
> > > > > +	(dev->data->dev_conf.txmode.lock_mode ? \
> > > > > +	func ## _lock : func)
> > > > > +#else
> > > > > +#define RX_LOCK_FUNCTION(dev, func) func
> > > > > +
> > > > > +#define TX_LOCK_FUNCTION(dev, func) func #endif
> > > > > +
> > > > > +/* Add the lock RX/TX function for VF reset */ #define
> > > > > +GENERATE_RX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > > > +*rx_queue, \
> > > > > +		      struct rte_mbuf **rx_pkts, \
> > > > > +		      uint16_t nb_pkts) \
> > > > > +{					\
> > > > > +	struct nic ## _rx_queue *rxq = rx_queue; \
> > > > > +	uint16_t nb_rx = 0; \
> > > > > +						\
> > > > > +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> > > > > +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> > > > > +		rte_spinlock_unlock(&rxq->rx_lock); \
> > > > > +	} \
> > > > > +	\
> > > > > +	return nb_rx; \
> > > > > +}
> > > > > +
> > > > > +#define GENERATE_TX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > > > +*tx_queue, \
> > > > > +		      struct rte_mbuf **tx_pkts, \
> > > > > +		      uint16_t nb_pkts) \
> > > > > +{					\
> > > > > +	struct nic ## _tx_queue *txq = tx_queue; \
> > > > > +	uint16_t nb_tx = 0; \
> > > > > +						\
> > > > > +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> > > > > +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> > > > > +		rte_spinlock_unlock(&txq->tx_lock); \
> > > > > +	} \
> > > > > +	\
> > > > > +	return nb_tx; \
> > > > > +}
> > > >
> > > > 1. As I said in off-line dicussiion, I think this locking could (and
> > > > I think better be) impelented completely on rte_ethdev layer.
> > > > So actual PMD code will be unaffected.
> > > > Again that avoids us to introduce _lock version of every RX/Tx
> > > > function in each PMD.
> > > One purpose of implementing the lock in PMD layer is to avoid ABI
> > > change. But we introduce the field lock_mode in struct
> > > rte_eth_rx/txmode. So seems it's not a good reason now :) The other
> > > purpose is we want to add a lock for every queue. But in rte layer the
> > > queue is void *, so we add the lock in the specific structures of the NICs. But as
> > you mentioned below, we can add the lock as dev->data->rx_queue_state it the
> > struct rte_eth_dev_data.
> > > So, I prefer to add the lock in rte layer now.
> >
> > OK.
> >
> > >
> > > >
> > > > 2. Again, as discussed offline, I think it is better to have an
> > > > explicit
> > > > rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds
> > > > into RX/TX config strcutures.
> > > > Would help to avoid any confusion, I think.
> > > We want the users to choose the rx/tx path without  lock if they're
> > > sensitive to the performance and can handle the reset event in their APP. After
> > introducing new fields of config struct, users can change the config to choose
> > the different path.
> >
> > I understand what you are doing.
> >
> > > If we introduce new API, it may be harder for the use to use it. I
> > > mean when users want to use lock mode, they may need to replace all the
> > rte_eth_rx/tx_burst by rte_eth_rx/tx_burst_lock.
> >
> > Yes, my opinion if users would like to use locking API they need to call it
> > explicitly.
> >
> >
> > >So if we add the lock in rte layer, I still prefer adding lock_mode in
> > >the  configuration, and the rte_eth_rx/tx_burst is changed like this,
> > >rte_eth_rx/tx_burst  {
> > > + if lock_mode
> > > + try_lock
> > > ......
> > > + if lock_mode
> > > + release_lock
> > > }
> >
> > My preference is to keep existing rx/tx_burst() functions unaffected by that
> > patch.
> > At least for now.
> > I suppose that will minimise the risks and help users to avoid confusion what API
> > (locking/non-locking) is in use.
> OK. Let me add new APIs.
> 
> >
> > >
> > >
> > > >
> > > > 3.  I thought the plan was to introduce a locking in all appropriate
> > > > control path functions (dev_start/dev_stop etc.) Without that
> > > > locking version of RX/TX seems a bit useless.
> > > > Yes, I understand that you do use locking inside dev_reset, but I
> > > > suppose the plan was to have a generic solution, no?
> > > > Again, interrupt fire when user invokes dev_start/stop or so, so we
> > > > still need some synchronisation between them.
> > > >
> > > > To be more specific, I thought about something like that:
> > > >
> > > > static inline uint16_t
> > > > rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
> > > >                  struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) {
> > > >         struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > > >
> > > > #ifdef RTE_LIBRTE_ETHDEV_DEBUG
> > > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
> > > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);
> > > >
> > > >         if (queue_id >= dev->data->nb_rx_queues) {
> > > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
> > > >                 return 0;
> > > >         }
> > > > #endif
> > > >
> > > > + if (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].lock)
> > == 0)
> > > > +	return 0;
> > > > +  else if (dev->data->rx_queue_state[rx_queue_id] ==
> > > > RTE_ETH_QUEUE_STATE_STOPPED)) {
> > > > +	rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> > > > +	return 0;
> > > > +
> > > >
> > > >  nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
> > > >                         rx_pkts, nb_pkts);
> > > >
> > > > + rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock
> > > > + );
> > > >
> > > > ....
> > > >
> > > > return nb_rx;
> > > > }
> > > >
> > > > And inside queue_start:
> > > >
> > > > int
> > > > rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id) {
> > > >         struct rte_eth_dev *dev;
> > > >
> > > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > > >
> > > >         dev = &rte_eth_devices[port_id];
> > > >         if (rx_queue_id >= dev->data->nb_rx_queues) {
> > > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n",
> > rx_queue_id);
> > > >                 return -EINVAL;
> > > >         }
> > > >
> > > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start,
> > > > -ENOTSUP);
> > > >
> > > >      rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)
> > > I think you add the lock here to stop the rx/tx.
> > > But to my opinion, we should lock the rx/tx much earlier before
> > > starting the queue. For example, when stop the port, the resource of the
> > queues may be released.
> >
> > I didn't get you here...
> > Before releasing the queue resources, queue_stop() has to be executed, right?
> Sorry, I saw your example with rte_eth_dev_rx_queue_start, I didn't know you also want to change rte_eth_dev_rx_queue_stop
> too.
> Agree this should work it we call queue_start/stop when reset the port. But we will not call them. I find the queue_stop/start are per-
> queue functions and not supported by all NICs.

But right now you do reset only for ixgbe/i40e.
For these devices we defiantly do support queue start/stop.
And again, it is not only about reset op.
If we want to add rx locked (synced), I think it should be in sync with all control API
that changes queue state.
As I said before it is a lot of work and a lot of hassle...
So probably the easiest (and might be safiest) way just leave things as there are right now:
we allow user to setup a callback on VF reset, and it is user responsibility to make
sure no RX/TX is active while reset operation is performed.
Pretty much what Olivier and Stephen suggested, as I understand.
Konstantin

> Our solution now is stop the whole port and restart the whole port. We will not stop/restart queue by queue.
> 
> >
> > >The rx/tx cannot be executed. So I prefer to get the lock before stopping the
> > ports.
> >
> > Might be I wasn't clear enough here.
> > What I think we need to have:
> >  -To stop/start/rx/tx the queue (or do any other action that might change the
> > queue internal structure)
> >    you have to grab the lock.
> >    After queue is stopped it's state has to be changed to
> > QUEUE_STATE_STOPPED (whti queue lock grabbed),
> >    so rx/tx_locked wouldn't    proceed with that queue.
> >   - dev_stop() - has to stop all its queues first, i.e. it needs to call queue_stop()
> > for all of them.
> >  So after dev_stop() had finished  - all device queues have to be in
> > QUEUE_STATE_STOPPED
> > Same about dev_start() - after it does all other things - it will call queue_start()
> > for all it's queues.
> > that will bring them into QUEUE_STARTED.
> > After that rx/tx_locked can use them again.
> >
> > >Maybe better to keep the spinlock in the dev_reset.
> >
> > Might be not :)
> >
> > >
> > > >
> > > >         if (dev->data->rx_queue_state[rx_queue_id] !=
> > > > RTE_ETH_QUEUE_STATE_STOPPED) {
> > > >                 RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device with
> > > > port_id=%" PRIu8
> > > >                         " already started\n",
> > > >                         rx_queue_id, port_id);
> > > >                 ret = -EINVAL 0;
> > > >         } else
> > > >         	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);
> > > >
> > > >
> > > > rte_spinlock_unlock(&dev->data->rx_queue_state[rx_queue_id].unlock);
> > > >
> > > >    return ret;
> > > > }
> > > >
> > > > Then again, we don't need to do explicit locking inside dev_reset().
> > > > Does it make sense to you guys?
> > > Please see the answer above.
> > >
> > > >
> > > >
> > > > > +
> > > > > +/**
> > > > >   * A structure used to configure an RX ring of an Ethernet port.
> > > > >   */
> > > > >  struct rte_eth_rxconf {
> > > > > --
> > > > > 2.1.4

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
  2016-06-12 23:16  0%             ` Ananyev, Konstantin
@ 2016-06-13  1:06  0%               ` Lu, Wenzhuo
  0 siblings, 0 replies; 200+ results
From: Lu, Wenzhuo @ 2016-06-13  1:06 UTC (permalink / raw)
  To: Ananyev, Konstantin, Tao, Zhe, dev
  Cc: Richardson, Bruce, Chen, Jing D, Liang, Cunming, Wu, Jingjing,
	Zhang, Helin

Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Monday, June 13, 2016 7:17 AM
> To: Lu, Wenzhuo; Tao, Zhe; dev@dpdk.org
> Cc: Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing; Zhang,
> Helin
> Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> 
> Hi Wenzhuo,
> 
> >
> > Hi Konstantin,
> >
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Wednesday, June 8, 2016 5:20 PM
> > > To: Lu, Wenzhuo; Tao, Zhe; dev@dpdk.org
> > > Cc: Richardson, Bruce; Chen, Jing D; Liang, Cunming; Wu, Jingjing;
> > > Zhang, Helin
> > > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode
> > >
> > >
> > >
> > > >
> > > > Hi Konstantin,
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Ananyev, Konstantin
> > > > > Sent: Tuesday, June 7, 2016 5:59 PM
> > > > > To: Tao, Zhe; dev@dpdk.org
> > > > > Cc: Lu, Wenzhuo; Richardson, Bruce; Chen, Jing D; Liang,
> > > > > Cunming; Wu, Jingjing; Zhang, Helin
> > > > > Subject: RE: [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock
> > > > > mode
> > > > >
> > > > >
> > > > > Hi Zhe & Wenzhuo,
> > > > >
> > > > > Please find my comments below.
> > > > > BTW, for clarification - is that patch for 16.11?
> > > > > I believe it's too late to introduce such significant change in 16.07.
> > > > > Thanks
> > > > > Konstantin
> > > > Thanks for the comments.
> > > > Honestly, our purpose is 16.07. Realizing the big impact, we use
> > > > NEXT_ABI to comment our change. So, I think although we want to
> > > > merge it in
> > > 16.07 this change will become effective after we remove NEXT_ABI in
> 16.11.
> > >
> > > I don't think it is achievable.
> > > First I think your code is not in proper shape yet, right now.
> > > Second, as you said, it is a significant change and I would like to
> > > hear opinions from the rest of the community.
> > Agree it should have risk. I mean our target is 16.07. But surely if it can be
> achieved depends on the feedback from the community.
> >
> > >
> > > >
> > > > >
> > > > > > Define lock mode for RX/TX queue. Because when resetting the
> > > > > > device we want the resetting thread to get the lock of the
> > > > > > RX/TX queue to make sure the RX/TX is stopped.
> > > > > >
> > > > > > Using next ABI macro for this ABI change as it has too much
> > > > > > impact. 7 APIs and 1 global variable are impacted.
> > > > > >
> > > > > > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > > > > > Signed-off-by: Zhe Tao <zhe.tao@intel.com>
> > > > > > ---
> > > > > >  lib/librte_ether/rte_ethdev.h | 62
> > > > > > +++++++++++++++++++++++++++++++++++++++++++
> > > > > >  1 file changed, 62 insertions(+)
> > > > > >
> > > > > > diff --git a/lib/librte_ether/rte_ethdev.h
> > > > > > b/lib/librte_ether/rte_ethdev.h index 74e895f..4efb5e9 100644
> > > > > > --- a/lib/librte_ether/rte_ethdev.h
> > > > > > +++ b/lib/librte_ether/rte_ethdev.h
> > > > > > @@ -354,7 +354,12 @@ struct rte_eth_rxmode {
> > > > > >  		jumbo_frame      : 1, /**< Jumbo Frame Receipt
> enable. */
> > > > > >  		hw_strip_crc     : 1, /**< Enable CRC stripping by
> hardware. */
> > > > > >  		enable_scatter   : 1, /**< Enable scatter packets rx
> handler */
> > > > > > +#ifndef RTE_NEXT_ABI
> > > > > >  		enable_lro       : 1; /**< Enable LRO */
> > > > > > +#else
> > > > > > +		enable_lro       : 1, /**< Enable LRO */
> > > > > > +		lock_mode        : 1; /**< Using lock path */
> > > > > > +#endif
> > > > > >  };
> > > > > >
> > > > > >  /**
> > > > > > @@ -634,11 +639,68 @@ struct rte_eth_txmode {
> > > > > >  		/**< If set, reject sending out tagged pkts */
> > > > > >  		hw_vlan_reject_untagged : 1,
> > > > > >  		/**< If set, reject sending out untagged pkts */
> > > > > > +#ifndef RTE_NEXT_ABI
> > > > > >  		hw_vlan_insert_pvid : 1;
> > > > > >  		/**< If set, enable port based VLAN insertion */
> > > > > > +#else
> > > > > > +		hw_vlan_insert_pvid : 1,
> > > > > > +		/**< If set, enable port based VLAN insertion */
> > > > > > +		lock_mode : 1;
> > > > > > +		/**< If set, using lock path */ #endif
> > > > > >  };
> > > > > >
> > > > > >  /**
> > > > > > + * The macros for the RX/TX lock mode functions  */ #ifdef
> > > > > > +RTE_NEXT_ABI #define RX_LOCK_FUNCTION(dev, func) \
> > > > > > +	(dev->data->dev_conf.rxmode.lock_mode ? \
> > > > > > +	func ## _lock : func)
> > > > > > +
> > > > > > +#define TX_LOCK_FUNCTION(dev, func) \
> > > > > > +	(dev->data->dev_conf.txmode.lock_mode ? \
> > > > > > +	func ## _lock : func)
> > > > > > +#else
> > > > > > +#define RX_LOCK_FUNCTION(dev, func) func
> > > > > > +
> > > > > > +#define TX_LOCK_FUNCTION(dev, func) func #endif
> > > > > > +
> > > > > > +/* Add the lock RX/TX function for VF reset */ #define
> > > > > > +GENERATE_RX_LOCK(func, nic) \ uint16_t func ## _lock(void
> > > > > > +*rx_queue, \
> > > > > > +		      struct rte_mbuf **rx_pkts, \
> > > > > > +		      uint16_t nb_pkts) \
> > > > > > +{					\
> > > > > > +	struct nic ## _rx_queue *rxq = rx_queue; \
> > > > > > +	uint16_t nb_rx = 0; \
> > > > > > +						\
> > > > > > +	if (rte_spinlock_trylock(&rxq->rx_lock)) { \
> > > > > > +		nb_rx = func(rx_queue, rx_pkts, nb_pkts); \
> > > > > > +		rte_spinlock_unlock(&rxq->rx_lock); \
> > > > > > +	} \
> > > > > > +	\
> > > > > > +	return nb_rx; \
> > > > > > +}
> > > > > > +
> > > > > > +#define GENERATE_TX_LOCK(func, nic) \ uint16_t func ##
> > > > > > +_lock(void *tx_queue, \
> > > > > > +		      struct rte_mbuf **tx_pkts, \
> > > > > > +		      uint16_t nb_pkts) \
> > > > > > +{					\
> > > > > > +	struct nic ## _tx_queue *txq = tx_queue; \
> > > > > > +	uint16_t nb_tx = 0; \
> > > > > > +						\
> > > > > > +	if (rte_spinlock_trylock(&txq->tx_lock)) { \
> > > > > > +		nb_tx = func(tx_queue, tx_pkts, nb_pkts); \
> > > > > > +		rte_spinlock_unlock(&txq->tx_lock); \
> > > > > > +	} \
> > > > > > +	\
> > > > > > +	return nb_tx; \
> > > > > > +}
> > > > >
> > > > > 1. As I said in off-line dicussiion, I think this locking could
> > > > > (and I think better be) impelented completely on rte_ethdev layer.
> > > > > So actual PMD code will be unaffected.
> > > > > Again that avoids us to introduce _lock version of every RX/Tx
> > > > > function in each PMD.
> > > > One purpose of implementing the lock in PMD layer is to avoid ABI
> > > > change. But we introduce the field lock_mode in struct
> > > > rte_eth_rx/txmode. So seems it's not a good reason now :) The
> > > > other purpose is we want to add a lock for every queue. But in rte
> > > > layer the queue is void *, so we add the lock in the specific
> > > > structures of the NICs. But as
> > > you mentioned below, we can add the lock as
> > > dev->data->rx_queue_state it the struct rte_eth_dev_data.
> > > > So, I prefer to add the lock in rte layer now.
> > >
> > > OK.
> > >
> > > >
> > > > >
> > > > > 2. Again, as discussed offline, I think it is better to have an
> > > > > explicit
> > > > > rte_eth_(rx|tx)_burst_lock(sync?) API, instead of add new fileds
> > > > > into RX/TX config strcutures.
> > > > > Would help to avoid any confusion, I think.
> > > > We want the users to choose the rx/tx path without  lock if
> > > > they're sensitive to the performance and can handle the reset
> > > > event in their APP. After
> > > introducing new fields of config struct, users can change the config
> > > to choose the different path.
> > >
> > > I understand what you are doing.
> > >
> > > > If we introduce new API, it may be harder for the use to use it. I
> > > > mean when users want to use lock mode, they may need to replace
> > > > all the
> > > rte_eth_rx/tx_burst by rte_eth_rx/tx_burst_lock.
> > >
> > > Yes, my opinion if users would like to use locking API they need to
> > > call it explicitly.
> > >
> > >
> > > >So if we add the lock in rte layer, I still prefer adding lock_mode
> > > >in the  configuration, and the rte_eth_rx/tx_burst is changed like
> > > >this, rte_eth_rx/tx_burst  {
> > > > + if lock_mode
> > > > + try_lock
> > > > ......
> > > > + if lock_mode
> > > > + release_lock
> > > > }
> > >
> > > My preference is to keep existing rx/tx_burst() functions unaffected
> > > by that patch.
> > > At least for now.
> > > I suppose that will minimise the risks and help users to avoid
> > > confusion what API
> > > (locking/non-locking) is in use.
> > OK. Let me add new APIs.
> >
> > >
> > > >
> > > >
> > > > >
> > > > > 3.  I thought the plan was to introduce a locking in all
> > > > > appropriate control path functions (dev_start/dev_stop etc.)
> > > > > Without that locking version of RX/TX seems a bit useless.
> > > > > Yes, I understand that you do use locking inside dev_reset, but
> > > > > I suppose the plan was to have a generic solution, no?
> > > > > Again, interrupt fire when user invokes dev_start/stop or so, so
> > > > > we still need some synchronisation between them.
> > > > >
> > > > > To be more specific, I thought about something like that:
> > > > >
> > > > > static inline uint16_t
> > > > > rte_eth_rx_burst_lock(uint8_t port_id, uint16_t queue_id,
> > > > >                  struct rte_mbuf **rx_pkts, const uint16_t nb_pkts) {
> > > > >         struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> > > > >
> > > > > #ifdef RTE_LIBRTE_ETHDEV_DEBUG
> > > > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
> > > > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);
> > > > >
> > > > >         if (queue_id >= dev->data->nb_rx_queues) {
> > > > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n",
> queue_id);
> > > > >                 return 0;
> > > > >         }
> > > > > #endif
> > > > >
> > > > > + if
> > > > > + (rte_spinlock_trylock(&dev->data->rx_queue_state[rx_queue_id].
> > > > > + lock)
> > > == 0)
> > > > > +	return 0;
> > > > > +  else if (dev->data->rx_queue_state[rx_queue_id] ==
> > > > > RTE_ETH_QUEUE_STATE_STOPPED)) {
> > > > > +	rte_spinlock_unlock(&dev->data-
> >rx_queue_state[rx_queue_id].unlock);
> > > > > +	return 0;
> > > > > +
> > > > >
> > > > >  nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
> > > > >                         rx_pkts, nb_pkts);
> > > > >
> > > > > + rte_spinlock_unlock(&dev->data-
> >rx_queue_state[rx_queue_id].un
> > > > > + lock
> > > > > + );
> > > > >
> > > > > ....
> > > > >
> > > > > return nb_rx;
> > > > > }
> > > > >
> > > > > And inside queue_start:
> > > > >
> > > > > int
> > > > > rte_eth_dev_rx_queue_start(uint8_t port_id, uint16_t rx_queue_id)
> {
> > > > >         struct rte_eth_dev *dev;
> > > > >
> > > > >         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > > > >
> > > > >         dev = &rte_eth_devices[port_id];
> > > > >         if (rx_queue_id >= dev->data->nb_rx_queues) {
> > > > >                 RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n",
> > > rx_queue_id);
> > > > >                 return -EINVAL;
> > > > >         }
> > > > >
> > > > >         RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_start,
> > > > > -ENOTSUP);
> > > > >
> > > > >
> > > > > rte_spinlock_lock(&dev->data->rx_queue_state[rx_queue_id].lock)
> > > > I think you add the lock here to stop the rx/tx.
> > > > But to my opinion, we should lock the rx/tx much earlier before
> > > > starting the queue. For example, when stop the port, the resource
> > > > of the
> > > queues may be released.
> > >
> > > I didn't get you here...
> > > Before releasing the queue resources, queue_stop() has to be executed,
> right?
> > Sorry, I saw your example with rte_eth_dev_rx_queue_start, I didn't
> > know you also want to change rte_eth_dev_rx_queue_stop too.
> > Agree this should work it we call queue_start/stop when reset the
> > port. But we will not call them. I find the queue_stop/start are per- queue
> functions and not supported by all NICs.
> 
> But right now you do reset only for ixgbe/i40e.
Not only for ixgbe/i40e. You forget igb, which doesn't support queue_start/stop :)

> For these devices we defiantly do support queue start/stop.
> And again, it is not only about reset op.
> If we want to add rx locked (synced), I think it should be in sync with all
> control API that changes queue state.
> As I said before it is a lot of work and a lot of hassle...
> So probably the easiest (and might be safiest) way just leave things as there
> are right now:
> we allow user to setup a callback on VF reset, and it is user responsibility to
> make sure no RX/TX is active while reset operation is performed.
> Pretty much what Olivier and Stephen suggested, as I understand.
Agree. It's not a good way to add lock for just one feature. It could be tricky if we want to extend the lock to other features. A whole picture is needed.
We've sent another patch set to let the user setup a callback on VF reset. Depend on that, user can use existing rte APIs to reset the VF port. But how about your opinion if we add a specific rte_reset API? It may be easier for the user.

> Konstantin
> 
> > Our solution now is stop the whole port and restart the whole port. We
> will not stop/restart queue by queue.
> >
> > >
> > > >The rx/tx cannot be executed. So I prefer to get the lock before
> > > >stopping the
> > > ports.
> > >
> > > Might be I wasn't clear enough here.
> > > What I think we need to have:
> > >  -To stop/start/rx/tx the queue (or do any other action that might
> > > change the queue internal structure)
> > >    you have to grab the lock.
> > >    After queue is stopped it's state has to be changed to
> > > QUEUE_STATE_STOPPED (whti queue lock grabbed),
> > >    so rx/tx_locked wouldn't    proceed with that queue.
> > >   - dev_stop() - has to stop all its queues first, i.e. it needs to
> > > call queue_stop() for all of them.
> > >  So after dev_stop() had finished  - all device queues have to be in
> > > QUEUE_STATE_STOPPED Same about dev_start() - after it does all other
> > > things - it will call queue_start() for all it's queues.
> > > that will bring them into QUEUE_STARTED.
> > > After that rx/tx_locked can use them again.
> > >
> > > >Maybe better to keep the spinlock in the dev_reset.
> > >
> > > Might be not :)
> > >
> > > >
> > > > >
> > > > >         if (dev->data->rx_queue_state[rx_queue_id] !=
> > > > > RTE_ETH_QUEUE_STATE_STOPPED) {
> > > > >                 RTE_PMD_DEBUG_TRACE("Queue %" PRIu16" of device
> > > > > with port_id=%" PRIu8
> > > > >                         " already started\n",
> > > > >                         rx_queue_id, port_id);
> > > > >                 ret = -EINVAL 0;
> > > > >         } else
> > > > >         	ret = dev->dev_ops->rx_queue_start(dev, rx_queue_id);
> > > > >
> > > > >
> > > > > rte_spinlock_unlock(&dev->data-
> >rx_queue_state[rx_queue_id].unlo
> > > > > ck);
> > > > >
> > > > >    return ret;
> > > > > }
> > > > >
> > > > > Then again, we don't need to do explicit locking inside dev_reset().
> > > > > Does it make sense to you guys?
> > > > Please see the answer above.
> > > >
> > > > >
> > > > >
> > > > > > +
> > > > > > +/**
> > > > > >   * A structure used to configure an RX ring of an Ethernet port.
> > > > > >   */
> > > > > >  struct rte_eth_rxconf {
> > > > > > --
> > > > > > 2.1.4

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2] i40e: modify the meaning of single VLAN type
  2016-05-26  7:28  4% [dpdk-dev] [PATCH] ethdev: change comments of VLAN type Beilei Xing
@ 2016-06-13  8:03  4% ` Beilei Xing
  2016-06-21 10:29  4%   ` Bruce Richardson
  0 siblings, 1 reply; 200+ results
From: Beilei Xing @ 2016-06-13  8:03 UTC (permalink / raw)
  To: jingjing.wu; +Cc: dev, Beilei Xing

In current i40e codebase, if single VLAN header is added in a packet,
it's treated as inner VLAN. Generally, a single VLAN header is
treated as the outer VLAN header. So change corresponding register
for single VLAN.
At the meanwhile, change the meanings of inner VLAN and outer VLAN.

Signed-off-by: Beilei Xing <beilei.xing@intel.com>
---
v2 changes:
 Combine corresponding i40e driver changes into this patch.

 doc/guides/rel_notes/release_16_07.rst |  3 +++
 drivers/net/i40e/i40e_ethdev.c         | 29 ++++++++++++++++++++---------
 lib/librte_ether/rte_ethdev.h          |  4 ++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..ae02824 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -135,6 +135,9 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* The meanings of ``ETH_VLAN_TYPE_INNER`` and ``ETH_VLAN_TYPE_OUTER`` in
+  ``rte_vlan_type`` are changed.
+
 
 ABI Changes
 -----------
diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 24777d5..3672c52 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -920,12 +920,6 @@ eth_i40e_dev_init(struct rte_eth_dev *dev)
 			     "VLAN ether type");
 		goto err_setup_pf_switch;
 	}
-	ret = i40e_vlan_tpid_set(dev, ETH_VLAN_TYPE_INNER, ETHER_TYPE_VLAN);
-	if (ret != I40E_SUCCESS) {
-		PMD_INIT_LOG(ERR, "Failed to set the default outer "
-			     "VLAN ether type");
-		goto err_setup_pf_switch;
-	}
 
 	/* PF setup, which includes VSI setup */
 	ret = i40e_pf_setup(pf);
@@ -2392,13 +2386,24 @@ i40e_vlan_tpid_set(struct rte_eth_dev *dev,
 	uint64_t reg_r = 0, reg_w = 0;
 	uint16_t reg_id = 0;
 	int ret = 0;
+	int qinq = dev->data->dev_conf.rxmode.hw_vlan_extend;
 
 	switch (vlan_type) {
 	case ETH_VLAN_TYPE_OUTER:
-		reg_id = 2;
+		if (qinq)
+			reg_id = 2;
+		else
+			reg_id = 3;
 		break;
 	case ETH_VLAN_TYPE_INNER:
-		reg_id = 3;
+		if (qinq)
+			reg_id = 3;
+		else {
+			ret = -EINVAL;
+			PMD_DRV_LOG(ERR, "Unsupported vlan type"
+				    "in single vlan.\n");
+			return ret;
+		}
 		break;
 	default:
 		ret = -EINVAL;
@@ -2460,8 +2465,14 @@ i40e_vlan_offload_set(struct rte_eth_dev *dev, int mask)
 	}
 
 	if (mask & ETH_VLAN_EXTEND_MASK) {
-		if (dev->data->dev_conf.rxmode.hw_vlan_extend)
+		if (dev->data->dev_conf.rxmode.hw_vlan_extend) {
 			i40e_vsi_config_double_vlan(vsi, TRUE);
+			/* Set global registers with default ether type value */
+			i40e_vlan_tpid_set(dev, ETH_VLAN_TYPE_OUTER,
+					   ETHER_TYPE_VLAN);
+			i40e_vlan_tpid_set(dev, ETH_VLAN_TYPE_INNER,
+					   ETHER_TYPE_VLAN);
+		}
 		else
 			i40e_vsi_config_double_vlan(vsi, FALSE);
 	}
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 2757510..c5c29fb 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -363,8 +363,8 @@ struct rte_eth_rxmode {
  */
 enum rte_vlan_type {
 	ETH_VLAN_TYPE_UNKNOWN = 0,
-	ETH_VLAN_TYPE_INNER, /**< Single VLAN, or inner VLAN. */
-	ETH_VLAN_TYPE_OUTER, /**< Outer VLAN. */
+	ETH_VLAN_TYPE_INNER, /**< Inner VLAN. */
+	ETH_VLAN_TYPE_OUTER, /**< Single VLAN, or outer VLAN. */
 	ETH_VLAN_TYPE_MAX,
 };
 
-- 
2.5.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v8 0/8] add packet capture framework
  2016-06-10 23:23  0%   ` [dpdk-dev] [PATCH v8 0/8] add " Neil Horman
@ 2016-06-13  8:47  0%     ` Pattan, Reshma
  0 siblings, 0 replies; 200+ results
From: Pattan, Reshma @ 2016-06-13  8:47 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev

Hi,

> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Saturday, June 11, 2016 12:23 AM
> To: Pattan, Reshma <reshma.pattan@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v8 0/8] add packet capture framework
> 
> On Fri, Jun 10, 2016 at 05:18:46PM +0100, Reshma Pattan wrote:
> > This patch set include below changes
> >
> > 1)Changes to librte_ether.
> > 2)A new library librte_pdump added for packet capture framework.
> > 3)A new app/pdump tool added for packet capturing.
> > 4)Test pmd changes done to initialize packet capture framework.
> > 5)Documentation update.
> >
> > 1)librte_pdump
> > ==============
> > To support packet capturing on dpdk Ethernet devices, a new library
> > librte_pdump is added.Users can develop their own packet capturing
> application using new library APIs.
> >
> > Operation:
> > ----------
> > The librte_pdump provides APIs to support packet capturing on dpdk Ethernet
> devices.
> > Library provides APIs to initialize the packet capture framework,
> > enable/disable the packet capture and uninitialize the packet capture
> framework.
> >
> > The librte_pdump library works on a client/server model. The server is
> > responsible for enabling or disabling the packet capture and the
> > clients are responsible for requesting the enabling or disabling of the packet
> capture.
> >
> > The packet capture framework, as part of its initialization, creates
> > the pthread and the server socket in the pthread. The application that
> > calls the framework initialization will have the server socket
> > created, either under the path that the application has passed or under the
> default path i.e. either ''/var/run'' for root user or ''$HOME'' for non root user.
> >
> > Applications that request enabling or disabling of the packet capture
> > will have the client socket created either under the ''/var/run/'' for root users
> or ''$HOME'' for not root users to send the requests to the server.
> > The server socket will listen for client requests for enabling or disabling the
> packet capture.
> >
> > Applications using below APIs need to pass port/device_id, queue,
> > mempool and ring parameters. Library uses user provided ring and
> > mempool to mirror the rx/tx packets of the port for users. Users need
> > to dequeue the rings and write the packets to vdev(pcap/tuntap) to view the
> packets using any standard tools.
> >
> > Note:
> > Mempool and Ring should be mc/mp supportable.
> > Mempool mbuf size should be big enough to handle the rx/tx packets of a
> port.
> >
> > APIs:
> > -----
> > rte_pdump_init()
> > rte_pdump_enable()
> > rte_pdump_enable_by_deviceid()
> > rte_pdump_disable()
> > rte_pdump_disable_by_deviceid()
> > rte_pdump_uninit()
> > rte_pdump_set_socket_dir()
> >
> > 2)app/pdump tool
> > ================
> > Tool app/pdump is designed based on librte_pdump for packet capturing in
> DPDK.
> > This tool by default runs as secondary process, and provides the
> > support for the command line options for packet capture.
> >
> > ./build/app/dpdk_pdump --
> >                        --pdump '(port=<port id> | device_id=<pci id or vdev name>),
> >                                 (queue=<queue id>),
> >                                 (rx-dev=<iface or pcap file> |
> >                                  tx-dev=<iface or pcap file>),
> >                                 [ring-size=<ring size>],
> >                                 [mbuf-size=<mbuf data size>],
> >                                 [total-num-mbufs=<number of mbufs>]'
> >
> > Parameters inside the parenthesis represents the mandatory parameters.
> > Parameters inside the square brackets represents optional parameters.
> > User has to pass on packet capture parameters under --pdump
> > parameters, multiples of --pdump can be passed to capture packets on
> > different port and queue combinations
> >
> > Operation:
> > ----------
> > *Tool parse the user command line arguments, creates the mempool, ring
> > and the PCAP PMD vdev with 'tx_stream' as either of the device passed
> > in rx-dev|tx-dev parameters.
> >
> > *Then calls the APIs of librte_pdump i.e.
> > rte_pdump_enable()/rte_pdump_enable_by_deviceid()
> > to enable packet capturing on a specific port/device_id and queue by
> > passing on
> > port|device_id, queue, mempool and ring info.
> >
> > *Tool runs in while loop to dequeue the packets from the ring and write them
> to pcap device.
> >
> > *Tool can be stopped using SIGINT, upon which tool calls
> > rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated
> resources.
> >
> > Note:
> > CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run
> the pdump tool.
> >
> > 3)Test-pmd changes
> > ==================
> > Changes are done to test-pmd application to initialize/uninitialize the packet
> capture framework.
> > So app/pdump tool can be run to see packets of dpdk ports that are used by
> test-pmd.
> >
> > Similarly any application which needs packet capture should call
> > initialize/uninitialize APIs of librte_pdump and use pdump tool to start the
> capture.
> >
> > 4)Packet capture flow between pdump tool and librte_pdump
> > =========================================================
> > * Pdump tool (Secondary process) requests packet capture for specific
> > port|device_id and queue combinations.
> >
> > *Library in secondary process context creates client socket and
> > communicates the port|device_id, queue, ring and mempool to server.
> >
> > *Library initializes server in primary process 'test-pmd' context and
> > server serves the client request to enable Ethernet rxtx call-backs for a given
> port|device_id and queue.
> >
> > *Copy the rx/tx packets to passed mempool and enqueue the packets to ring
> for secondary process.
> >
> > *Pdump tool will dequeue the packets from ring and writes them to
> > PCAPMD vdev, so ultimately packets will be seen on the device that is passed
> in rx-dev|tx-dev.
> >
> > *Once the pdump tool is terminated with SIGINT it will disable the packet
> capturing.
> >
> > *Library receives the disable packet capture request, communicate the
> > info to server, server will remove the Ethernet rxtx call-backs.
> >
> > *Packet capture can be seen using tcpdump command "tcpdump -ni
> > <iface>" (or) "tcpdump –nr <pcapfile>"
> >
> > 5)Example command line
> > ======================
> > ./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-
> dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-
> size=2176,total-num-mbufs=32768' --pdump
> 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-
> file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'
> >
> > v8:
> > added server socket argument to rte_pdump_init() API ==>
> > http://dpdk.org/dev/patchwork/patch/13402/
> > added rte_pdump_set_socket_dir() API.
> > updated documentation for new changes.
> >
> > v7:
> > fixed lines over 90 characters.
> >
> > v6:
> > removed below deprecation notice patch from patch set.
> > http://dpdk.org/dev/patchwork/patch/13372/
> >
> > v5:
> > addressed code review comments for below patches
> > http://dpdk.org/dev/patchwork/patch/12955/
> > http://dpdk.org/dev/patchwork/patch/12951/
> >
> > v4:
> > added missing deprecation notice for ABI changes of rte_eth_dev_info
> structure.
> > made doc changes as per doc guidelines.
> > replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
> > removed rxtx-dev parameter from pdump tool command line.
> >
> > v3:
> > app/pdump: Moved cleanup code from signal handler to main.
> > divided librte_ether changes into multiple patches.
> > example command changed in app/pdump application guide
> >
> > v2:
> > fix compilation issues for 4.8.3
> > fix unnecessary #includes
> >
> >
> > Reshma Pattan (8):
> >   librte_ether: protect add/remove of rxtx callbacks with spinlocks
> >   librte_ether: add new api rte_eth_add_first_rx_callback
> >   librte_ether: add new fields to rte_eth_dev_info struct
> >   librte_ether: make rte_eth_dev_get_port_by_name
> >     rte_eth_dev_get_name_by_port public
> >   lib/librte_pdump: add new library for packet capturing support
> >   app/pdump: add pdump tool for packet capturing
> >   app/test-pmd: add pdump initialization uninitialization
> >   doc: update doc for packet capture framework
> >
> >  MAINTAINERS                             |   8 +
> >  app/Makefile                            |   1 +
> >  app/pdump/Makefile                      |  45 ++
> >  app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++
> >  app/test-pmd/testpmd.c                  |   6 +
> >  config/common_base                      |   5 +
> >  doc/guides/prog_guide/index.rst         |   1 +
> >  doc/guides/prog_guide/pdump_library.rst | 117 +++++
> > doc/guides/rel_notes/release_16_07.rst  |  13 +
> >  doc/guides/sample_app_ug/index.rst      |   1 +
> >  doc/guides/sample_app_ug/pdump.rst      | 122 +++++
> >  lib/Makefile                            |   1 +
> >  lib/librte_ether/rte_ethdev.c           | 123 +++--
> >  lib/librte_ether/rte_ethdev.h           |  60 +++
> >  lib/librte_ether/rte_ether_version.map  |   9 +
> >  lib/librte_pdump/Makefile               |  55 ++
> >  lib/librte_pdump/rte_pdump.c            | 904
> ++++++++++++++++++++++++++++++++
> >  lib/librte_pdump/rte_pdump.h            | 208 ++++++++
> >  lib/librte_pdump/rte_pdump_version.map  |  13 +
> >  mk/rte.app.mk                           |   1 +
> >  20 files changed, 2493 insertions(+), 44 deletions(-)  create mode
> > 100644 app/pdump/Makefile  create mode 100644 app/pdump/main.c  create
> > mode 100644 doc/guides/prog_guide/pdump_library.rst
> >  create mode 100644 doc/guides/sample_app_ug/pdump.rst
> >  create mode 100644 lib/librte_pdump/Makefile  create mode 100644
> > lib/librte_pdump/rte_pdump.c  create mode 100644
> > lib/librte_pdump/rte_pdump.h  create mode 100644
> > lib/librte_pdump/rte_pdump_version.map
> >
> > Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > --
> > 2.5.0
> >
> >
> This seems useful, but the pcap pmd already accepts pcap formatted files for
> input to send using the pcap library.  Shouldn't this functionality be integrated
> with that pmd instead of breaking it out to its own library?
> 

The librte_pdump library doesn’t' deal with any  PCAP functionality, it is solely to send packets of DPDK ports to applications over the rings.
It is upto the applications to decide on which vdev  pmd they would like to use to send the packets further on to the Linux devices.
In this patch set, the  pdump tool(application based on librte_pdump) uses pcap pmd vdev. In future we also can replace  pcap pmd vdev usage with other virtual device pmd types. 

> Neil


^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v9 0/8] add packet capture framework
  2016-06-10 16:18  2% ` [dpdk-dev] [PATCH v8 0/8] add packet capture framework Reshma Pattan
  2016-06-10 16:18  5%   ` [dpdk-dev] [PATCH v8 8/8] doc: update doc for " Reshma Pattan
  2016-06-10 23:23  0%   ` [dpdk-dev] [PATCH v8 0/8] add " Neil Horman
@ 2016-06-14  9:38  2%   ` Reshma Pattan
  2016-06-14  9:38  5%     ` [dpdk-dev] [PATCH v9 8/8] doc: update doc for " Reshma Pattan
  2016-06-15 14:06  2%     ` [dpdk-dev] [PATCH v10 0/7] add " Reshma Pattan
  2 siblings, 2 replies; 200+ results
From: Reshma Pattan @ 2016-06-14  9:38 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
The librte_pdump provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

The librte_pdump library works on a client/server model. The server is responsible for enabling or
disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
the packet capture.

The packet capture framework, as part of its initialization, creates the pthread and the server socket in
the pthread. The application that calls the framework initialization will have the server socket created,
either under the path that the application has passed or under the default path i.e. either ''/var/run'' for
root user or ''$HOME'' for non root user.

Applications that request enabling or disabling of the packet capture will have the client socket created
either under the path that the application has passed or under the default path i.e. either ''/var/run/''
for root users or ''$HOME'' for not root users to send the requests to the server.
The server socket will listen for client requests for enabling or disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()
rte_pdump_set_socket_dir()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v9:
added a support in rte_pdump_set_socket_dir() to set server and client socket paths
==> http://dpdk.org/dev/patchwork/patch/13450/
updated the documentation for the new changes.
updated the commit messages.

v8:
added server socket argument to rte_pdump_init() API ==> http://dpdk.org/dev/patchwork/patch/13402/
added rte_pdump_set_socket_dir() API.
updated documentation for new changes.

v7:
fixed lines over 90 characters.

v6:
removed below deprecation notice patch from patch set.
http://dpdk.org/dev/patchwork/patch/13372/

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (8):
  ethdev: use locks to protect Rx/Tx callback lists
  ethdev: add new api to add Rx callback as head of the list
  ethdev: add new fields to ethdev info struct
  ethdev: make get port by name and get name by port public
  pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/testpmd: add pdump initialization uninitialization
  doc: update doc for packet capture framework

 MAINTAINERS                             |   8 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  45 ++
 app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |   6 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 119 +++++
 doc/guides/rel_notes/release_16_07.rst  |  13 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  60 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 ++
 lib/librte_pdump/rte_pdump.c            | 913 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 216 ++++++++
 lib/librte_pdump/rte_pdump_version.map  |  13 +
 mk/rte.app.mk                           |   1 +
 20 files changed, 2512 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
-- 
2.5.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v9 8/8] doc: update doc for packet capture framework
  2016-06-14  9:38  2%   ` [dpdk-dev] [PATCH v9 " Reshma Pattan
@ 2016-06-14  9:38  5%     ` Reshma Pattan
  2016-06-14 20:41  3%       ` Thomas Monjalon
  2016-06-15 14:06  2%     ` [dpdk-dev] [PATCH v10 0/7] add " Reshma Pattan
  1 sibling, 1 reply; 200+ results
From: Reshma Pattan @ 2016-06-14  9:38 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Added programmers guide for librte_pdump.
Added sample application guide for app/pdump application.
Updated release note for packet capture framework changes.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
Acked-by: John McNamara <john.mcnamara@intel.com>
---
 MAINTAINERS                             |   3 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 119 +++++++++++++++++++++++++++++++
 doc/guides/rel_notes/release_16_07.rst  |  13 ++++
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 ++++++++++++++++++++++++++++++++
 6 files changed, 259 insertions(+)
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index c46cf86..9a84f59 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -437,6 +437,9 @@ Pdump
 M: Reshma Pattan <reshma.pattan@intel.com>
 F: lib/librte_pdump/
 F: app/pdump/
+F: doc/guides/prog_guide/pdump_library.rst
+F: doc/guides/sample_app_ug/pdump.rst
+
 
 Hierarchical scheduler
 M: Cristian Dumitrescu <cristian.dumitrescu@intel.com>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index b862d0c..4caf969 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -71,6 +71,7 @@ Programmer's Guide
     writing_efficient_code
     profile_app
     glossary
+    pdump_library
 
 
 **Figures**
diff --git a/doc/guides/prog_guide/pdump_library.rst b/doc/guides/prog_guide/pdump_library.rst
new file mode 100644
index 0000000..8781ffb
--- /dev/null
+++ b/doc/guides/prog_guide/pdump_library.rst
@@ -0,0 +1,119 @@
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+.. _pdump_library:
+
+The librte_pdump Library
+========================
+
+The ``librte_pdump`` library provides a framework for packet capturing in DPDK.
+The library provides the following APIs to initialize the packet capture framework, to enable
+or disable the packet capture, and to uninitialize it:
+
+* ``rte_pdump_init()``:
+  This API initializes the packet capture framework.
+
+* ``rte_pdump_enable()``:
+  This API enables the packet capture on a given port and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_enable_by_deviceid()``:
+  This API enables the packet capture on a given device id (``vdev name or pci address``) and queue.
+  Note: The filter option in the API is a place holder for future enhancements.
+
+* ``rte_pdump_disable()``:
+  This API disables the packet capture on a given port and queue.
+
+* ``rte_pdump_disable_by_deviceid()``:
+  This API disables the packet capture on a given device id (``vdev name or pci address``) and queue.
+
+* ``rte_pdump_uninit()``:
+  This API uninitializes the packet capture framework.
+
+* ``rte_pdump_set_socket_dir()``:
+  This API sets the server and client socket paths.
+  Note: This API is not thread-safe.
+
+
+Operation
+---------
+
+The ``librte_pdump`` library works on a client/server model. The server is responsible for enabling or
+disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
+the packet capture.
+
+The packet capture framework, as part of its initialization, creates the pthread and the server socket in
+the pthread. The application that calls the framework initialization will have the server socket created,
+either under the path that the application has passed or under the default path i.e. either ``/var/run`` for
+root user or ``$HOME`` for non root user.
+
+Applications that request enabling or disabling of the packet capture will have the client socket created either under
+the path that the application has passed or under the default path i.e. either ``/var/run/`` for root user or ``$HOME``
+for not root user to send the requests to the server.
+The server socket will listen for client requests for enabling or disabling the packet capture.
+
+
+Implementation Details
+----------------------
+
+The library API ``rte_pdump_init()``, initializes the packet capture framework by creating the pthread and the server
+socket. The server socket in the pthread context will be listening to the client requests to enable or disable the
+packet capture.
+
+The library APIs ``rte_pdump_enable()`` and ``rte_pdump_enable_by_deviceid()`` enables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump enable" request and sends
+the request to the server. The server that is listening on the socket will take the request and enable the packet capture
+by registering the Ethernet RX and TX callbacks for the given port or device_id and queue combinations.
+Then the server will mirror the packets to the new mempool and enqueue them to the rte_ring that clients have passed
+to these APIs. The server also sends the response back to the client about the status of the request that was processed.
+After the response is received from the server, the client socket is closed.
+
+The library APIs ``rte_pdump_disable()`` and ``rte_pdump_disable_by_deviceid()`` disables the packet capture.
+On each call to these APIs, the library creates a separate client socket, creates the "pdump disable" request and sends
+the request to the server. The server that is listening on the socket will take the request and disable the packet
+capture by removing the Ethernet RX and TX callbacks for the given port or device_id and queue combinations. The server
+also sends the response back to the client about the status of the request that was processed. After the response is
+received from the server, the client socket is closed.
+
+The library API ``rte_pdump_uninit()``, uninitializes the packet capture framework by closing the pthread and the
+server socket.
+
+The library API ``rte_pdump_set_socket_dir()``, sets the given path as either server socket path
+or client socket path based on the ``type`` argument of the API.
+If the given path is ``NULL``, default path will be selected, i.e. either ``/var/run/`` for root user or ``$HOME``
+for non root user. Clients also need to call this API to set their server socket path if the server socket
+path is different from default path.
+
+
+Use Case: Packet Capturing
+--------------------------
+
+The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
+Users can use this as an example to develop their own packet capturing application.
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..a4de2a2 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -66,6 +66,11 @@ New Features
   * Enable RSS per network interface through the configuration file.
   * Streamline the CLI code.
 
+* **Added packet capture framework.**
+
+  * A new library ``librte_pdump`` is added to provide packet capture APIs.
+  * A new ``app/pdump`` tool is added to capture packets in DPDK.
+
 
 Resolved Issues
 ---------------
@@ -135,6 +140,11 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_get_port_by_name`` changed to a public API.
+
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -146,6 +156,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/doc/guides/sample_app_ug/index.rst b/doc/guides/sample_app_ug/index.rst
index 930f68c..96bb317 100644
--- a/doc/guides/sample_app_ug/index.rst
+++ b/doc/guides/sample_app_ug/index.rst
@@ -76,6 +76,7 @@ Sample Applications User Guide
     ptpclient
     performance_thread
     ipsec_secgw
+    pdump
 
 **Figures**
 
diff --git a/doc/guides/sample_app_ug/pdump.rst b/doc/guides/sample_app_ug/pdump.rst
new file mode 100644
index 0000000..96c8709
--- /dev/null
+++ b/doc/guides/sample_app_ug/pdump.rst
@@ -0,0 +1,122 @@
+
+..  BSD LICENSE
+    Copyright(c) 2016 Intel Corporation. All rights reserved.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+dpdk_pdump Application
+======================
+
+The ``dpdk_pdump`` application is a Data Plane Development Kit (DPDK) application that runs as a DPDK secondary process and
+is capable of enabling packet capture on dpdk ports.
+
+
+Running the Application
+-----------------------
+
+The application has a ``--pdump`` command line option with various sub arguments:
+
+.. code-block:: console
+
+   ./build/app/dpdk_pdump --
+                          --pdump '(port=<port id> | device_id=<pci id or vdev name>),
+                                   (queue=<queue_id>),
+                                   (rx-dev=<iface or pcap file> |
+                                    tx-dev=<iface or pcap file>),
+                                   [ring-size=<ring size>],
+                                   [mbuf-size=<mbuf data size>],
+                                   [total-num-mbufs=<number of mbufs>]'
+
+Note:
+
+* Parameters inside the parentheses represents mandatory parameters.
+
+* Parameters inside the square brackets represents optional parameters.
+
+Multiple instances of ``--pdump`` can be passed to capture packets on different port and queue combinations.
+
+
+Parameters
+~~~~~~~~~~
+
+``port``:
+Port id of the eth device on which packets should be captured.
+
+``device_id``:
+PCI address (or) name of the eth device on which packets should be captured.
+
+   .. Note::
+
+      * As of now the ``dpdk_pdump`` tool cannot capture the packets of virtual devices
+        in the primary process due to a bug in the ethdev library. Due to this bug, in a multi process context,
+        when the primary and secondary have different ports set, then the secondary process
+        (here the ``dpdk_pdump`` tool) overwrites the ``rte_eth_devices[]`` entries of the primary process.
+
+``queue``:
+Queue id of the eth device on which packets should be captured. The user can pass a queue value of ``*`` to enable
+packet capture on all queues of the eth device.
+
+``rx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+``tx-dev``:
+Can be either a pcap file name or any Linux iface.
+
+   .. Note::
+
+      * To receive ingress packets only, ``rx-dev`` should be passed.
+
+      * To receive egress packets only, ``tx-dev`` should be passed.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the different file names or the Linux iface names.
+
+      * To receive ingress and egress packets separately ``rx-dev`` and ``tx-dev``
+        should both be passed with the same file names or the the Linux iface names.
+
+``ring-size``:
+Size of the ring. This value is used internally for ring creation. The ring will be used to enqueue the packets from
+the primary application to the secondary. This is an optional parameter with default size 16384.
+
+``mbuf-size``:
+Size of the mbuf data. This is used internally for mempool creation. Ideally this value must be same as
+the primary application's mempool's mbuf data size which is used for packet RX. This is an optional parameter with
+default size 2176.
+
+``total-num-mbufs``:
+Total number mbufs in mempool. This is used internally for mempool creation. This is an optional parameter with default
+value 65535.
+
+
+Example
+-------
+
+.. code-block:: console
+
+   $ sudo ./build/app/dpdk_pdump -- --pdump 'port=0,queue=*,rx-dev=/tmp/rx.pcap'
-- 
2.5.0

^ permalink raw reply	[relevance 5%]

* [dpdk-dev] [PATCH v10 0/3] mempool: add external mempool manager
  2016-06-10 15:16  2%         ` [dpdk-dev] [PATCH v9 0/3] " David Hunt
@ 2016-06-14  9:46  2%           ` David Hunt
  2016-06-14 15:48  3%             ` [dpdk-dev] [PATCH v11 " David Hunt
  0 siblings, 1 reply; 200+ results
From: David Hunt @ 2016-06-14  9:46 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 14/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_HANDLER macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new 'create' function, providing the
mempool ops struct name to point the mempool to the relevant mempool manager
callback structure.

The old 'create' function can still be called by legacy programs, and will
internally work out the mempool handle based on the flags provided (single
producer, single consumer, etc). By default handles are created internally to
implement the built-in DPDK mempool manager and mempool types.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool manager

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
                       ` (4 preceding siblings ...)
  2016-06-07  3:52  6%     ` [dpdk-dev] [PATCH v3 18/20] examples/tep_term: adapt to new vhost ABI/API changes Yuanhan Liu
@ 2016-06-14 12:00  4%     ` Yuanhan Liu
  2016-06-30  7:39  9%     ` Panu Matilainen
  6 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-14 12:00 UTC (permalink / raw)
  To: dev
  Cc: huawei.xie, Thomas Monjalon, Panu Matilainen, Traynor Kevin,
	Rich Lane, Tetsuya Mukawa

Applied to dpdk-next-virtio.

	--yliu

On Tue, Jun 07, 2016 at 11:51:50AM +0800, Yuanhan Liu wrote:
> v3: - adapted the new vhost ABI/API changes to tep_term example, to make
>       sure not break build at least.
>     - bumped the ABI version to 3
> 
> NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
> 
>     [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
> 
> 
> Every time we introduce a new feature to vhost, we are likely to break
> ABI. Moreover, some cleanups (such as the one from Ilya to remove vec_buf
> from vhost_virtqueue struct) also break ABI.
> 
> This patch set is meant to resolve above issue ultimately, by hiding
> virtio_net structure (as well as few others) internaly, and export the
> virtio_net dev strut to applications by a number, vid, like the way
> kernel exposes an fd to user space.
> 
> Back to the patch set, the first part of this set makes some changes to
> vhost example, vhost-pmd and vhost, bit by bit, to remove the dependence
> to "virtio_net" struct. And then do the final change to make the current
> APIs to adapt to using "vid".
> 
> After that, "vrtio_net_device_ops" is the only left open struct that an
> application can acces, therefore, it's the only place that might introduce
> potential ABI breakage in future for extension. Hence, I made few more
> (5) space reservation, to make sure we will not break ABI for a long time,
> and hopefuly, forever.
> 
> The last bit of this patch set is some cleanups, including the one from
> Ilya.
> 
> v2: - exported ifname as well to fix a vhost-pmd issue reported by Rich
>     - separated the big patch that introduces several new APIs into some
>       small patches.
>     - updated release note
>     - updated version.map
> 
> Thanks.
> 
> 	--yliu
> 
> ---
> Ilya Maximets (1):
>   vhost: make buf vector for scatter Rx local
> 
> Yuanhan Liu (19):
>   vhost: declare backend with int type
>   vhost: set/reset dev flags internally
>   vhost: declare device fh as int
>   examples/vhost: make a copy of virtio device id
>   vhost: rename device fh to vid
>   vhost: get device by vid only
>   vhost: move vhost device ctx to cuse
>   vhost: introduce new API to export numa node
>   vhost: introduce new API to export number of queues
>   vhost: introduce new API to export ifname
>   vhost: introduce new API to export queue free entries
>   vhost: remove dependency on priv field
>   vhost: export vid as the only interface to applications
>   vhost: hide internal structs/macros/functions
>   vhost: remove unnecessary fields
>   vhost: remove virtio-net.h
>   vhost: reserve few more space for future extension
>   examples/tep_term: adapt to new vhost ABI/API changes
>   vhost: per device virtio net header len
> 
>  doc/guides/rel_notes/release_16_07.rst        |  11 +-
>  drivers/net/vhost/rte_eth_vhost.c             |  79 ++++-----
>  examples/tep_termination/main.c               |  83 +++++-----
>  examples/tep_termination/main.h               |   5 +-
>  examples/tep_termination/vxlan_setup.c        |  20 +--
>  examples/tep_termination/vxlan_setup.h        |   6 +-
>  examples/vhost/main.c                         | 116 +++++++------
>  examples/vhost/main.h                         |   3 +-
>  lib/librte_vhost/Makefile                     |   2 +-
>  lib/librte_vhost/rte_vhost_version.map        |  10 ++
>  lib/librte_vhost/rte_virtio_net.h             | 223 +++++++------------------
>  lib/librte_vhost/vhost-net.h                  | 201 ++++++++++++++++++----
>  lib/librte_vhost/vhost_cuse/vhost-net-cdev.c  |  83 +++++-----
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |  30 ++--
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.h |  12 +-
>  lib/librte_vhost/vhost_rxtx.c                 | 133 ++++++++-------
>  lib/librte_vhost/vhost_user/vhost-net-user.c  |  53 +++---
>  lib/librte_vhost/vhost_user/vhost-net-user.h  |   2 +
>  lib/librte_vhost/vhost_user/virtio-net-user.c |  64 +++----
>  lib/librte_vhost/vhost_user/virtio-net-user.h |  18 +-
>  lib/librte_vhost/virtio-net.c                 | 229 +++++++++++++++++---------
>  lib/librte_vhost/virtio-net.h                 |  43 -----
>  22 files changed, 752 insertions(+), 674 deletions(-)
>  delete mode 100644 lib/librte_vhost/virtio-net.h
> 
> -- 
> 1.9.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 0/6] vhost: add vhost-user client mode and reconnect ability
  2016-06-07  4:05  3% ` [dpdk-dev] [PATCH v3 " Yuanhan Liu
@ 2016-06-14 12:00  0%   ` Yuanhan Liu
  0 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-14 12:00 UTC (permalink / raw)
  To: dev; +Cc: huawei.xie, Traynor Kevin, marcandre.lureau

Applied to dpdk-next-virtio.

	--yliu

On Tue, Jun 07, 2016 at 12:05:02PM +0800, Yuanhan Liu wrote:
> v3: - make the "reconnect" feature be default for client mode, as it's
>       good to handle guest OS restart with less effort.
>     - fix var not-initilized error pointed out by Rich
> 
> 
> NOTE: I created a branch at dpdk.org [0] for more convenient testing:
> 
>     [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
> 
> 
> When the DPDK vhost-user application (such as OVS) restarts (due to
> crash, or update), the vhost-user connection between DPDK and QEMU
> won't be established automatically again. In another word, the virtio
> net is broken.
> 
> The reason it doesn't work is that DPDK just acts as server only.
> A restart of the server needs a reconnection from the client (QEMU).
> However, reconnect from QEMU is not supported from QEMU.
> 
> Adding the support of client mode and let DPDK be the client somehow
> would resolve above issue a bit easier: a restart of DPDK would naturally
> try to connect to the server (QEMU) automatically.
> 
> Therefore, this patchset implements the DPDK vhost-user client mode, by
> introducing a new arg (flags) for API rte_vhost_driver_register(). And the
> client mode is enabled when RTE_VHOST_USER_CLIENT is given. Note that this
> implies an API breakage. However, since this release deals with ABI/API
> refactoring, it should not be an issue.
> 
> Another interesting thing to make it work is that you not only have
> to consider that case the DPDK vhost-user app might restart, but also
> have to think that QEMU might restart as well: guest OS sometimes
> just reboots. In such case, when the server is down, the client has
> to keep reconnecting with the server until the server is back and the
> connection is established again. And that's what "reconnect" patch for.
> 
> Note that current QEMU doesn't not support a second time connection
> from client, thus a restart of DPDK vhost-user will not work. This is
> because current QEMU won't be able to detect the disconnect from
> restart, thus it will not listen for later connections. Patches [1] have
> been sent, it's just not merged yet. But unlike the vhost-user mulitple
> queue case, that we have critical depends on QEMU implementation, here
> we have no such dependency, therefore, I think it's okay to make DPDK
> be ready for the "reconnect" stuff first. (note that I also mentioned
> this fact in the release doc).
> 
>     [1]: http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg01507.html
> 
> v2: - added release doc
>     - do not remove socket file for the client mode
>     - create one thread ony to handle all reconnects
> 
> 
> Thanks.
> 	--yliu
> 
> ---
> Yuanhan Liu (6):
>   vhost: rename structs for enabling client mode
>   vhost: add vhost-user client mode
>   vhost: add reconnect ability
>   vhost: workaround stale vring base
>   examples/vhost: add client option
>   vhost: add pmd client option
> 
>  doc/guides/rel_notes/release_16_07.rst       |  21 ++
>  drivers/net/vhost/rte_eth_vhost.c            |  38 ++-
>  examples/vhost/main.c                        |  12 +-
>  lib/librte_vhost/rte_virtio_net.h            |  12 +-
>  lib/librte_vhost/vhost_cuse/vhost-net-cdev.c |   8 +-
>  lib/librte_vhost/vhost_user/vhost-net-user.c | 403 ++++++++++++++++++---------
>  lib/librte_vhost/vhost_user/vhost-net-user.h |   6 -
>  lib/librte_vhost/virtio-net.c                |   9 +
>  8 files changed, 361 insertions(+), 148 deletions(-)
> 
> -- 
> 1.9.0

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH 0/3] [RFC] vhost: micro vhost optimization
  @ 2016-06-14 12:42  0% ` Yuanhan Liu
  0 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-14 12:42 UTC (permalink / raw)
  To: dev; +Cc: huawei.xie

Applied to dpdk-next-virtio.

	--yliu

On Mon, May 02, 2016 at 05:46:15PM -0700, Yuanhan Liu wrote:
> Here is a small patch set does the micro optimization, which brings about
> 10% performance boost in my 64B packet testing, with the following topo:
> 
>     pkt generator <----> NIC <-----> Virtio NIC
> 
> Patch 1 pre updates the used ring and update them in batch. It should be
> feasible from my understanding: there will be no issue, guest driver will
> not start processing them as far as we haven't updated the "used->idx"
> yet. I could miss something though.
> 
> Patch 2 saves one check for small packets (that can be hold in one desc
> buf and mbuf).
> 
> Patch 3 moves several frequently used fields into one cache line, for
> better cache sharing. 
> 
> Note that this patch set is based on my latest vhost ABI refactoring patchset.
> 
> 
> ---
> Yuanhan Liu (3):
>   vhost: pre update used ring for Tx and Rx
>   vhost: optimize dequeue for small packets
>   vhost: arrange virtio_net fields for better cache sharing
> 
>  lib/librte_vhost/vhost-net.h  |   8 +--
>  lib/librte_vhost/vhost_rxtx.c | 110 ++++++++++++++++++++++++------------------
>  2 files changed, 68 insertions(+), 50 deletions(-)
> 
> -- 
> 1.9.0

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH] vhost: remove internal lockless enqueue
  @ 2016-06-14 14:07  3% ` Yuanhan Liu
  0 siblings, 0 replies; 200+ results
From: Yuanhan Liu @ 2016-06-14 14:07 UTC (permalink / raw)
  To: Huawei Xie
  Cc: dev, yuanhan.liu, jianfeng.tan, mukawa, kevin.traynor, haifeng.lin

Firstly, it's V2. So, don't forget to add version number and link it to 
the previous version next time.

On Mon, Jun 13, 2016 at 07:52:12PM +0800, Huawei Xie wrote:
> All other DPDK PMDs doesn't support concurrent receiving or sending
> packets to the same queue. The upper application should deal with
> this, normally through queue and core bindings.
> 
> Due to historical reason, vhost internally supports concurrent lockless
> enqueuing packets to the same virtio queue through costly cmpset operation.
> This patch removes this internal lockless implementation and should improve
> performance a bit.
> 
> Luckily DPDK OVS doesn't rely on this behavior.
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>

Applied to dpdk-next-virtio, with the rebase on top of my vhost ABI/API
changes.

FYI, I also renamed the title a bit to "remove concurrent enqueue" as
Thomas mentioned in the last version, that it's confusing to say "remove
lockless" here, since we are actually removing the lock.

Thanks.

	--yliu

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v11 0/3] mempool: add external mempool manager
  2016-06-14  9:46  2%           ` [dpdk-dev] [PATCH v10 " David Hunt
@ 2016-06-14 15:48  3%             ` David Hunt
  2016-06-14 15:48  1%               ` [dpdk-dev] [PATCH v11 1/3] mempool: support external mempool operations David Hunt
  2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
  0 siblings, 2 replies; 200+ results
From: David Hunt @ 2016-06-14 15:48 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 14/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c
 * Kept the v10 ACK from Shreyansh and Olivier for v11

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool manager callback structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool manager by default. These applications
will need to be modified to use a new external mempool manager.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool handler

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v11 1/3] mempool: support external mempool operations
  2016-06-14 15:48  3%             ` [dpdk-dev] [PATCH v11 " David Hunt
@ 2016-06-14 15:48  1%               ` David Hunt
  2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
  1 sibling, 0 replies; 200+ results
From: David Hunt @ 2016-06-14 15:48 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  31 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 --
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  66 +++-----
 lib/librte_mempool/rte_mempool.h           | 251 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 148 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 601 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..6e358d5 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,7 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a ring or an external mempool manager to store free objects.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +127,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+External Mempool Manager
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to external mempool manager.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several external mempool managers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool manager callbacki (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool manager by default. These applications
+will need to be modified to use a new external mempool manager.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an external mempool manager.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index bda40c1..5708eef 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -45,15 +45,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 22a5645..ac40cb3 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,9 +352,9 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
 	}
 
@@ -703,7 +669,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +781,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +811,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc");
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc");
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc");
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc");
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +910,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1099,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1120,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..e429f3f 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,13 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +221,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +247,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +337,212 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_ops_get(int ops_index)
+{
+	RTE_VERIFY(ops_index < RTE_MEMPOOL_MAX_OPS_IDX);
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops get callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of get function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops put callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of put function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);
+
+/**
+ * Register mempool operations.
+ *
+ * @param h
+ *   Pointer to and ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_ops_register(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of an external mempool manager.
+ * Note that the rte_mempool_ops_register fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_ops_register(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +992,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1003,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1154,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1182,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..9328b77
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,148 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_ops_register(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	return ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_ops_register. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..626786e
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool managers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..6209ec2 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_register;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v9 8/8] doc: update doc for packet capture framework
  2016-06-14  9:38  5%     ` [dpdk-dev] [PATCH v9 8/8] doc: update doc for " Reshma Pattan
@ 2016-06-14 20:41  3%       ` Thomas Monjalon
  2016-06-15  5:44  0%         ` Pattan, Reshma
  0 siblings, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-06-14 20:41 UTC (permalink / raw)
  To: Reshma Pattan; +Cc: dev, john.mcnamara

When commenting previous patches, I missed these docs.
Please move them in the appropriate patches.

2016-06-14 10:38, Reshma Pattan:
> --- a/doc/guides/prog_guide/index.rst
> +++ b/doc/guides/prog_guide/index.rst
> @@ -71,6 +71,7 @@ Programmer's Guide
>      writing_efficient_code
>      profile_app
>      glossary
> +    pdump_library

There is probably a better place that after the glossary.

[...]
> +The librte_pdump Library
> +========================
> +
> +The ``librte_pdump`` library provides a framework for packet capturing in DPDK.

Here you need to explain what you mean by "packet capturing".
Doing a copy?
Slowing down the normal processing?
Which usage do you target? debugging? fast mirroring?

> +Use Case: Packet Capturing
> +--------------------------
> +
> +The DPDK ``app/pdump`` tool is developed based on this library to capture packets in DPDK.
> +Users can use this as an example to develop their own packet capturing application.

Is it an example or a debugging tool?
If it is an example, it should be in the examples/ directory.

>  ABI Changes
>  -----------
> @@ -146,6 +156,9 @@ ABI Changes
>  * The ``rte_port_source_params`` structure has new fields to support PCAP file.
>    It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
>  
> +* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
> +  to support number of queues configured by software.

There was no deprecation notice in 16.04 for this ABI change.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v9 8/8] doc: update doc for packet capture framework
  2016-06-14 20:41  3%       ` Thomas Monjalon
@ 2016-06-15  5:44  0%         ` Pattan, Reshma
  2016-06-15  8:24  0%           ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Pattan, Reshma @ 2016-06-15  5:44 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, Mcnamara, John



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Tuesday, June 14, 2016 9:41 PM
> To: Pattan, Reshma <reshma.pattan@intel.com>
> Cc: dev@dpdk.org; Mcnamara, John <john.mcnamara@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v9 8/8] doc: update doc for packet capture
> framework
> 
> When commenting previous patches, I missed these docs.
> Please move them in the appropriate patches.
> 
> > +Use Case: Packet Capturing
> > +--------------------------
> > +
> > +The DPDK ``app/pdump`` tool is developed based on this library to capture
> packets in DPDK.
> > +Users can use this as an example to develop their own packet capturing
> application.
> 
> Is it an example or a debugging tool?

It is a debugging tool.

> If it is an example, it should be in the examples/ directory.
> 
> >  ABI Changes
> >  -----------
> > @@ -146,6 +156,9 @@ ABI Changes
> >  * The ``rte_port_source_params`` structure has new fields to support PCAP
> file.
> >    It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
> >
> > +* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and
> ``nb_tx_queues``
> > +  to support number of queues configured by software.
> 
> There was no deprecation notice in 16.04 for this ABI change.

Deprecation notice and relevant planed changes were sent as RFC during the start of 16.07 , please find the below is the link for the same.
http://dpdk.org/dev/patchwork/patch/12033/
http://dpdk.org/dev/patchwork/patch/12034/

Thanks,
Reshma

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager
  2016-06-14 15:48  3%             ` [dpdk-dev] [PATCH v11 " David Hunt
  2016-06-14 15:48  1%               ` [dpdk-dev] [PATCH v11 1/3] mempool: support external mempool operations David Hunt
@ 2016-06-15  7:47  3%               ` David Hunt
  2016-06-15  7:47  1%                 ` [dpdk-dev] [PATCH v12 1/3] mempool: support external mempool operations David Hunt
                                   ` (2 more replies)
  1 sibling, 3 replies; 200+ results
From: David Hunt @ 2016-06-15  7:47 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 14/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v12 changes:

 * Fixed a comment (function pram h -> ops)
 * Fixed a typo in mempool docs (callbacki)

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool manager callback structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool manager by default. These applications
will need to be modified to use a new external mempool manager.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool manager

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v12 1/3] mempool: support external mempool operations
  2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
@ 2016-06-15  7:47  1%                 ` David Hunt
  2016-06-15 10:13  0%                 ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager Jan Viktorin
  2016-06-16 12:30  3%                 ` [dpdk-dev] [PATCH v13 " David Hunt
  2 siblings, 0 replies; 200+ results
From: David Hunt @ 2016-06-15  7:47 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  31 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 --
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  66 +++-----
 lib/librte_mempool/rte_mempool.h           | 251 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 148 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 601 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..2e3116e 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,7 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a ring or an external mempool manager to store free objects.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +127,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+External Mempool Manager
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to external mempool manager.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several external mempool managers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool manager callback (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool manager by default. These applications
+will need to be modified to use a new external mempool manager.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an external mempool manager.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 7d947ae..c415095 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -39,15 +39,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 22a5645..ac40cb3 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,9 +352,9 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
 	}
 
@@ -703,7 +669,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +781,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +811,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc");
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc");
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc");
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc");
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +910,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1099,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1120,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..92deb42 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,13 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +221,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +247,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +337,212 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_ops_get(int ops_index)
+{
+	RTE_VERIFY(ops_index < RTE_MEMPOOL_MAX_OPS_IDX);
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops get callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of get function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops put callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of put function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);
+
+/**
+ * Register mempool operations.
+ *
+ * @param ops
+ *   Pointer to an ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_ops_register(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of an external mempool manager.
+ * Note that the rte_mempool_ops_register fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_ops_register(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +992,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1003,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1154,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1182,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..9328b77
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,148 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_ops_register(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	return ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_ops_register. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..626786e
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool managers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..6209ec2 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_register;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v9 8/8] doc: update doc for packet capture framework
  2016-06-15  5:44  0%         ` Pattan, Reshma
@ 2016-06-15  8:24  0%           ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-15  8:24 UTC (permalink / raw)
  To: Pattan, Reshma; +Cc: dev, Mcnamara, John

2016-06-15 05:44, Pattan, Reshma:
> > >  ABI Changes
> > >  -----------
> > > @@ -146,6 +156,9 @@ ABI Changes
> > >  * The ``rte_port_source_params`` structure has new fields to support PCAP
> > file.
> > >    It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
> > >
> > > +* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and
> > ``nb_tx_queues``
> > > +  to support number of queues configured by software.
> > 
> > There was no deprecation notice in 16.04 for this ABI change.
> 
> Deprecation notice and relevant planed changes were sent as RFC during the start of 16.07 , please find the below is the link for the same.
> http://dpdk.org/dev/patchwork/patch/12033/
> http://dpdk.org/dev/patchwork/patch/12034/

Yes there was no notice in 16.04.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager
  2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
  2016-06-15  7:47  1%                 ` [dpdk-dev] [PATCH v12 1/3] mempool: support external mempool operations David Hunt
@ 2016-06-15 10:13  0%                 ` Jan Viktorin
  2016-06-16 12:30  3%                 ` [dpdk-dev] [PATCH v13 " David Hunt
  2 siblings, 0 replies; 200+ results
From: Jan Viktorin @ 2016-06-15 10:13 UTC (permalink / raw)
  To: David Hunt; +Cc: dev, olivier.matz, jerin.jacob, shreyansh.jain

Hi,

I've got one last question. Initially, I was interested in creating
my own external memory provider based on a Linux Kernel driver.
So, I've got an opened file descriptor that points to a device which
can mmap a memory regions for me.

...
int fd = open("/dev/uio0" ...);
...
rte_mempool *pool = rte_mempool_create_empty(...);
rte_mempool_set_ops_byname(pool, "uio_allocator_ops");

I am not sure how to pass the file descriptor pointer. I thought it would
be possible by the rte_mempool_alloc but it's not... Is it possible
to solve this case?

The allocator is device-specific.

Regards
Jan

On Wed, 15 Jun 2016 08:47:01 +0100
David Hunt <david.hunt@intel.com> wrote:

> Here's the latest version of the External Mempool Manager patchset.
> It's re-based on top of the latest head as of 14/6/2016, including
> Olivier's 35-part patch series on mempool re-org [1]
> 
> [1] http://dpdk.org/ml/archives/dev/2016-May/039229.html
> 
> v12 changes:
> 
>  * Fixed a comment (function pram h -> ops)
>  * Fixed a typo in mempool docs (callbacki)
> 
> v11 changes:
> 
>  * Fixed comments (added '.' where needed for consistency)
>  * removed ABI breakage notice for mempool manager in deprecation.rst
>  * Added description of the external mempool manager functionality to
>    doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
>  * renamed rte_mempool_default.c to rte_mempool_ring.c
> 
> v10 changes:
> 
>  * changed the _put/_get op names to _enqueue/_dequeue to be consistent
>    with the function names
>  * some rte_errno cleanup
>  * comment tweaks about when to set pool_data
>  * removed an un-needed check for ops->alloc == NULL
> 
> v9 changes:
> 
>  * added a check for NULL alloc in rte_mempool_ops_register
>  * rte_mempool_alloc_t now returns int instead of void*
>  * fixed some comment typo's
>  * removed some unneeded typecasts
>  * changed a return NULL to return -EEXIST in rte_mempool_ops_register
>  * fixed rte_mempool_version.map file so builds ok as shared libs
>  * moved flags check from rte_mempool_create_empty to rte_mempool_create
> 
> v8 changes:
> 
>  * merged first three patches in the series into one.
>  * changed parameters to ops callback to all be rte_mempool pointer
>    rather than than pointer to opaque data or uint64.
>  * comment fixes.
>  * fixed parameter to _free function (was inconsistent).
>  * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED
> 
> v7 changes:
> 
>  * Changed rte_mempool_handler_table to rte_mempool_ops_table
>  * Changed hander_idx to ops_index in rte_mempool struct
>  * Reworked comments in rte_mempool.h around ops functions
>  * Changed rte_mempool_hander.c to rte_mempool_ops.c
>  * Changed all functions containing _handler_ to _ops_
>  * Now there is no mention of 'handler' left
>  * Other small changes out of review of mailing list
> 
> v6 changes:
> 
>  * Moved the flags handling from rte_mempool_create_empty to
>    rte_mempool_create, as it's only there for backward compatibility
>  * Various comment additions and cleanup
>  * Renamed rte_mempool_handler to rte_mempool_ops
>  * Added a union for *pool and u64 pool_id in struct rte_mempool
>  * split the original patch into a few parts for easier review.
>  * rename functions with _ext_ to _ops_.
>  * addressed review comments
>  * renamed put and get functions to enqueue and dequeue
>  * changed occurences of rte_mempool_ops to const, as they
>    contain function pointers (security)
>  * split out the default external mempool handler into a separate
>    patch for easier review
> 
> v5 changes:
>  * rebasing, as it is dependent on another patch series [1]
> 
> v4 changes (Olivier Matz):
>  * remove the rte_mempool_create_ext() function. To change the handler, the
>    user has to do the following:
>    - mp = rte_mempool_create_empty()
>    - rte_mempool_set_handler(mp, "my_handler")
>    - rte_mempool_populate_default(mp)
>    This avoids to add another function with more than 10 arguments, duplicating
>    the doxygen comments
>  * change the api of rte_mempool_alloc_t: only the mempool pointer is required
>    as all information is available in it
>  * change the api of rte_mempool_free_t: remove return value
>  * move inline wrapper functions from the .c to the .h (else they won't be
>    inlined). This implies to have one header file (rte_mempool.h), or it
>    would have generate cross dependencies issues.
>  * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
>    to the use of && instead of &)
>  * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
>  * fix build with shared libraries (global handler has to be declared in
>    the .map file)
>  * rationalize #include order
>  * remove unused function rte_mempool_get_handler_name()
>  * rename some structures, fields, functions
>  * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
>    from Yuanhan)
>  * test the ext mempool handler in the same file than standard mempool tests,
>    avoiding to duplicate the code
>  * rework the custom handler in mempool_test
>  * rework a bit the patch selecting default mbuf pool handler
>  * fix some doxygen comments
> 
> v3 changes:
>  * simplified the file layout, renamed to rte_mempool_handler.[hc]
>  * moved the default handlers into rte_mempool_default.c
>  * moved the example handler out into app/test/test_ext_mempool.c
>  * removed is_mc/is_mp change, slight perf degredation on sp cached operation
>  * removed stack hanler, may re-introduce at a later date
>  * Changes out of code reviews
> 
> v2 changes:
>  * There was a lot of duplicate code between rte_mempool_xmem_create and
>    rte_mempool_create_ext. This has now been refactored and is now
>    hopefully cleaner.
>  * The RTE_NEXT_ABI define is now used to allow building of the library
>    in a format that is compatible with binaries built against previous
>    versions of DPDK.
>  * Changes out of code reviews. Hopefully I've got most of them included.
> 
> The External Mempool Manager is an extension to the mempool API that allows
> users to add and use an external mempool manager, which allows external memory
> subsystems such as external hardware memory management systems and software
> based memory allocators to be used with DPDK.
> 
> The existing API to the internal DPDK mempool manager will remain unchanged
> and will be backward compatible. However, there will be an ABI breakage, as
> the mempool struct is changing. These changes are all contained withing
> RTE_NEXT_ABI defs, and the current or next code can be changed with
> the CONFIG_RTE_NEXT_ABI config setting
> 
> There are two aspects to external mempool manager.
>   1. Adding the code for your new mempool operations (ops). This is
>      achieved by adding a new mempool ops source file into the
>      librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
>   2. Using the new API to call rte_mempool_create_empty and
>      rte_mempool_set_ops_byname to create a new mempool
>      using the name parameter to identify which ops to use.
> 
> New API calls added
>  1. A new rte_mempool_create_empty() function
>  2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
>  3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
>     which populates the mempool using the relevant ops
> 
> Several external mempool managers may be used in the same application. A new
> mempool can then be created by using the new rte_mempool_create_empty function,
> then calling rte_mempool_set_ops_byname to point the mempool to the relevant
> mempool manager callback structure.
> 
> Legacy applications will continue to use the old rte_mempool_create API call,
> which uses a ring based mempool manager by default. These applications
> will need to be modified to use a new external mempool manager.
> 
> The external mempool manager needs to provide the following functions.
>  1. alloc     - allocates the mempool memory, and adds each object onto a ring
>  2. enqueue   - puts an object back into the mempool once an application has
>                 finished with it
>  3. dequeue   - gets an object from the mempool for use by the application
>  4. get_count - gets the number of available objects in the mempool
>  5. free      - frees the mempool memory
> 
> Every time an enqueue/dequeue/get_count is called from the application/PMD,
> the callback for that mempool is called. These functions are in the fastpath,
> and any unoptimised ops may limit performance.
> 
> The new APIs are as follows:
> 
> 1. rte_mempool_create_empty
> 
> struct rte_mempool *
> rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
>     unsigned cache_size, unsigned private_data_size,
>     int socket_id, unsigned flags);
> 
> 2. rte_mempool_set_ops_byname()
> 
> int
> rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name);
> 
> 3. rte_mempool_populate_default()
> 
> int rte_mempool_populate_default(struct rte_mempool *mp);
> 
> 4. rte_mempool_populate_anon()
> 
> int rte_mempool_populate_anon(struct rte_mempool *mp);
> 
> Please see rte_mempool.h for further information on the parameters.
> 
> 
> The important thing to note is that the mempool ops struct is passed by name
> to rte_mempool_set_ops_byname, which looks through the ops struct array to
> get the ops_index, which is then stored in the rte_memool structure. This
> allow multiple processes to use the same mempool, as the function pointers
> are accessed via ops index.
> 
> The mempool ops structure contains callbacks to the implementation of
> the ops function, and is set up for registration as follows:
> 
> static const struct rte_mempool_ops ops_sp_mc = {
>     .name = "ring_sp_mc",
>     .alloc = rte_mempool_common_ring_alloc,
>     .enqueue = common_ring_sp_enqueue,
>     .dequeue = common_ring_mc_dequeue,
>     .get_count = common_ring_get_count,
>     .free = common_ring_free,
> };
> 
> And then the following macro will register the ops in the array of ops
> structures
> 
> REGISTER_MEMPOOL_OPS(ops_mp_mc);
> 
> For an example of API usage, please see app/test/test_mempool.c, which
> implements a rudimentary "custom_handler" mempool manager using simple mallocs
> for each mempool object. This file also contains the callbacks and self
> registration for the new handler.
> 
> David Hunt (2):
>   mempool: support external mempool operations
>   mbuf: make default mempool ops configurable at build
> 
> Olivier Matz (1):
>   app/test: test external mempool manager
> 
> 



-- 
   Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
   System Architect              Web:    www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3] mbuf: new flag when Vlan is stripped
  2016-05-27 14:33  2%   ` [dpdk-dev] [PATCH v2] " Olivier Matz
@ 2016-06-15 11:48  2%     ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2016-06-15 11:48 UTC (permalink / raw)
  To: dev
  Cc: johndale, konstantin.ananyev, helin.zhang, adrien.mazarguil,
	rahul.lakkireddy, alejandro.lucero, sony.chacko

The behavior of PKT_RX_VLAN_PKT was not very well defined, resulting in
PMDs not advertising the same flags in similar conditions.

Following discussion in [1], introduce 2 new flags PKT_RX_VLAN_STRIPPED
and PKT_RX_QINQ_STRIPPED that are better defined:

  PKT_RX_VLAN_STRIPPED: a vlan has been stripped by the hardware and its
  tci is saved in mbuf->vlan_tci. This can only happen if vlan stripping
  is enabled in the RX configuration of the PMD.

For now, the old flag PKT_RX_VLAN_PKT is kept but marked as deprecated.
It should be removed from applications and PMDs in a future revision.

This patch also updates the drivers. For PKT_RX_VLAN_PKT:

- e1000, enic, i40e, mlx5, nfp, vmxnet3: done, PKT_RX_VLAN_PKT already
  had the same meaning than PKT_RX_VLAN_STRIPPED, minor update is
  required.
- fm10k: done, PKT_RX_VLAN_PKT already had the same meaning than
  PKT_RX_VLAN_STRIPPED, and vlan stripping is always enabled on fm10k.
- ixgbe: modification done (vector and normal), the old flag was set
  when a vlan was recognized, even if vlan stripping was disabled.
- the other drivers do not support vlan stripping.

For PKT_RX_QINQ_PKT, it was only supported on i40e, and the behavior was
already correct, so we can reuse the same bit value for
PKT_RX_QINQ_STRIPPED.

[1] http://dpdk.org/ml/archives/dev/2016-April/037837.html,

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---

v2 -> v3:
- remove the "deprecated" notice from PKT_RX_VLAN_PKT comment.
- use UINT8_MAX when appropriate in ixgbe_vec
- simplify check of vlan flags in RTE_BUILD_BUG_ON() in ixgbe_vec

v1 -> v2:
- fix ixgbe (vector mode) and i40e (normal and vector mode)
- store vlan flags instead of a boolean value in ixgbe rxq, as
  suggested by Konstantin
- replay tests on ixgbe (normal + vector) and i40e (normal +
  vector). See below.

RFC -> v1:
- fix checkpatch and check-git-log.sh issues
- add a deprecation notice for the old vlan flags
- rebase on head

 app/test-pmd/rxonly.c                |  4 +--
 doc/guides/rel_notes/deprecation.rst |  5 ++++
 drivers/net/e1000/em_rxtx.c          |  3 ++-
 drivers/net/e1000/igb_rxtx.c         |  3 ++-
 drivers/net/enic/enic_rx.c           |  2 +-
 drivers/net/i40e/i40e_rxtx.c         |  4 +--
 drivers/net/i40e/i40e_rxtx_vec.c     |  2 +-
 drivers/net/ixgbe/ixgbe_ethdev.c     | 11 ++++++++
 drivers/net/ixgbe/ixgbe_rxtx.c       | 14 +++++++----
 drivers/net/ixgbe/ixgbe_rxtx.h       |  2 ++
 drivers/net/ixgbe/ixgbe_rxtx_vec.c   | 35 +++++++++++++++++---------
 drivers/net/mlx5/mlx5_rxtx.c         |  6 +++--
 drivers/net/nfp/nfp_net.c            |  2 +-
 drivers/net/vmxnet3/vmxnet3_rxtx.c   |  2 +-
 lib/librte_mbuf/rte_mbuf.c           |  2 ++
 lib/librte_mbuf/rte_mbuf.h           | 49 ++++++++++++++++++++++++++++++++----
 16 files changed, 112 insertions(+), 34 deletions(-)


This patch is tested on ixgbe (normal + vector), i40e (normal +
vector) and igb (hardware is a 82575):

  # we use scapy to send packets like this:
  # Ether(src="00:01:02:03:04:05", dst="00:1B:21:AB:8F:10")/Dot1Q(vlan=0x666)/IP()/UDP()/Raw("x"*32)

  cd dpdk.org/
  make config T=x86_64-native-linuxapp-gcc
  make -j32
  mkdir -p /mnt/huge
  mount -t hugetlbfs nodev /mnt/huge
  echo 256 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
  modprobe uio_pci_generic

  # test-pmd is started with vlan stripping, using the rx-vector
  # function if available (i40e and ixgbe)
  ./build/app/testpmd -l 2,4 -- --total-num-mbufs=65536 -i --port-topology=chained \
    --disable-hw-vlan-filter
  # to disable vlan stripping, add:
  --disable-hw-vlan-strip
  # to disable the vector mode (it can be checked in debug logs), add:
  --enable-rx-cksum

  # we run test-pmd in rxonly mode, displaying the packet information.
  set fwd rxonly
  set verbose 1
  start

==== IXGBE normal rx function

  # ixgbe: the behavior of the flag PKT_RX_VLAN_PKT is kept as before,
  # and the new flag PKT_RX_VLAN_STRIPPED is introduced when vlan stripping
  # is enabled and a vlan is stripped.

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT

==== IXGBE vector rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666Unknown packet type
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1Unknown packet type
 - Receive queue=0x0
  PKT_RX_VLAN_PKT

==== I40E normal rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown            
 - Receive queue=0x0     

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4_EXT_UNKNOWN - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 

==== I40E vector rx function

--- vlan stripping enabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0     

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666Unknown packet type
 - Receive queue=0x0     
  PKT_RX_VLAN_PKT        
  PKT_RX_VLAN_STRIPPED   

--- vlan stripping disabled

  # packet without vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1Unknown packet type
 - Receive queue=0x0
port 0/queue 0: received 1 packets

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1Unknown packet type
 - Receive queue=0x0

==== IGB

(not retested since RFC patch, but there was no code modification)

--- vlan stripping enabled

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x8100 - length=78 - nb_segs=1 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0

--- vlan stripping disabled

  # packet with vlan
  src=00:01:02:03:04:05 - dst=00:1B:21:AB:8F:10 - type=0x0800 - length=74 - nb_segs=1 - VLAN tci=0x666 - (outer) L2 type: ETHER - (outer) L3 type: IPV4 - (outer) L4 type: UDP - Tunnel type: Unknown - Inner L2 type: Unknown - Inner L3 type: Unknown - Inner L4 type: Unknown
 - Receive queue=0x0
  PKT_RX_VLAN_PKT
  PKT_RX_VLAN_STRIPPED


diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 14555ab..c69b344 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -156,9 +156,9 @@ pkt_burst_receive(struct fwd_stream *fs)
 				printf("hash=0x%x ID=0x%x ",
 				       mb->hash.fdir.hash, mb->hash.fdir.id);
 		}
-		if (ol_flags & PKT_RX_VLAN_PKT)
+		if (ol_flags & PKT_RX_VLAN_STRIPPED)
 			printf(" - VLAN tci=0x%x", mb->vlan_tci);
-		if (ol_flags & PKT_RX_QINQ_PKT)
+		if (ol_flags & PKT_RX_QINQ_STRIPPED)
 			printf(" - QinQ VLAN tci=0x%x, VLAN tci outer=0x%x",
 					mb->vlan_tci, mb->vlan_tci_outer);
 		if (mb->packet_type) {
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 7d947ae..702dfce 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -54,3 +54,8 @@ Deprecation Notices
   a handle, like the way kernel exposes an fd to user for locating a
   specific file, and to keep all major structures internally, so that
   we are likely to be free from ABI violations in future.
+
+* The mbuf flags PKT_RX_VLAN_PKT and PKT_RX_QINQ_PKT are deprecated and
+  are respectively replaced by PKT_RX_VLAN_STRIPPED and
+  PKT_RX_QINQ_STRIPPED, that are better described. The old flags and
+  their behavior will be kept in 16.07 and will be removed in 16.11.
diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
index 3d36f21..6d8750a 100644
--- a/drivers/net/e1000/em_rxtx.c
+++ b/drivers/net/e1000/em_rxtx.c
@@ -629,7 +629,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0);
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 	return pkt_flags;
 }
diff --git a/drivers/net/e1000/igb_rxtx.c b/drivers/net/e1000/igb_rxtx.c
index 18aeead..9d80a0b 100644
--- a/drivers/net/e1000/igb_rxtx.c
+++ b/drivers/net/e1000/igb_rxtx.c
@@ -729,7 +729,8 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	uint64_t pkt_flags;
 
 	/* Check if VLAN present */
-	pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
+		PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED : 0);
 
 #if defined(RTE_LIBRTE_IEEE1588)
 	if (rx_status & E1000_RXD_STAT_TMST)
diff --git a/drivers/net/enic/enic_rx.c b/drivers/net/enic/enic_rx.c
index f92f6bc..6459e97 100644
--- a/drivers/net/enic/enic_rx.c
+++ b/drivers/net/enic/enic_rx.c
@@ -197,7 +197,7 @@ enic_cq_rx_to_pkt_flags(struct cq_desc *cqd, struct rte_mbuf *mbuf)
 
 	/* VLAN stripping */
 	if (bwflags & CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) {
-		pkt_flags |= PKT_RX_VLAN_PKT;
+		pkt_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		mbuf->vlan_tci = enic_cq_rx_desc_vlan(cqrd);
 	} else {
 		mbuf->vlan_tci = 0;
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index c833aa3..eea246b 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -88,7 +88,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
 {
 	if (rte_le_to_cpu_64(rxdp->wb.qword1.status_error_len) &
 		(1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) {
-		mb->ol_flags |= PKT_RX_VLAN_PKT;
+		mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		mb->vlan_tci =
 			rte_le_to_cpu_16(rxdp->wb.qword0.lo_dword.l2tag1);
 		PMD_RX_LOG(DEBUG, "Descriptor l2tag1: %u",
@@ -99,7 +99,7 @@ i40e_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union i40e_rx_desc *rxdp)
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 	if (rte_le_to_cpu_16(rxdp->wb.qword2.ext_status) &
 		(1 << I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) {
-		mb->ol_flags |= PKT_RX_QINQ_PKT;
+		mb->ol_flags |= PKT_RX_QINQ_STRIPPED;
 		mb->vlan_tci_outer = mb->vlan_tci;
 		mb->vlan_tci = rte_le_to_cpu_16(rxdp->wb.qword2.l2tag2_2);
 		PMD_RX_LOG(DEBUG, "Descriptor l2tag2_1: %u, l2tag2_2: %u",
diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
index eef80d9..634bd39 100644
--- a/drivers/net/i40e/i40e_rxtx_vec.c
+++ b/drivers/net/i40e/i40e_rxtx_vec.c
@@ -154,7 +154,7 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 	/* map rss and vlan type to rss hash and vlan flag */
 	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
 			0, 0, 0, 0,
-			0, 0, 0, PKT_RX_VLAN_PKT,
+			0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
 			0, 0, 0, 0);
 
 	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index a2b170b..5f3e047 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -1636,6 +1636,7 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 {
 	struct ixgbe_hwstrip *hwstrip =
 		IXGBE_DEV_PRIVATE_TO_HWSTRIP_BITMAP(dev->data->dev_private);
+	struct ixgbe_rx_queue *rxq;
 
 	if (queue >= IXGBE_MAX_RX_QUEUE_NUM)
 		return;
@@ -1644,6 +1645,16 @@ ixgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev, uint16_t queue, bool on)
 		IXGBE_SET_HWSTRIP(hwstrip, queue);
 	else
 		IXGBE_CLEAR_HWSTRIP(hwstrip, queue);
+
+	if (queue >= dev->data->nb_rx_queues)
+		return;
+
+	rxq = dev->data->rx_queues[queue];
+
+	if (on)
+		rxq->vlan_flags = PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
+	else
+		rxq->vlan_flags = PKT_RX_VLAN_PKT;
 }
 
 static void
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9c6eaf2..5a7064c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1221,7 +1221,7 @@ ixgbe_rxd_pkt_info_to_pkt_flags(uint16_t pkt_info)
 }
 
 static inline uint64_t
-rx_desc_status_to_pkt_flags(uint32_t rx_status)
+rx_desc_status_to_pkt_flags(uint32_t rx_status, uint64_t vlan_flags)
 {
 	uint64_t pkt_flags;
 
@@ -1230,7 +1230,7 @@ rx_desc_status_to_pkt_flags(uint32_t rx_status)
 	 * Do not check whether L3/L4 rx checksum done by NIC or not,
 	 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
 	 */
-	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
+	pkt_flags = (rx_status & IXGBE_RXD_STAT_VP) ?  vlan_flags : 0;
 
 #ifdef RTE_LIBRTE_IEEE1588
 	if (rx_status & IXGBE_RXD_STAT_TMST)
@@ -1287,6 +1287,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 	uint32_t pkt_info[LOOK_AHEAD];
 	int i, j, nb_rx = 0;
 	uint32_t status;
+	uint64_t vlan_flags = rxq->vlan_flags;
 
 	/* get references to current descriptor and S/W ring entry */
 	rxdp = &rxq->rx_ring[rxq->rx_tail];
@@ -1328,7 +1329,8 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 			mb->vlan_tci = rte_le_to_cpu_16(rxdp[j].wb.upper.vlan);
 
 			/* convert descriptor fields to rte mbuf flags */
-			pkt_flags = rx_desc_status_to_pkt_flags(s[j]);
+			pkt_flags = rx_desc_status_to_pkt_flags(s[j],
+				vlan_flags);
 			pkt_flags |= rx_desc_error_to_pkt_flags(s[j]);
 			pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags
 					((uint16_t)pkt_info[j]);
@@ -1544,6 +1546,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	uint16_t nb_rx;
 	uint16_t nb_hold;
 	uint64_t pkt_flags;
+	uint64_t vlan_flags;
 
 	nb_rx = 0;
 	nb_hold = 0;
@@ -1551,6 +1554,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 	rx_id = rxq->rx_tail;
 	rx_ring = rxq->rx_ring;
 	sw_ring = rxq->sw_ring;
+	vlan_flags = rxq->vlan_flags;
 	while (nb_rx < nb_pkts) {
 		/*
 		 * The order of operations here is important as the DD status
@@ -1660,7 +1664,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		/* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
 		rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
 
-		pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+		pkt_flags = rx_desc_status_to_pkt_flags(staterr, vlan_flags);
 		pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
 		pkt_flags = pkt_flags |
 			ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
@@ -1753,7 +1757,7 @@ ixgbe_fill_cluster_head_buf(
 	 */
 	head->vlan_tci = rte_le_to_cpu_16(desc->wb.upper.vlan);
 	pkt_info = rte_le_to_cpu_32(desc->wb.lower.lo_dword.data);
-	pkt_flags = rx_desc_status_to_pkt_flags(staterr);
+	pkt_flags = rx_desc_status_to_pkt_flags(staterr, rxq->vlan_flags);
 	pkt_flags |= rx_desc_error_to_pkt_flags(staterr);
 	pkt_flags |= ixgbe_rxd_pkt_info_to_pkt_flags((uint16_t)pkt_info);
 	head->ol_flags = pkt_flags;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.h b/drivers/net/ixgbe/ixgbe_rxtx.h
index 3691a19..2608b36 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.h
+++ b/drivers/net/ixgbe/ixgbe_rxtx.h
@@ -146,6 +146,8 @@ struct ixgbe_rx_queue {
 	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwise. */
 	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
 	uint8_t             rx_deferred_start; /**< not in global dev start. */
+	/** flags to set in mbuf when a vlan is detected. */
+	uint64_t            vlan_flags;
 	/** need to alloc dummy mbuf, for wraparound when scanning hw ring */
 	struct rte_mbuf fake_mbuf;
 	/** hold packets to return to application */
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index e97ea82..12190d2 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -140,10 +140,9 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
  */
 #ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
 
-#define VTAG_SHIFT     (3)
-
 static inline void
-desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
+desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
+	struct rte_mbuf **rx_pkts)
 {
 	__m128i ptype0, ptype1, vtag0, vtag1;
 	union {
@@ -151,12 +150,6 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 		uint64_t dword;
 	} vol;
 
-	/* pkt type + vlan olflags mask */
-	const __m128i pkttype_msk = _mm_set_epi16(
-			0x0000, 0x0000, 0x0000, 0x0000,
-			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
-			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);
-
 	/* mask everything except rss type */
 	const __m128i rsstype_msk = _mm_set_epi16(
 			0x0000, 0x0000, 0x0000, 0x0000,
@@ -168,6 +161,19 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
 			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
 
+	/* mask everything except vlan present bit */
+	const __m128i vlan_msk = _mm_set_epi16(
+			0x0000, 0x0000,
+			0x0000, 0x0000,
+			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
+			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
+	/* map vlan present (0x8) to ol_flags */
+	const __m128i vlan_map = _mm_set_epi8(
+		0, 0, 0, 0,
+		0, 0, 0, vlan_flags,
+		0, 0, 0, 0,
+		0, 0, 0, 0);
+
 	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
 	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
 	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
@@ -178,8 +184,8 @@ desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
 	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
 
 	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
-	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
-	vtag1 = _mm_and_si128(vtag1, pkttype_msk);
+	vtag1 = _mm_and_si128(vtag1, vlan_msk);
+	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);
 
 	vtag1 = _mm_or_si128(ptype0, vtag1);
 	vol.dword = _mm_cvtsi128_si64(vtag1);
@@ -221,6 +227,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 				0, 0            /* ignore pkt_type field */
 			);
 	__m128i dd_check, eop_check;
+	uint8_t vlan_flags;
 
 	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
 	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
@@ -270,6 +277,10 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	 */
 	sw_ring = &rxq->sw_ring[rxq->rx_tail];
 
+	/* ensure these 2 flags are in the lower 8 bits */
+	RTE_BUILD_BUG_ON((PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED) > UINT8_MAX);
+	vlan_flags = rxq->vlan_flags & UINT8_MAX;
+
 	/* A. load 4 packet in one loop
 	 * [A*. mask out 4 unused dirty field in desc]
 	 * B. copy 4 mbuf point from swring to rx_pkts
@@ -330,7 +341,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
 
 		/* set ol_flags with vlan packet type */
-		desc_to_olflags_v(descs, &rx_pkts[pos]);
+		desc_to_olflags_v(descs, vlan_flags, &rx_pkts[pos]);
 
 		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 29bfcec..d5b2286 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1051,7 +1051,8 @@ mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
+				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				pkt_buf->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
@@ -1207,7 +1208,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
 			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-				seg->ol_flags |= PKT_RX_VLAN_PKT;
+				seg->ol_flags |= PKT_RX_VLAN_PKT |
+					PKT_RX_VLAN_STRIPPED;
 				seg->vlan_tci = vlan_tci;
 			}
 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
diff --git a/drivers/net/nfp/nfp_net.c b/drivers/net/nfp/nfp_net.c
index ea5a2a3..5c9f350 100644
--- a/drivers/net/nfp/nfp_net.c
+++ b/drivers/net/nfp/nfp_net.c
@@ -1800,7 +1800,7 @@ nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
 		    (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
 			mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
-			mb->ol_flags |= PKT_RX_VLAN_PKT;
+			mb->ol_flags |= PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED;
 		}
 
 		/* Adding the mbuff to the mbuff array passed by the app */
diff --git a/drivers/net/vmxnet3/vmxnet3_rxtx.c b/drivers/net/vmxnet3/vmxnet3_rxtx.c
index 9fe8752..ccafc0c 100644
--- a/drivers/net/vmxnet3/vmxnet3_rxtx.c
+++ b/drivers/net/vmxnet3/vmxnet3_rxtx.c
@@ -579,7 +579,7 @@ vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
 {
 	/* Check for hardware stripped VLAN tag */
 	if (rcd->ts) {
-		rxm->ol_flags |= PKT_RX_VLAN_PKT;
+		rxm->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
 		rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
 	}
 
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index eec1456..2ece742 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -258,8 +258,10 @@ const char *rte_get_rx_ol_flag_name(uint64_t mask)
 	/* case PKT_RX_HBUF_OVERFLOW: return "PKT_RX_HBUF_OVERFLOW"; */
 	/* case PKT_RX_RECIP_ERR: return "PKT_RX_RECIP_ERR"; */
 	/* case PKT_RX_MAC_ERR: return "PKT_RX_MAC_ERR"; */
+	case PKT_RX_VLAN_STRIPPED: return "PKT_RX_VLAN_STRIPPED";
 	case PKT_RX_IEEE1588_PTP: return "PKT_RX_IEEE1588_PTP";
 	case PKT_RX_IEEE1588_TMST: return "PKT_RX_IEEE1588_TMST";
+	case PKT_RX_QINQ_STRIPPED: return "PKT_RX_QINQ_STRIPPED";
 	default: return NULL;
 	}
 }
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 11fa06d..8798c41 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -79,7 +79,15 @@ extern "C" {
  * Keep these flags synchronized with rte_get_rx_ol_flag_name() and
  * rte_get_tx_ol_flag_name().
  */
-#define PKT_RX_VLAN_PKT      (1ULL << 0)  /**< RX packet is a 802.1q VLAN packet. */
+
+/**
+ * RX packet is a 802.1q VLAN packet. This flag was set by PMDs when
+ * the packet is recognized as a VLAN, but the behavior between PMDs
+ * was not the same. This flag is kept for some time to avoid breaking
+ * applications and should be replaced by PKT_RX_VLAN_STRIPPED.
+ */
+#define PKT_RX_VLAN_PKT      (1ULL << 0)
+
 #define PKT_RX_RSS_HASH      (1ULL << 1)  /**< RX packet with RSS hash result. */
 #define PKT_RX_FDIR          (1ULL << 2)  /**< RX packet with FDIR match indicate. */
 #define PKT_RX_L4_CKSUM_BAD  (1ULL << 3)  /**< L4 cksum of RX pkt. is not OK. */
@@ -89,11 +97,37 @@ extern "C" {
 #define PKT_RX_HBUF_OVERFLOW (0ULL << 0)  /**< Header buffer overflow. */
 #define PKT_RX_RECIP_ERR     (0ULL << 0)  /**< Hardware processing error. */
 #define PKT_RX_MAC_ERR       (0ULL << 0)  /**< MAC error. */
+
+/**
+ * A vlan has been stripped by the hardware and its tci is saved in
+ * mbuf->vlan_tci. This can only happen if vlan stripping is enabled
+ * in the RX configuration of the PMD.
+ */
+#define PKT_RX_VLAN_STRIPPED (1ULL << 6)
+
+/* hole, some bits can be reused here  */
+
 #define PKT_RX_IEEE1588_PTP  (1ULL << 9)  /**< RX IEEE1588 L2 Ethernet PT Packet. */
 #define PKT_RX_IEEE1588_TMST (1ULL << 10) /**< RX IEEE1588 L2/L4 timestamped packet.*/
 #define PKT_RX_FDIR_ID       (1ULL << 13) /**< FD id reported if FDIR match. */
 #define PKT_RX_FDIR_FLX      (1ULL << 14) /**< Flexible bytes reported if FDIR match. */
-#define PKT_RX_QINQ_PKT      (1ULL << 15)  /**< RX packet with double VLAN stripped. */
+
+/**
+ * The 2 vlans have been stripped by the hardware and their tci are
+ * saved in mbuf->vlan_tci (inner) and mbuf->vlan_tci_outer (outer).
+ * This can only happen if vlan stripping is enabled in the RX
+ * configuration of the PMD. If this flag is set, PKT_RX_VLAN_STRIPPED
+ * must also be set.
+ */
+#define PKT_RX_QINQ_STRIPPED (1ULL << 15)
+
+/**
+ * Deprecated.
+ * RX packet with double VLAN stripped.
+ * This flag is replaced by PKT_RX_QINQ_STRIPPED.
+ */
+#define PKT_RX_QINQ_PKT      PKT_RX_QINQ_STRIPPED
+
 /* add new RX flags here */
 
 /* add new TX flags here */
@@ -761,7 +795,10 @@ struct rte_mbuf {
 
 	/*
 	 * The packet type, which is the combination of outer/inner L2, L3, L4
-	 * and tunnel types.
+	 * and tunnel types. The packet_type is about data really present in the
+	 * mbuf. Example: if vlan stripping is enabled, a received vlan packet
+	 * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the
+	 * vlan is stripped from the data.
 	 */
 	union {
 		uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */
@@ -778,7 +815,8 @@ struct rte_mbuf {
 
 	uint32_t pkt_len;         /**< Total pkt len: sum of all segments. */
 	uint16_t data_len;        /**< Amount of data in segment buffer. */
-	uint16_t vlan_tci;        /**< VLAN Tag Control Identifier (CPU order) */
+	/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */
+	uint16_t vlan_tci;
 
 	union {
 		uint32_t rss;     /**< RSS hash result if RSS enabled */
@@ -804,7 +842,8 @@ struct rte_mbuf {
 
 	uint32_t seqn; /**< Sequence number. See also rte_reorder_insert() */
 
-	uint16_t vlan_tci_outer;  /**< Outer VLAN Tag Control Identifier (CPU order) */
+	/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */
+	uint16_t vlan_tci_outer;
 
 	/* second cache line - fields only used in slow path or on TX */
 	MARKER cacheline1 __rte_cache_min_aligned;
-- 
2.8.0.rc3

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v10 4/7] ethdev: make get port by name and get name by port public
  2016-06-15 14:06  2%     ` [dpdk-dev] [PATCH v10 0/7] add " Reshma Pattan
  2016-06-15 14:06 10%       ` [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct Reshma Pattan
@ 2016-06-15 14:06  4%       ` Reshma Pattan
  1 sibling, 0 replies; 200+ results
From: Reshma Pattan @ 2016-06-15 14:06 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

Converted rte_eth_dev_get_port_by_name to a public API.
Converted rte_eth_dev_get_name_by_port to a public API.
Updated the release notes with the changes.

The librte_pdump library provides the APIs to enable or disable the
packet capture either using the port id or pci address or device name.
So pdump library need to do a mapping from name to port and port to name
internally to validate the device name and register the Rx and Tx
callbacks for the mapped ports. So these APIs are made public for the
pdump library for doing the mentioned mappings.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 doc/guides/rel_notes/release_16_07.rst |  3 +++
 lib/librte_ether/rte_ethdev.c          |  4 ++--
 lib/librte_ether/rte_ethdev.h          | 29 +++++++++++++++++++++++++++++
 lib/librte_ether/rte_ether_version.map |  2 ++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 004ecee..c6222f8 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -138,6 +138,9 @@ API Changes
 * Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
   in the ``rte_eth_dev_info`` object.
 
+* Functions ``rte_eth_dev_get_port_by_name`` and ``rte_eth_dev_get_name_by_port``
+  are changed to a public APIs.
+
 
 ABI Changes
 -----------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 1f634c9..0b19569 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -406,7 +406,7 @@ rte_eth_dev_get_addr_by_port(uint8_t port_id, struct rte_pci_addr *addr)
 	return 0;
 }
 
-static int
+int
 rte_eth_dev_get_name_by_port(uint8_t port_id, char *name)
 {
 	char *tmp;
@@ -425,7 +425,7 @@ rte_eth_dev_get_name_by_port(uint8_t port_id, char *name)
 	return 0;
 }
 
-static int
+int
 rte_eth_dev_get_port_by_name(const char *name, uint8_t *port_id)
 {
 	int i;
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 8ad7c01..fab281e 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -4284,6 +4284,35 @@ rte_eth_dev_l2_tunnel_offload_set(uint8_t port_id,
 				  uint32_t mask,
 				  uint8_t en);
 
+/**
+* Get the port id from pci adrress or device name
+* Ex: 0000:2:00.0 or vdev name eth_pcap0
+*
+* @param name
+*  pci address or name of the device
+* @param port_id
+*   pointer to port identifier of the device
+* @return
+*   - (0) if successful.
+*   - (-ENODEV or -EINVAL) on failure.
+*/
+int
+rte_eth_dev_get_port_by_name(const char *name, uint8_t *port_id);
+
+/**
+* Get the device name from port id
+*
+* @param port_id
+*   pointer to port identifier of the device
+* @param name
+*  pci address or name of the device
+* @return
+*   - (0) if successful.
+*   - (-EINVAL) on failure.
+*/
+int
+rte_eth_dev_get_name_by_port(uint8_t port_id, char *name);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index d06d648..73e730d 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -137,5 +137,7 @@ DPDK_16.07 {
 	global:
 
 	rte_eth_add_first_rx_callback;
+	rte_eth_dev_get_name_by_port;
+	rte_eth_dev_get_port_by_name;
 	rte_eth_dev_info_get;
 } DPDK_16.04;
-- 
2.5.0

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v10 0/7] add packet capture framework
  2016-06-14  9:38  2%   ` [dpdk-dev] [PATCH v9 " Reshma Pattan
  2016-06-14  9:38  5%     ` [dpdk-dev] [PATCH v9 8/8] doc: update doc for " Reshma Pattan
@ 2016-06-15 14:06  2%     ` Reshma Pattan
  2016-06-15 14:06 10%       ` [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct Reshma Pattan
  2016-06-15 14:06  4%       ` [dpdk-dev] [PATCH v10 4/7] ethdev: make get port by name and get name by port public Reshma Pattan
  1 sibling, 2 replies; 200+ results
From: Reshma Pattan @ 2016-06-15 14:06 UTC (permalink / raw)
  To: dev

This patch set include below changes

1)Changes to librte_ether.
2)A new library librte_pdump added for packet capture framework.
3)A new app/pdump tool added for packet capturing.
4)Test pmd changes done to initialize packet capture framework.
5)Documentation update.

1)librte_pdump
==============
To support packet capturing on dpdk Ethernet devices, a new library librte_pdump
is added.Users can develop their own packet capturing application using new library APIs.

Operation:
----------
The librte_pdump provides APIs to support packet capturing on dpdk Ethernet devices.
Library provides APIs to initialize the packet capture framework, enable/disable
the packet capture and uninitialize the packet capture framework.

The librte_pdump library works on a client/server model. The server is responsible for enabling or
disabling the packet capture and the clients are responsible for requesting the enabling or disabling of
the packet capture.

The packet capture framework, as part of its initialization, creates the pthread and the server socket in
the pthread. The application that calls the framework initialization will have the server socket created,
either under the path that the application has passed or under the default path i.e. either ''/var/run'' for
root user or ''$HOME'' for non root user.

Applications that request enabling or disabling of the packet capture will have the client socket created
either under the path that the application has passed or under the default path i.e. either ''/var/run/''
for root users or ''$HOME'' for not root users to send the requests to the server.
The server socket will listen for client requests for enabling or disabling the packet capture.

Applications using below APIs need to pass port/device_id, queue, mempool and
ring parameters. Library uses user provided ring and mempool to mirror the rx/tx
packets of the port for users. Users need to dequeue the rings and write the packets
to vdev(pcap/tuntap) to view the packets using any standard tools.

Note:
Mempool and Ring should be mc/mp supportable.
Mempool mbuf size should be big enough to handle the rx/tx packets of a port.

APIs:
-----
rte_pdump_init()
rte_pdump_enable()
rte_pdump_enable_by_deviceid()
rte_pdump_disable()
rte_pdump_disable_by_deviceid()
rte_pdump_uninit()
rte_pdump_set_socket_dir()

2)app/pdump tool
================
Tool app/pdump is designed based on librte_pdump for packet capturing in DPDK.
This tool by default runs as secondary process, and provides the support for
the command line options for packet capture.

./build/app/dpdk_pdump --
                       --pdump '(port=<port id> | device_id=<pci id or vdev name>),
                                (queue=<queue id>),
                                (rx-dev=<iface or pcap file> |
                                 tx-dev=<iface or pcap file>),
                                [ring-size=<ring size>],
                                [mbuf-size=<mbuf data size>],
                                [total-num-mbufs=<number of mbufs>]'

Parameters inside the parenthesis represents the mandatory parameters.
Parameters inside the square brackets represents optional parameters.
User has to pass on packet capture parameters under --pdump parameters, multiples of
--pdump can be passed to capture packets on different port and queue combinations

Operation:
----------
*Tool parse the user command line arguments,
creates the mempool, ring and the PCAP PMD vdev with 'tx_stream' as either
of the device passed in rx-dev|tx-dev parameters.

*Then calls the APIs of librte_pdump i.e. rte_pdump_enable()/rte_pdump_enable_by_deviceid()
to enable packet capturing on a specific port/device_id and queue by passing on
port|device_id, queue, mempool and ring info.

*Tool runs in while loop to dequeue the packets from the ring and write them to pcap device.

*Tool can be stopped using SIGINT, upon which tool calls
rte_pdump_disable()/rte_pdump_disable_by_deviceid() and free the allocated resources.

Note:
CONFIG_RTE_LIBRTE_PMD_PCAP flag should be set to yes to compile and run the pdump tool.

3)Test-pmd changes
==================
Changes are done to test-pmd application to initialize/uninitialize the packet capture framework.
So app/pdump tool can be run to see packets of dpdk ports that are used by test-pmd.

Similarly any application which needs packet capture should call initialize/uninitialize APIs of
librte_pdump and use pdump tool to start the capture.

4)Packet capture flow between pdump tool and librte_pdump
=========================================================
* Pdump tool (Secondary process) requests packet capture
for specific port|device_id and queue combinations.

*Library in secondary process context creates client socket and communicates
the port|device_id, queue, ring and mempool to server.

*Library initializes server in primary process 'test-pmd' context and server serves
the client request to enable Ethernet rxtx call-backs for a given port|device_id and queue.

*Copy the rx/tx packets to passed mempool and enqueue the packets to ring for secondary process.

*Pdump tool will dequeue the packets from ring and writes them to PCAPMD vdev,
so ultimately packets will be seen on the device that is passed in rx-dev|tx-dev.

*Once the pdump tool is terminated with SIGINT it will disable the packet capturing.

*Library receives the disable packet capture request, communicate the info to server,
server will remove the Ethernet rxtx call-backs.

*Packet capture can be seen using tcpdump command
"tcpdump -ni <iface>" (or) "tcpdump –nr <pcapfile>"

5)Example command line
======================
./build/app/dpdk_pdump -- --pdump 'device_id=0000:02:0.0,queue=*,tx-dev=/tmp/dt-file.pcap,rx-dev=/tmp/dr-file.pcap,ring-size=8192,mbuf-size=2176,total-num-mbufs=32768' --pdump 'device_id=0000:01:00.0,queue=*,rx-dev=/tmp/d-file.pcap,tx-dev=/tmp/d-file.pcap,ring-size=16384,mbuf-size=2176,total-num-mbufs=32768'

v10:
fixed commit messages description.
fixed compilation issue when CONFIG_RTE_LIBRTE_PDUMP is disabled.
removed wrong config option CONFIG_RTE_EXEC_ENV_LINUXAPP inside app/Makefile
for pdump tool.
moved document changes to appropriate patches.

v9:
added a support in rte_pdump_set_socket_dir() to set server and client socket paths
==> http://dpdk.org/dev/patchwork/patch/13450/
updated the documentation for the new changes.
updated the commit messages.

v8:
added server socket argument to rte_pdump_init() API ==> http://dpdk.org/dev/patchwork/patch/13402/
added rte_pdump_set_socket_dir() API.
updated documentation for new changes.

v7:
fixed lines over 90 characters.

v6:
removed below deprecation notice patch from patch set.
http://dpdk.org/dev/patchwork/patch/13372/

v5:
addressed code review comments for below patches
http://dpdk.org/dev/patchwork/patch/12955/
http://dpdk.org/dev/patchwork/patch/12951/

v4:
added missing deprecation notice for ABI changes of rte_eth_dev_info structure.
made doc changes as per doc guidelines.
replaced rte_eal_vdev_init with rte_eth_dev_attach in pdump tool.
removed rxtx-dev parameter from pdump tool command line.

v3:
app/pdump: Moved cleanup code from signal handler to main.
divided librte_ether changes into multiple patches.
example command changed in app/pdump application guide

v2:
fix compilation issues for 4.8.3
fix unnecessary #includes

Reshma Pattan (7):
  ethdev: use locks to protect Rx/Tx callback lists
  ethdev: add new api to add Rx callback as head of the list
  ethdev: add new fields to ethdev info struct
  ethdev: make get port by name and get name by port public
  pdump: add new library for packet capturing support
  app/pdump: add pdump tool for packet capturing
  app/testpmd: add pdump initialization uninitialization

 MAINTAINERS                             |   7 +
 app/Makefile                            |   1 +
 app/pdump/Makefile                      |  49 ++
 app/pdump/main.c                        | 844 +++++++++++++++++++++++++++++
 app/test-pmd/testpmd.c                  |  12 +
 config/common_base                      |   5 +
 doc/guides/prog_guide/index.rst         |   1 +
 doc/guides/prog_guide/pdump_library.rst | 123 +++++
 doc/guides/rel_notes/release_16_07.rst  |  14 +
 doc/guides/sample_app_ug/index.rst      |   1 +
 doc/guides/sample_app_ug/pdump.rst      | 122 +++++
 lib/Makefile                            |   1 +
 lib/librte_ether/rte_ethdev.c           | 123 +++--
 lib/librte_ether/rte_ethdev.h           |  60 +++
 lib/librte_ether/rte_ether_version.map  |   9 +
 lib/librte_pdump/Makefile               |  55 ++
 lib/librte_pdump/rte_pdump.c            | 913 ++++++++++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.h            | 216 ++++++++
 lib/librte_pdump/rte_pdump_version.map  |  13 +
 mk/rte.app.mk                           |   1 +
 20 files changed, 2526 insertions(+), 44 deletions(-)
 create mode 100644 app/pdump/Makefile
 create mode 100644 app/pdump/main.c
 create mode 100644 doc/guides/prog_guide/pdump_library.rst
 create mode 100644 doc/guides/sample_app_ug/pdump.rst
 create mode 100644 lib/librte_pdump/Makefile
 create mode 100644 lib/librte_pdump/rte_pdump.c
 create mode 100644 lib/librte_pdump/rte_pdump.h
 create mode 100644 lib/librte_pdump/rte_pdump_version.map

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
-- 
2.5.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct
  2016-06-15 14:06  2%     ` [dpdk-dev] [PATCH v10 0/7] add " Reshma Pattan
@ 2016-06-15 14:06 10%       ` Reshma Pattan
  2016-06-16 19:14  4%         ` Thomas Monjalon
  2016-06-15 14:06  4%       ` [dpdk-dev] [PATCH v10 4/7] ethdev: make get port by name and get name by port public Reshma Pattan
  1 sibling, 1 reply; 200+ results
From: Reshma Pattan @ 2016-06-15 14:06 UTC (permalink / raw)
  To: dev; +Cc: Reshma Pattan

The new fields nb_rx_queues and nb_tx_queues are added to the
rte_eth_dev_info structure.
Changes to API rte_eth_dev_info_get() are done to update these new fields
to the rte_eth_dev_info object.
Release notes is updated with the changes.

The librte_pdump library needs to register Rx and Tx callbacks for all
the nb_rx_queues and nb_tx_queues, when application wants to capture the
packets on all the software configured number of Rx and Tx queues of the
device. So far there is no support to get nb_rx_queues and nb_tx_queues
information from the ethdev library. Hence these changes are introduced.

Signed-off-by: Reshma Pattan <reshma.pattan@intel.com>
---
 doc/guides/rel_notes/release_16_07.rst | 6 ++++++
 lib/librte_ether/rte_ethdev.c          | 2 ++
 lib/librte_ether/rte_ethdev.h          | 3 +++
 lib/librte_ether/rte_ether_version.map | 1 +
 4 files changed, 12 insertions(+)

diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index c0f6b02..004ecee 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -135,6 +135,9 @@ API Changes
   ibadcrc, ibadlen, imcasts, fdirmatch, fdirmiss,
   tx_pause_xon, rx_pause_xon, tx_pause_xoff, rx_pause_xoff.
 
+* Function ``rte_eth_dev_info_get`` updated to return new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  in the ``rte_eth_dev_info`` object.
+
 
 ABI Changes
 -----------
@@ -146,6 +149,9 @@ ABI Changes
 * The ``rte_port_source_params`` structure has new fields to support PCAP file.
   It was already in release 16.04 with ``RTE_NEXT_ABI`` flag.
 
+* The ``rte_eth_dev_info`` structure has new fields ``nb_rx_queues`` and ``nb_tx_queues``
+  to support number of queues configured by software.
+
 
 Shared Library Versions
 -----------------------
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index 97d167e..1f634c9 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -1661,6 +1661,8 @@ rte_eth_dev_info_get(uint8_t port_id, struct rte_eth_dev_info *dev_info)
 	(*dev->dev_ops->dev_infos_get)(dev, dev_info);
 	dev_info->pci_dev = dev->pci_dev;
 	dev_info->driver_name = dev->data->drv_name;
+	dev_info->nb_rx_queues = dev->data->nb_rx_queues;
+	dev_info->nb_tx_queues = dev->data->nb_tx_queues;
 }
 
 int
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 237e6ef..8ad7c01 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -882,6 +882,9 @@ struct rte_eth_dev_info {
 	struct rte_eth_desc_lim rx_desc_lim;  /**< RX descriptors limits */
 	struct rte_eth_desc_lim tx_desc_lim;  /**< TX descriptors limits */
 	uint32_t speed_capa;  /**< Supported speeds bitmap (ETH_LINK_SPEED_). */
+	/** Configured number of rx/tx queues */
+	uint16_t nb_rx_queues; /**< Number of RX queues. */
+	uint16_t nb_tx_queues; /**< Number of TX queues. */
 };
 
 /**
diff --git a/lib/librte_ether/rte_ether_version.map b/lib/librte_ether/rte_ether_version.map
index c990b04..d06d648 100644
--- a/lib/librte_ether/rte_ether_version.map
+++ b/lib/librte_ether/rte_ether_version.map
@@ -137,4 +137,5 @@ DPDK_16.07 {
 	global:
 
 	rte_eth_add_first_rx_callback;
+	rte_eth_dev_info_get;
 } DPDK_16.04;
-- 
2.5.0

^ permalink raw reply	[relevance 10%]

* Re: [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports on ppc64
  2016-05-30  8:45  0%             ` Olivier Matz
@ 2016-06-15 16:13  3%               ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-15 16:13 UTC (permalink / raw)
  To: Olivier Matz, Yuanhan Liu
  Cc: David Marchand, dev, Chao Zhu, Xie, Huawei, Panu Matilainen

2016-05-30 10:45, Olivier Matz:
> On 05/24/2016 07:15 AM, Yuanhan Liu wrote:
> > On Mon, May 23, 2016 at 03:40:58PM +0200, Olivier Matz wrote:
> >> For reference, here is the report of the ABI checker for EAL:
> >>
> >> [−] struct rte_pci_ioport (2)
> >>
> >>  1 Field len has been added to this type.
> >>    1) This field will not be initialized by old clients.
> >>    2) Size of the inclusive type has been changed.
> >>       NOTE: this field should be accessed only from the new library
> >>       functions, otherwise it may result in crash or incorrect behavior
> >>       of applications.
> >>  2 Size of this type has been changed from 16 bytes to 24 bytes. 	
> >>    The fields or parameters of such data type may be incorrectly
> >>    initialized or accessed by old client applications.
> >>
> >> [−] affected symbols (4)
> >>  rte_eal_pci_ioport_map ( struct rte_pci_device* dev, int bar,
> >>     struct rte_pci_ioport* p ) @@ DPDK_16.04
> >>  3rd parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
> >>  rte_eal_pci_ioport_read ( struct rte_pci_ioport* p, void* data,
> >>     size_t len, off_t offset ) @@ DPDK_16.04
> >>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
> >>  rte_eal_pci_ioport_unmap ( struct rte_pci_ioport* p ) @@ DPDK_16.04
> >>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
> >>  rte_eal_pci_ioport_write ( struct rte_pci_ioport* p, void const* data,
> >>     size_t len, off_t offset ) @@ DPDK_16.04
> >>  1st parameter 'p' (pointer) has base type 'struct rte_pci_ioport'.
> >>
> >>
> >> My understanding of the comment for this structure is that it's
> >> internal to EAL:
> > 
> > I'm not quite sure that is enough. Cc'ed Panu, the guru on ABI stuff,
> > hopefully he could shed some light on it.
> > 
> >> /**
> >>  * A structure used to access io resources for a pci device.
> >>  * rte_pci_ioport is arch, os, driver specific, and should not be used
> >> outside
> >>  * of pci ioport api.
> >>  */
> >> struct rte_pci_ioport {
> >>  ...
> >> }
> >>
> >> So I'd say it's ok to have it integrated for 16.07.
> > 
> > I'll let Thomas to decide it :)
> 
> Panu or Thomas, do you have any comment on this?

The user of this struct is virtio.
The ABI policy does not apply to drivers:
	- A means Application
	- external drivers must be rebuilt for each new release

Thus no problem here.

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v13 0/3] mempool: add external mempool manager
  2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
  2016-06-15  7:47  1%                 ` [dpdk-dev] [PATCH v12 1/3] mempool: support external mempool operations David Hunt
  2016-06-15 10:13  0%                 ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager Jan Viktorin
@ 2016-06-16 12:30  3%                 ` David Hunt
  2016-06-16 12:30  1%                   ` [dpdk-dev] [PATCH v13 1/3] mempool: support external mempool operations David Hunt
  2016-06-17 13:53  3%                   ` [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature David Hunt
  2 siblings, 2 replies; 200+ results
From: David Hunt @ 2016-06-16 12:30 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the External Mempool Manager patchset.
It's re-based on top of the latest head as of 15/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v13 changes:

 * Added in extra opaque data (pool_config) to mempool struct for mempool
   configuration by the ops functions. For example, this can be used to pass
  device names or device flags to the underlying alloc function.
 * Added mempool_config param to rte_mempool_set_ops_byname()

v12 changes:

 * Fixed a comment (function pram h -> ops)
 * fixed a typo (callbacki)

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The External Mempool Manager is an extension to the mempool API that allows
users to add and use an external mempool manager, which allows external memory
subsystems such as external hardware memory management systems and software
based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool manager will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing. These changes are all contained withing
RTE_NEXT_ABI defs, and the current or next code can be changed with
the CONFIG_RTE_NEXT_ABI config setting

There are two aspects to external mempool manager.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempool's ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several external mempool managers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool manager callback structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool manager by default. These applications
will need to be modified to use a new external mempool manager.

The external mempool manager needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name
    void *pool_config);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool manager using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support external mempool operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test external mempool handler

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v13 1/3] mempool: support external mempool operations
  2016-06-16 12:30  3%                 ` [dpdk-dev] [PATCH v13 " David Hunt
@ 2016-06-16 12:30  1%                   ` David Hunt
  2016-06-17 13:53  3%                   ` [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature David Hunt
  1 sibling, 0 replies; 200+ results
From: David Hunt @ 2016-06-16 12:30 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  31 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 -
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  66 +++-----
 lib/librte_mempool/rte_mempool.h           | 253 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 150 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 605 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..2e3116e 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,7 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a ring or an external mempool manager to store free objects.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +127,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+External Mempool Manager
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to external mempool manager.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several external mempool managers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool manager callback (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool manager by default. These applications
+will need to be modified to use a new external mempool manager.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an external mempool manager.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 7d947ae..c415095 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -39,15 +39,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 22a5645..0fb84ad 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,9 +352,9 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
 	}
 
@@ -703,7 +669,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +781,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +811,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +910,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1099,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1120,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..a763fb5 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,14 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	void *pool_config;               /**< optional args for ops alloc. */
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +222,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +248,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +338,213 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_ops_get(int ops_index)
+{
+	RTE_VERIFY(ops_index < RTE_MEMPOOL_MAX_OPS_IDX);
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops get callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of get function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops put callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of put function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+		void *pool_config);
+
+/**
+ * Register mempool operations.
+ *
+ * @param ops
+ *   Pointer to an ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_ops_register(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of an external mempool manager.
+ * Note that the rte_mempool_ops_register fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_ops_register(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +994,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1005,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1156,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1184,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..7977a14
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,150 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_ops_register(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	return ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_ops_register. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+	void *pool_config)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	mp->pool_config = pool_config;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..626786e
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool managers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..6209ec2 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_register;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [PATCH v3 04/17] eal: remove duplicate function declaration
  @ 2016-06-16 14:06  3%   ` Shreyansh Jain
  0 siblings, 0 replies; 200+ results
From: Shreyansh Jain @ 2016-06-16 14:06 UTC (permalink / raw)
  To: dev; +Cc: viktorin, thomas.monjalon, David Marchand

From: David Marchand <david.marchand@6wind.com>

rte_eal_dev_init is declared in both eal_private.h and rte_dev.h since its
introduction.
This function has been exported in ABI, so remove it from eal_private.h

Fixes: e57f20e05177 ("eal: make vdev init path generic for both virtual and pci devices")
Signed-off-by: David Marchand <david.marchand@6wind.com>
---
 lib/librte_eal/common/eal_private.h | 7 -------
 lib/librte_eal/linuxapp/eal/eal.c   | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 857dc3e..06a68f6 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,13 +259,6 @@ int rte_eal_intr_init(void);
 int rte_eal_alarm_init(void);
 
 /**
- * This function initialises any virtual devices
- *
- * This function is private to the EAL.
- */
-int rte_eal_dev_init(void);
-
-/**
  * Function is to check if the kernel module(like, vfio, vfio_iommu_type1,
  * etc.) loaded.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index bba8fea..5ec3d4e 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -70,6 +70,7 @@
 #include <rte_cpuflags.h>
 #include <rte_interrupts.h>
 #include <rte_pci.h>
+#include <rte_dev.h>
 #include <rte_devargs.h>
 #include <rte_common.h>
 #include <rte_version.h>
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct
  2016-06-15 14:06 10%       ` [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct Reshma Pattan
@ 2016-06-16 19:14  4%         ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-16 19:14 UTC (permalink / raw)
  To: Reshma Pattan; +Cc: dev

2016-06-15 15:06, Reshma Pattan:
> The new fields nb_rx_queues and nb_tx_queues are added to the
> rte_eth_dev_info structure.
> Changes to API rte_eth_dev_info_get() are done to update these new fields
> to the rte_eth_dev_info object.

The ABI is changed, not the API.

> Release notes is updated with the changes.
[...]
> --- a/lib/librte_ether/rte_ether_version.map
> +++ b/lib/librte_ether/rte_ether_version.map
> @@ -137,4 +137,5 @@ DPDK_16.07 {
>  	global:
>  
>  	rte_eth_add_first_rx_callback;
> +	rte_eth_dev_info_get;
>  } DPDK_16.04;

Why duplicating this symbol in 16.07?
The ABI is broken anyway.

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature
  2016-06-16 12:30  3%                 ` [dpdk-dev] [PATCH v13 " David Hunt
  2016-06-16 12:30  1%                   ` [dpdk-dev] [PATCH v13 1/3] mempool: support external mempool operations David Hunt
@ 2016-06-17 13:53  3%                   ` David Hunt
  2016-06-17 13:53  1%                     ` [dpdk-dev] [PATCH v14 1/3] mempool: support mempool handler operations David Hunt
  2016-06-19 12:05  3%                     ` [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature David Hunt
  1 sibling, 2 replies; 200+ results
From: David Hunt @ 2016-06-17 13:53 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the Mempool Handler feature (previously
known as the External Mempool Manager.

It's re-based on top of the latest head as of 17/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v14 changes:

 * set MEMPOOL_F_RING_CREATED flag after rte_mempool_ring_create() is called.
 * Changed name of feature from "external mempool manager" to "mempool handler"
   and updated comments and release notes accordingly.
 * Added a comment for newly added pool_config param in
   rte_mempool_set_ops_byname.

v13 changes:

 * Added in extra opaque data (pool_config) to mempool struct for mempool
   configuration by the ops functions. For example, this can be used to pass
  device names or device flags to the underlying alloc function.
 * Added mempool_config param to rte_mempool_set_ops_byname()

v12 changes:

 * Fixed a comment (function pram h -> ops)
 * fixed a typo (callbacki)

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The Mempool Handler feature is an extension to the mempool API that allows
users to add and use an alternative mempool handler, which allows
external memory subsystems such as external hardware memory management
systems and software based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool handler will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing.

There are two aspects to mempool handlers.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempools ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several mempool handlers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool handler callback (ops) structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool handler by default. These applications
will need to be modified to use a new mempool handler.

A mempool handler needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name
    void *pool_config);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool handler using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support mempool handler operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test mempool handler

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v14 1/3] mempool: support mempool handler operations
  2016-06-17 13:53  3%                   ` [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature David Hunt
@ 2016-06-17 13:53  1%                     ` David Hunt
  2016-06-19 12:05  3%                     ` [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature David Hunt
  1 sibling, 0 replies; 200+ results
From: David Hunt @ 2016-06-17 13:53 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  32 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 -
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  67 +++-----
 lib/librte_mempool/rte_mempool.h           | 255 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 150 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 609 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..1943fc4 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,8 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a mempool handler to store free objects.
+The default mempool handler is ring based.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +128,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+Mempool Handlers
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to a mempool handler.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several different mempool handlers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool handler callback (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool handler by default. These applications
+will need to be modified to use a new mempool handler.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an alternative mempool handler.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index f75183f..3cbc19e 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -34,15 +34,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index af71edd..e6a83d0 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,10 +352,11 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
+		mp->flags |= MEMPOOL_F_POOL_CREATED;
 	}
 
 	/* mempool is already populated */
@@ -703,7 +670,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +782,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +812,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +911,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1100,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1121,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..2d7c980 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,14 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	void *pool_config;               /**< optional args for ops alloc. */
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +222,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +248,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +338,215 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_ops_get(int ops_index)
+{
+	RTE_VERIFY(ops_index < RTE_MEMPOOL_MAX_OPS_IDX);
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops get callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of get function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops put callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of put function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @param pool_config
+ *   Opaque data that can be passed by the application to the ops functions.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+		void *pool_config);
+
+/**
+ * Register mempool operations.
+ *
+ * @param ops
+ *   Pointer to an ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_ops_register(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of a mempool handler.
+ * Note that the rte_mempool_ops_register fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_ops_register(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +996,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1007,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1158,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1186,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..7977a14
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,150 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_ops_register(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	return ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_ops_register. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+	void *pool_config)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	mp->pool_config = pool_config;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..b9aa64d
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool handlers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..6209ec2 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_register;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* [dpdk-dev] [RFC] librte_vhost: Add unix domain socket fd registration
@ 2016-06-17 15:32  3% Aaron Conole
  0 siblings, 0 replies; 200+ results
From: Aaron Conole @ 2016-06-17 15:32 UTC (permalink / raw)
  To: dev, Huawei Xie, Yuanhan Liu

Prior to this commit, the only way to add a vhost-user socket to the
system is by relying on librte_vhost to open the unix domain socket and
add it to the unix socket list.  This is problematic for applications
which would like to set the permissions, or applications which are not
directly allowed to open sockets due to policy restrictions.

This patch provides a new API and ABI to allow application developers to
acquire the unix domain socket via whatever mechanism fits and pass it
to the vhost driver registration process.

Signed-off-by: Aaron Conole <aconole@redhat.com>
---
 doc/guides/prog_guide/vhost_lib.rst          |  8 +++++
 lib/librte_vhost/rte_vhost_version.map       |  6 ++++
 lib/librte_vhost/rte_virtio_net.h            |  6 ++++
 lib/librte_vhost/vhost_user/vhost-net-user.c | 47 ++++++++++++++++++----------
 4 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
index 48e1fff..22d0c6d 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -49,6 +49,14 @@ Vhost API Overview
       For vhost-user, a Unix domain socket server will be created with the parameter as
       the local socket path.
 
+      Alternately, rte_vhost_driver_register_socket registers a unix domain
+      socket into the system.
+      This socket descriptor should be acquired by the host application through
+      some mechanism (either fd passing or by performing the unix domain socket
+      allocation).
+      The file descriptor passed in this way must still be a Unix domain socket
+      server.
+
 *   Vhost session start
 
       rte_vhost_driver_session_start starts the vhost session loop.
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index 3d8709e..fe58967 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -20,3 +20,9 @@ DPDK_2.1 {
 	rte_vhost_driver_unregister;
 
 } DPDK_2.0;
+
+DPDK_16.7 {
+	global:
+
+	rte_vhost_driver_register_socket;
+} DPDK_2.1;
diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index 600b20b..d2959ff 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -236,6 +236,12 @@ int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_i
 /* Register vhost driver. dev_name could be different for multiple instance support. */
 int rte_vhost_driver_register(const char *dev_name);
 
+/* Register vhost driver using the provided unix domain socket. The socket MUST
+ * already be fully created and in a listening state (by calling listen()).
+ */
+int rte_vhost_driver_register_socket(const char *dev_name,
+	int vhost_unix_socket);
+
 /* Unregister vhost driver. This is only meaningful to vhost user. */
 int rte_vhost_driver_unregister(const char *dev_name);
 
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
index df2bd64..0fe72db 100644
--- a/lib/librte_vhost/vhost_user/vhost-net-user.c
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
@@ -446,45 +446,58 @@ vserver_message_handler(int connfd, void *dat, int *remove)
 	}
 }
 
+
 /**
- * Creates and initialise the vhost server.
+ * Appends a socket to the vhost server polling list
  */
 int
-rte_vhost_driver_register(const char *path)
+rte_vhost_driver_register_socket(const char *dev_name, int vhost_unix_socket)
 {
 	struct vhost_server *vserver;
 
-	pthread_mutex_lock(&g_vhost_server.server_mutex);
-
-	if (g_vhost_server.vserver_cnt == MAX_VHOST_SERVER) {
-		RTE_LOG(ERR, VHOST_CONFIG,
-			"error: the number of servers reaches maximum\n");
-		pthread_mutex_unlock(&g_vhost_server.server_mutex);
-		return -1;
-	}
-
 	vserver = calloc(sizeof(struct vhost_server), 1);
 	if (vserver == NULL) {
-		pthread_mutex_unlock(&g_vhost_server.server_mutex);
 		return -1;
 	}
 
-	vserver->listenfd = uds_socket(path);
-	if (vserver->listenfd < 0) {
+	vserver->listenfd = vhost_unix_socket;
+	vserver->path = strdup(dev_name);
+	if (!vserver->path) {
 		free(vserver);
-		pthread_mutex_unlock(&g_vhost_server.server_mutex);
 		return -1;
 	}
 
-	vserver->path = strdup(path);
+	pthread_mutex_lock(&g_vhost_server.server_mutex);
+
+	if (g_vhost_server.vserver_cnt == MAX_VHOST_SERVER) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"error: the number of servers reaches maximum\n");
+		pthread_mutex_unlock(&g_vhost_server.server_mutex);
+		free(vserver->path);
+		free(vserver);
+		return -1;
+	}
 
 	fdset_add(&g_vhost_server.fdset, vserver->listenfd,
 		vserver_new_vq_conn, NULL, vserver);
 
 	g_vhost_server.server[g_vhost_server.vserver_cnt++] = vserver;
 	pthread_mutex_unlock(&g_vhost_server.server_mutex);
+}
 
-	return 0;
+
+/**
+ * Creates and initialise the vhost server.
+ */
+int
+rte_vhost_driver_register(const char *dev_name)
+{
+
+	int listenfd = uds_socket(dev_name);
+	if (listenfd < 0)
+		return -1;
+
+	return rte_vhost_driver_register_socket(dev_name, listenfd);
 }
 
 
-- 
2.5.5

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature
  2016-06-17 13:53  3%                   ` [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature David Hunt
  2016-06-17 13:53  1%                     ` [dpdk-dev] [PATCH v14 1/3] mempool: support mempool handler operations David Hunt
@ 2016-06-19 12:05  3%                     ` David Hunt
  2016-06-19 12:05  1%                       ` [dpdk-dev] [PATCH v15 1/3] mempool: support mempool handler operations David Hunt
  2016-06-22  9:27  3%                       ` [dpdk-dev] [PATCH v16 0/3] mempool: add mempool handler feature David Hunt
  1 sibling, 2 replies; 200+ results
From: David Hunt @ 2016-06-19 12:05 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the Mempool Handler feature patch set.

It's re-based on top of the latest head as of 19/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v15 changes:

 * Changed rte_mempool_ops_get() to rte_mempool_get_ops()
 * Did some minor tweaks to comments after the previous change of function
   names from put/get to enqueue/dequeue
 * Added missing spinlock_unlock in rte_mempool_ops_register()
 * Added check for null in ops_free
 * removed unneeded return statement

v14 changes:

 * set MEMPOOL_F_RING_CREATED flag after rte_mempool_ring_create() is called.
 * Changed name of feature from "external mempool manager" to "mempool handler"
   and updated comments and release notes accordingly.
 * Added a comment for newly added pool_config param in
   rte_mempool_set_ops_byname.

v13 changes:

 * Added in extra opaque data (pool_config) to mempool struct for mempool
   configuration by the ops functions. For example, this can be used to pass
  device names or device flags to the underlying alloc function.
 * Added mempool_config param to rte_mempool_set_ops_byname()

v12 changes:

 * Fixed a comment (function pram h -> ops)
 * fixed a typo (callbacki)

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The Mempool Handler feature is an extension to the mempool API that allows
users to add and use an alternative mempool handler, which allows
external memory subsystems such as external hardware memory management
systems and software based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool handler will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing.

There are two aspects to mempool handlers.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempools ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several mempool handlers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool handler callback (ops) structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool handler by default. These applications
will need to be modified to use a new mempool handler.

A mempool handler needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name
    void *pool_config);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool handler using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support mempool handler operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test mempool handler

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v15 1/3] mempool: support mempool handler operations
  2016-06-19 12:05  3%                     ` [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature David Hunt
@ 2016-06-19 12:05  1%                       ` David Hunt
  2016-06-22  9:27  3%                       ` [dpdk-dev] [PATCH v16 0/3] mempool: add mempool handler feature David Hunt
  1 sibling, 0 replies; 200+ results
From: David Hunt @ 2016-06-19 12:05 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  32 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 -
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  67 +++-----
 lib/librte_mempool/rte_mempool.h           | 255 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 150 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 609 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..1943fc4 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,8 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a mempool handler to store free objects.
+The default mempool handler is ring based.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +128,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+Mempool Handlers
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to a mempool handler.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several different mempool handlers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool handler callback (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool handler by default. These applications
+will need to be modified to use a new mempool handler.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an alternative mempool handler.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index f75183f..3cbc19e 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -34,15 +34,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index af71edd..e6a83d0 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,10 +352,11 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
+		mp->flags |= MEMPOOL_F_POOL_CREATED;
 	}
 
 	/* mempool is already populated */
@@ -703,7 +670,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +782,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +812,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +911,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1100,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1121,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..2d7c980 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,14 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	void *pool_config;               /**< optional args for ops alloc. */
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +222,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +248,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +338,215 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_ops_get(int ops_index)
+{
+	RTE_VERIFY(ops_index < RTE_MEMPOOL_MAX_OPS_IDX);
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops get callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of get function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops put callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of put function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @param pool_config
+ *   Opaque data that can be passed by the application to the ops functions.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+		void *pool_config);
+
+/**
+ * Register mempool operations.
+ *
+ * @param ops
+ *   Pointer to an ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_ops_register(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of a mempool handler.
+ * Note that the rte_mempool_ops_register fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_ops_register(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +996,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1007,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1158,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1186,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..7977a14
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,150 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_ops_register(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	return ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_ops_get(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_ops_register. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+	void *pool_config)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	mp->pool_config = pool_config;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..b9aa64d
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool handlers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..6209ec2 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_register;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v3 1/2] mempool: add stack (lifo) mempool handler
  @ 2016-06-21  9:44  3%                 ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2016-06-21  9:44 UTC (permalink / raw)
  To: Ananyev, Konstantin, Jerin Jacob
  Cc: Thomas Monjalon, dev, Hunt, David, viktorin, shreyansh.jain

Hi,

On 06/21/2016 11:28 AM, Ananyev, Konstantin wrote:
>>>> I was proposing only to move only the new
>>>> handler(lib/librte_mempool/rte_mempool_stack.c). Not any library or any
>>>> other common code.
>>>>
>>>> Just like DPDK crypto device, Even if it is software implementation its
>>>> better to move in driver/crypto instead of lib/librte_cryptodev
>>>>
>>>> "lib/librte_mempool/arch/" is not correct place as it is platform specific
>>>> not architecture specific and HW mempool device may be PCIe or platform
>>>> device.
>>>
>>> Ok, but why rte_mempool_stack.c has to be moved?
>>
>> Just thought of having all the mempool handlers at one place.
>> We can't move all HW mempool handlers at lib/librte_mempool/
> 
> Yep, but from your previous mail I thought we might have specific ones
> for specific devices, no? 
> If so, why to put them in one place, why just not in:
> Drivers/xxx_dev/xxx_mempool.[h,c]
> ?
> And keep generic ones in lib/librte_mempool
> ?

I think all drivers (generic or not) should be at the same place for
consistency.

I'm not sure having them lib/librte_mempool is really a problem today,
but once hardware-dependent handler are pushed, we may move all of them
in drivers/mempool because I think we should avoid to have hw-specific
code in lib/.

I don't think it will cause ABI/API breakage since the user always
talk to the generic mempool API.

Regards
Olivier

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2] i40e: modify the meaning of single VLAN type
  2016-06-13  8:03  4% ` [dpdk-dev] [PATCH v2] i40e: modify the meaning of single " Beilei Xing
@ 2016-06-21 10:29  4%   ` Bruce Richardson
  2016-06-21 11:06  0%     ` Panu Matilainen
  0 siblings, 1 reply; 200+ results
From: Bruce Richardson @ 2016-06-21 10:29 UTC (permalink / raw)
  To: Beilei Xing; +Cc: jingjing.wu, dev, thomas.monjalon, nhorman, pmatilai

On Mon, Jun 13, 2016 at 04:03:32PM +0800, Beilei Xing wrote:
> In current i40e codebase, if single VLAN header is added in a packet,
> it's treated as inner VLAN. Generally, a single VLAN header is
> treated as the outer VLAN header. So change corresponding register
> for single VLAN.
> At the meanwhile, change the meanings of inner VLAN and outer VLAN.
> 
> Signed-off-by: Beilei Xing <beilei.xing@intel.com>

This patch changes the ABI, since an app written to the original API as specified
e.g. to set a single vlan header, would no longer work with this change.
Therefore, even though the original behaviour was inconsistent with other drivers
it may still need to be preserved.

I'm thinking that we may need to provide appropriately versioned copies of the
vlan_offload_set and vlan_tpid_set functions for backward compatibility with
the old ABI.

Any other comments or thoughts on this? 
Neil, Thomas, Panu - is this fix something that we need to provide backward
version-compatibility for, or given that the functions are being called through
a generic ethdev API mean that this can just go in as a straight bug-fix?

/Bruce

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v2] i40e: modify the meaning of single VLAN type
  2016-06-21 10:29  4%   ` Bruce Richardson
@ 2016-06-21 11:06  0%     ` Panu Matilainen
  2016-06-21 11:28  0%       ` Bruce Richardson
  0 siblings, 1 reply; 200+ results
From: Panu Matilainen @ 2016-06-21 11:06 UTC (permalink / raw)
  To: Bruce Richardson, Beilei Xing; +Cc: jingjing.wu, dev, thomas.monjalon, nhorman

On 06/21/2016 01:29 PM, Bruce Richardson wrote:
> On Mon, Jun 13, 2016 at 04:03:32PM +0800, Beilei Xing wrote:
>> In current i40e codebase, if single VLAN header is added in a packet,
>> it's treated as inner VLAN. Generally, a single VLAN header is
>> treated as the outer VLAN header. So change corresponding register
>> for single VLAN.
>> At the meanwhile, change the meanings of inner VLAN and outer VLAN.
>>
>> Signed-off-by: Beilei Xing <beilei.xing@intel.com>
>
> This patch changes the ABI, since an app written to the original API as specified
> e.g. to set a single vlan header, would no longer work with this change.
> Therefore, even though the original behaviour was inconsistent with other drivers
> it may still need to be preserved.
>
> I'm thinking that we may need to provide appropriately versioned copies of the
> vlan_offload_set and vlan_tpid_set functions for backward compatibility with
> the old ABI.
>
> Any other comments or thoughts on this?
> Neil, Thomas, Panu - is this fix something that we need to provide backward
> version-compatibility for, or given that the functions are being called through
> a generic ethdev API mean that this can just go in as a straight bug-fix?

Since it's currently inconsistent with everything else, I'd just call it 
a bug-fix and leave it at that.

Besides, I dont think you could version it via the ordinary means even 
if you wanted to, due to the way its called through eth_dev_ops etc.

	- Panu -

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v2] i40e: modify the meaning of single VLAN type
  2016-06-21 11:06  0%     ` Panu Matilainen
@ 2016-06-21 11:28  0%       ` Bruce Richardson
  0 siblings, 0 replies; 200+ results
From: Bruce Richardson @ 2016-06-21 11:28 UTC (permalink / raw)
  To: Panu Matilainen; +Cc: Beilei Xing, jingjing.wu, dev, thomas.monjalon, nhorman

On Tue, Jun 21, 2016 at 02:06:38PM +0300, Panu Matilainen wrote:
> On 06/21/2016 01:29 PM, Bruce Richardson wrote:
> >On Mon, Jun 13, 2016 at 04:03:32PM +0800, Beilei Xing wrote:
> >>In current i40e codebase, if single VLAN header is added in a packet,
> >>it's treated as inner VLAN. Generally, a single VLAN header is
> >>treated as the outer VLAN header. So change corresponding register
> >>for single VLAN.
> >>At the meanwhile, change the meanings of inner VLAN and outer VLAN.
> >>
> >>Signed-off-by: Beilei Xing <beilei.xing@intel.com>
> >
> >This patch changes the ABI, since an app written to the original API as specified
> >e.g. to set a single vlan header, would no longer work with this change.
> >Therefore, even though the original behaviour was inconsistent with other drivers
> >it may still need to be preserved.
> >
> >I'm thinking that we may need to provide appropriately versioned copies of the
> >vlan_offload_set and vlan_tpid_set functions for backward compatibility with
> >the old ABI.
> >
> >Any other comments or thoughts on this?
> >Neil, Thomas, Panu - is this fix something that we need to provide backward
> >version-compatibility for, or given that the functions are being called through
> >a generic ethdev API mean that this can just go in as a straight bug-fix?
> 
> Since it's currently inconsistent with everything else, I'd just call it a
> bug-fix and leave it at that.
> 

Yep, makes sense.

> Besides, I dont think you could version it via the ordinary means even if
> you wanted to, due to the way its called through eth_dev_ops etc.
> 

Good point, never thought of that! :-(

> 	- Panu -

Thanks for the guidance.

/Bruce

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v4 04/17] eal: remove duplicate function declaration
  @ 2016-06-21 12:02  3%   ` Shreyansh Jain
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 200+ results
From: Shreyansh Jain @ 2016-06-21 12:02 UTC (permalink / raw)
  To: dev; +Cc: viktorin, thomas.monjalon

rte_eal_dev_init is declared in both eal_private.h and rte_dev.h since its
introduction.
This function has been exported in ABI, so remove it from eal_private.h

Fixes: e57f20e05177 ("eal: make vdev init path generic for both virtual and pci devices")
Signed-off-by: David Marchand <david.marchand@6wind.com>
Signed-off-by: Shreyansh Jain <shreyansh.jain@nxp.com>
---
 lib/librte_eal/common/eal_private.h | 7 -------
 lib/librte_eal/linuxapp/eal/eal.c   | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 857dc3e..06a68f6 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,13 +259,6 @@ int rte_eal_intr_init(void);
 int rte_eal_alarm_init(void);
 
 /**
- * This function initialises any virtual devices
- *
- * This function is private to the EAL.
- */
-int rte_eal_dev_init(void);
-
-/**
  * Function is to check if the kernel module(like, vfio, vfio_iommu_type1,
  * etc.) loaded.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 4f22c18..29fba52 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -70,6 +70,7 @@
 #include <rte_cpuflags.h>
 #include <rte_interrupts.h>
 #include <rte_pci.h>
+#include <rte_dev.h>
 #include <rte_devargs.h>
 #include <rte_common.h>
 #include <rte_version.h>
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v5 04/17] eal: remove duplicate function declaration
  @ 2016-06-22  9:06  3%     ` Shreyansh Jain
  0 siblings, 0 replies; 200+ results
From: Shreyansh Jain @ 2016-06-22  9:06 UTC (permalink / raw)
  To: dev; +Cc: viktorin, thomas.monjalon

rte_eal_dev_init is declared in both eal_private.h and rte_dev.h since its
introduction.
This function has been exported in ABI, so remove it from eal_private.h

Fixes: e57f20e05177 ("eal: make vdev init path generic for both virtual and pci devices")
Signed-off-by: David Marchand <david.marchand@6wind.com>
Signed-off-by: Shreyansh Jain <shreyansh.jain@nxp.com>
---
 lib/librte_eal/common/eal_private.h | 7 -------
 lib/librte_eal/linuxapp/eal/eal.c   | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 857dc3e..06a68f6 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,13 +259,6 @@ int rte_eal_intr_init(void);
 int rte_eal_alarm_init(void);
 
 /**
- * This function initialises any virtual devices
- *
- * This function is private to the EAL.
- */
-int rte_eal_dev_init(void);
-
-/**
  * Function is to check if the kernel module(like, vfio, vfio_iommu_type1,
  * etc.) loaded.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 4f22c18..29fba52 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -70,6 +70,7 @@
 #include <rte_cpuflags.h>
 #include <rte_interrupts.h>
 #include <rte_pci.h>
+#include <rte_dev.h>
 #include <rte_devargs.h>
 #include <rte_common.h>
 #include <rte_version.h>
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v16 0/3] mempool: add mempool handler feature
  2016-06-19 12:05  3%                     ` [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature David Hunt
  2016-06-19 12:05  1%                       ` [dpdk-dev] [PATCH v15 1/3] mempool: support mempool handler operations David Hunt
@ 2016-06-22  9:27  3%                       ` David Hunt
  2016-06-22  9:27  1%                         ` [dpdk-dev] [PATCH v16 1/3] mempool: support mempool handler operations David Hunt
  1 sibling, 1 reply; 200+ results
From: David Hunt @ 2016-06-22  9:27 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain

Here's the latest version of the Mempool Handler patch set.
It's re-based on top of the latest head as of 20/6/2016, including
Olivier's 35-part patch series on mempool re-org [1]

[1] http://dpdk.org/ml/archives/dev/2016-May/039229.html

v16 changes:

 * Changed rte_mempool_ops_get() to rte_mempool_get_ops()
 * Changed rte_mempool_ops_register() to rte_mempool_register_ops()
 * Applied missing changes that should have been in v15

v15 changes:

 * Changed rte_mempool_ops_get() to rte_mempool_get_ops()
 * Did some minor tweaks to comments after the previous change of function
   names from put/get to enqueue/dequeue
 * Added missing spinlock_unlock in rte_mempool_ops_register()
 * Added check for null in ops_free
 * removed un-needed return statement

v14 changes:

 * set MEMPOOL_F_RING_CREATED flag after rte_mempool_ring_create() is called.
 * Changed name of feature from "external mempool manager" to "mempool handler"
   and updated comments and release notes accordingly.
 * Added a comment for newly added pool_config param in
   rte_mempool_set_ops_byname.

v13 changes:

 * Added in extra opaque data (pool_config) to mempool struct for mempool
   configuration by the ops functions. For example, this can be used to pass
  device names or device flags to the underlying alloc function.
 * Added mempool_config param to rte_mempool_set_ops_byname()

v12 changes:

 * Fixed a comment (function pram h -> ops)
 * fixed a typo (callbacki)

v11 changes:

 * Fixed comments (added '.' where needed for consistency)
 * removed ABI breakage notice for mempool manager in deprecation.rst
 * Added description of the external mempool manager functionality to
   doc/guides/prog_guide/mempool_lib.rst (John Mc reviewed)
 * renamed rte_mempool_default.c to rte_mempool_ring.c

v10 changes:

 * changed the _put/_get op names to _enqueue/_dequeue to be consistent
   with the function names
 * some rte_errno cleanup
 * comment tweaks about when to set pool_data
 * removed an un-needed check for ops->alloc == NULL

v9 changes:

 * added a check for NULL alloc in rte_mempool_ops_register
 * rte_mempool_alloc_t now returns int instead of void*
 * fixed some comment typo's
 * removed some unneeded typecasts
 * changed a return NULL to return -EEXIST in rte_mempool_ops_register
 * fixed rte_mempool_version.map file so builds ok as shared libs
 * moved flags check from rte_mempool_create_empty to rte_mempool_create

v8 changes:

 * merged first three patches in the series into one.
 * changed parameters to ops callback to all be rte_mempool pointer
   rather than than pointer to opaque data or uint64.
 * comment fixes.
 * fixed parameter to _free function (was inconsistent).
 * changed MEMPOOL_F_RING_CREATED to MEMPOOL_F_POOL_CREATED

v7 changes:

 * Changed rte_mempool_handler_table to rte_mempool_ops_table
 * Changed hander_idx to ops_index in rte_mempool struct
 * Reworked comments in rte_mempool.h around ops functions
 * Changed rte_mempool_hander.c to rte_mempool_ops.c
 * Changed all functions containing _handler_ to _ops_
 * Now there is no mention of 'handler' left
 * Other small changes out of review of mailing list

v6 changes:

 * Moved the flags handling from rte_mempool_create_empty to
   rte_mempool_create, as it's only there for backward compatibility
 * Various comment additions and cleanup
 * Renamed rte_mempool_handler to rte_mempool_ops
 * Added a union for *pool and u64 pool_id in struct rte_mempool
 * split the original patch into a few parts for easier review.
 * rename functions with _ext_ to _ops_.
 * addressed review comments
 * renamed put and get functions to enqueue and dequeue
 * changed occurences of rte_mempool_ops to const, as they
   contain function pointers (security)
 * split out the default external mempool handler into a separate
   patch for easier review

v5 changes:
 * rebasing, as it is dependent on another patch series [1]

v4 changes (Olivier Matz):
 * remove the rte_mempool_create_ext() function. To change the handler, the
   user has to do the following:
   - mp = rte_mempool_create_empty()
   - rte_mempool_set_handler(mp, "my_handler")
   - rte_mempool_populate_default(mp)
   This avoids to add another function with more than 10 arguments, duplicating
   the doxygen comments
 * change the api of rte_mempool_alloc_t: only the mempool pointer is required
   as all information is available in it
 * change the api of rte_mempool_free_t: remove return value
 * move inline wrapper functions from the .c to the .h (else they won't be
   inlined). This implies to have one header file (rte_mempool.h), or it
   would have generate cross dependencies issues.
 * remove now unused MEMPOOL_F_INT_HANDLER (note: it was misused anyway due
   to the use of && instead of &)
 * fix build in debug mode (__MEMPOOL_STAT_ADD(mp, put_pool, n) remaining)
 * fix build with shared libraries (global handler has to be declared in
   the .map file)
 * rationalize #include order
 * remove unused function rte_mempool_get_handler_name()
 * rename some structures, fields, functions
 * remove the static in front of rte_tailq_elem rte_mempool_tailq (comment
   from Yuanhan)
 * test the ext mempool handler in the same file than standard mempool tests,
   avoiding to duplicate the code
 * rework the custom handler in mempool_test
 * rework a bit the patch selecting default mbuf pool handler
 * fix some doxygen comments

v3 changes:
 * simplified the file layout, renamed to rte_mempool_handler.[hc]
 * moved the default handlers into rte_mempool_default.c
 * moved the example handler out into app/test/test_ext_mempool.c
 * removed is_mc/is_mp change, slight perf degredation on sp cached operation
 * removed stack hanler, may re-introduce at a later date
 * Changes out of code reviews

v2 changes:
 * There was a lot of duplicate code between rte_mempool_xmem_create and
   rte_mempool_create_ext. This has now been refactored and is now
   hopefully cleaner.
 * The RTE_NEXT_ABI define is now used to allow building of the library
   in a format that is compatible with binaries built against previous
   versions of DPDK.
 * Changes out of code reviews. Hopefully I've got most of them included.

The Mempool Handler feature is an extension to the mempool API that allows
users to add and use an alternative mempool handler, which allows
external memory subsystems such as external hardware memory management
systems and software based memory allocators to be used with DPDK.

The existing API to the internal DPDK mempool handler will remain unchanged
and will be backward compatible. However, there will be an ABI breakage, as
the mempool struct is changing.

There are two aspects to mempool handlers.
  1. Adding the code for your new mempool operations (ops). This is
     achieved by adding a new mempool ops source file into the
     librte_mempool library, and using the REGISTER_MEMPOOL_OPS macro.
  2. Using the new API to call rte_mempool_create_empty and
     rte_mempool_set_ops_byname to create a new mempool
     using the name parameter to identify which ops to use.

New API calls added
 1. A new rte_mempool_create_empty() function
 2. rte_mempool_set_ops_byname() which sets the mempools ops (functions)
 3. An rte_mempool_populate_default() and rte_mempool_populate_anon() functions
    which populates the mempool using the relevant ops

Several mempool handlers may be used in the same application. A new
mempool can then be created by using the new rte_mempool_create_empty function,
then calling rte_mempool_set_ops_byname to point the mempool to the relevant
mempool handler callback (ops) structure.

Legacy applications will continue to use the old rte_mempool_create API call,
which uses a ring based mempool handler by default. These applications
will need to be modified to use a new mempool handler.

A mempool handler needs to provide the following functions.
 1. alloc     - allocates the mempool memory, and adds each object onto a ring
 2. enqueue   - puts an object back into the mempool once an application has
                finished with it
 3. dequeue   - gets an object from the mempool for use by the application
 4. get_count - gets the number of available objects in the mempool
 5. free      - frees the mempool memory

Every time an enqueue/dequeue/get_count is called from the application/PMD,
the callback for that mempool is called. These functions are in the fastpath,
and any unoptimised ops may limit performance.

The new APIs are as follows:

1. rte_mempool_create_empty

struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
    unsigned cache_size, unsigned private_data_size,
    int socket_id, unsigned flags);

2. rte_mempool_set_ops_byname()

int
rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name
    void *pool_config);

3. rte_mempool_populate_default()

int rte_mempool_populate_default(struct rte_mempool *mp);

4. rte_mempool_populate_anon()

int rte_mempool_populate_anon(struct rte_mempool *mp);

Please see rte_mempool.h for further information on the parameters.

The important thing to note is that the mempool ops struct is passed by name
to rte_mempool_set_ops_byname, which looks through the ops struct array to
get the ops_index, which is then stored in the rte_memool structure. This
allow multiple processes to use the same mempool, as the function pointers
are accessed via ops index.

The mempool ops structure contains callbacks to the implementation of
the ops function, and is set up for registration as follows:

static const struct rte_mempool_ops ops_sp_mc = {
    .name = "ring_sp_mc",
    .alloc = rte_mempool_common_ring_alloc,
    .enqueue = common_ring_sp_enqueue,
    .dequeue = common_ring_mc_dequeue,
    .get_count = common_ring_get_count,
    .free = common_ring_free,
};

And then the following macro will register the ops in the array of ops
structures

REGISTER_MEMPOOL_OPS(ops_mp_mc);

For an example of API usage, please see app/test/test_mempool.c, which
implements a rudimentary "custom_handler" mempool handler using simple mallocs
for each mempool object. This file also contains the callbacks and self
registration for the new handler.

David Hunt (2):
  mempool: support mempool handler operations
  mbuf: make default mempool ops configurable at build

Olivier Matz (1):
  app/test: test mempool handler

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH v16 1/3] mempool: support mempool handler operations
  2016-06-22  9:27  3%                       ` [dpdk-dev] [PATCH v16 0/3] mempool: add mempool handler feature David Hunt
@ 2016-06-22  9:27  1%                         ` David Hunt
  0 siblings, 0 replies; 200+ results
From: David Hunt @ 2016-06-22  9:27 UTC (permalink / raw)
  To: dev; +Cc: olivier.matz, viktorin, jerin.jacob, shreyansh.jain, David Hunt

Until now, the objects stored in a mempool were internally stored in a
ring. This patch introduces the possibility to register external handlers
replacing the ring.

The default behavior remains unchanged, but calling the new function
rte_mempool_set_ops_byname() right after rte_mempool_create_empty() allows
the user to change the handler that will be used when populating
the mempool.

This patch also adds a set of default ops (function callbacks) based
on rte_ring.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: David Hunt <david.hunt@intel.com>
Acked-by: Shreyansh Jain <shreyansh.jain@nxp.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test/test_mempool_perf.c               |   1 -
 doc/guides/prog_guide/mempool_lib.rst      |  32 +++-
 doc/guides/rel_notes/deprecation.rst       |   9 -
 lib/librte_mempool/Makefile                |   2 +
 lib/librte_mempool/rte_mempool.c           |  67 +++-----
 lib/librte_mempool/rte_mempool.h           | 255 ++++++++++++++++++++++++++---
 lib/librte_mempool/rte_mempool_ops.c       | 151 +++++++++++++++++
 lib/librte_mempool/rte_mempool_ring.c      | 161 ++++++++++++++++++
 lib/librte_mempool/rte_mempool_version.map |  13 +-
 9 files changed, 610 insertions(+), 81 deletions(-)
 create mode 100644 lib/librte_mempool/rte_mempool_ops.c
 create mode 100644 lib/librte_mempool/rte_mempool_ring.c

diff --git a/app/test/test_mempool_perf.c b/app/test/test_mempool_perf.c
index c5e3576..c5f8455 100644
--- a/app/test/test_mempool_perf.c
+++ b/app/test/test_mempool_perf.c
@@ -161,7 +161,6 @@ per_lcore_mempool_test(__attribute__((unused)) void *arg)
 							   n_get_bulk);
 				if (unlikely(ret < 0)) {
 					rte_mempool_dump(stdout, mp);
-					rte_ring_dump(stdout, mp->ring);
 					/* in this case, objects are lost... */
 					return -1;
 				}
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index c3afc2e..1943fc4 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -34,7 +34,8 @@ Mempool Library
 ===============
 
 A memory pool is an allocator of a fixed-sized object.
-In the DPDK, it is identified by name and uses a ring to store free objects.
+In the DPDK, it is identified by name and uses a mempool handler to store free objects.
+The default mempool handler is ring based.
 It provides some other optional services such as a per-core object cache and
 an alignment helper to ensure that objects are padded to spread them equally on all DRAM or DDR3 channels.
 
@@ -127,6 +128,35 @@ The maximum size of the cache is static and is defined at compilation time (CONF
    A mempool in Memory with its Associated Ring
 
 
+Mempool Handlers
+------------------------
+
+This allows external memory subsystems, such as external hardware memory
+management systems and software based memory allocators, to be used with DPDK.
+
+There are two aspects to a mempool handler.
+
+* Adding the code for your new mempool operations (ops). This is achieved by
+  adding a new mempool ops code, and using the ``REGISTER_MEMPOOL_OPS`` macro.
+
+* Using the new API to call ``rte_mempool_create_empty()`` and
+  ``rte_mempool_set_ops_byname()`` to create a new mempool and specifying which
+  ops to use.
+
+Several different mempool handlers may be used in the same application. A new
+mempool can be created by using the ``rte_mempool_create_empty()`` function,
+then using ``rte_mempool_set_ops_byname()`` to point the mempool to the
+relevant mempool handler callback (ops) structure.
+
+Legacy applications may continue to use the old ``rte_mempool_create()`` API
+call, which uses a ring based mempool handler by default. These applications
+will need to be modified to use a new mempool handler.
+
+For applications that use ``rte_pktmbuf_create()``, there is a config setting
+(``RTE_MBUF_DEFAULT_MEMPOOL_OPS``) that allows the application to make use of
+an alternative mempool handler.
+
+
 Use Cases
 ---------
 
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index f75183f..3cbc19e 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -34,15 +34,6 @@ Deprecation Notices
   compact API. The ones that remain are backwards compatible and use the
   per-lcore default cache if available. This change targets release 16.07.
 
-* The rte_mempool struct will be changed in 16.07 to facilitate the new
-  external mempool manager functionality.
-  The ring element will be replaced with a more generic 'pool' opaque pointer
-  to allow new mempool handlers to use their own user-defined mempool
-  layout. Also newly added to rte_mempool is a handler index.
-  The existing API will be backward compatible, but there will be new API
-  functions added to facilitate the creation of mempools using an external
-  handler. The 16.07 release will contain these changes.
-
 * A librte_vhost public structures refactor is planned for DPDK 16.07
   that requires both ABI and API change.
   The proposed refactor would expose DPDK vhost dev to applications as
diff --git a/lib/librte_mempool/Makefile b/lib/librte_mempool/Makefile
index 43423e0..a4c089e 100644
--- a/lib/librte_mempool/Makefile
+++ b/lib/librte_mempool/Makefile
@@ -42,6 +42,8 @@ LIBABIVER := 2
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ops.c
+SRCS-$(CONFIG_RTE_LIBRTE_MEMPOOL) +=  rte_mempool_ring.c
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MEMPOOL)-include := rte_mempool.h
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index af71edd..e6a83d0 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -148,7 +148,7 @@ mempool_add_elem(struct rte_mempool *mp, void *obj, phys_addr_t physaddr)
 #endif
 
 	/* enqueue in ring */
-	rte_ring_sp_enqueue(mp->ring, obj);
+	rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
 }
 
 /* call obj_cb() for each mempool element */
@@ -303,40 +303,6 @@ rte_mempool_xmem_usage(__rte_unused void *vaddr, uint32_t elt_num,
 	return (size_t)paddr_idx << pg_shift;
 }
 
-/* create the internal ring */
-static int
-rte_mempool_ring_create(struct rte_mempool *mp)
-{
-	int rg_flags = 0, ret;
-	char rg_name[RTE_RING_NAMESIZE];
-	struct rte_ring *r;
-
-	ret = snprintf(rg_name, sizeof(rg_name),
-		RTE_MEMPOOL_MZ_FORMAT, mp->name);
-	if (ret < 0 || ret >= (int)sizeof(rg_name))
-		return -ENAMETOOLONG;
-
-	/* ring flags */
-	if (mp->flags & MEMPOOL_F_SP_PUT)
-		rg_flags |= RING_F_SP_ENQ;
-	if (mp->flags & MEMPOOL_F_SC_GET)
-		rg_flags |= RING_F_SC_DEQ;
-
-	/* Allocate the ring that will be used to store objects.
-	 * Ring functions will return appropriate errors if we are
-	 * running as a secondary process etc., so no checks made
-	 * in this function for that condition.
-	 */
-	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
-		mp->socket_id, rg_flags);
-	if (r == NULL)
-		return -rte_errno;
-
-	mp->ring = r;
-	mp->flags |= MEMPOOL_F_RING_CREATED;
-	return 0;
-}
-
 /* free a memchunk allocated with rte_memzone_reserve() */
 static void
 rte_mempool_memchunk_mz_free(__rte_unused struct rte_mempool_memhdr *memhdr,
@@ -354,7 +320,7 @@ rte_mempool_free_memchunks(struct rte_mempool *mp)
 	void *elt;
 
 	while (!STAILQ_EMPTY(&mp->elt_list)) {
-		rte_ring_sc_dequeue(mp->ring, &elt);
+		rte_mempool_ops_dequeue_bulk(mp, &elt, 1);
 		(void)elt;
 		STAILQ_REMOVE_HEAD(&mp->elt_list, next);
 		mp->populated_size--;
@@ -386,10 +352,11 @@ rte_mempool_populate_phys(struct rte_mempool *mp, char *vaddr,
 	int ret;
 
 	/* create the internal ring if not already done */
-	if ((mp->flags & MEMPOOL_F_RING_CREATED) == 0) {
-		ret = rte_mempool_ring_create(mp);
-		if (ret < 0)
+	if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
+		ret = rte_mempool_ops_alloc(mp);
+		if (ret != 0)
 			return ret;
+		mp->flags |= MEMPOOL_F_POOL_CREATED;
 	}
 
 	/* mempool is already populated */
@@ -703,7 +670,7 @@ rte_mempool_free(struct rte_mempool *mp)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 
 	rte_mempool_free_memchunks(mp);
-	rte_ring_free(mp->ring);
+	rte_mempool_ops_free(mp);
 	rte_memzone_free(mp->mz);
 }
 
@@ -815,6 +782,7 @@ rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
 		RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
 
 	te->data = mp;
+
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
 	TAILQ_INSERT_TAIL(mempool_list, te, next);
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
@@ -844,6 +812,19 @@ rte_mempool_create(const char *name, unsigned n, unsigned elt_size,
 	if (mp == NULL)
 		return NULL;
 
+	/*
+	 * Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
+	 * set the correct index into the table of ops structs.
+	 */
+	if (flags & (MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET))
+		rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
+	else if (flags & MEMPOOL_F_SP_PUT)
+		rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
+	else if (flags & MEMPOOL_F_SC_GET)
+		rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
+	else
+		rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
+
 	/* call the mempool priv initializer */
 	if (mp_init)
 		mp_init(mp, mp_init_arg);
@@ -930,7 +911,7 @@ rte_mempool_count(const struct rte_mempool *mp)
 	unsigned count;
 	unsigned lcore_id;
 
-	count = rte_ring_count(mp->ring);
+	count = rte_mempool_ops_get_count(mp);
 
 	if (mp->cache_size == 0)
 		return count;
@@ -1119,7 +1100,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 
 	fprintf(f, "mempool <%s>@%p\n", mp->name, mp);
 	fprintf(f, "  flags=%x\n", mp->flags);
-	fprintf(f, "  ring=<%s>@%p\n", mp->ring->name, mp->ring);
+	fprintf(f, "  pool=%p\n", mp->pool_data);
 	fprintf(f, "  phys_addr=0x%" PRIx64 "\n", mp->mz->phys_addr);
 	fprintf(f, "  nb_mem_chunks=%u\n", mp->nb_mem_chunks);
 	fprintf(f, "  size=%"PRIu32"\n", mp->size);
@@ -1140,7 +1121,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 	}
 
 	cache_count = rte_mempool_dump_cache(f, mp);
-	common_count = rte_ring_count(mp->ring);
+	common_count = rte_mempool_ops_get_count(mp);
 	if ((cache_count + common_count) > mp->size)
 		common_count = mp->size - cache_count;
 	fprintf(f, "  common_pool_count=%u\n", common_count);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 60339bd..0a1777c 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -67,6 +67,7 @@
 #include <inttypes.h>
 #include <sys/queue.h>
 
+#include <rte_spinlock.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_lcore.h>
@@ -203,10 +204,14 @@ struct rte_mempool_memhdr {
  */
 struct rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
-	struct rte_ring *ring;           /**< Ring to store objects. */
-	const struct rte_memzone *mz;    /**< Memzone where pool is allocated */
+	union {
+		void *pool_data;         /**< Ring or pool to store objects. */
+		uint64_t pool_id;        /**< External mempool identifier. */
+	};
+	void *pool_config;               /**< optional args for ops alloc. */
+	const struct rte_memzone *mz;    /**< Memzone where pool is alloc'd. */
 	int flags;                       /**< Flags of the mempool. */
-	int socket_id;                   /**< Socket id passed at mempool creation. */
+	int socket_id;                   /**< Socket id passed at create. */
 	uint32_t size;                   /**< Max size of the mempool. */
 	uint32_t cache_size;             /**< Size of per-lcore local cache. */
 	uint32_t cache_flushthresh;
@@ -217,6 +222,14 @@ struct rte_mempool {
 	uint32_t trailer_size;           /**< Size of trailer (after elt). */
 
 	unsigned private_data_size;      /**< Size of private data. */
+	/**
+	 * Index into rte_mempool_ops_table array of mempool ops
+	 * structs, which contain callback function pointers.
+	 * We're using an index here rather than pointers to the callbacks
+	 * to facilitate any secondary processes that may want to use
+	 * this mempool.
+	 */
+	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
 
@@ -235,7 +248,7 @@ struct rte_mempool {
 #define MEMPOOL_F_NO_CACHE_ALIGN 0x0002 /**< Do not align objs on cache lines.*/
 #define MEMPOOL_F_SP_PUT         0x0004 /**< Default put is "single-producer".*/
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
-#define MEMPOOL_F_RING_CREATED   0x0010 /**< Internal: ring is created */
+#define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_PHYS_CONTIG 0x0020 /**< Don't need physically contiguous objs. */
 
 /**
@@ -325,6 +338,215 @@ void rte_mempool_check_cookies(const struct rte_mempool *mp,
 #define __mempool_check_cookies(mp, obj_table_const, n, free) do {} while(0)
 #endif /* RTE_LIBRTE_MEMPOOL_DEBUG */
 
+#define RTE_MEMPOOL_OPS_NAMESIZE 32 /**< Max length of ops struct name. */
+
+/**
+ * Prototype for implementation specific data provisioning function.
+ *
+ * The function should provide the implementation specific memory for
+ * for use by the other mempool ops functions in a given mempool ops struct.
+ * E.g. the default ops provides an instance of the rte_ring for this purpose.
+ * it will most likely point to a different type of data structure, and
+ * will be transparent to the application programmer.
+ * This function should set mp->pool_data.
+ */
+typedef int (*rte_mempool_alloc_t)(struct rte_mempool *mp);
+
+/**
+ * Free the opaque private data pointed to by mp->pool_data pointer.
+ */
+typedef void (*rte_mempool_free_t)(struct rte_mempool *mp);
+
+/**
+ * Enqueue an object into the external pool.
+ */
+typedef int (*rte_mempool_enqueue_t)(struct rte_mempool *mp,
+		void * const *obj_table, unsigned int n);
+
+/**
+ * Dequeue an object from the external pool.
+ */
+typedef int (*rte_mempool_dequeue_t)(struct rte_mempool *mp,
+		void **obj_table, unsigned int n);
+
+/**
+ * Return the number of available objects in the external pool.
+ */
+typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
+
+/** Structure defining mempool operations structure */
+struct rte_mempool_ops {
+	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
+	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
+	rte_mempool_free_t free;         /**< Free the external pool. */
+	rte_mempool_enqueue_t enqueue;   /**< Enqueue an object. */
+	rte_mempool_dequeue_t dequeue;   /**< Dequeue an object. */
+	rte_mempool_get_count get_count; /**< Get qty of available objs. */
+} __rte_cache_aligned;
+
+#define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
+
+/**
+ * Structure storing the table of registered ops structs, each of which contain
+ * the function pointers for the mempool ops functions.
+ * Each process has its own storage for this ops struct array so that
+ * the mempools can be shared across primary and secondary processes.
+ * The indices used to access the array are valid across processes, whereas
+ * any function pointers stored directly in the mempool struct would not be.
+ * This results in us simply having "ops_index" in the mempool struct.
+ */
+struct rte_mempool_ops_table {
+	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
+	uint32_t num_ops;      /**< Number of used ops structs in the table. */
+	/**
+	 * Storage for all possible ops structs.
+	 */
+	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
+} __rte_cache_aligned;
+
+/** Array of registered ops structs. */
+extern struct rte_mempool_ops_table rte_mempool_ops_table;
+
+/**
+ * @internal Get the mempool ops struct from its index.
+ *
+ * @param ops_index
+ *   The index of the ops struct in the ops struct table. It must be a valid
+ *   index: (0 <= idx < num_ops).
+ * @return
+ *   The pointer to the ops struct in the table.
+ */
+static inline struct rte_mempool_ops *
+rte_mempool_get_ops(int ops_index)
+{
+	RTE_VERIFY((ops_index >= 0) && (ops_index < RTE_MEMPOOL_MAX_OPS_IDX));
+
+	return &rte_mempool_ops_table.ops[ops_index];
+}
+
+/**
+ * @internal Wrapper for mempool_ops alloc callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   - 0: Success; successfully allocated mempool pool_data.
+ *   - <0: Error; code of alloc function.
+ */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp);
+
+/**
+ * @internal Wrapper for mempool_ops dequeue callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to get.
+ * @return
+ *   - 0: Success; got n objects.
+ *   - <0: Error; code of dequeue function.
+ */
+static inline int
+rte_mempool_ops_dequeue_bulk(struct rte_mempool *mp,
+		void **obj_table, unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	return ops->dequeue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops enqueue callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param obj_table
+ *   Pointer to a table of void * pointers (objects).
+ * @param n
+ *   Number of objects to put.
+ * @return
+ *   - 0: Success; n objects supplied.
+ *   - <0: Error; code of enqueue function.
+ */
+static inline int
+rte_mempool_ops_enqueue_bulk(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	return ops->enqueue(mp, obj_table, n);
+}
+
+/**
+ * @internal wrapper for mempool_ops get_count callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @return
+ *   The number of available objects in the external pool.
+ */
+unsigned
+rte_mempool_ops_get_count(const struct rte_mempool *mp);
+
+/**
+ * @internal wrapper for mempool_ops free callback.
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ */
+void
+rte_mempool_ops_free(struct rte_mempool *mp);
+
+/**
+ * Set the ops of a mempool.
+ *
+ * This can only be done on a mempool that is not populated, i.e. just after
+ * a call to rte_mempool_create_empty().
+ *
+ * @param mp
+ *   Pointer to the memory pool.
+ * @param name
+ *   Name of the ops structure to use for this mempool.
+ * @param pool_config
+ *   Opaque data that can be passed by the application to the ops functions.
+ * @return
+ *   - 0: Success; the mempool is now using the requested ops functions.
+ *   - -EINVAL - Invalid ops struct name provided.
+ *   - -EEXIST - mempool already has an ops struct assigned.
+ */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+		void *pool_config);
+
+/**
+ * Register mempool operations.
+ *
+ * @param ops
+ *   Pointer to an ops structure to register.
+ * @return
+ *   - >=0: Success; return the index of the ops struct in the table.
+ *   - -EINVAL - some missing callbacks while registering ops struct.
+ *   - -ENOSPC - the maximum number of ops structs has been reached.
+ */
+int rte_mempool_register_ops(const struct rte_mempool_ops *ops);
+
+/**
+ * Macro to statically register the ops of a mempool handler.
+ * Note that the rte_mempool_register_ops fails silently here when
+ * more then RTE_MEMPOOL_MAX_OPS_IDX is registered.
+ */
+#define MEMPOOL_REGISTER_OPS(ops)					\
+	void mp_hdlr_init_##ops(void);					\
+	void __attribute__((constructor, used)) mp_hdlr_init_##ops(void)\
+	{								\
+		rte_mempool_register_ops(&ops);			\
+	}
+
 /**
  * An object callback function for mempool.
  *
@@ -774,7 +996,7 @@ __mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
 	cache->len += n;
 
 	if (cache->len >= flushthresh) {
-		rte_ring_mp_enqueue_bulk(mp->ring, &cache->objs[cache_size],
+		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache_size],
 				cache->len - cache_size);
 		cache->len = cache_size;
 	}
@@ -785,19 +1007,10 @@ ring_enqueue:
 
 	/* push remaining objects in ring */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
-	if (is_mp) {
-		if (rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
-	else {
-		if (rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n) < 0)
-			rte_panic("cannot put objects in mempool\n");
-	}
+	if (rte_mempool_ops_enqueue_bulk(mp, obj_table, n) < 0)
+		rte_panic("cannot put objects in mempool\n");
 #else
-	if (is_mp)
-		rte_ring_mp_enqueue_bulk(mp->ring, obj_table, n);
-	else
-		rte_ring_sp_enqueue_bulk(mp->ring, obj_table, n);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, n);
 #endif
 }
 
@@ -945,7 +1158,8 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 		uint32_t req = n + (cache_size - cache->len);
 
 		/* How many do we require i.e. number to fill the cache + the request */
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, &cache->objs[cache->len], req);
+		ret = rte_mempool_ops_dequeue_bulk(mp,
+			&cache->objs[cache->len], req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the offchance that we are buffer constrained,
@@ -972,10 +1186,7 @@ __mempool_get_bulk(struct rte_mempool *mp, void **obj_table,
 ring_dequeue:
 
 	/* get remaining objects from ring */
-	if (is_mc)
-		ret = rte_ring_mc_dequeue_bulk(mp->ring, obj_table, n);
-	else
-		ret = rte_ring_sc_dequeue_bulk(mp->ring, obj_table, n);
+	ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n);
 
 	if (ret < 0)
 		__MEMPOOL_STAT_ADD(mp, get_fail, n);
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
new file mode 100644
index 0000000..fd0b64c
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -0,0 +1,151 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_mempool.h>
+#include <rte_errno.h>
+
+/* indirect jump table to support external memory pools. */
+struct rte_mempool_ops_table rte_mempool_ops_table = {
+	.sl =  RTE_SPINLOCK_INITIALIZER,
+	.num_ops = 0
+};
+
+/* add a new ops struct in rte_mempool_ops_table, return its index. */
+int
+rte_mempool_register_ops(const struct rte_mempool_ops *h)
+{
+	struct rte_mempool_ops *ops;
+	int16_t ops_index;
+
+	rte_spinlock_lock(&rte_mempool_ops_table.sl);
+
+	if (rte_mempool_ops_table.num_ops >=
+			RTE_MEMPOOL_MAX_OPS_IDX) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Maximum number of mempool ops structs exceeded\n");
+		return -ENOSPC;
+	}
+
+	if (h->alloc == NULL || h->enqueue == NULL ||
+			h->dequeue == NULL || h->get_count == NULL) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(ERR, MEMPOOL,
+			"Missing callback while registering mempool ops\n");
+		return -EINVAL;
+	}
+
+	if (strlen(h->name) >= sizeof(ops->name) - 1) {
+		rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+		RTE_LOG(DEBUG, EAL, "%s(): mempool_ops <%s>: name too long\n",
+				__func__, h->name);
+		rte_errno = EEXIST;
+		return -EEXIST;
+	}
+
+	ops_index = rte_mempool_ops_table.num_ops++;
+	ops = &rte_mempool_ops_table.ops[ops_index];
+	snprintf(ops->name, sizeof(ops->name), "%s", h->name);
+	ops->alloc = h->alloc;
+	ops->enqueue = h->enqueue;
+	ops->dequeue = h->dequeue;
+	ops->get_count = h->get_count;
+
+	rte_spinlock_unlock(&rte_mempool_ops_table.sl);
+
+	return ops_index;
+}
+
+/* wrapper to allocate an external mempool's private (pool) data. */
+int
+rte_mempool_ops_alloc(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	return ops->alloc(mp);
+}
+
+/* wrapper to free an external pool ops. */
+void
+rte_mempool_ops_free(struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	if (ops->free == NULL)
+		return;
+	ops->free(mp);
+}
+
+/* wrapper to get available objects in an external mempool. */
+unsigned int
+rte_mempool_ops_get_count(const struct rte_mempool *mp)
+{
+	struct rte_mempool_ops *ops;
+
+	ops = rte_mempool_get_ops(mp->ops_index);
+	return ops->get_count(mp);
+}
+
+/* sets mempool ops previously registered by rte_mempool_register_ops. */
+int
+rte_mempool_set_ops_byname(struct rte_mempool *mp, const char *name,
+	void *pool_config)
+{
+	struct rte_mempool_ops *ops = NULL;
+	unsigned i;
+
+	/* too late, the mempool is already populated. */
+	if (mp->flags & MEMPOOL_F_POOL_CREATED)
+		return -EEXIST;
+
+	for (i = 0; i < rte_mempool_ops_table.num_ops; i++) {
+		if (!strcmp(name,
+				rte_mempool_ops_table.ops[i].name)) {
+			ops = &rte_mempool_ops_table.ops[i];
+			break;
+		}
+	}
+
+	if (ops == NULL)
+		return -EINVAL;
+
+	mp->ops_index = i;
+	mp->pool_config = pool_config;
+	return 0;
+}
diff --git a/lib/librte_mempool/rte_mempool_ring.c b/lib/librte_mempool/rte_mempool_ring.c
new file mode 100644
index 0000000..b9aa64d
--- /dev/null
+++ b/lib/librte_mempool/rte_mempool_ring.c
@@ -0,0 +1,161 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+
+static int
+common_ring_mp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_mp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sp_enqueue(struct rte_mempool *mp, void * const *obj_table,
+		unsigned n)
+{
+	return rte_ring_sp_enqueue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_mc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_mc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static int
+common_ring_sc_dequeue(struct rte_mempool *mp, void **obj_table, unsigned n)
+{
+	return rte_ring_sc_dequeue_bulk(mp->pool_data, obj_table, n);
+}
+
+static unsigned
+common_ring_get_count(const struct rte_mempool *mp)
+{
+	return rte_ring_count(mp->pool_data);
+}
+
+
+static int
+common_ring_alloc(struct rte_mempool *mp)
+{
+	int rg_flags = 0, ret;
+	char rg_name[RTE_RING_NAMESIZE];
+	struct rte_ring *r;
+
+	ret = snprintf(rg_name, sizeof(rg_name),
+		RTE_MEMPOOL_MZ_FORMAT, mp->name);
+	if (ret < 0 || ret >= (int)sizeof(rg_name)) {
+		rte_errno = ENAMETOOLONG;
+		return -rte_errno;
+	}
+
+	/* ring flags */
+	if (mp->flags & MEMPOOL_F_SP_PUT)
+		rg_flags |= RING_F_SP_ENQ;
+	if (mp->flags & MEMPOOL_F_SC_GET)
+		rg_flags |= RING_F_SC_DEQ;
+
+	/*
+	 * Allocate the ring that will be used to store objects.
+	 * Ring functions will return appropriate errors if we are
+	 * running as a secondary process etc., so no checks made
+	 * in this function for that condition.
+	 */
+	r = rte_ring_create(rg_name, rte_align32pow2(mp->size + 1),
+		mp->socket_id, rg_flags);
+	if (r == NULL)
+		return -rte_errno;
+
+	mp->pool_data = r;
+
+	return 0;
+}
+
+static void
+common_ring_free(struct rte_mempool *mp)
+{
+	rte_ring_free(mp->pool_data);
+}
+
+/*
+ * The following 4 declarations of mempool ops structs address
+ * the need for the backward compatible mempool handlers for
+ * single/multi producers and single/multi consumers as dictated by the
+ * flags provided to the rte_mempool_create function
+ */
+static const struct rte_mempool_ops ops_mp_mc = {
+	.name = "ring_mp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_sc = {
+	.name = "ring_sp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_mp_sc = {
+	.name = "ring_mp_sc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_mp_enqueue,
+	.dequeue = common_ring_sc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+static const struct rte_mempool_ops ops_sp_mc = {
+	.name = "ring_sp_mc",
+	.alloc = common_ring_alloc,
+	.free = common_ring_free,
+	.enqueue = common_ring_sp_enqueue,
+	.dequeue = common_ring_mc_dequeue,
+	.get_count = common_ring_get_count,
+};
+
+MEMPOOL_REGISTER_OPS(ops_mp_mc);
+MEMPOOL_REGISTER_OPS(ops_sp_sc);
+MEMPOOL_REGISTER_OPS(ops_mp_sc);
+MEMPOOL_REGISTER_OPS(ops_sp_mc);
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index f63461b..a4a6c1f 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -20,15 +20,18 @@ DPDK_16.7 {
 	global:
 
 	rte_mempool_check_cookies;
-	rte_mempool_obj_iter;
-	rte_mempool_mem_iter;
 	rte_mempool_create_empty;
+	rte_mempool_free;
+	rte_mempool_mem_iter;
+	rte_mempool_obj_iter;
+	rte_mempool_ops_table;
+	rte_mempool_populate_anon;
+	rte_mempool_populate_default;
 	rte_mempool_populate_phys;
 	rte_mempool_populate_phys_tab;
 	rte_mempool_populate_virt;
-	rte_mempool_populate_default;
-	rte_mempool_populate_anon;
-	rte_mempool_free;
+	rte_mempool_register_ops;
+	rte_mempool_set_ops_byname;
 
 	local: *;
 } DPDK_2.0;
-- 
2.5.5

^ permalink raw reply	[relevance 1%]

* Re: [dpdk-dev] [PATCH v5 09/17] crypto: get rid of crypto driver register callback
  @ 2016-06-22 13:44  3%       ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-22 13:44 UTC (permalink / raw)
  To: Neil Horman; +Cc: Shreyansh Jain, dev, viktorin

2016-06-22 09:27, Neil Horman:
> > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > -	rte_cryptodev_pmd_driver_register;
> NAK, you can't just remove exported symbols without going through the
> deprecation process.  Better still would be to only expose it for DPDK_16.04 and
> hide it in the next release

This function is not called by the application.
Thus there is no ABI break.

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH 0/3] ethdev: add helper functions to get eth_dev and dev private data
  @ 2016-06-22 21:47  0% ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-22 21:47 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Declan Doherty

2016-02-17 14:20, Ferruh Yigit:
> This is to provide abstraction and reduce global variable access.
> 
> Global variable rte_eth_devices kept exported to not break ABI.
> 
> Bonding driver not selected on purpose, just it seems it is using 
> rte_eth_devices heavily.

The struct rte_eth_dev is marked internal.
It is a good goal to remove access to the global array rte_eth_devices,
but the fix must be in the code accessing it only (bonding).

This patchset is rejected.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
                       ` (5 preceding siblings ...)
  2016-06-14 12:00  4%     ` [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring Yuanhan Liu
@ 2016-06-30  7:39  9%     ` Panu Matilainen
  2016-06-30  7:57  4%       ` Yuanhan Liu
  6 siblings, 1 reply; 200+ results
From: Panu Matilainen @ 2016-06-30  7:39 UTC (permalink / raw)
  To: Yuanhan Liu, dev
  Cc: huawei.xie, Thomas Monjalon, Traynor Kevin, Rich Lane, Tetsuya Mukawa

On 06/07/2016 06:51 AM, Yuanhan Liu wrote:
> v3: - adapted the new vhost ABI/API changes to tep_term example, to make
>       sure not break build at least.
>     - bumped the ABI version to 3
>
> NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
>
>     [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
>
>
> Every time we introduce a new feature to vhost, we are likely to break
> ABI. Moreover, some cleanups (such as the one from Ilya to remove vec_buf
> from vhost_virtqueue struct) also break ABI.
>
> This patch set is meant to resolve above issue ultimately, by hiding
> virtio_net structure (as well as few others) internaly, and export the
> virtio_net dev strut to applications by a number, vid, like the way
> kernel exposes an fd to user space.
>
> Back to the patch set, the first part of this set makes some changes to
> vhost example, vhost-pmd and vhost, bit by bit, to remove the dependence
> to "virtio_net" struct. And then do the final change to make the current
> APIs to adapt to using "vid".
>
> After that, "vrtio_net_device_ops" is the only left open struct that an
> application can acces, therefore, it's the only place that might introduce
> potential ABI breakage in future for extension. Hence, I made few more
> (5) space reservation, to make sure we will not break ABI for a long time,
> and hopefuly, forever.

Been intending to say this for a while but seems I never actually got 
around to do so:

This is a really fine example of how to refactor an API against constant 
ABI breakages, thank you Yuanhan! Exported structs are one of the 
biggest obstacles in keeping a stable ABI while adding new features, and 
while its not always possible to hide everything to this extent, the 
damage (erm, exposure) can usually be considerably limited by careful 
API design.

Since the first and the foremost objection against doing this in the 
DPDK context is always "but performance!", I'm curious as to what sort 
of numbers you're getting with the new API vs the old one? I'm really 
hoping other libraries would follow suit after seeing that its possible 
to provide a future-proof API/ABI without sacrificing performance :)

Thanks again,

	- Panu -

^ permalink raw reply	[relevance 9%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-30  7:39  9%     ` Panu Matilainen
@ 2016-06-30  7:57  4%       ` Yuanhan Liu
  2016-06-30  9:05  7%         ` Panu Matilainen
  0 siblings, 1 reply; 200+ results
From: Yuanhan Liu @ 2016-06-30  7:57 UTC (permalink / raw)
  To: Panu Matilainen
  Cc: dev, huawei.xie, Thomas Monjalon, Rich Lane, Tetsuya Mukawa

On Thu, Jun 30, 2016 at 10:39:45AM +0300, Panu Matilainen wrote:
> On 06/07/2016 06:51 AM, Yuanhan Liu wrote:
> >v3: - adapted the new vhost ABI/API changes to tep_term example, to make
> >      sure not break build at least.
> >    - bumped the ABI version to 3
> >
> >NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
> >
> >    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
> >
> >
> >Every time we introduce a new feature to vhost, we are likely to break
> >ABI. Moreover, some cleanups (such as the one from Ilya to remove vec_buf
> >from vhost_virtqueue struct) also break ABI.
> >
> >This patch set is meant to resolve above issue ultimately, by hiding
> >virtio_net structure (as well as few others) internaly, and export the
> >virtio_net dev strut to applications by a number, vid, like the way
> >kernel exposes an fd to user space.
> >
> >Back to the patch set, the first part of this set makes some changes to
> >vhost example, vhost-pmd and vhost, bit by bit, to remove the dependence
> >to "virtio_net" struct. And then do the final change to make the current
> >APIs to adapt to using "vid".
> >
> >After that, "vrtio_net_device_ops" is the only left open struct that an
> >application can acces, therefore, it's the only place that might introduce
> >potential ABI breakage in future for extension. Hence, I made few more
> >(5) space reservation, to make sure we will not break ABI for a long time,
> >and hopefuly, forever.
> 
> Been intending to say this for a while but seems I never actually got around
> to do so:
> 
> This is a really fine example of how to refactor an API against constant ABI
> breakages, thank you Yuanhan!

Panu, thanks!

> Exported structs are one of the biggest
> obstacles in keeping a stable ABI while adding new features, and while its
> not always possible to hide everything to this extent, the damage (erm,
> exposure) can usually be considerably limited by careful API design.

Agreed.

> Since the first and the foremost objection against doing this in the DPDK
> context is always "but performance!", I'm curious as to what sort of numbers
> you're getting with the new API vs the old one? I'm really hoping other
> libraries would follow suit after seeing that its possible to provide a
> future-proof API/ABI without sacrificing performance :)

>From my (limited) test, nope, I see no performance drop at all, not even a
little.

	--yliu

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-30  7:57  4%       ` Yuanhan Liu
@ 2016-06-30  9:05  7%         ` Panu Matilainen
  2016-06-30 11:15  7%           ` Mcnamara, John
  0 siblings, 1 reply; 200+ results
From: Panu Matilainen @ 2016-06-30  9:05 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: dev, huawei.xie, Thomas Monjalon, Rich Lane, Tetsuya Mukawa

On 06/30/2016 10:57 AM, Yuanhan Liu wrote:
> On Thu, Jun 30, 2016 at 10:39:45AM +0300, Panu Matilainen wrote:
>> On 06/07/2016 06:51 AM, Yuanhan Liu wrote:
>>> v3: - adapted the new vhost ABI/API changes to tep_term example, to make
>>>      sure not break build at least.
>>>    - bumped the ABI version to 3
>>>
>>> NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
>>>
>>>    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
>>>
>>>
>>> Every time we introduce a new feature to vhost, we are likely to break
>>> ABI. Moreover, some cleanups (such as the one from Ilya to remove vec_buf
>> >from vhost_virtqueue struct) also break ABI.
>>>
>>> This patch set is meant to resolve above issue ultimately, by hiding
>>> virtio_net structure (as well as few others) internaly, and export the
>>> virtio_net dev strut to applications by a number, vid, like the way
>>> kernel exposes an fd to user space.
>>>
>>> Back to the patch set, the first part of this set makes some changes to
>>> vhost example, vhost-pmd and vhost, bit by bit, to remove the dependence
>>> to "virtio_net" struct. And then do the final change to make the current
>>> APIs to adapt to using "vid".
>>>
>>> After that, "vrtio_net_device_ops" is the only left open struct that an
>>> application can acces, therefore, it's the only place that might introduce
>>> potential ABI breakage in future for extension. Hence, I made few more
>>> (5) space reservation, to make sure we will not break ABI for a long time,
>>> and hopefuly, forever.
>>
>> Been intending to say this for a while but seems I never actually got around
>> to do so:
>>
>> This is a really fine example of how to refactor an API against constant ABI
>> breakages, thank you Yuanhan!
>
> Panu, thanks!
>
>> Exported structs are one of the biggest
>> obstacles in keeping a stable ABI while adding new features, and while its
>> not always possible to hide everything to this extent, the damage (erm,
>> exposure) can usually be considerably limited by careful API design.
>
> Agreed.
>
>> Since the first and the foremost objection against doing this in the DPDK
>> context is always "but performance!", I'm curious as to what sort of numbers
>> you're getting with the new API vs the old one? I'm really hoping other
>> libraries would follow suit after seeing that its possible to provide a
>> future-proof API/ABI without sacrificing performance :)
>
> From my (limited) test, nope, I see no performance drop at all, not even a
> little.

Awesome!

With that, hopefully others will see the light and follow its example. 
If nothing else, they ought to get a bit envious when you can add 
features left and right without ever having to wait for API/ABI break 
periods etc ;)

	- Panu -

>
> 	--yliu
>

^ permalink raw reply	[relevance 7%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-30  9:05  7%         ` Panu Matilainen
@ 2016-06-30 11:15  7%           ` Mcnamara, John
  2016-06-30 11:40  4%             ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Mcnamara, John @ 2016-06-30 11:15 UTC (permalink / raw)
  To: Panu Matilainen, Yuanhan Liu
  Cc: dev, Xie, Huawei, Thomas Monjalon, Rich Lane, Tetsuya Mukawa



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Panu Matilainen
> Sent: Thursday, June 30, 2016 10:05 AM
> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> Cc: dev@dpdk.org; Xie, Huawei <huawei.xie@intel.com>; Thomas Monjalon
> <thomas.monjalon@6wind.com>; Rich Lane <rich.lane@bigswitch.com>; Tetsuya
> Mukawa <mukawa@igel.co.jp>
> Subject: Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
> 
> On 06/30/2016 10:57 AM, Yuanhan Liu wrote:
> > On Thu, Jun 30, 2016 at 10:39:45AM +0300, Panu Matilainen wrote:
> >> On 06/07/2016 06:51 AM, Yuanhan Liu wrote:
> >>> v3: - adapted the new vhost ABI/API changes to tep_term example, to
> make
> >>>      sure not break build at least.
> >>>    - bumped the ABI version to 3
> >>>
> >>> NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
> >>>
> >>>    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
> >>>
> >>>
> >>> Every time we introduce a new feature to vhost, we are likely to
> >>> break ABI. Moreover, some cleanups (such as the one from Ilya to
> >>> remove vec_buf
> >> >from vhost_virtqueue struct) also break ABI.
> >>>
> >>> This patch set is meant to resolve above issue ultimately, by hiding
> >>> virtio_net structure (as well as few others) internaly, and export
> >>> the virtio_net dev strut to applications by a number, vid, like the
> >>> way kernel exposes an fd to user space.
> >>>
> >>> Back to the patch set, the first part of this set makes some changes
> >>> to vhost example, vhost-pmd and vhost, bit by bit, to remove the
> >>> dependence to "virtio_net" struct. And then do the final change to
> >>> make the current APIs to adapt to using "vid".
> >>>
> >>> After that, "vrtio_net_device_ops" is the only left open struct that
> >>> an application can acces, therefore, it's the only place that might
> >>> introduce potential ABI breakage in future for extension. Hence, I
> >>> made few more
> >>> (5) space reservation, to make sure we will not break ABI for a long
> >>> time, and hopefuly, forever.
> >>
> >> Been intending to say this for a while but seems I never actually got
> >> around to do so:
> >>
> >> This is a really fine example of how to refactor an API against
> >> constant ABI breakages, thank you Yuanhan!
> >
> > Panu, thanks!
> >
> >> Exported structs are one of the biggest obstacles in keeping a stable
> >> ABI while adding new features, and while its not always possible to
> >> hide everything to this extent, the damage (erm,
> >> exposure) can usually be considerably limited by careful API design.
> >
> > Agreed.
> >
> >> Since the first and the foremost objection against doing this in the
> >> DPDK context is always "but performance!", I'm curious as to what
> >> sort of numbers you're getting with the new API vs the old one? I'm
> >> really hoping other libraries would follow suit after seeing that its
> >> possible to provide a future-proof API/ABI without sacrificing
> >> performance :)
> >
> > From my (limited) test, nope, I see no performance drop at all, not
> > even a little.
> 
> Awesome!
> 
> With that, hopefully others will see the light and follow its example.
> If nothing else, they ought to get a bit envious when you can add features
> left and right without ever having to wait for API/ABI break periods etc
> ;)

Agreed. We should be doing more of this type of refactoring work to make the API/ABI less easier to break.

John

^ permalink raw reply	[relevance 7%]

* Re: [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring
  2016-06-30 11:15  7%           ` Mcnamara, John
@ 2016-06-30 11:40  4%             ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-06-30 11:40 UTC (permalink / raw)
  To: Mcnamara, John
  Cc: Panu Matilainen, Yuanhan Liu, dev, Xie, Huawei, Rich Lane,
	Tetsuya Mukawa

2016-06-30 11:15, Mcnamara, John:
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Panu Matilainen
> > On 06/30/2016 10:57 AM, Yuanhan Liu wrote:
> > > On Thu, Jun 30, 2016 at 10:39:45AM +0300, Panu Matilainen wrote:
> > >> On 06/07/2016 06:51 AM, Yuanhan Liu wrote:
> > >>> v3: - adapted the new vhost ABI/API changes to tep_term example, to
> > make
> > >>>      sure not break build at least.
> > >>>    - bumped the ABI version to 3
> > >>>
> > >>> NOTE: I created a branch at dpdk.org [0] for more conveinient testing:
> > >>>
> > >>>    [0]: git://dpdk.org/next/dpdk-next-virtio for-testing
> > >>>
> > >>>
> > >>> Every time we introduce a new feature to vhost, we are likely to
> > >>> break ABI. Moreover, some cleanups (such as the one from Ilya to
> > >>> remove vec_buf
> > >> >from vhost_virtqueue struct) also break ABI.
> > >>>
> > >>> This patch set is meant to resolve above issue ultimately, by hiding
> > >>> virtio_net structure (as well as few others) internaly, and export
> > >>> the virtio_net dev strut to applications by a number, vid, like the
> > >>> way kernel exposes an fd to user space.
> > >>>
> > >>> Back to the patch set, the first part of this set makes some changes
> > >>> to vhost example, vhost-pmd and vhost, bit by bit, to remove the
> > >>> dependence to "virtio_net" struct. And then do the final change to
> > >>> make the current APIs to adapt to using "vid".
> > >>>
> > >>> After that, "vrtio_net_device_ops" is the only left open struct that
> > >>> an application can acces, therefore, it's the only place that might
> > >>> introduce potential ABI breakage in future for extension. Hence, I
> > >>> made few more
> > >>> (5) space reservation, to make sure we will not break ABI for a long
> > >>> time, and hopefuly, forever.
> > >>
> > >> Been intending to say this for a while but seems I never actually got
> > >> around to do so:
> > >>
> > >> This is a really fine example of how to refactor an API against
> > >> constant ABI breakages, thank you Yuanhan!
> > >
> > > Panu, thanks!
> > >
> > >> Exported structs are one of the biggest obstacles in keeping a stable
> > >> ABI while adding new features, and while its not always possible to
> > >> hide everything to this extent, the damage (erm,
> > >> exposure) can usually be considerably limited by careful API design.
> > >
> > > Agreed.
> > >
> > >> Since the first and the foremost objection against doing this in the
> > >> DPDK context is always "but performance!", I'm curious as to what
> > >> sort of numbers you're getting with the new API vs the old one? I'm
> > >> really hoping other libraries would follow suit after seeing that its
> > >> possible to provide a future-proof API/ABI without sacrificing
> > >> performance :)
> > >
> > > From my (limited) test, nope, I see no performance drop at all, not
> > > even a little.
> > 
> > Awesome!
> > 
> > With that, hopefully others will see the light and follow its example.
> > If nothing else, they ought to get a bit envious when you can add features
> > left and right without ever having to wait for API/ABI break periods etc
> > ;)
> 
> Agreed. We should be doing more of this type of refactoring work to make the API/ABI less easier to break.

+1
But we must check the possible performance degradation with care :)

^ permalink raw reply	[relevance 4%]

* [dpdk-dev] [PATCH v9 4/7] pmdinfogen: parse driver to generate code to export
  @ 2016-07-04  1:14  2%   ` Thomas Monjalon
  2016-07-04  1:14  2%   ` [dpdk-dev] [PATCH v9 7/7] tools: query binaries for support information Thomas Monjalon
  1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-04  1:14 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, Panu Matilainen

From: Neil Horman <nhorman@tuxdriver.com>

dpdk-pmdinfogen is a tool used to parse object files and build JSON
strings for use in later determining hardware support in a DSO or
application binary.
dpdk-pmdinfogen looks for the non-exported symbol names rte_pmd_name<n>
(where n is a integer counter) and <name>_pci_table_export.
It records the name of each of these tuples, using the later to find
the symbolic name of the PCI table for physical devices that the object
supports.  With this information, it outputs a C file with a single line
of the form:

static char *<name>_pmd_info[] __attribute__((used)) = " \
	PMD_INFO_STRING=<json_string>";

Where <name> is the arbitrary name of the PMD, and <json_string> is the
JSON encoded string that hold relevant PMD information, including the PMD
name, type and optional array of PCI device/vendor IDs that the driver
supports.

This C file is suitable for compiling to object code, then relocatably
linking into the parent file from which the C was generated.  This creates
an entry in the string table of the object that can inform a later tool
about hardware support.

Note 1: When installed as part of a SDK package, dpdk-pmdinfogen should
        be built for the SDK target. It is not handled currently.
Note 2: Some generated files are not cleaned by "make clean".

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Panu Matilainen <pmatilai@redhat.com>
Acked-by: Remy Horton <remy.horton@intel.com>
Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
 GNUmakefile                                    |   2 +-
 MAINTAINERS                                    |   4 +
 GNUmakefile => buildtools/Makefile             |  17 +-
 GNUmakefile => buildtools/pmdinfogen/Makefile  |  21 +-
 buildtools/pmdinfogen/pmdinfogen.c             | 451 +++++++++++++++++++++++++
 buildtools/pmdinfogen/pmdinfogen.h             |  99 ++++++
 doc/guides/prog_guide/dev_kit_build_system.rst |  15 +-
 mk/rte.sdkbuild.mk                             |   2 +-
 mk/rte.sdkinstall.mk                           |   3 +
 9 files changed, 587 insertions(+), 27 deletions(-)
 copy GNUmakefile => buildtools/Makefile (87%)
 copy GNUmakefile => buildtools/pmdinfogen/Makefile (84%)
 create mode 100644 buildtools/pmdinfogen/pmdinfogen.c
 create mode 100644 buildtools/pmdinfogen/pmdinfogen.h

diff --git a/GNUmakefile b/GNUmakefile
index b59e4b6..00fe0db 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -40,6 +40,6 @@ export RTE_SDK
 # directory list
 #
 
-ROOTDIRS-y := lib drivers app
+ROOTDIRS-y := buildtools lib drivers app
 
 include $(RTE_SDK)/mk/rte.sdkroot.mk
diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..1a8a3b7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -68,6 +68,10 @@ F: lib/librte_compat/
 F: doc/guides/rel_notes/deprecation.rst
 F: scripts/validate-abi.sh
 
+Driver information
+M: Neil Horman <nhorman@tuxdriver.com>
+F: buildtools/pmdinfogen/
+
 
 Environment Abstraction Layer
 -----------------------------
diff --git a/GNUmakefile b/buildtools/Makefile
similarity index 87%
copy from GNUmakefile
copy to buildtools/Makefile
index b59e4b6..35a42ff 100644
--- a/GNUmakefile
+++ b/buildtools/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2016 Neil Horman. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -29,17 +29,8 @@
 #   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#
-# Head Makefile for compiling rte SDK
-#
-
-RTE_SDK := $(CURDIR)
-export RTE_SDK
-
-#
-# directory list
-#
+include $(RTE_SDK)/mk/rte.vars.mk
 
-ROOTDIRS-y := lib drivers app
+DIRS-y += pmdinfogen
 
-include $(RTE_SDK)/mk/rte.sdkroot.mk
+include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/GNUmakefile b/buildtools/pmdinfogen/Makefile
similarity index 84%
copy from GNUmakefile
copy to buildtools/pmdinfogen/Makefile
index b59e4b6..327927e 100644
--- a/GNUmakefile
+++ b/buildtools/pmdinfogen/Makefile
@@ -1,6 +1,6 @@
 #   BSD LICENSE
 #
-#   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+#   Copyright(c) 2016 Neil Horman. All rights reserved.
 #   All rights reserved.
 #
 #   Redistribution and use in source and binary forms, with or without
@@ -29,17 +29,16 @@
 #   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#
-# Head Makefile for compiling rte SDK
-#
+include $(RTE_SDK)/mk/rte.vars.mk
 
-RTE_SDK := $(CURDIR)
-export RTE_SDK
+HOSTAPP_DIR = buildtools
+HOSTAPP = dpdk-pmdinfogen
 
-#
-# directory list
-#
+SRCS-y += pmdinfogen.c
+
+HOST_CFLAGS += $(WERROR_FLAGS) -g
+HOST_CFLAGS += -I$(RTE_OUTPUT)/include
 
-ROOTDIRS-y := lib drivers app
+DEPDIRS-y += lib/librte_eal
 
-include $(RTE_SDK)/mk/rte.sdkroot.mk
+include $(RTE_SDK)/mk/rte.hostapp.mk
diff --git a/buildtools/pmdinfogen/pmdinfogen.c b/buildtools/pmdinfogen/pmdinfogen.c
new file mode 100644
index 0000000..101bce1
--- /dev/null
+++ b/buildtools/pmdinfogen/pmdinfogen.c
@@ -0,0 +1,451 @@
+/* Postprocess pmd object files to export hw support
+ *
+ * Copyright 2016 Neil Horman <nhorman@tuxdriver.com>
+ * Based in part on modpost.c from the linux kernel
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License V2, incorporated herein by reference.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <rte_common.h>
+#include "pmdinfogen.h"
+
+#ifdef RTE_ARCH_64
+#define ADDR_SIZE 64
+#else
+#define ADDR_SIZE 32
+#endif
+
+
+static void *
+grab_file(const char *filename, unsigned long *size)
+{
+	struct stat st;
+	void *map = MAP_FAILED;
+	int fd;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+	if (fstat(fd, &st))
+		goto failed;
+
+	*size = st.st_size;
+	map = mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+
+failed:
+	close(fd);
+	if (map == MAP_FAILED)
+		return NULL;
+	return map;
+}
+
+/*
+ * Return a copy of the next line in a mmap'ed file.
+ * spaces in the beginning of the line is trimmed away.
+ * Return a pointer to a static buffer.
+ */
+static void
+release_file(void *file, unsigned long size)
+{
+	munmap(file, size);
+}
+
+/*
+ * Note, it seems odd that we have both a CONVERT_NATIVE and a TO_NATIVE macro
+ * below.  We do this because the values passed to TO_NATIVE may themselves be
+ * macros and need both macros here to get expanded.  Specifically its the width
+ * variable we are concerned with, because it needs to get expanded prior to
+ * string concatenation
+ */
+#define CONVERT_NATIVE(fend, width, x) ({ \
+typeof(x) ___x; \
+if ((fend) == ELFDATA2LSB) \
+	___x = rte_le_to_cpu_##width(x); \
+else \
+	___x = rte_be_to_cpu_##width(x); \
+	___x; \
+})
+
+#define TO_NATIVE(fend, width, x) CONVERT_NATIVE(fend, width, x)
+
+static int
+parse_elf(struct elf_info *info, const char *filename)
+{
+	unsigned int i;
+	Elf_Ehdr *hdr;
+	Elf_Shdr *sechdrs;
+	Elf_Sym  *sym;
+	int endian;
+	unsigned int symtab_idx = ~0U, symtab_shndx_idx = ~0U;
+
+	hdr = grab_file(filename, &info->size);
+	if (!hdr) {
+		perror(filename);
+		return -ENOENT;
+	}
+	info->hdr = hdr;
+	if (info->size < sizeof(*hdr)) {
+		/* file too small, assume this is an empty .o file */
+		return 0;
+	}
+	/* Is this a valid ELF file? */
+	if ((hdr->e_ident[EI_MAG0] != ELFMAG0) ||
+	    (hdr->e_ident[EI_MAG1] != ELFMAG1) ||
+	    (hdr->e_ident[EI_MAG2] != ELFMAG2) ||
+	    (hdr->e_ident[EI_MAG3] != ELFMAG3)) {
+		/* Not an ELF file - silently ignore it */
+		return 0;
+	}
+
+	if (!hdr->e_ident[EI_DATA]) {
+		/* Unknown endian */
+		return 0;
+	}
+
+	endian = hdr->e_ident[EI_DATA];
+
+	/* Fix endianness in ELF header */
+	hdr->e_type      = TO_NATIVE(endian, 16, hdr->e_type);
+	hdr->e_machine   = TO_NATIVE(endian, 16, hdr->e_machine);
+	hdr->e_version   = TO_NATIVE(endian, 32, hdr->e_version);
+	hdr->e_entry     = TO_NATIVE(endian, ADDR_SIZE, hdr->e_entry);
+	hdr->e_phoff     = TO_NATIVE(endian, ADDR_SIZE, hdr->e_phoff);
+	hdr->e_shoff     = TO_NATIVE(endian, ADDR_SIZE, hdr->e_shoff);
+	hdr->e_flags     = TO_NATIVE(endian, 32, hdr->e_flags);
+	hdr->e_ehsize    = TO_NATIVE(endian, 16, hdr->e_ehsize);
+	hdr->e_phentsize = TO_NATIVE(endian, 16, hdr->e_phentsize);
+	hdr->e_phnum     = TO_NATIVE(endian, 16, hdr->e_phnum);
+	hdr->e_shentsize = TO_NATIVE(endian, 16, hdr->e_shentsize);
+	hdr->e_shnum     = TO_NATIVE(endian, 16, hdr->e_shnum);
+	hdr->e_shstrndx  = TO_NATIVE(endian, 16, hdr->e_shstrndx);
+
+	sechdrs = RTE_PTR_ADD(hdr, hdr->e_shoff);
+	info->sechdrs = sechdrs;
+
+	/* Check if file offset is correct */
+	if (hdr->e_shoff > info->size) {
+		fprintf(stderr, "section header offset=%lu in file '%s' "
+			"is bigger than filesize=%lu\n",
+			(unsigned long)hdr->e_shoff,
+			filename, info->size);
+		return 0;
+	}
+
+	if (hdr->e_shnum == SHN_UNDEF) {
+		/*
+		 * There are more than 64k sections,
+		 * read count from .sh_size.
+		 */
+		info->num_sections = TO_NATIVE(endian, 32, sechdrs[0].sh_size);
+	} else {
+		info->num_sections = hdr->e_shnum;
+	}
+	if (hdr->e_shstrndx == SHN_XINDEX)
+		info->secindex_strings =
+			TO_NATIVE(endian, 32, sechdrs[0].sh_link);
+	else
+		info->secindex_strings = hdr->e_shstrndx;
+
+	/* Fix endianness in section headers */
+	for (i = 0; i < info->num_sections; i++) {
+		sechdrs[i].sh_name      =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_name);
+		sechdrs[i].sh_type      =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_type);
+		sechdrs[i].sh_flags     =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_flags);
+		sechdrs[i].sh_addr      =
+			TO_NATIVE(endian, ADDR_SIZE, sechdrs[i].sh_addr);
+		sechdrs[i].sh_offset    =
+			TO_NATIVE(endian, ADDR_SIZE, sechdrs[i].sh_offset);
+		sechdrs[i].sh_size      =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_size);
+		sechdrs[i].sh_link      =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_link);
+		sechdrs[i].sh_info      =
+			TO_NATIVE(endian, 32, sechdrs[i].sh_info);
+		sechdrs[i].sh_addralign =
+			TO_NATIVE(endian, ADDR_SIZE, sechdrs[i].sh_addralign);
+		sechdrs[i].sh_entsize   =
+			TO_NATIVE(endian, ADDR_SIZE, sechdrs[i].sh_entsize);
+	}
+	/* Find symbol table. */
+	for (i = 1; i < info->num_sections; i++) {
+		int nobits = sechdrs[i].sh_type == SHT_NOBITS;
+
+		if (!nobits && sechdrs[i].sh_offset > info->size) {
+			fprintf(stderr, "%s is truncated. "
+				"sechdrs[i].sh_offset=%lu > sizeof(*hrd)=%zu\n",
+				filename, (unsigned long)sechdrs[i].sh_offset,
+				sizeof(*hdr));
+			return 0;
+		}
+
+		if (sechdrs[i].sh_type == SHT_SYMTAB) {
+			unsigned int sh_link_idx;
+			symtab_idx = i;
+			info->symtab_start = RTE_PTR_ADD(hdr,
+				sechdrs[i].sh_offset);
+			info->symtab_stop  = RTE_PTR_ADD(hdr,
+				sechdrs[i].sh_offset + sechdrs[i].sh_size);
+			sh_link_idx = sechdrs[i].sh_link;
+			info->strtab       = RTE_PTR_ADD(hdr,
+				sechdrs[sh_link_idx].sh_offset);
+		}
+
+		/* 32bit section no. table? ("more than 64k sections") */
+		if (sechdrs[i].sh_type == SHT_SYMTAB_SHNDX) {
+			symtab_shndx_idx = i;
+			info->symtab_shndx_start = RTE_PTR_ADD(hdr,
+				sechdrs[i].sh_offset);
+			info->symtab_shndx_stop  = RTE_PTR_ADD(hdr,
+				sechdrs[i].sh_offset + sechdrs[i].sh_size);
+		}
+	}
+	if (!info->symtab_start)
+		fprintf(stderr, "%s has no symtab?\n", filename);
+
+	/* Fix endianness in symbols */
+	for (sym = info->symtab_start; sym < info->symtab_stop; sym++) {
+		sym->st_shndx = TO_NATIVE(endian, 16, sym->st_shndx);
+		sym->st_name  = TO_NATIVE(endian, 32, sym->st_name);
+		sym->st_value = TO_NATIVE(endian, ADDR_SIZE, sym->st_value);
+		sym->st_size  = TO_NATIVE(endian, ADDR_SIZE, sym->st_size);
+	}
+
+	if (symtab_shndx_idx != ~0U) {
+		Elf32_Word *p;
+		if (symtab_idx != sechdrs[symtab_shndx_idx].sh_link)
+			fprintf(stderr,
+				"%s: SYMTAB_SHNDX has bad sh_link: %u!=%u\n",
+				filename, sechdrs[symtab_shndx_idx].sh_link,
+				symtab_idx);
+		/* Fix endianness */
+		for (p = info->symtab_shndx_start; p < info->symtab_shndx_stop; p++)
+			*p = TO_NATIVE(endian, 32, *p);
+	}
+
+	return 1;
+}
+
+static void
+parse_elf_finish(struct elf_info *info)
+{
+	struct pmd_driver *tmp, *idx = info->drivers;
+	release_file(info->hdr, info->size);
+	while (idx) {
+		tmp = idx->next;
+		free(idx);
+		idx = tmp;
+	}
+}
+
+static const char *
+get_sym_name(struct elf_info *elf, Elf_Sym *sym)
+{
+	if (sym)
+		return elf->strtab + sym->st_name;
+	else
+		return "(unknown)";
+}
+
+static void *
+get_sym_value(struct elf_info *info, const Elf_Sym *sym)
+{
+	return RTE_PTR_ADD(info->hdr,
+		info->sechdrs[sym->st_shndx].sh_offset + sym->st_value);
+}
+
+static Elf_Sym *
+find_sym_in_symtab(struct elf_info *info, const char *name, Elf_Sym *last)
+{
+	Elf_Sym *idx;
+	if (last)
+		idx = last+1;
+	else
+		idx = info->symtab_start;
+
+	for (; idx < info->symtab_stop; idx++) {
+		const char *n = get_sym_name(info, idx);
+		if (!strncmp(n, name, strlen(name)))
+			return idx;
+	}
+	return NULL;
+}
+
+struct opt_tag {
+	const char *suffix;
+	const char *json_id;
+};
+
+static const struct opt_tag opt_tags[] = {
+	{"_param_string_export", "params"},
+};
+
+static int
+complete_pmd_entry(struct elf_info *info, struct pmd_driver *drv)
+{
+	const char *tname;
+	int i;
+	char tmpsymname[128];
+	Elf_Sym *tmpsym;
+
+	drv->name = get_sym_value(info, drv->name_sym);
+
+	for (i = 0; i < PMD_OPT_MAX; i++) {
+		memset(tmpsymname, 0, 128);
+		sprintf(tmpsymname, "__%s%s", drv->name, opt_tags[i].suffix);
+		tmpsym = find_sym_in_symtab(info, tmpsymname, NULL);
+		if (!tmpsym)
+			continue;
+		drv->opt_vals[i] = get_sym_value(info, tmpsym);
+	}
+
+	memset(tmpsymname, 0, 128);
+	sprintf(tmpsymname, "__%s_pci_table_export", drv->name);
+
+	tmpsym = find_sym_in_symtab(info, tmpsymname, NULL);
+
+	/*
+	 * If this returns NULL, then this is a PMD_VDEV, because
+	 * it has no pci table reference
+	 */
+	if (!tmpsym) {
+		drv->pci_tbl = NULL;
+		return 0;
+	}
+
+	tname = get_sym_value(info, tmpsym);
+	tmpsym = find_sym_in_symtab(info, tname, NULL);
+	if (!tmpsym) {
+		fprintf(stderr, "No symbol %s\n", tname);
+		return -ENOENT;
+	}
+
+	drv->pci_tbl = (struct rte_pci_id *)get_sym_value(info, tmpsym);
+	if (!drv->pci_tbl) {
+		fprintf(stderr, "Failed to get PCI table %s\n", tname);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static int
+locate_pmd_entries(struct elf_info *info)
+{
+	Elf_Sym *last = NULL;
+	struct pmd_driver *new;
+
+	info->drivers = NULL;
+
+	do {
+		new = calloc(sizeof(struct pmd_driver), 1);
+		new->name_sym = find_sym_in_symtab(info, "rte_pmd_name", last);
+		last = new->name_sym;
+		if (!new->name_sym)
+			free(new);
+		else {
+			if (complete_pmd_entry(info, new)) {
+				fprintf(stderr, "Failed to complete pmd entry\n");
+				free(new);
+				return -ENOENT;
+			} else {
+				new->next = info->drivers;
+				info->drivers = new;
+			}
+		}
+	} while (last);
+
+	return 0;
+}
+
+static void
+output_pmd_info_string(struct elf_info *info, char *outfile)
+{
+	FILE *ofd;
+	struct pmd_driver *drv;
+	struct rte_pci_id *pci_ids;
+	int idx = 0;
+
+	ofd = fopen(outfile, "w+");
+	if (!ofd) {
+		fprintf(stderr, "Unable to open output file\n");
+		return;
+	}
+
+	drv = info->drivers;
+
+	while (drv) {
+		fprintf(ofd, "const char %s_pmd_info[] __attribute__((used)) = "
+			"\"PMD_INFO_STRING= {",
+			drv->name);
+		fprintf(ofd, "\\\"name\\\" : \\\"%s\\\", ", drv->name);
+
+		for (idx = 0; idx < PMD_OPT_MAX; idx++) {
+			if (drv->opt_vals[idx])
+				fprintf(ofd, "\\\"%s\\\" : \\\"%s\\\", ",
+					opt_tags[idx].json_id,
+					drv->opt_vals[idx]);
+		}
+
+		pci_ids = drv->pci_tbl;
+		fprintf(ofd, "\\\"pci_ids\\\" : [");
+
+		while (pci_ids && pci_ids->device_id) {
+			fprintf(ofd, "[%d, %d, %d, %d]",
+				pci_ids->vendor_id, pci_ids->device_id,
+				pci_ids->subsystem_vendor_id,
+				pci_ids->subsystem_device_id);
+			pci_ids++;
+			if (pci_ids->device_id)
+				fprintf(ofd, ",");
+			else
+				fprintf(ofd, " ");
+		}
+		fprintf(ofd, "]}\";");
+		drv = drv->next;
+	}
+
+	fclose(ofd);
+}
+
+int main(int argc, char **argv)
+{
+	struct elf_info info;
+	int rc;
+
+	if (argc < 3) {
+		fprintf(stderr,
+			"usage: dpdk-pmdinfogen <object file> <c output file>\n");
+		exit(127);
+	}
+
+	rc = parse_elf(&info, argv[1]);
+	if (rc < 0)
+		exit(-rc);
+
+	rc = locate_pmd_entries(&info);
+	if (rc < 0)
+		goto error;
+
+	if (info.drivers) {
+		output_pmd_info_string(&info, argv[2]);
+		rc = 0;
+	} else {
+		rc = -1;
+		fprintf(stderr, "No drivers registered\n");
+	}
+
+error:
+	parse_elf_finish(&info);
+	exit(-rc);
+}
diff --git a/buildtools/pmdinfogen/pmdinfogen.h b/buildtools/pmdinfogen/pmdinfogen.h
new file mode 100644
index 0000000..7e57702
--- /dev/null
+++ b/buildtools/pmdinfogen/pmdinfogen.h
@@ -0,0 +1,99 @@
+/* Postprocess pmd object files to export hw support
+ *
+ * Copyright 2016 Neil Horman <nhorman@tuxdriver.com>
+ * Based in part on modpost.c from the linux kernel
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License V2, incorporated herein by reference.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <elf.h>
+#include <rte_config.h>
+#include <rte_pci.h>
+#include <rte_byteorder.h>
+
+/* On BSD-alike OSes elf.h defines these according to host's word size */
+#undef ELF_ST_BIND
+#undef ELF_ST_TYPE
+#undef ELF_R_SYM
+#undef ELF_R_TYPE
+
+/*
+ * Define ELF64_* to ELF_*, the latter being defined in both 32 and 64 bit
+ * flavors in elf.h.  This makes our code a bit more generic between arches
+ * and allows us to support 32 bit code in the future should we ever want to
+ */
+#ifdef RTE_ARCH_64
+#define Elf_Ehdr    Elf64_Ehdr
+#define Elf_Shdr    Elf64_Shdr
+#define Elf_Sym     Elf64_Sym
+#define Elf_Addr    Elf64_Addr
+#define Elf_Sword   Elf64_Sxword
+#define Elf_Section Elf64_Half
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_ST_TYPE ELF64_ST_TYPE
+
+#define Elf_Rel     Elf64_Rel
+#define Elf_Rela    Elf64_Rela
+#define ELF_R_SYM   ELF64_R_SYM
+#define ELF_R_TYPE  ELF64_R_TYPE
+#else
+#define Elf_Ehdr    Elf32_Ehdr
+#define Elf_Shdr    Elf32_Shdr
+#define Elf_Sym     Elf32_Sym
+#define Elf_Addr    Elf32_Addr
+#define Elf_Sword   Elf32_Sxword
+#define Elf_Section Elf32_Half
+#define ELF_ST_BIND ELF32_ST_BIND
+#define ELF_ST_TYPE ELF32_ST_TYPE
+
+#define Elf_Rel     Elf32_Rel
+#define Elf_Rela    Elf32_Rela
+#define ELF_R_SYM   ELF32_R_SYM
+#define ELF_R_TYPE  ELF32_R_TYPE
+#endif
+
+
+enum opt_params {
+	PMD_PARAM_STRING = 0,
+	PMD_OPT_MAX
+};
+
+struct pmd_driver {
+	Elf_Sym *name_sym;
+	const char *name;
+	struct rte_pci_id *pci_tbl;
+	struct pmd_driver *next;
+
+	const char *opt_vals[PMD_OPT_MAX];
+};
+
+struct elf_info {
+	unsigned long size;
+	Elf_Ehdr     *hdr;
+	Elf_Shdr     *sechdrs;
+	Elf_Sym      *symtab_start;
+	Elf_Sym      *symtab_stop;
+	char         *strtab;
+
+	/* support for 32bit section numbers */
+
+	unsigned int num_sections; /* max_secindex + 1 */
+	unsigned int secindex_strings;
+	/* if Nth symbol table entry has .st_shndx = SHN_XINDEX,
+	 * take shndx from symtab_shndx_start[N] instead
+	 */
+	Elf32_Word   *symtab_shndx_start;
+	Elf32_Word   *symtab_shndx_stop;
+
+	struct pmd_driver *drivers;
+};
diff --git a/doc/guides/prog_guide/dev_kit_build_system.rst b/doc/guides/prog_guide/dev_kit_build_system.rst
index dedd18a..fa34fe0 100644
--- a/doc/guides/prog_guide/dev_kit_build_system.rst
+++ b/doc/guides/prog_guide/dev_kit_build_system.rst
@@ -70,7 +70,7 @@ Each build directory contains include files, libraries, and applications:
     ...
     ~/DEV/DPDK$ ls i686-native-linuxapp-gcc
 
-    app build hostapp include kmod lib Makefile
+    app build buildtools include kmod lib Makefile
 
 
     ~/DEV/DPDK$ ls i686-native-linuxapp-gcc/app/
@@ -307,6 +307,7 @@ Misc
 Internally Generated Build Tools
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+``dpdk-pmdinfogen`` scans an object (.o) file for various well known symbol names.
 These well known symbol names are defined by various macros and used to export
 important information about hardware support and usage for PMD files.  For
 instance the macro:
@@ -321,6 +322,18 @@ Creates the following symbol:
 
    static char rte_pmd_name0[] __attribute__((used)) = "<name>";
 
+Which dpdk-pmdinfogen scans for.  Using this information other relevant bits of
+data can be exported from the object file and used to produce a hardware support
+description, that dpdk-pmdinfogen then encodes into a json formatted string in
+the following format:
+
+.. code-block:: C
+
+   static char <name_pmd_string>="PMD_INFO_STRING=\"{'name' : '<name>', ...}\"";
+
+These strings can then be searched for by external tools to determine the
+hardware support of a given library or application.
+
 .. _Useful_Variables_Provided_by_the_Build_System:
 
 Useful Variables Provided by the Build System
diff --git a/mk/rte.sdkbuild.mk b/mk/rte.sdkbuild.mk
index f1a163a..5edbf50 100644
--- a/mk/rte.sdkbuild.mk
+++ b/mk/rte.sdkbuild.mk
@@ -63,7 +63,7 @@ build: $(ROOTDIRS-y)
 .PHONY: clean
 clean: $(CLEANDIRS)
 	@rm -rf $(RTE_OUTPUT)/include $(RTE_OUTPUT)/app \
-		$(RTE_OUTPUT)/lib $(RTE_OUTPUT)/kmod
+		$(RTE_OUTPUT)/lib $(RTE_OUTPUT)/kmod $(RTE_OUTPUT)/buildtools
 	@[ -d $(RTE_OUTPUT)/include ] || mkdir -p $(RTE_OUTPUT)/include
 	@$(RTE_SDK)/scripts/gen-config-h.sh $(RTE_OUTPUT)/.config \
 		> $(RTE_OUTPUT)/include/rte_config.h
diff --git a/mk/rte.sdkinstall.mk b/mk/rte.sdkinstall.mk
index abdab0f..2b92157 100644
--- a/mk/rte.sdkinstall.mk
+++ b/mk/rte.sdkinstall.mk
@@ -141,10 +141,13 @@ install-sdk:
 	$(Q)$(call rte_mkdir,                            $(DESTDIR)$(sdkdir))
 	$(Q)cp -a               $(RTE_SDK)/mk            $(DESTDIR)$(sdkdir)
 	$(Q)cp -a               $(RTE_SDK)/scripts       $(DESTDIR)$(sdkdir)
+	$(Q)cp -a               $O/buildtools            $(DESTDIR)$(sdkdir)
 	$(Q)$(call rte_mkdir,                            $(DESTDIR)$(targetdir))
 	$(Q)cp -a               $O/.config               $(DESTDIR)$(targetdir)
 	$(Q)$(call rte_symlink, $(DESTDIR)$(includedir), $(DESTDIR)$(targetdir)/include)
 	$(Q)$(call rte_symlink, $(DESTDIR)$(libdir),     $(DESTDIR)$(targetdir)/lib)
+	$(Q)$(call rte_symlink, $(DESTDIR)$(sdkdir)/buildtools, \
+	                        $(DESTDIR)$(targetdir)/buildtools)
 
 install-doc:
 ifneq ($(wildcard $O/doc),)
-- 
2.7.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH v9 7/7] tools: query binaries for support information
    2016-07-04  1:14  2%   ` [dpdk-dev] [PATCH v9 4/7] pmdinfogen: parse driver to generate code to export Thomas Monjalon
@ 2016-07-04  1:14  2%   ` Thomas Monjalon
  1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-04  1:14 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, Panu Matilainen

From: Neil Horman <nhorman@tuxdriver.com>

This tool searches for the primer string PMD_INFO_STRING= in any ELF binary,
and, if found parses the remainder of the string as a json encoded string,
outputting the results in either a human readable or raw, script parseable
format.

Note that, in the case of dynamically linked applications, pmdinfo.py will
scan for implicitly linked PMDs by searching the specified binaries
.dynamic section for DT_NEEDED entries that contain the substring
librte_pmd.  The DT_RUNPATH, LD_LIBRARY_PATH, /usr/lib and /lib are
searched for these libraries, in that order.

If a file is specified with no path, it is assumed to be a PMD DSO, and the
LD_LIBRARY_PATH, /usr/lib[64]/ and /lib[64] is searched for it.

Currently the tool can output data in 3 formats:
a) raw, suitable for scripting, where the raw JSON strings are dumped out
b) table format (default) where hex pci ids are dumped in a table format
c) pretty, where a user supplied pci.ids file is used to print out vendor
and device strings

There is a dependency on pyelftools.
The script is not yet compatible with Python 3.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Panu Matilainen <pmatilai@redhat.com>
Acked-by: Remy Horton <remy.horton@intel.com>
---
 MAINTAINERS                                |   1 +
 lib/librte_eal/common/eal_common_options.c |   2 +-
 mk/rte.sdkinstall.mk                       |   2 +
 tools/dpdk-pmdinfo.py                      | 628 +++++++++++++++++++++++++++++
 4 files changed, 632 insertions(+), 1 deletion(-)
 create mode 100755 tools/dpdk-pmdinfo.py

diff --git a/MAINTAINERS b/MAINTAINERS
index 1a8a3b7..1e972f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -71,6 +71,7 @@ F: scripts/validate-abi.sh
 Driver information
 M: Neil Horman <nhorman@tuxdriver.com>
 F: buildtools/pmdinfogen/
+F: tools/dpdk-pmdinfo.py
 
 
 Environment Abstraction Layer
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 7e9f7b8..b562c8a 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -115,7 +115,7 @@ TAILQ_HEAD_INITIALIZER(solib_list);
 /* Default path of external loadable drivers */
 static const char *default_solib_dir = RTE_EAL_PMD_PATH;
 
-/* Stringified version of default solib path */
+/* Stringified version of default solib path used by dpdk-pmdinfo.py */
 static const char dpdk_solib_path[] __attribute__((used)) =
 "DPDK_PLUGIN_PATH=" RTE_EAL_PMD_PATH;
 
diff --git a/mk/rte.sdkinstall.mk b/mk/rte.sdkinstall.mk
index 2b92157..76be308 100644
--- a/mk/rte.sdkinstall.mk
+++ b/mk/rte.sdkinstall.mk
@@ -126,6 +126,8 @@ install-runtime:
 	$(Q)$(call rte_mkdir,      $(DESTDIR)$(sbindir))
 	$(Q)$(call rte_symlink,    $(DESTDIR)$(datadir)/tools/dpdk_nic_bind.py, \
 	                           $(DESTDIR)$(sbindir)/dpdk_nic_bind)
+	$(Q)$(call rte_symlink,    $(DESTDIR)$(datadir)/tools/dpdk-pmdinfo.py, \
+	                           $(DESTDIR)$(bindir)/dpdk-pmdinfo)
 
 install-kmod:
 ifneq ($(wildcard $O/kmod/*),)
diff --git a/tools/dpdk-pmdinfo.py b/tools/dpdk-pmdinfo.py
new file mode 100755
index 0000000..b8a9be2
--- /dev/null
+++ b/tools/dpdk-pmdinfo.py
@@ -0,0 +1,628 @@
+#!/usr/bin/python
+# -------------------------------------------------------------------------
+#
+# Utility to dump PMD_INFO_STRING support from an object file
+#
+# -------------------------------------------------------------------------
+import os
+import sys
+from optparse import OptionParser
+import string
+import json
+
+# For running from development directory. It should take precedence over the
+# installed pyelftools.
+sys.path.insert(0, '.')
+
+
+from elftools import __version__
+from elftools.common.exceptions import ELFError
+from elftools.common.py3compat import (
+    ifilter, byte2int, bytes2str, itervalues, str2bytes)
+from elftools.elf.elffile import ELFFile
+from elftools.elf.dynamic import DynamicSection, DynamicSegment
+from elftools.elf.enums import ENUM_D_TAG
+from elftools.elf.segments import InterpSegment
+from elftools.elf.sections import SymbolTableSection
+from elftools.elf.gnuversions import (
+    GNUVerSymSection, GNUVerDefSection,
+    GNUVerNeedSection,
+)
+from elftools.elf.relocation import RelocationSection
+from elftools.elf.descriptions import (
+    describe_ei_class, describe_ei_data, describe_ei_version,
+    describe_ei_osabi, describe_e_type, describe_e_machine,
+    describe_e_version_numeric, describe_p_type, describe_p_flags,
+    describe_sh_type, describe_sh_flags,
+    describe_symbol_type, describe_symbol_bind, describe_symbol_visibility,
+    describe_symbol_shndx, describe_reloc_type, describe_dyn_tag,
+    describe_ver_flags,
+)
+from elftools.elf.constants import E_FLAGS
+from elftools.dwarf.dwarfinfo import DWARFInfo
+from elftools.dwarf.descriptions import (
+    describe_reg_name, describe_attr_value, set_global_machine_arch,
+    describe_CFI_instructions, describe_CFI_register_rule,
+    describe_CFI_CFA_rule,
+)
+from elftools.dwarf.constants import (
+    DW_LNS_copy, DW_LNS_set_file, DW_LNE_define_file)
+from elftools.dwarf.callframe import CIE, FDE
+
+raw_output = False
+pcidb = None
+
+# ===========================================
+
+
+class Vendor:
+    """
+    Class for vendors. This is the top level class
+    for the devices belong to a specific vendor.
+    self.devices is the device dictionary
+    subdevices are in each device.
+    """
+
+    def __init__(self, vendorStr):
+        """
+        Class initializes with the raw line from pci.ids
+        Parsing takes place inside __init__
+        """
+        self.ID = vendorStr.split()[0]
+        self.name = vendorStr.replace("%s " % self.ID, "").rstrip()
+        self.devices = {}
+
+    def add_device(self, deviceStr):
+        """
+        Adds a device to self.devices
+        takes the raw line from pci.ids
+        """
+        s = deviceStr.strip()
+        devID = s.split()[0]
+        if devID in self.devices:
+            pass
+        else:
+            self.devices[devID] = Device(deviceStr)
+
+    def report(self):
+        print self.ID, self.name
+        for id, dev in self.devices.items():
+            dev.report()
+
+    def find_device(self, devid):
+        # convert to a hex string and remove 0x
+        devid = hex(devid)[2:]
+        try:
+            return self.devices[devid]
+        except:
+            return Device("%s  Unknown Device" % devid)
+
+
+class Device:
+
+    def __init__(self, deviceStr):
+        """
+        Class for each device.
+        Each vendor has its own devices dictionary.
+        """
+        s = deviceStr.strip()
+        self.ID = s.split()[0]
+        self.name = s.replace("%s  " % self.ID, "")
+        self.subdevices = {}
+
+    def report(self):
+        print "\t%s\t%s" % (self.ID, self.name)
+        for subID, subdev in self.subdevices.items():
+            subdev.report()
+
+    def add_sub_device(self, subDeviceStr):
+        """
+        Adds a subvendor, subdevice to device.
+        Uses raw line from pci.ids
+        """
+        s = subDeviceStr.strip()
+        spl = s.split()
+        subVendorID = spl[0]
+        subDeviceID = spl[1]
+        subDeviceName = s.split("  ")[-1]
+        devID = "%s:%s" % (subVendorID, subDeviceID)
+        self.subdevices[devID] = SubDevice(
+            subVendorID, subDeviceID, subDeviceName)
+
+    def find_subid(self, subven, subdev):
+        subven = hex(subven)[2:]
+        subdev = hex(subdev)[2:]
+        devid = "%s:%s" % (subven, subdev)
+
+        try:
+            return self.subdevices[devid]
+        except:
+            if (subven == "ffff" and subdev == "ffff"):
+                return SubDevice("ffff", "ffff", "(All Subdevices)")
+            else:
+                return SubDevice(subven, subdev, "(Unknown Subdevice)")
+
+
+class SubDevice:
+    """
+    Class for subdevices.
+    """
+
+    def __init__(self, vendor, device, name):
+        """
+        Class initializes with vendorid, deviceid and name
+        """
+        self.vendorID = vendor
+        self.deviceID = device
+        self.name = name
+
+    def report(self):
+        print "\t\t%s\t%s\t%s" % (self.vendorID, self.deviceID, self.name)
+
+
+class PCIIds:
+    """
+    Top class for all pci.ids entries.
+    All queries will be asked to this class.
+    PCIIds.vendors["0e11"].devices["0046"].\
+    subdevices["0e11:4091"].name  =  "Smart Array 6i"
+    """
+
+    def __init__(self, filename):
+        """
+        Prepares the directories.
+        Checks local data file.
+        Tries to load from local, if not found, downloads from web
+        """
+        self.version = ""
+        self.date = ""
+        self.vendors = {}
+        self.contents = None
+        self.read_local(filename)
+        self.parse()
+
+    def report_vendors(self):
+        """Reports the vendors
+        """
+        for vid, v in self.vendors.items():
+            print v.ID, v.name
+
+    def report(self, vendor=None):
+        """
+        Reports everything for all vendors or a specific vendor
+        PCIIds.report()  reports everything
+        PCIIDs.report("0e11") reports only "Compaq Computer Corporation"
+        """
+        if vendor is not None:
+            self.vendors[vendor].report()
+        else:
+            for vID, v in self.vendors.items():
+                v.report()
+
+    def find_vendor(self, vid):
+        # convert vid to a hex string and remove the 0x
+        vid = hex(vid)[2:]
+
+        try:
+            return self.vendors[vid]
+        except:
+            return Vendor("%s Unknown Vendor" % (vid))
+
+    def find_date(self, content):
+        for l in content:
+            if l.find("Date:") > -1:
+                return l.split()[-2].replace("-", "")
+        return None
+
+    def parse(self):
+        if len(self.contents) < 1:
+            print "data/%s-pci.ids not found" % self.date
+        else:
+            vendorID = ""
+            deviceID = ""
+            for l in self.contents:
+                if l[0] == "#":
+                    continue
+                elif len(l.strip()) == 0:
+                    continue
+                else:
+                    if l.find("\t\t") == 0:
+                        self.vendors[vendorID].devices[
+                            deviceID].add_sub_device(l)
+                    elif l.find("\t") == 0:
+                        deviceID = l.strip().split()[0]
+                        self.vendors[vendorID].add_device(l)
+                    else:
+                        vendorID = l.split()[0]
+                        self.vendors[vendorID] = Vendor(l)
+
+    def read_local(self, filename):
+        """
+        Reads the local file
+        """
+        self.contents = open(filename).readlines()
+        self.date = self.find_date(self.contents)
+
+    def load_local(self):
+        """
+        Loads database from local. If there is no file,
+        it creates a new one from web
+        """
+        self.date = idsfile[0].split("/")[1].split("-")[0]
+        self.read_local()
+
+
+# =======================================
+
+def search_file(filename, search_path):
+    """ Given a search path, find file with requested name """
+    for path in string.split(search_path, ":"):
+        candidate = os.path.join(path, filename)
+        if os.path.exists(candidate):
+            return os.path.abspath(candidate)
+    return None
+
+
+class ReadElf(object):
+    """ display_* methods are used to emit output into the output stream
+    """
+
+    def __init__(self, file, output):
+        """ file:
+                stream object with the ELF file to read
+
+            output:
+                output stream to write to
+        """
+        self.elffile = ELFFile(file)
+        self.output = output
+
+        # Lazily initialized if a debug dump is requested
+        self._dwarfinfo = None
+
+        self._versioninfo = None
+
+    def _section_from_spec(self, spec):
+        """ Retrieve a section given a "spec" (either number or name).
+            Return None if no such section exists in the file.
+        """
+        try:
+            num = int(spec)
+            if num < self.elffile.num_sections():
+                return self.elffile.get_section(num)
+            else:
+                return None
+        except ValueError:
+            # Not a number. Must be a name then
+            return self.elffile.get_section_by_name(str2bytes(spec))
+
+    def pretty_print_pmdinfo(self, pmdinfo):
+        global pcidb
+
+        for i in pmdinfo["pci_ids"]:
+            vendor = pcidb.find_vendor(i[0])
+            device = vendor.find_device(i[1])
+            subdev = device.find_subid(i[2], i[3])
+            print("%s (%s) : %s (%s) %s" %
+                  (vendor.name, vendor.ID, device.name,
+                   device.ID, subdev.name))
+
+    def parse_pmd_info_string(self, mystring):
+        global raw_output
+        global pcidb
+
+        optional_pmd_info = [{'id': 'params', 'tag': 'PMD PARAMETERS'}]
+
+        i = mystring.index("=")
+        mystring = mystring[i + 2:]
+        pmdinfo = json.loads(mystring)
+
+        if raw_output:
+            print(pmdinfo)
+            return
+
+        print("PMD NAME: " + pmdinfo["name"])
+        for i in optional_pmd_info:
+            try:
+                print("%s: %s" % (i['tag'], pmdinfo[i['id']]))
+            except KeyError as e:
+                continue
+
+        if (len(pmdinfo["pci_ids"]) != 0):
+            print("PMD HW SUPPORT:")
+            if pcidb is not None:
+                self.pretty_print_pmdinfo(pmdinfo)
+            else:
+                print("VENDOR\t DEVICE\t SUBVENDOR\t SUBDEVICE")
+                for i in pmdinfo["pci_ids"]:
+                    print("0x%04x\t 0x%04x\t 0x%04x\t\t 0x%04x" %
+                          (i[0], i[1], i[2], i[3]))
+
+        print("")
+
+    def display_pmd_info_strings(self, section_spec):
+        """ Display a strings dump of a section. section_spec is either a
+            section number or a name.
+        """
+        section = self._section_from_spec(section_spec)
+        if section is None:
+            return
+
+        data = section.data()
+        dataptr = 0
+
+        while dataptr < len(data):
+            while (dataptr < len(data) and
+                    not (32 <= byte2int(data[dataptr]) <= 127)):
+                dataptr += 1
+
+            if dataptr >= len(data):
+                break
+
+            endptr = dataptr
+            while endptr < len(data) and byte2int(data[endptr]) != 0:
+                endptr += 1
+
+            mystring = bytes2str(data[dataptr:endptr])
+            rc = mystring.find("PMD_INFO_STRING")
+            if (rc != -1):
+                self.parse_pmd_info_string(mystring)
+
+            dataptr = endptr
+
+    def find_librte_eal(self, section):
+        for tag in section.iter_tags():
+            if tag.entry.d_tag == 'DT_NEEDED':
+                if "librte_eal" in tag.needed:
+                    return tag.needed
+        return None
+
+    def search_for_autoload_path(self):
+        scanelf = self
+        scanfile = None
+        library = None
+
+        section = self._section_from_spec(".dynamic")
+        try:
+            eallib = self.find_librte_eal(section)
+            if eallib is not None:
+                ldlibpath = os.environ.get('LD_LIBRARY_PATH')
+                if ldlibpath is None:
+                    ldlibpath = ""
+                dtr = self.get_dt_runpath(section)
+                library = search_file(eallib,
+                                      dtr + ":" + ldlibpath +
+                                      ":/usr/lib64:/lib64:/usr/lib:/lib")
+                if library is None:
+                    return (None, None)
+                if raw_output is False:
+                    print("Scanning for autoload path in %s" % library)
+                scanfile = open(library, 'rb')
+                scanelf = ReadElf(scanfile, sys.stdout)
+        except AttributeError:
+            # Not a dynamic binary
+            pass
+        except ELFError:
+            scanfile.close()
+            return (None, None)
+
+        section = scanelf._section_from_spec(".rodata")
+        if section is None:
+            if scanfile is not None:
+                scanfile.close()
+            return (None, None)
+
+        data = section.data()
+        dataptr = 0
+
+        while dataptr < len(data):
+            while (dataptr < len(data) and
+                    not (32 <= byte2int(data[dataptr]) <= 127)):
+                dataptr += 1
+
+            if dataptr >= len(data):
+                break
+
+            endptr = dataptr
+            while endptr < len(data) and byte2int(data[endptr]) != 0:
+                endptr += 1
+
+            mystring = bytes2str(data[dataptr:endptr])
+            rc = mystring.find("DPDK_PLUGIN_PATH")
+            if (rc != -1):
+                rc = mystring.find("=")
+                return (mystring[rc + 1:], library)
+
+            dataptr = endptr
+        if scanfile is not None:
+            scanfile.close()
+        return (None, None)
+
+    def get_dt_runpath(self, dynsec):
+        for tag in dynsec.iter_tags():
+            if tag.entry.d_tag == 'DT_RUNPATH':
+                return tag.runpath
+        return ""
+
+    def process_dt_needed_entries(self):
+        """ Look to see if there are any DT_NEEDED entries in the binary
+            And process those if there are
+        """
+        global raw_output
+        runpath = ""
+        ldlibpath = os.environ.get('LD_LIBRARY_PATH')
+        if ldlibpath is None:
+            ldlibpath = ""
+
+        dynsec = self._section_from_spec(".dynamic")
+        try:
+            runpath = self.get_dt_runpath(dynsec)
+        except AttributeError:
+            # dynsec is None, just return
+            return
+
+        for tag in dynsec.iter_tags():
+            if tag.entry.d_tag == 'DT_NEEDED':
+                rc = tag.needed.find("librte_pmd")
+                if (rc != -1):
+                    library = search_file(tag.needed,
+                                          runpath + ":" + ldlibpath +
+                                          ":/usr/lib64:/lib64:/usr/lib:/lib")
+                    if library is not None:
+                        if raw_output is False:
+                            print("Scanning %s for pmd information" % library)
+                        with open(library, 'rb') as file:
+                            try:
+                                libelf = ReadElf(file, sys.stdout)
+                            except ELFError as e:
+                                print("%s is no an ELF file" % library)
+                                continue
+                            libelf.process_dt_needed_entries()
+                            libelf.display_pmd_info_strings(".rodata")
+                            file.close()
+
+
+def scan_autoload_path(autoload_path):
+    global raw_output
+
+    if os.path.exists(autoload_path) is False:
+        return
+
+    try:
+        dirs = os.listdir(autoload_path)
+    except OSError as e:
+        # Couldn't read the directory, give up
+        return
+
+    for d in dirs:
+        dpath = os.path.join(autoload_path, d)
+        if os.path.isdir(dpath):
+            scan_autoload_path(dpath)
+        if os.path.isfile(dpath):
+            try:
+                file = open(dpath, 'rb')
+                readelf = ReadElf(file, sys.stdout)
+            except ELFError as e:
+                # this is likely not an elf file, skip it
+                continue
+            except IOError as e:
+                # No permission to read the file, skip it
+                continue
+
+            if raw_output is False:
+                print("Hw Support for library %s" % d)
+            readelf.display_pmd_info_strings(".rodata")
+            file.close()
+
+
+def scan_for_autoload_pmds(dpdk_path):
+    """
+    search the specified application or path for a pmd autoload path
+    then scan said path for pmds and report hw support
+    """
+    global raw_output
+
+    if (os.path.isfile(dpdk_path) is False):
+        if raw_output is False:
+            print("Must specify a file name")
+        return
+
+    file = open(dpdk_path, 'rb')
+    try:
+        readelf = ReadElf(file, sys.stdout)
+    except ElfError as e:
+        if raw_output is False:
+            print("Unable to parse %s" % file)
+        return
+
+    (autoload_path, scannedfile) = readelf.search_for_autoload_path()
+    if (autoload_path is None or autoload_path is ""):
+        if (raw_output is False):
+            print("No autoload path configured in %s" % dpdk_path)
+        return
+    if (raw_output is False):
+        if (scannedfile is None):
+            scannedfile = dpdk_path
+        print("Found autoload path %s in %s" % (autoload_path, scannedfile))
+
+    file.close()
+    if (raw_output is False):
+        print("Discovered Autoload HW Support:")
+    scan_autoload_path(autoload_path)
+    return
+
+
+def main(stream=None):
+    global raw_output
+    global pcidb
+
+    optparser = OptionParser(
+        usage='usage: %prog [-hrtp] [-d <pci id file] <elf-file>',
+        description="Dump pmd hardware support info",
+        add_help_option=True,
+        prog='pmdinfo.py')
+    optparser.add_option('-r', '--raw',
+                         action='store_true', dest='raw_output',
+                         help='Dump raw json strings')
+    optparser.add_option("-d", "--pcidb", dest="pcifile",
+                         help="specify a pci database "
+                              "to get vendor names from",
+                         default="/usr/share/hwdata/pci.ids", metavar="FILE")
+    optparser.add_option("-t", "--table", dest="tblout",
+                         help="output information on hw support as a hex table",
+                         action='store_true')
+    optparser.add_option("-p", "--plugindir", dest="pdir",
+                         help="scan dpdk for autoload plugins",
+                         action='store_true')
+
+    options, args = optparser.parse_args()
+
+    if options.raw_output:
+        raw_output = True
+
+    if options.pcifile:
+        pcidb = PCIIds(options.pcifile)
+        if pcidb is None:
+            print("Pci DB file not found")
+            exit(1)
+
+    if options.tblout:
+        options.pcifile = None
+        pcidb = None
+
+    if (len(args) == 0):
+        optparser.print_usage()
+        exit(1)
+
+    if options.pdir is True:
+        exit(scan_for_autoload_pmds(args[0]))
+
+    ldlibpath = os.environ.get('LD_LIBRARY_PATH')
+    if (ldlibpath is None):
+        ldlibpath = ""
+
+    if (os.path.exists(args[0]) is True):
+        myelffile = args[0]
+    else:
+        myelffile = search_file(
+            args[0], ldlibpath + ":/usr/lib64:/lib64:/usr/lib:/lib")
+
+    if (myelffile is None):
+        print("File not found")
+        sys.exit(1)
+
+    with open(myelffile, 'rb') as file:
+        try:
+            readelf = ReadElf(file, sys.stdout)
+            readelf.process_dt_needed_entries()
+            readelf.display_pmd_info_strings(".rodata")
+            sys.exit(0)
+
+        except ELFError as ex:
+            sys.stderr.write('ELF error: %s\n' % ex)
+            sys.exit(1)
+
+
+# -------------------------------------------------------------------------
+if __name__ == '__main__':
+    main()
-- 
2.7.0

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH 01/18] doc: add template for release notes 16.11
  @ 2016-07-05 15:41  6% ` Olivier Matz
  0 siblings, 0 replies; 200+ results
From: Olivier Matz @ 2016-07-05 15:41 UTC (permalink / raw)
  To: dev

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 doc/guides/rel_notes/release_16_11.rst | 160 +++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 doc/guides/rel_notes/release_16_11.rst

diff --git a/doc/guides/rel_notes/release_16_11.rst b/doc/guides/rel_notes/release_16_11.rst
new file mode 100644
index 0000000..0106bc9
--- /dev/null
+++ b/doc/guides/rel_notes/release_16_11.rst
@@ -0,0 +1,160 @@
+DPDK Release 16.11
+==================
+
+.. **Read this first.**
+
+   The text below explains how to update the release notes.
+
+   Use proper spelling, capitalization and punctuation in all sections.
+
+   Variable and config names should be quoted as fixed width text: ``LIKE_THIS``.
+
+   Build the docs and view the output file to ensure the changes are correct::
+
+      make doc-guides-html
+
+      firefox build/doc/html/guides/rel_notes/release_16_11.html
+
+
+New Features
+------------
+
+.. This section should contain new features added in this release. Sample format:
+
+   * **Add a title in the past tense with a full stop.**
+
+     Add a short 1-2 sentence description in the past tense. The description
+     should be enough to allow someone scanning the release notes to understand
+     the new feature.
+
+     If the feature adds a lot of sub-features you can use a bullet list like this.
+
+     * Added feature foo to do something.
+     * Enhanced feature bar to do something else.
+
+     Refer to the previous release notes for examples.
+
+
+Resolved Issues
+---------------
+
+.. This section should contain bug fixes added to the relevant sections. Sample format:
+
+   * **code/section Fixed issue in the past tense with a full stop.**
+
+     Add a short 1-2 sentence description of the resolved issue in the past tense.
+     The title should contain the code/lib section like a commit message.
+     Add the entries in alphabetic order in the relevant sections below.
+
+
+EAL
+~~~
+
+
+Drivers
+~~~~~~~
+
+
+Libraries
+~~~~~~~~~
+
+
+Examples
+~~~~~~~~
+
+
+Other
+~~~~~
+
+
+Known Issues
+------------
+
+.. This section should contain new known issues in this release. Sample format:
+
+   * **Add title in present tense with full stop.**
+
+     Add a short 1-2 sentence description of the known issue in the present
+     tense. Add information on any known workarounds.
+
+
+API Changes
+-----------
+
+.. This section should contain API changes. Sample format:
+
+   * Add a short 1-2 sentence description of the API change. Use fixed width
+     quotes for ``rte_function_names`` or ``rte_struct_names``. Use the past tense.
+
+
+ABI Changes
+-----------
+
+.. * Add a short 1-2 sentence description of the ABI change that was announced in
+     the previous releases and made in this release. Use fixed width quotes for
+     ``rte_function_names`` or ``rte_struct_names``. Use the past tense.
+
+
+Shared Library Versions
+-----------------------
+
+.. Update any library version updated in this release and prepend with a ``+`` sign.
+
+The libraries prepended with a plus sign were incremented in this version.
+
+.. code-block:: diff
+
+     libethdev.so.3
+     librte_acl.so.2
+     librte_cfgfile.so.2
+     librte_cmdline.so.2
+     librte_distributor.so.1
+     librte_eal.so.2
+     librte_hash.so.2
+     librte_ip_frag.so.1
+     librte_ivshmem.so.1
+     librte_jobstats.so.1
+     librte_kni.so.2
+     librte_kvargs.so.1
+     librte_lpm.so.2
+     librte_mbuf.so.2
+     librte_mempool.so.2
+     librte_meter.so.1
+     librte_pipeline.so.3
+     librte_pmd_bond.so.1
+     librte_pmd_ring.so.2
+     librte_port.so.2
+     librte_power.so.1
+     librte_reorder.so.1
+     librte_ring.so.1
+     librte_sched.so.1
+     librte_table.so.2
+     librte_timer.so.1
+     librte_vhost.so.2
+
+
+Tested Platforms
+----------------
+
+.. This section should contain a list of platforms that were tested with this
+   release.
+
+   The format is:
+
+   #. Platform name.
+
+      - Platform details.
+      - Platform details.
+
+
+Tested NICs
+-----------
+
+.. This section should contain a list of NICs that were tested with this release.
+
+   The format is:
+
+   #. NIC name.
+
+      - NIC details.
+      - NIC details.
-- 
2.8.1

^ permalink raw reply	[relevance 6%]

* [dpdk-dev] [RFC] Generic flow director/filtering/classification API
@ 2016-07-05 18:16  2% Adrien Mazarguil
  2016-07-07  7:14  0% ` Lu, Wenzhuo
                   ` (2 more replies)
  0 siblings, 3 replies; 200+ results
From: Adrien Mazarguil @ 2016-07-05 18:16 UTC (permalink / raw)
  To: dev
  Cc: Thomas Monjalon, Helin Zhang, Jingjing Wu, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Wenzhuo Lu, Jan Medala,
	John Daley, Jing Chen, Konstantin Ananyev, Matej Vido,
	Alejandro Lucero, Sony Chacko, Jerin Jacob, Pablo de Lara,
	Olga Shern

Hi All,

First, forgive me for this large message, I know our mailboxes already
suffer quite a bit from the amount of traffic on this ML.

This is not exactly yet another thread about how flow director should be
extended, rather about a brand new API to handle filtering and
classification for incoming packets in the most PMD-generic and
application-friendly fashion we can come up with. Reasons described below.

I think this topic is important enough to include both the users of this API
as well as PMD maintainers. So far I have CC'ed librte_ether (especially
rte_eth_ctrl.h contributors), testpmd and PMD maintainers (with and without
a .filter_ctrl implementation), but if you know application maintainers
other than testpmd who use FDIR or might be interested in this discussion,
feel free to add them.

The issues we found with the current approach are already summarized in the
following document, but here is a quick summary for TL;DR folks:

- PMDs do not expose a common set of filter types and even when they do,
  their behavior more or less differs.

- Applications need to determine and adapt to device-specific limitations
  and quirks on their own, without help from PMDs.

- Writing an application that creates flow rules targeting all devices
  supported by DPDK is thus difficult, if not impossible.

- The current API has too many unspecified areas (particularly regarding
  side effects of flow rules) that make PMD implementation tricky.

This RFC API handles everything currently supported by .filter_ctrl, the
idea being to reimplement all of these to make them fully usable by
applications in a more generic and well defined fashion. It has a very small
set of mandatory features and an easy method to let applications probe for
supported capabilities.

The only downside is more work for the software control side of PMDs because
they have to adapt to the API instead of the reverse. I think helpers can be
added to EAL to assist with this.

HTML version:

 https://rawgit.com/6WIND/rte_flow/master/rte_flow.html

PDF version:

 https://rawgit.com/6WIND/rte_flow/master/rte_flow.pdf

Related draft header file (for reference while reading the specification):

 https://raw.githubusercontent.com/6WIND/rte_flow/master/rte_flow.h

Git tree for completeness (latest .rst version can be retrieved from here):

 https://github.com/6WIND/rte_flow

What follows is the ReST source of the above, for inline comments and
discussion. I intend to update that specification accordingly.

========================
Generic filter interface
========================

.. footer::

   v0.6

.. contents::
.. sectnum::
.. raw:: pdf

   PageBreak

Overview
========

DPDK provides several competing interfaces added over time to perform packet
matching and related actions such as filtering and classification.

They must be extended to implement the features supported by newer devices
in order to expose them to applications, however the current design has
several drawbacks:

- Complicated filter combinations which have not been hard-coded cannot be
  expressed.
- Prone to API/ABI breakage when new features must be added to an existing
  filter type, which frequently happens.

>From an application point of view:

- Having disparate interfaces, all optional and lacking in features does not
  make this API easy to use.
- Seemingly arbitrary built-in limitations of filter types based on the
  device they were initially designed for.
- Undefined relationship between different filter types.
- High complexity, considerable undocumented and/or undefined behavior.

Considering the growing number of devices supported by DPDK, adding a new
filter type each time a new feature must be implemented is not sustainable
in the long term. Applications not written to target a specific device
cannot really benefit from such an API.

For these reasons, this document defines an extensible unified API that
encompasses and supersedes these legacy filter types.

.. raw:: pdf

   PageBreak

Current API
===========

Rationale
---------

The reason several competing (and mostly overlapping) filtering APIs are
present in DPDK is due to its nature as a thin layer between hardware and
software.

Each subsequent interface has been added to better match the capabilities
and limitations of the latest supported device, which usually happened to
need an incompatible configuration approach. Because of this, many ended up
device-centric and not usable by applications that were not written for that
particular device.

This document is not the first attempt to address this proliferation issue,
in fact a lot of work has already been done both to create a more generic
interface while somewhat keeping compatibility with legacy ones through a
common call interface (``rte_eth_dev_filter_ctrl()`` with the
``.filter_ctrl`` PMD callback in ``rte_ethdev.h``).

Today, these previously incompatible interfaces are known as filter types
(``RTE_ETH_FILTER_*`` from ``enum rte_filter_type`` in ``rte_eth_ctrl.h``).

However while trivial to extend with new types, it only shifted the
underlying problem as applications still need to be written for one kind of
filter type, which, as described in the following sections, is not
necessarily implemented by all PMDs that support filtering.

.. raw:: pdf

   PageBreak

Filter types
------------

This section summarizes the capabilities of each filter type.

Although the following list is exhaustive, the description of individual
types may contain inaccuracies due to the lack of documentation or usage
examples.

Note: names are prefixed with ``RTE_ETH_FILTER_``.

``MACVLAN``
~~~~~~~~~~~

Matching:

- L2 source/destination addresses.
- Optional 802.1Q VLAN ID.
- Masking individual fields on a rule basis is not supported.

Action:

- Packets are redirected either to a given VF device using its ID or to the
  PF.

``ETHERTYPE``
~~~~~~~~~~~~~

Matching:

- L2 source/destination addresses (optional).
- Ethertype (no VLAN ID?).
- Masking individual fields on a rule basis is not supported.

Action:

- Receive packets on a given queue.
- Drop packets.

``FLEXIBLE``
~~~~~~~~~~~~

Matching:

- At most 128 consecutive bytes anywhere in packets.
- Masking is supported with byte granularity.
- Priorities are supported (relative to this filter type, undefined
  otherwise).

Action:

- Receive packets on a given queue.

``SYN``
~~~~~~~

Matching:

- TCP SYN packets only.
- One high priority bit can be set to give the highest possible priority to
  this type when other filters with different types are configured.

Action:

- Receive packets on a given queue.

``NTUPLE``
~~~~~~~~~~

Matching:

- Source/destination IPv4 addresses (optional in 2-tuple mode).
- Source/destination TCP/UDP port (mandatory in 2 and 5-tuple modes).
- L4 protocol (2 and 5-tuple modes).
- Masking individual fields is supported.
- TCP flags.
- Up to 7 levels of priority relative to this filter type, undefined
  otherwise.
- No IPv6.

Action:

- Receive packets on a given queue.

``TUNNEL``
~~~~~~~~~~

Matching:

- Outer L2 source/destination addresses.
- Inner L2 source/destination addresses.
- Inner VLAN ID.
- IPv4/IPv6 source (destination?) address.
- Tunnel type to match (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE, 802.1BR
  E-Tag).
- Tenant ID for tunneling protocols that have one.
- Any combination of the above can be specified.
- Masking individual fields on a rule basis is not supported.

Action:

- Receive packets on a given queue.

.. raw:: pdf

   PageBreak

``FDIR``
~~~~~~~~

Queries:

- Device capabilities and limitations.
- Device statistics about configured filters (resource usage, collisions).
- Device configuration (matching input set and masks)

Matching:

- Device mode of operation: none (to disable filtering), signature
  (hash-based dispatching from masked fields) or perfect (either MAC VLAN or
  tunnel).
- L2 Ethertype.
- Outer L2 destination address (MAC VLAN mode).
- Inner L2 destination address, tunnel type (NVGRE, VXLAN) and tunnel ID
  (tunnel mode).
- IPv4 source/destination addresses, ToS, TTL and protocol fields.
- IPv6 source/destination addresses, TC, protocol and hop limits fields.
- UDP source/destination IPv4/IPv6 and ports.
- TCP source/destination IPv4/IPv6 and ports.
- SCTP source/destination IPv4/IPv6, ports and verification tag field.
- Note, only one protocol type at once (either only L2 Ethertype, basic
  IPv6, IPv4+UDP, IPv4+TCP and so on).
- VLAN TCI (extended API).
- At most 16 bytes to match in payload (extended API). A global device
  look-up table specifies for each possible protocol layer (unknown, raw,
  L2, L3, L4) the offset to use for each byte (they do not need to be
  contiguous) and the related bitmask.
- Whether packet is addressed to PF or VF, in that case its ID can be
  matched as well (extended API).
- Masking most of the above fields is supported, but simultaneously affects
  all filters configured on a device.
- Input set can be modified in a similar fashion for a given device to
  ignore individual fields of filters (i.e. do not match the destination
  address in a IPv4 filter, refer to **RTE_ETH_INPUT_SET_**
  macros). Configuring this also affects RSS processing on **i40e**.
- Filters can also provide 32 bits of arbitrary data to return as part of
  matched packets.

Action:

- **RTE_ETH_FDIR_ACCEPT**: receive (accept) packet on a given queue.
- **RTE_ETH_FDIR_REJECT**: drop packet immediately.
- **RTE_ETH_FDIR_PASSTHRU**: similar to accept for the last filter in list,
  otherwise process it with subsequent filters.
- For accepted packets and if requested by filter, either 32 bits of
  arbitrary data and four bytes of matched payload (only in case of flex
  bytes matching), or eight bytes of matched payload (flex also) are added
  to meta data.

.. raw:: pdf

   PageBreak

``HASH``
~~~~~~~~

Not an actual filter type. Provides and retrieves the global device
configuration (per port or entire NIC) for hash functions and their
properties.

Hash function selection: "default" (keep current), XOR or Toeplitz.

This function can be configured per flow type (**RTE_ETH_FLOW_**
definitions), supported types are:

- Unknown.
- Raw.
- Fragmented or non-fragmented IPv4.
- Non-fragmented IPv4 with L4 (TCP, UDP, SCTP or other).
- Fragmented or non-fragmented IPv6.
- Non-fragmented IPv6 with L4 (TCP, UDP, SCTP or other).
- L2 payload.
- IPv6 with extensions.
- IPv6 with L4 (TCP, UDP) and extensions.

``L2_TUNNEL``
~~~~~~~~~~~~~

Matching:

- All packets received on a given port.

Action:

- Add tunnel encapsulation (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
  802.1BR E-Tag) using the provided Ethertype and tunnel ID (only E-Tag
  is implemented at the moment).
- VF ID to use for tag insertion (currently unused).
- Destination pool for tag based forwarding (pools are IDs that can be
  affected to ports, duplication occurs if the same ID is shared by several
  ports of the same NIC).

.. raw:: pdf

   PageBreak

Driver support
--------------

======== ======= ========= ======== === ====== ====== ==== ==== =========
Driver   MACVLAN ETHERTYPE FLEXIBLE SYN NTUPLE TUNNEL FDIR HASH L2_TUNNEL
======== ======= ========= ======== === ====== ====== ==== ==== =========
bnx2x
cxgbe
e1000            yes       yes      yes yes
ena
enic                                                  yes
fm10k
i40e     yes     yes                           yes    yes  yes
ixgbe            yes                yes yes           yes       yes
mlx4
mlx5                                                  yes
szedata2
======== ======= ========= ======== === ====== ====== ==== ==== =========

Flow director
-------------

Flow director (FDIR) is the name of the most capable filter type, which
covers most features offered by others. As such, it is the most widespread
in PMDs that support filtering (i.e. all of them besides **e1000**).

It is also the only type that allows an arbitrary 32 bits value provided by
applications to be attached to a filter and returned with matching packets
instead of relying on the destination queue to recognize flows.

Unfortunately, even FDIR requires applications to be aware of low-level
capabilities and limitations (most of which come directly from **ixgbe** and
**i40e**):

- Bitmasks are set globally per device (port?), not per filter.
- Configuration state is not expected to be saved by the driver, and
  stopping/restarting a port requires the application to perform it again
  (API documentation is also unclear about this).
- Monolithic approach with ABI issues as soon as a new kind of flow or
  combination needs to be supported.
- Cryptic global statistics/counters.
- Unclear about how priorities are managed; filters seem to be arranged as a
  linked list in hardware (possibly related to configuration order).

Packet alteration
-----------------

One interesting feature is that the L2 tunnel filter type implements the
ability to alter incoming packets through a filter (in this case to
encapsulate them), thus the **mlx5** flow encap/decap features are not a
foreign concept.

.. raw:: pdf

   PageBreak

Proposed API
============

Terminology
-----------

- **Filtering API**: overall framework affecting the fate of selected
  packets, covers everything described in this document.
- **Matching pattern**: properties to look for in received packets, a
  combination of any number of items.
- **Pattern item**: part of a pattern that either matches packet data
  (protocol header, payload or derived information), or specifies properties
  of the pattern itself.
- **Actions**: what needs to be done when a packet matches a pattern.
- **Flow rule**: this is the result of combining a *matching pattern* with
  *actions*.
- **Filter rule**: a less generic term than *flow rule*, can otherwise be
  used interchangeably.
- **Hit**: a flow rule is said to be *hit* when processing a matching
  packet.

Requirements
------------

As described in the previous section, there is a growing need for a common
method to configure filtering and related actions in a hardware independent
fashion.

The filtering API should not disallow any filter combination by design and
must remain as simple as possible to use. It can simply be defined as a
method to perform one or several actions on selected packets.

PMDs are aware of the capabilities of the device they manage and should be
responsible for preventing unsupported or conflicting combinations.

This approach is fundamentally different as it places most of the burden on
the software side of the PMD instead of having device capabilities directly
mapped to API functions, then expecting applications to work around ensuing
compatibility issues.

Requirements for a new API:

- Flexible and extensible without causing API/ABI problems for existing
  applications.
- Should be unambiguous and easy to use.
- Support existing filtering features and actions listed in `Filter types`_.
- Support packet alteration.
- In case of overlapping filters, their priority should be well documented.
- Support filter queries (for example to retrieve counters).

.. raw:: pdf

   PageBreak

High level design
-----------------

The chosen approach to make filtering as generic as possible is by
expressing matching patterns through lists of items instead of the flat
structures used in DPDK today, enabling combinations that are not predefined
and thus being more versatile.

Flow rules can have several distinct actions (such as counting,
encapsulating, decapsulating before redirecting packets to a particular
queue, etc.), instead of relying on several rules to achieve this and having
applications deal with hardware implementation details regarding their
order.

Support for different priority levels on a rule basis is provided, for
example in order to force a more specific rule come before a more generic
one for packets matched by both, however hardware support for more than a
single priority level cannot be guaranteed. When supported, the number of
available priority levels is usually low, which is why they can also be
implemented in software by PMDs (e.g. to simulate missing priority levels by
reordering rules).

In order to remain as hardware agnostic as possible, by default all rules
are considered to have the same priority, which means that the order between
overlapping rules (when a packet is matched by several filters) is
undefined, packet duplication may even occur as a result.

PMDs may refuse to create overlapping rules at a given priority level when
they can be detected (e.g. if a pattern matches an existing filter).

Thus predictable results for a given priority level can only be achieved
with non-overlapping rules, using perfect matching on all protocol layers.

Support for multiple actions per rule may be implemented internally on top
of non-default hardware priorities, as a result both features may not be
simultaneously available to applications.

Considering that allowed pattern/actions combinations cannot be known in
advance and would result in an unpractically large number of capabilities to
expose, a method is provided to validate a given rule from the current
device configuration state without actually adding it (akin to a "dry run"
mode).

This enables applications to check if the rule types they need is supported
at initialization time, before starting their data path. This method can be
used anytime, its only requirement being that the resources needed by a rule
must exist (e.g. a target RX queue must be configured first).

Each defined rule is associated with an opaque handle managed by the PMD,
applications are responsible for keeping it. These can be used for queries
and rules management, such as retrieving counters or other data and
destroying them.

Handles must be destroyed before releasing associated resources such as
queues.

Integration
-----------

To avoid ABI breakage, this new interface will be implemented through the
existing filtering control framework (``rte_eth_dev_filter_ctrl()``) using
**RTE_ETH_FILTER_GENERIC** as a new filter type.

However a public front-end API described in `Rules management`_ will
be added as the preferred method to use it.

Once discussions with the community have converged to a definite API, legacy
filter types should be deprecated and a deadline defined to remove their
support entirely.

PMDs will have to be gradually converted to **RTE_ETH_FILTER_GENERIC** or
drop filtering support entirely. Less maintained PMDs for older hardware may
lose support at this point.

The notion of filter type will then be deprecated and subsequently dropped
to avoid confusion between both frameworks.

Implementation details
======================

Flow rule
---------

A flow rule is the combination of a matching pattern with a list of actions,
and is the basis of this API.

Priorities
~~~~~~~~~~

A priority can be assigned to a matching pattern.

The default priority level is 0 and is also the highest. Support for more
than a single priority level in hardware is not guaranteed.

If a packet is matched by several filters at a given priority level, the
outcome is undefined. It can take any path and can even be duplicated.

Matching pattern
~~~~~~~~~~~~~~~~

A matching pattern comprises any number of items of various types.

Items are arranged in a list to form a matching pattern for packets. They
fall in two categories:

- Protocol matching (ANY, RAW, ETH, IPV4, IPV6, ICMP, UDP, TCP, VXLAN and so
  on), usually associated with a specification structure. These must be
  stacked in the same order as the protocol layers to match, starting from
  L2.

- Affecting how the pattern is processed (END, VOID, INVERT, PF, VF,
  SIGNATURE and so on), often without a specification structure. Since they
  are meta data that does not match packet contents, these can be specified
  anywhere within item lists without affecting the protocol matching items.

Most item specifications can be optionally paired with a mask to narrow the
specific fields or bits to be matched.

- Items are defined with ``struct rte_flow_item``.
- Patterns are defined with ``struct rte_flow_pattern``.

Example of an item specification matching an Ethernet header:

+-----------------------------------------+
| Ethernet                                |
+==========+=========+====================+
| ``spec`` | ``src`` | ``00:01:02:03:04`` |
|          +---------+--------------------+
|          | ``dst`` | ``00:2a:66:00:01`` |
+----------+---------+--------------------+
| ``mask`` | ``src`` | ``00:ff:ff:ff:00`` |
|          +---------+--------------------+
|          | ``dst`` | ``00:00:00:00:ff`` |
+----------+---------+--------------------+

Non-masked bits stand for any value, Ethernet headers with the following
properties are thus matched:

- ``src``: ``??:01:02:03:??``
- ``dst``: ``??:??:??:??:01``

Except for meta types that do not need one, ``spec`` must be a valid pointer
to a structure of the related item type. A ``mask`` of the same type can be
provided to tell which bits in ``spec`` are to be matched.

A mask is normally only needed for ``spec`` fields matching packet data,
ignored otherwise. See individual item types for more information.

A ``NULL`` mask pointer is allowed and is similar to matching with a full
mask (all ones) ``spec`` fields supported by hardware, the remaining fields
are ignored (all zeroes), there is thus no error checking for unsupported
fields.

Matching pattern items for packet data must be naturally stacked (ordered
from lowest to highest protocol layer), as in the following examples:

+--------------+
| TCPv4 as L4  |
+===+==========+
| 0 | Ethernet |
+---+----------+
| 1 | IPv4     |
+---+----------+
| 2 | TCP      |
+---+----------+

+----------------+
| TCPv6 in VXLAN |
+===+============+
| 0 | Ethernet   |
+---+------------+
| 1 | IPv4       |
+---+------------+
| 2 | UDP        |
+---+------------+
| 3 | VXLAN      |
+---+------------+
| 4 | Ethernet   |
+---+------------+
| 5 | IPv6       |
+---+------------+
| 6 | TCP        |
+---+------------+

+-----------------------------+
| TCPv4 as L4 with meta items |
+===+=========================+
| 0 | VOID                    |
+---+-------------------------+
| 1 | Ethernet                |
+---+-------------------------+
| 2 | VOID                    |
+---+-------------------------+
| 3 | IPv4                    |
+---+-------------------------+
| 4 | TCP                     |
+---+-------------------------+
| 5 | VOID                    |
+---+-------------------------+
| 6 | VOID                    |
+---+-------------------------+

The above example shows how meta items do not affect packet data matching
items, as long as those remain stacked properly. The resulting matching
pattern is identical to "TCPv4 as L4".

+----------------+
| UDPv6 anywhere |
+===+============+
| 0 | IPv6       |
+---+------------+
| 1 | UDP        |
+---+------------+

If supported by the PMD, omitting one or several protocol layers at the
bottom of the stack as in the above example (missing an Ethernet
specification) enables hardware to look anywhere in packets.

It is unspecified whether the payload of supported encapsulations
(e.g. VXLAN inner packet) is matched by such a pattern, which may apply to
inner, outer or both packets.

+---------------------+
| Invalid, missing L3 |
+===+=================+
| 0 | Ethernet        |
+---+-----------------+
| 1 | UDP             |
+---+-----------------+

The above pattern is invalid due to a missing L3 specification between L2
and L4. It is only allowed at the bottom and at the top of the stack.

Meta item types
~~~~~~~~~~~~~~~

These do not match packet data but affect how the pattern is processed, most
of them do not need a specification structure. This particularity allows
them to be specified anywhere without affecting other item types.

``END``
^^^^^^^

End marker for item lists. Prevents further processing of items, thereby
ending the pattern.

- Its numeric value is **0** for convenience.
- PMD support is mandatory.
- Both ``spec`` and ``mask`` are ignored.

+--------------------+
| END                |
+==========+=========+
| ``spec`` | ignored |
+----------+---------+
| ``mask`` | ignored |
+----------+---------+

``VOID``
^^^^^^^^

Used as a placeholder for convenience. It is ignored and simply discarded by
PMDs.

- PMD support is mandatory.
- Both ``spec`` and ``mask`` are ignored.

+--------------------+
| VOID               |
+==========+=========+
| ``spec`` | ignored |
+----------+---------+
| ``mask`` | ignored |
+----------+---------+

One usage example for this type is generating rules that share a common
prefix quickly without reallocating memory, only by updating item types:

+------------------------+
| TCP, UDP or ICMP as L4 |
+===+====================+
| 0 | Ethernet           |
+---+--------------------+
| 1 | IPv4               |
+---+------+------+------+
| 2 | UDP  | VOID | VOID |
+---+------+------+------+
| 3 | VOID | TCP  | VOID |
+---+------+------+------+
| 4 | VOID | VOID | ICMP |
+---+------+------+------+

.. raw:: pdf

   PageBreak

``INVERT``
^^^^^^^^^^

Inverted matching, i.e. process packets that do not match the pattern.

- Both ``spec`` and ``mask`` are ignored.

+--------------------+
| INVERT             |
+==========+=========+
| ``spec`` | ignored |
+----------+---------+
| ``mask`` | ignored |
+----------+---------+

Usage example in order to match non-TCPv4 packets only:

+--------------------+
| Anything but TCPv4 |
+===+================+
| 0 | INVERT         |
+---+----------------+
| 1 | Ethernet       |
+---+----------------+
| 2 | IPv4           |
+---+----------------+
| 3 | TCP            |
+---+----------------+

``PF``
^^^^^^

Matches packets addressed to the physical function of the device.

- Both ``spec`` and ``mask`` are ignored.

+--------------------+
| PF                 |
+==========+=========+
| ``spec`` | ignored |
+----------+---------+
| ``mask`` | ignored |
+----------+---------+

``VF``
^^^^^^

Matches packets addressed to the given virtual function ID of the device.

- Only ``spec`` needs to be defined, ``mask`` is ignored.

+----------------------------------------+
| VF                                     |
+==========+=========+===================+
| ``spec`` | ``vf``  | destination VF ID |
+----------+---------+-------------------+
| ``mask`` | ignored                     |
+----------+-----------------------------+

``SIGNATURE``
^^^^^^^^^^^^^

Requests hash-based signature dispatching for this rule.

Considering this is a global setting on devices that support it, all
subsequent filter rules may have to be created with it as well.

- Only ``spec`` needs to be defined, ``mask`` is ignored.

+--------------------+
| SIGNATURE          |
+==========+=========+
| ``spec`` | TBD     |
+----------+---------+
| ``mask`` | ignored |
+----------+---------+

.. raw:: pdf

   PageBreak

Data matching item types
~~~~~~~~~~~~~~~~~~~~~~~~

Most of these are basically protocol header definitions with associated
bitmasks. They must be specified (stacked) from lowest to highest protocol
layer.

The following list is not exhaustive as new protocols will be added in the
future.

``ANY``
^^^^^^^

Matches any protocol in place of the current layer, a single ANY may also
stand for several protocol layers.

This is usually specified as the first pattern item when looking for a
protocol anywhere in a packet.

- A maximum value of **0** requests matching any number of protocol layers
  above or equal to the minimum value, a maximum value lower than the
  minimum one is otherwise invalid.
- Only ``spec`` needs to be defined, ``mask`` is ignored.

+-----------------------------------------------------------------------+
| ANY                                                                   |
+==========+=========+==================================================+
| ``spec`` | ``min`` | minimum number of layers covered                 |
|          +---------+--------------------------------------------------+
|          | ``max`` | maximum number of layers covered, 0 for infinity |
+----------+---------+--------------------------------------------------+
| ``mask`` | ignored                                                    |
+----------+------------------------------------------------------------+

Example for VXLAN TCP payload matching regardless of outer L3 (IPv4 or IPv6)
and L4 (UDP) both matched by the first ANY specification, and inner L3 (IPv4
or IPv6) matched by the second ANY specification:

+----------------------------------+
| TCP in VXLAN with wildcards      |
+===+==============================+
| 0 | Ethernet                     |
+---+-----+----------+---------+---+
| 1 | ANY | ``spec`` | ``min`` | 2 |
|   |     |          +---------+---+
|   |     |          | ``max`` | 2 |
+---+-----+----------+---------+---+
| 2 | VXLAN                        |
+---+------------------------------+
| 3 | Ethernet                     |
+---+-----+----------+---------+---+
| 4 | ANY | ``spec`` | ``min`` | 1 |
|   |     |          +---------+---+
|   |     |          | ``max`` | 1 |
+---+-----+----------+---------+---+
| 5 | TCP                          |
+---+------------------------------+

.. raw:: pdf

   PageBreak

``RAW``
^^^^^^^

Matches a string of a given length at a given offset (in bytes), or anywhere
in the payload of the current protocol layer (including L2 header if used as
the first item in the stack).

This does not increment the protocol layer count as it is not a protocol
definition. Subsequent RAW items modulate the first absolute one with
relative offsets.

- Using **-1** as the ``offset`` of the first RAW item makes its absolute
  offset not fixed, i.e. the pattern is searched everywhere.
- ``mask`` only affects the pattern.

+--------------------------------------------------------------+
| RAW                                                          |
+==========+=============+=====================================+
| ``spec`` | ``offset``  | absolute or relative pattern offset |
|          +-------------+-------------------------------------+
|          | ``length``  | pattern length                      |
|          +-------------+-------------------------------------+
|          | ``pattern`` | byte string of the above length     |
+----------+-------------+-------------------------------------+
| ``mask`` | ``offset``  | ignored                             |
|          +-------------+-------------------------------------+
|          | ``length``  | ignored                             |
|          +-------------+-------------------------------------+
|          | ``pattern`` | bitmask with the same byte length   |
+----------+-------------+-------------------------------------+

Example pattern looking for several strings at various offsets of a UDP
payload, using combined RAW items:

+------------------------------------------+
| UDP payload matching                     |
+===+======================================+
| 0 | Ethernet                             |
+---+--------------------------------------+
| 1 | IPv4                                 |
+---+--------------------------------------+
| 2 | UDP                                  |
+---+-----+----------+-------------+-------+
| 3 | RAW | ``spec`` | ``offset``  | -1    |
|   |     |          +-------------+-------+
|   |     |          | ``length``  | 3     |
|   |     |          +-------------+-------+
|   |     |          | ``pattern`` | "foo" |
+---+-----+----------+-------------+-------+
| 4 | RAW | ``spec`` | ``offset``  | 20    |
|   |     |          +-------------+-------+
|   |     |          | ``length``  | 3     |
|   |     |          +-------------+-------+
|   |     |          | ``pattern`` | "bar" |
+---+-----+----------+-------------+-------+
| 5 | RAW | ``spec`` | ``offset``  | -30   |
|   |     |          +-------------+-------+
|   |     |          | ``length``  | 3     |
|   |     |          +-------------+-------+
|   |     |          | ``pattern`` | "baz" |
+---+-----+----------+-------------+-------+

This translates to:

- Locate "foo" in UDP payload, remember its offset.
- Check "bar" at "foo"'s offset plus 20 bytes.
- Check "baz" at "foo"'s offset minus 30 bytes.

.. raw:: pdf

   PageBreak

``ETH``
^^^^^^^

Matches an Ethernet header.

- ``dst``: destination MAC.
- ``src``: source MAC.
- ``type``: EtherType.
- ``tags``: number of 802.1Q/ad tags defined.
- ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:

 - ``tpid``: Tag protocol identifier.
 - ``tci``: Tag control information.

``IPV4``
^^^^^^^^

Matches an IPv4 header.

- ``src``: source IP address.
- ``dst``: destination IP address.
- ``tos``: ToS/DSCP field.
- ``ttl``: TTL field.
- ``proto``: protocol number for the next layer.

``IPV6``
^^^^^^^^

Matches an IPv6 header.

- ``src``: source IP address.
- ``dst``: destination IP address.
- ``tc``: traffic class field.
- ``nh``: Next header field (protocol).
- ``hop_limit``: hop limit field (TTL).

``ICMP``
^^^^^^^^

Matches an ICMP header.

- TBD.

``UDP``
^^^^^^^

Matches a UDP header.

- ``sport``: source port.
- ``dport``: destination port.
- ``length``: UDP length.
- ``checksum``: UDP checksum.

.. raw:: pdf

   PageBreak

``TCP``
^^^^^^^

Matches a TCP header.

- ``sport``: source port.
- ``dport``: destination port.
- All other TCP fields and bits.

``VXLAN``
^^^^^^^^^

Matches a VXLAN header.

- TBD.

.. raw:: pdf

   PageBreak

Actions
~~~~~~~

Each possible action is represented by a type. Some have associated
configuration structures. Several actions combined in a list can be affected
to a flow rule. That list is not ordered.

At least one action must be defined in a filter rule in order to do
something with matched packets.

- Actions are defined with ``struct rte_flow_action``.
- A list of actions is defined with ``struct rte_flow_actions``.

They fall in three categories:

- Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
  processing matched packets by subsequent flow rules, unless overridden
  with PASSTHRU.

- Non terminating actions (PASSTHRU, DUP) that leave matched packets up for
  additional processing by subsequent flow rules.

- Other non terminating meta actions that do not affect the fate of packets
  (END, VOID, ID, COUNT).

When several actions are combined in a flow rule, they should all have
different types (e.g. dropping a packet twice is not possible). However
considering the VOID type is an exception to this rule, the defined behavior
is for PMDs to only take into account the last action of a given type found
in the list. PMDs still perform error checking on the entire list.

*Note that PASSTHRU is the only action able to override a terminating rule.*

.. raw:: pdf

   PageBreak

Example of an action that redirects packets to queue index 10:

+----------------+
| QUEUE          |
+===========+====+
| ``queue`` | 10 |
+-----------+----+

Action lists examples, their order is not significant, applications must
consider all actions to be performed simultaneously:

+----------------+
| Count and drop |
+=======+========+
| COUNT |        |
+-------+--------+
| DROP  |        |
+-------+--------+

+--------------------------+
| Tag, count and redirect  |
+=======+===========+======+
| ID    | ``id``    | 0x2a |
+-------+-----------+------+
| COUNT |                  |
+-------+-----------+------+
| QUEUE | ``queue`` | 10   |
+-------+-----------+------+

+-----------------------+
| Redirect to queue 5   |
+=======+===============+
| DROP  |               |
+-------+-----------+---+
| QUEUE | ``queue`` | 5 |
+-------+-----------+---+

In the above example, considering both actions are performed simultaneously,
its end result is that only QUEUE has any effect.

+-----------------------+
| Redirect to queue 3   |
+=======+===========+===+
| QUEUE | ``queue`` | 5 |
+-------+-----------+---+
| VOID  |               |
+-------+-----------+---+
| QUEUE | ``queue`` | 3 |
+-------+-----------+---+

As previously described, only the last action of a given type found in the
list is taken into account. The above example also shows that VOID is
ignored.

.. raw:: pdf

   PageBreak

Action types
~~~~~~~~~~~~

Common action types are described in this section. Like pattern item types,
this list is not exhaustive as new actions will be added in the future.

``END`` (action)
^^^^^^^^^^^^^^^^

End marker for action lists. Prevents further processing of actions, thereby
ending the list.

- Its numeric value is **0** for convenience.
- PMD support is mandatory.
- No configurable property.

+---------------+
| END           |
+===============+
| no properties |
+---------------+

``VOID`` (action)
^^^^^^^^^^^^^^^^^

Used as a placeholder for convenience. It is ignored and simply discarded by
PMDs.

- PMD support is mandatory.
- No configurable property.

+---------------+
| VOID          |
+===============+
| no properties |
+---------------+

``PASSTHRU``
^^^^^^^^^^^^

Leaves packets up for additional processing by subsequent flow rules. This
is the default when a rule does not contain a terminating action, but can be
specified to force a rule to become non-terminating.

- No configurable property.

+---------------+
| PASSTHRU      |
+===============+
| no properties |
+---------------+

Example to copy a packet to a queue and continue processing by subsequent
flow rules:

+--------------------------+
| Copy to queue 8          |
+==========+===============+
| PASSTHRU |               |
+----------+-----------+---+
| QUEUE    | ``queue`` | 8 |
+----------+-----------+---+

``ID``
^^^^^^

Attaches a 32 bit value to packets.

+----------------------------------------------+
| ID                                           |
+========+=====================================+
| ``id`` | 32 bit value to return with packets |
+--------+-------------------------------------+

.. raw:: pdf

   PageBreak

``QUEUE``
^^^^^^^^^

Assigns packets to a given queue index.

- Terminating by default.

+--------------------------------+
| QUEUE                          |
+===========+====================+
| ``queue`` | queue index to use |
+-----------+--------------------+

``DROP``
^^^^^^^^

Drop packets.

- No configurable property.
- Terminating by default.
- PASSTHRU overrides this action if both are specified.

+---------------+
| DROP          |
+===============+
| no properties |
+---------------+

``COUNT``
^^^^^^^^^

Enables hits counter for this rule.

This counter can be retrieved and reset through ``rte_flow_query()``, see
``struct rte_flow_query_count``.

- Counters can be retrieved with ``rte_flow_query()``.
- No configurable property.

+---------------+
| COUNT         |
+===============+
| no properties |
+---------------+

Query structure to retrieve and reset the flow rule hits counter:

+------------------------------------------------+
| COUNT query                                    |
+===========+=====+==============================+
| ``reset`` | in  | reset counter after query    |
+-----------+-----+------------------------------+
| ``hits``  | out | number of hits for this flow |
+-----------+-----+------------------------------+

``DUP``
^^^^^^^

Duplicates packets to a given queue index.

This is normally combined with QUEUE, however when used alone, it is
actually similar to QUEUE + PASSTHRU.

- Non-terminating by default.

+------------------------------------------------+
| DUP                                            |
+===========+====================================+
| ``queue`` | queue index to duplicate packet to |
+-----------+------------------------------------+

.. raw:: pdf

   PageBreak

``RSS``
^^^^^^^

Similar to QUEUE, except RSS is additionally performed on packets to spread
them among several queues according to the provided parameters.

- Terminating by default.

+---------------------------------------------+
| RSS                                         |
+==============+==============================+
| ``rss_conf`` | RSS parameters               |
+--------------+------------------------------+
| ``queues``   | number of entries in queue[] |
+--------------+------------------------------+
| ``queue[]``  | queue indices to use         |
+--------------+------------------------------+

``PF`` (action)
^^^^^^^^^^^^^^^

Redirects packets to the physical function (PF) of the current device.

- No configurable property.
- Terminating by default.

+---------------+
| PF            |
+===============+
| no properties |
+---------------+

``VF`` (action)
^^^^^^^^^^^^^^^

Redirects packets to the virtual function (VF) of the current device with
the specified ID.

- Terminating by default.

+---------------------------------------+
| VF                                    |
+========+==============================+
| ``id`` | VF ID to redirect packets to |
+--------+------------------------------+

Planned types
~~~~~~~~~~~~~

Other action types are planned but not defined yet. These actions will add
the ability to alter matching packets in several ways, such as performing
encapsulation/decapsulation of tunnel headers on specific flows.

.. raw:: pdf

   PageBreak

Rules management
----------------

A simple API with only four functions is provided to fully manage flows.

Each created flow rule is associated with an opaque, PMD-specific handle
pointer. The application is responsible for keeping it until the rule is
destroyed.

Flows rules are defined with ``struct rte_flow``.

Validation
~~~~~~~~~~

Given that expressing a definite set of device capabilities with this API is
not practical, a dedicated function is provided to check if a flow rule is
supported and can be created.

::

 int
 rte_flow_validate(uint8_t port_id,
                   const struct rte_flow_pattern *pattern,
                   const struct rte_flow_actions *actions);

While this function has no effect on the target device, the flow rule is
validated against its current configuration state and the returned value
should be considered valid by the caller for that state only.

The returned value is guaranteed to remain valid only as long as no
successful calls to rte_flow_create() or rte_flow_destroy() are made in the
meantime and no device parameter affecting flow rules in any way are
modified, due to possible collisions or resource limitations (although in
such cases ``EINVAL`` should not be returned).

Arguments:

- ``port_id``: port identifier of Ethernet device.
- ``pattern``: pattern specification to check.
- ``actions``: actions associated with the flow definition.

Return value:

- **0** if flow rule is valid and can be created. A negative errno value
  otherwise (``rte_errno`` is also set), the following errors are defined.
- ``-EINVAL``: unknown or invalid rule specification.
- ``-ENOTSUP``: valid but unsupported rule specification (e.g. partial masks
  are unsupported).
- ``-EEXIST``: collision with an existing rule.
- ``-ENOMEM``: not enough resources.

.. raw:: pdf

   PageBreak

Creation
~~~~~~~~

Creating a flow rule is similar to validating one, except the rule is
actually created.

::

 struct rte_flow *
 rte_flow_create(uint8_t port_id,
                 const struct rte_flow_pattern *pattern,
                 const struct rte_flow_actions *actions);

Arguments:

- ``port_id``: port identifier of Ethernet device.
- ``pattern``: pattern specification to add.
- ``actions``: actions associated with the flow definition.

Return value:

A valid flow pointer in case of success, NULL otherwise and ``rte_errno`` is
set to the positive version of one of the error codes defined for
``rte_flow_validate()``.

Destruction
~~~~~~~~~~~

Flow rules destruction is not automatic, and a queue should not be released
if any are still attached to it. Applications must take care of performing
this step before releasing resources.

::

 int
 rte_flow_destroy(uint8_t port_id,
                  struct rte_flow *flow);

Failure to destroy a flow rule may occur when other flow rules depend on it,
and destroying it would result in an inconsistent state.

This function is only guaranteed to succeed if flow rules are destroyed in
reverse order of their creation.

Arguments:

- ``port_id``: port identifier of Ethernet device.
- ``flow``: flow rule to destroy.

Return value:

- **0** on success, a negative errno value otherwise and ``rte_errno`` is
  set.

.. raw:: pdf

   PageBreak

Query
~~~~~

Query an existing flow rule.

This function allows retrieving flow-specific data such as counters. Data
is gathered by special actions which must be present in the flow rule
definition.

::

 int
 rte_flow_query(uint8_t port_id,
                struct rte_flow *flow,
                enum rte_flow_action_type action,
                void *data);

Arguments:

- ``port_id``: port identifier of Ethernet device.
- ``flow``: flow rule to query.
- ``action``: action type to query.
- ``data``: pointer to storage for the associated query data type.

Return value:

- **0** on success, a negative errno value otherwise and ``rte_errno`` is
  set.

.. raw:: pdf

   PageBreak

Behavior
--------

- API operations are synchronous and blocking (``EAGAIN`` cannot be
  returned).

- There is no provision for reentrancy/multi-thread safety, although nothing
  should prevent different devices from being configured at the same
  time. PMDs may protect their control path functions accordingly.

- Stopping the data path (TX/RX) should not be necessary when managing flow
  rules. If this cannot be achieved naturally or with workarounds (such as
  temporarily replacing the burst function pointers), an appropriate error
  code must be returned (``EBUSY``).

- PMDs, not applications, are responsible for maintaining flow rules
  configuration when stopping and restarting a port or performing other
  actions which may affect them. They can only be destroyed explicitly.

.. raw:: pdf

   PageBreak

Compatibility
-------------

No known hardware implementation supports all the features described in this
document.

Unsupported features or combinations are not expected to be fully emulated
in software by PMDs for performance reasons. Partially supported features
may be completed in software as long as hardware performs most of the work
(such as queue redirection and packet recognition).

However PMDs are expected to do their best to satisfy application requests
by working around hardware limitations as long as doing so does not affect
the behavior of existing flow rules.

The following sections provide a few examples of such cases, they are based
on limitations built into the previous APIs.

Global bitmasks
~~~~~~~~~~~~~~~

Each flow rule comes with its own, per-layer bitmasks, while hardware may
support only a single, device-wide bitmask for a given layer type, so that
two IPv4 rules cannot use different bitmasks.

The expected behavior in this case is that PMDs automatically configure
global bitmasks according to the needs of the first created flow rule.

Subsequent rules are allowed only if their bitmasks match those, the
``EEXIST`` error code should be returned otherwise.

Unsupported layer types
~~~~~~~~~~~~~~~~~~~~~~~

Many protocols can be simulated by crafting patterns with the `RAW`_ type.

PMDs can rely on this capability to simulate support for protocols with
fixed headers not directly recognized by hardware.

``ANY`` pattern item
~~~~~~~~~~~~~~~~~~~~

This pattern item stands for anything, which can be difficult to translate
to something hardware would understand, particularly if followed by more
specific types.

Consider the following pattern:

+---+--------------------------------+
| 0 | ETHER                          |
+---+--------------------------------+
| 1 | ANY (``min`` = 1, ``max`` = 1) |
+---+--------------------------------+
| 2 | TCP                            |
+---+--------------------------------+

Knowing that TCP does not make sense with something other than IPv4 and IPv6
as L3, such a pattern may be translated to two flow rules instead:

+---+--------------------+
| 0 | ETHER              |
+---+--------------------+
| 1 | IPV4 (zeroed mask) |
+---+--------------------+
| 2 | TCP                |
+---+--------------------+

+---+--------------------+
| 0 | ETHER              |
+---+--------------------+
| 1 | IPV6 (zeroed mask) |
+---+--------------------+
| 2 | TCP                |
+---+--------------------+

Note that as soon as a ANY rule covers several layers, this approach may
yield a large number of hidden flow rules. It is thus suggested to only
support the most common scenarios (anything as L2 and/or L3).

.. raw:: pdf

   PageBreak

Unsupported actions
~~~~~~~~~~~~~~~~~~~

- When combined with a `QUEUE`_ action, packet counting (`COUNT`_) and
  tagging (`ID`_) may be implemented in software as long as the target queue
  is used by a single rule.

- A rule specifying both `DUP`_ + `QUEUE`_ may be translated to two hidden
  rules combining `QUEUE`_ and `PASSTHRU`_.

- When a single target queue is provided, `RSS`_ can also be implemented
  through `QUEUE`_.

Flow rules priority
~~~~~~~~~~~~~~~~~~~

While it would naturally make sense, flow rules cannot be assumed to be
processed by hardware in the same order as their creation for several
reasons:

- They may be managed internally as a tree or a hash table instead of a
  list.
- Removing a flow rule before adding another one can either put the new rule
  at the end of the list or reuse a freed entry.
- Duplication may occur when packets are matched by several rules.

For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
predictable behavior is only guaranteed by using different priority levels.

Priority levels are not necessarily implemented in hardware, or may be
severely limited (e.g. a single priority bit).

For these reasons, priority levels may be implemented purely in software by
PMDs.

- For devices expecting flow rules to be added in the correct order, PMDs
  may destroy and re-create existing rules after adding a new one with
  a higher priority.

- A configurable number of dummy or empty rules can be created at
  initialization time to save high priority slots for later.

- In order to save priority levels, PMDs may evaluate whether rules are
  likely to collide and adjust their priority accordingly.

.. raw:: pdf

   PageBreak

API migration
=============

Exhaustive list of deprecated filter types and how to convert them to
generic flow rules.

``MACVLAN`` to ``ETH`` → ``VF``, ``PF``
---------------------------------------

`MACVLAN`_ can be translated to a basic `ETH`_ flow rule with a `VF
(action)`_ or `PF (action)`_ terminating action.

+------------------------------------+
| MACVLAN                            |
+--------------------------+---------+
| Pattern                  | Actions |
+===+=====+==========+=====+=========+
| 0 | ETH | ``spec`` | any | VF,     |
|   |     +----------+-----+ PF      |
|   |     | ``mask`` | any |         |
+---+-----+----------+-----+---------+

``ETHERTYPE`` to ``ETH`` → ``QUEUE``, ``DROP``
----------------------------------------------

`ETHERTYPE`_ is basically an `ETH`_ flow rule with `QUEUE`_ or `DROP`_ as
a terminating action.

+------------------------------------+
| ETHERTYPE                          |
+--------------------------+---------+
| Pattern                  | Actions |
+===+=====+==========+=====+=========+
| 0 | ETH | ``spec`` | any | QUEUE,  |
|   |     +----------+-----+ DROP    |
|   |     | ``mask`` | any |         |
+---+-----+----------+-----+---------+

``FLEXIBLE`` to ``RAW`` → ``QUEUE``
-----------------------------------

`FLEXIBLE`_ can be translated to one `RAW`_ pattern with `QUEUE`_ as the
terminating action and a defined priority level.

+------------------------------------+
| FLEXIBLE                           |
+--------------------------+---------+
| Pattern                  | Actions |
+===+=====+==========+=====+=========+
| 0 | RAW | ``spec`` | any | QUEUE   |
|   |     +----------+-----+         |
|   |     | ``mask`` | any |         |
+---+-----+----------+-----+---------+

``SYN`` to ``TCP`` → ``QUEUE``
------------------------------

`SYN`_ is a `TCP`_ rule with only the ``syn`` bit enabled and masked, and
`QUEUE`_ as the terminating action.

Priority level can be set to simulate the high priority bit.

+---------------------------------------------+
| SYN                                         |
+-----------------------------------+---------+
| Pattern                           | Actions |
+===+======+==========+=============+=========+
| 0 | ETH  | ``spec`` | N/A         | QUEUE   |
|   |      +----------+-------------+         |
|   |      | ``mask`` | empty       |         |
+---+------+----------+-------------+         |
| 1 | IPV4 | ``spec`` | N/A         |         |
|   |      +----------+-------------+         |
|   |      | ``mask`` | empty       |         |
+---+------+----------+-------------+         |
| 2 | TCP  | ``spec`` | ``syn`` = 1 |         |
|   |      +----------+-------------+         |
|   |      | ``mask`` | ``syn`` = 1 |         |
+---+------+----------+-------------+---------+

``NTUPLE`` to ``IPV4``, ``TCP``, ``UDP`` → ``QUEUE``
----------------------------------------------------

`NTUPLE`_ is similar to specifying an empty L2, `IPV4`_ as L3 with `TCP`_ or
`UDP`_ as L4 and `QUEUE`_ as the terminating action.

A priority level can be specified as well.

+---------------------------------------+
| NTUPLE                                |
+-----------------------------+---------+
| Pattern                     | Actions |
+===+======+==========+=======+=========+
| 0 | ETH  | ``spec`` | N/A   | QUEUE   |
|   |      +----------+-------+         |
|   |      | ``mask`` | empty |         |
+---+------+----------+-------+         |
| 1 | IPV4 | ``spec`` | any   |         |
|   |      +----------+-------+         |
|   |      | ``mask`` | any   |         |
+---+------+----------+-------+         |
| 2 | TCP, | ``spec`` | any   |         |
|   | UDP  +----------+-------+         |
|   |      | ``mask`` | any   |         |
+---+------+----------+-------+---------+

``TUNNEL`` to ``ETH``, ``IPV4``, ``IPV6``, ``VXLAN`` (or other) → ``QUEUE``
---------------------------------------------------------------------------

`TUNNEL`_ matches common IPv4 and IPv6 L3/L4-based tunnel types.

In the following table, `ANY`_ is used to cover the optional L4.

+------------------------------------------------+
| TUNNEL                                         |
+--------------------------------------+---------+
| Pattern                              | Actions |
+===+=========+==========+=============+=========+
| 0 | ETH     | ``spec`` | any         | QUEUE   |
|   |         +----------+-------------+         |
|   |         | ``mask`` | any         |         |
+---+---------+----------+-------------+         |
| 1 | IPV4,   | ``spec`` | any         |         |
|   | IPV6    +----------+-------------+         |
|   |         | ``mask`` | any         |         |
+---+---------+----------+-------------+         |
| 2 | ANY     | ``spec`` | ``min`` = 0 |         |
|   |         |          +-------------+         |
|   |         |          | ``max`` = 0 |         |
|   |         +----------+-------------+         |
|   |         | ``mask`` | N/A         |         |
+---+---------+----------+-------------+         |
| 3 | VXLAN,  | ``spec`` | any         |         |
|   | GENEVE, +----------+-------------+         |
|   | TEREDO, | ``mask`` | any         |         |
|   | NVGRE,  |          |             |         |
|   | GRE,    |          |             |         |
|   | ...     |          |             |         |
+---+---------+----------+-------------+---------+

.. raw:: pdf

   PageBreak

``FDIR`` to most item types → ``QUEUE``, ``DROP``, ``PASSTHRU``
---------------------------------------------------------------

`FDIR`_ is more complex than any other type, there are several methods to
emulate its functionality. It is summarized for the most part in the table
below.

A few features are intentionally not supported:

- The ability to configure the matching input set and masks for the entire
  device, PMDs should take care of it automatically according to flow rules.

- Returning four or eight bytes of matched data when using flex bytes
  filtering. Although a specific action could implement it, it conflicts
  with the much more useful 32 bits tagging on devices that support it.

- Side effects on RSS processing of the entire device. Flow rules that
  conflict with the current device configuration should not be
  allowed. Similarly, device configuration should not be allowed when it
  affects existing flow rules.

- Device modes of operation. "none" is unsupported since filtering cannot be
  disabled as long as a flow rule is present.

- "MAC VLAN" or "tunnel" perfect matching modes should be automatically set
  according to the created flow rules.

+----------------------------------------------+
| FDIR                                         |
+---------------------------------+------------+
| Pattern                         | Actions    |
+===+============+==========+=====+============+
| 0 | ETH,       | ``spec`` | any | QUEUE,     |
|   | RAW        +----------+-----+ DROP,      |
|   |            | ``mask`` | any | PASSTHRU   |
+---+------------+----------+-----+------------+
| 1 | IPV4,      | ``spec`` | any | ID         |
|   | IPV6       +----------+-----+ (optional) |
|   |            | ``mask`` | any |            |
+---+------------+----------+-----+            |
| 2 | TCP,       | ``spec`` | any |            |
|   | UDP,       +----------+-----+            |
|   | SCTP       | ``mask`` | any |            |
+---+------------+----------+-----+            |
| 3 | VF,        | ``spec`` | any |            |
|   | PF,        +----------+-----+            |
|   | SIGNATURE  | ``mask`` | any |            |
|   | (optional) |          |     |            |
+---+------------+----------+-----+------------+

``HASH``
~~~~~~~~

Hashing configuration is set per rule through the `SIGNATURE`_ item.

Since it is usually a global device setting, all flow rules created with
this item may have to share the same specification.

``L2_TUNNEL`` to ``VOID`` → ``VXLAN`` (or others)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

All packets are matched. This type alters incoming packets to encapsulate
them in a chosen tunnel type, optionally redirect them to a VF as well.

The destination pool for tag based forwarding can be emulated with other
flow rules using `DUP`_ as the action.

+----------------------------------------+
| L2_TUNNEL                              |
+---------------------------+------------+
| Pattern                   | Actions    |
+===+======+==========+=====+============+
| 0 | VOID | ``spec`` | N/A | VXLAN,     |
|   |      |          |     | GENEVE,    |
|   |      |          |     | ...        |
|   |      +----------+-----+------------+
|   |      | ``mask`` | N/A | VF         |
|   |      |          |     | (optional) |
+---+------+----------+-----+------------+

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* Re: [dpdk-dev] [PATCH v2] mk: filter duplicate configuration entries
  @ 2016-07-06  5:37  3%       ` Christian Ehrhardt
  0 siblings, 0 replies; 200+ results
From: Christian Ehrhardt @ 2016-07-06  5:37 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Ferruh Yigit, dev

Hi,
I came up with something very similar when looking for tac replacements
yesterday, but had no time to finish things.
But your suggestion is even shorter - I had found "sed -n '1{h;T;};G;h;$p;'
file" or "sed -n '1!G;h;$p'".
That removes the tac dependency, which I agree is a good thing.

To chain things up without a temp file one would need the "in-place"
features of sed&awk which I'm not sure they are available (awk >=4.1 and
only GNU awk).
sed -i is only used in validate-abi.sh which might not be used on all
platforms to count as "-i is there already so I can use it".
And I really don't want to break anyone due to that change, just naively
clean up the resulting config a bit.
Also we already have a temp file .config_tmp in the same scope and remove
it on our own.
So it is not that much different to create and remove a second one for that
section.

Thanks for both of your feedback, submitting v3 now ...

Christian Ehrhardt
Software Engineer, Ubuntu Server
Canonical Ltd

On Tue, Jul 5, 2016 at 9:47 PM, Thomas Monjalon <thomas.monjalon@6wind.com>
wrote:

> 2016-07-05 17:47, Ferruh Yigit:
> > On 6/30/2016 1:00 PM, Christian Ehrhardt wrote:
> > > +           tac $(RTE_OUTPUT)/.config_tmp >
> $(RTE_OUTPUT)/.config_tmp_reverse ; \
> > Now we are adding new binary dependency (tac) to build system
>
> tac can be replaced by sed '1!G;h;$!d'
>
>

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v4] Pci: Add the class_id support
  @ 2016-07-06 11:08  3%         ` Ferruh Yigit
  2016-07-07  7:46  0%           ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Ferruh Yigit @ 2016-07-06 11:08 UTC (permalink / raw)
  To: Thomas Monjalon, Ziye Yang; +Cc: dev

On 6/14/2016 3:52 PM, Thomas Monjalon wrote:
> 2016-05-24 20:50, Ziye Yang:
>> This patch is used to add the class_id (class_code,
>> subclass_code, programming_interface) support for
>> pci_device probe. With this patch, it will be
>> flexible for users to probe a class of devices
>> by class_id.
>>
>>
>> Signed-off-by: Ziye Yang <ziye.yang@intel.com>
> 
> Applied, thanks
> 
Hi Thomas, Ziye,

Is modification in public "struct rte_pci_id" is a ABI break?
If so, it requires eal LIBABIVER increase and release notes update.

Regards,
ferruh

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/3] kasumi: add new KASUMI PMD
  @ 2016-07-06 11:26  3%     ` Ferruh Yigit
  2016-07-06 13:07  0%       ` Thomas Monjalon
  2016-07-06 13:22  0%       ` De Lara Guarch, Pablo
  0 siblings, 2 replies; 200+ results
From: Ferruh Yigit @ 2016-07-06 11:26 UTC (permalink / raw)
  To: Pablo de Lara, dev; +Cc: declan.doherty, deepak.k.jain

On 6/20/2016 3:40 PM, Pablo de Lara wrote:
> Added new SW PMD which makes use of the libsso_kasumi SW library,
> which provides wireless algorithms KASUMI F8 and F9
> in software.
> 
> This PMD supports cipher-only, hash-only and chained operations
> ("cipher then hash" and "hash then cipher") of the following
> algorithms:
> - RTE_CRYPTO_SYM_CIPHER_KASUMI_F8
> - RTE_CRYPTO_SYM_AUTH_KASUMI_F9
> 
> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> Acked-by: Jain, Deepak K <deepak.k.jain@intel.com>

...

> --- a/lib/librte_cryptodev/rte_cryptodev.h
> +++ b/lib/librte_cryptodev/rte_cryptodev.h
> @@ -59,12 +59,15 @@ extern "C" {
>  /**< Intel QAT Symmetric Crypto PMD device name */
>  #define CRYPTODEV_NAME_SNOW3G_PMD	("cryptodev_snow3g_pmd")
>  /**< SNOW 3G PMD device name */
> +#define CRYPTODEV_NAME_KASUMI_PMD	("cryptodev_kasumi_pmd")
> +/**< KASUMI PMD device name */
>  
>  /** Crypto device type */
>  enum rte_cryptodev_type {
>  	RTE_CRYPTODEV_NULL_PMD = 1,	/**< Null crypto PMD */
>  	RTE_CRYPTODEV_AESNI_GCM_PMD,	/**< AES-NI GCM PMD */
>  	RTE_CRYPTODEV_AESNI_MB_PMD,	/**< AES-NI multi buffer PMD */
> +	RTE_CRYPTODEV_KASUMI_PMD,	/**< KASUMI PMD */
Does adding new field into middle cause a ABI breakage?
Since now value of below fields changed.

Btw, librte_cryptodev is not listed in release notes, "shared library
versions" section, not sure if this is intentional.

>  	RTE_CRYPTODEV_QAT_SYM_PMD,	/**< QAT PMD Symmetric Crypto */
>  	RTE_CRYPTODEV_SNOW3G_PMD,	/**< SNOW 3G PMD */
>  };

...

^ permalink raw reply	[relevance 3%]

* [dpdk-dev] [PATCH] librte_pmd_bond: fix exported symbol versioning
@ 2016-07-06 11:39  3% Christian Ehrhardt
  2016-07-11 11:27  3% ` [dpdk-dev] [PATCH v2] " Christian Ehrhardt
  0 siblings, 1 reply; 200+ results
From: Christian Ehrhardt @ 2016-07-06 11:39 UTC (permalink / raw)
  To: Eric Kinzie, christian.ehrhardt, thomas.monjalon, dev

The older versions of rte_eth_bond_8023ad_conf_get and
rte_eth_bond_8023ad_setup were available in the old way since 2.0 - at
least according to the map file.

But versioning in the code was set to 16.04.
That breaks compatibility checks for 2.0 on that library.

For example with the dpdk abi checker:
http://people.canonical.com/~paelzer/compat_report.html

To fix, version the old symbols on the 2.0 version as they were
initially added to the map file.

See http://people.canonical.com/~paelzer/compat_report.html

Fixes: dc40f17a ("net/bonding: allow external state machine in mode 4")

Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index 48a50e4..2f7ae70 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -1068,7 +1068,7 @@ bond_mode_8023ad_conf_assign(struct mode8023ad_private *mode4,
 }
 
 static void
-bond_mode_8023ad_setup_v1604(struct rte_eth_dev *dev,
+bond_mode_8023ad_setup_v20(struct rte_eth_dev *dev,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_bond_8023ad_conf def_conf;
@@ -1214,7 +1214,7 @@ free_out:
 }
 
 int
-rte_eth_bond_8023ad_conf_get_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_conf_get_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_dev *bond_dev;
@@ -1229,7 +1229,7 @@ rte_eth_bond_8023ad_conf_get_v1604(uint8_t port_id,
 	bond_mode_8023ad_conf_get(bond_dev, conf);
 	return 0;
 }
-VERSION_SYMBOL(rte_eth_bond_8023ad_conf_get, _v1604, 16.04);
+VERSION_SYMBOL(rte_eth_bond_8023ad_conf_get, _v20, 2.0);
 
 int
 rte_eth_bond_8023ad_conf_get_v1607(uint8_t port_id,
@@ -1278,7 +1278,7 @@ bond_8023ad_setup_validate(uint8_t port_id,
 }
 
 int
-rte_eth_bond_8023ad_setup_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_setup_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_dev *bond_dev;
@@ -1289,11 +1289,11 @@ rte_eth_bond_8023ad_setup_v1604(uint8_t port_id,
 		return err;
 
 	bond_dev = &rte_eth_devices[port_id];
-	bond_mode_8023ad_setup_v1604(bond_dev, conf);
+	bond_mode_8023ad_setup_v20(bond_dev, conf);
 
 	return 0;
 }
-VERSION_SYMBOL(rte_eth_bond_8023ad_setup, _v1604, 16.04);
+VERSION_SYMBOL(rte_eth_bond_8023ad_setup, _v20, 2.0);
 
 int
 rte_eth_bond_8023ad_setup_v1607(uint8_t port_id,
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v3 1/3] kasumi: add new KASUMI PMD
  2016-07-06 11:26  3%     ` Ferruh Yigit
@ 2016-07-06 13:07  0%       ` Thomas Monjalon
  2016-07-06 13:22  0%       ` De Lara Guarch, Pablo
  1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-06 13:07 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: dev, Pablo de Lara, declan.doherty, deepak.k.jain, reshma.pattan

2016-07-06 12:26, Ferruh Yigit:
> On 6/20/2016 3:40 PM, Pablo de Lara wrote:
> >  enum rte_cryptodev_type {
> >  	RTE_CRYPTODEV_NULL_PMD = 1,	/**< Null crypto PMD */
> >  	RTE_CRYPTODEV_AESNI_GCM_PMD,	/**< AES-NI GCM PMD */
> >  	RTE_CRYPTODEV_AESNI_MB_PMD,	/**< AES-NI multi buffer PMD */
> > +	RTE_CRYPTODEV_KASUMI_PMD,	/**< KASUMI PMD */
> Does adding new field into middle cause a ABI breakage?
> Since now value of below fields changed.
> 
> Btw, librte_cryptodev is not listed in release notes, "shared library
> versions" section, not sure if this is intentional.

Good catch!
Now that crypto is not experimental anymore, we must add cryptodev in
release notes. librte_pdump is also missing in this list.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v3 1/3] kasumi: add new KASUMI PMD
  2016-07-06 11:26  3%     ` Ferruh Yigit
  2016-07-06 13:07  0%       ` Thomas Monjalon
@ 2016-07-06 13:22  0%       ` De Lara Guarch, Pablo
  1 sibling, 0 replies; 200+ results
From: De Lara Guarch, Pablo @ 2016-07-06 13:22 UTC (permalink / raw)
  To: Yigit, Ferruh, dev; +Cc: Doherty, Declan, Jain, Deepak K



> -----Original Message-----
> From: Yigit, Ferruh
> Sent: Wednesday, July 06, 2016 12:26 PM
> To: De Lara Guarch, Pablo; dev@dpdk.org
> Cc: Doherty, Declan; Jain, Deepak K
> Subject: Re: [dpdk-dev] [PATCH v3 1/3] kasumi: add new KASUMI PMD
> 
> On 6/20/2016 3:40 PM, Pablo de Lara wrote:
> > Added new SW PMD which makes use of the libsso_kasumi SW library,
> > which provides wireless algorithms KASUMI F8 and F9
> > in software.
> >
> > This PMD supports cipher-only, hash-only and chained operations
> > ("cipher then hash" and "hash then cipher") of the following
> > algorithms:
> > - RTE_CRYPTO_SYM_CIPHER_KASUMI_F8
> > - RTE_CRYPTO_SYM_AUTH_KASUMI_F9
> >
> > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > Acked-by: Jain, Deepak K <deepak.k.jain@intel.com>
> 
> ...
> 
> > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > @@ -59,12 +59,15 @@ extern "C" {
> >  /**< Intel QAT Symmetric Crypto PMD device name */
> >  #define CRYPTODEV_NAME_SNOW3G_PMD	("cryptodev_snow3g_pmd")
> >  /**< SNOW 3G PMD device name */
> > +#define CRYPTODEV_NAME_KASUMI_PMD	("cryptodev_kasumi_pmd")
> > +/**< KASUMI PMD device name */
> >
> >  /** Crypto device type */
> >  enum rte_cryptodev_type {
> >  	RTE_CRYPTODEV_NULL_PMD = 1,	/**< Null crypto PMD */
> >  	RTE_CRYPTODEV_AESNI_GCM_PMD,	/**< AES-NI GCM PMD */
> >  	RTE_CRYPTODEV_AESNI_MB_PMD,	/**< AES-NI multi buffer PMD
> */
> > +	RTE_CRYPTODEV_KASUMI_PMD,	/**< KASUMI PMD */
> Does adding new field into middle cause a ABI breakage?
> Since now value of below fields changed.

Right! Thanks for the catch, will send a patch to fix that.
> 
> Btw, librte_cryptodev is not listed in release notes, "shared library
> versions" section, not sure if this is intentional.
> 
> >  	RTE_CRYPTODEV_QAT_SYM_PMD,	/**< QAT PMD Symmetric
> Crypto */
> >  	RTE_CRYPTODEV_SNOW3G_PMD,	/**< SNOW 3G PMD */
> >  };
> 
> ...

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH] cryptodev: move new cryptodev type to bottom of enum
@ 2016-07-06 14:05  3% Pablo de Lara
  2016-07-08 17:52  0% ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Pablo de Lara @ 2016-07-06 14:05 UTC (permalink / raw)
  To: dev; +Cc: declan.doherty, Pablo de Lara

New cryptodev type for the new KASUMI PMD was added
in the cryptodev type enum, but not at the end of it,
causing an ABI breakage.

Fixes: 2773c86d061a ("crypto/kasumi: add driver for KASUMI library")

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
Reported-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 lib/librte_cryptodev/rte_cryptodev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h
index 7768f0a..508c1f7 100644
--- a/lib/librte_cryptodev/rte_cryptodev.h
+++ b/lib/librte_cryptodev/rte_cryptodev.h
@@ -67,9 +67,9 @@ enum rte_cryptodev_type {
 	RTE_CRYPTODEV_NULL_PMD = 1,	/**< Null crypto PMD */
 	RTE_CRYPTODEV_AESNI_GCM_PMD,	/**< AES-NI GCM PMD */
 	RTE_CRYPTODEV_AESNI_MB_PMD,	/**< AES-NI multi buffer PMD */
-	RTE_CRYPTODEV_KASUMI_PMD,	/**< KASUMI PMD */
 	RTE_CRYPTODEV_QAT_SYM_PMD,	/**< QAT PMD Symmetric Crypto */
 	RTE_CRYPTODEV_SNOW3G_PMD,	/**< SNOW 3G PMD */
+	RTE_CRYPTODEV_KASUMI_PMD,	/**< KASUMI PMD */
 };
 
 extern const char **rte_cyptodev_names;
-- 
2.5.5

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
  2016-07-05 18:16  2% [dpdk-dev] [RFC] Generic flow director/filtering/classification API Adrien Mazarguil
@ 2016-07-07  7:14  0% ` Lu, Wenzhuo
  2016-07-07 10:26  2%   ` Adrien Mazarguil
  2016-07-07 23:15  0% ` Chandran, Sugesh
  2016-07-08 11:11  0% ` Liang, Cunming
  2 siblings, 1 reply; 200+ results
From: Lu, Wenzhuo @ 2016-07-07  7:14 UTC (permalink / raw)
  To: Adrien Mazarguil, dev
  Cc: Thomas Monjalon, Zhang, Helin, Wu, Jingjing, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Jan Medala, John Daley, Chen,
	Jing D, Ananyev, Konstantin, Matej Vido, Alejandro Lucero,
	Sony Chacko, Jerin Jacob, De Lara Guarch, Pablo, Olga Shern

Hi Adrien,
I have some questions, please see inline, thanks.

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Wednesday, July 6, 2016 2:17 AM
> To: dev@dpdk.org
> Cc: Thomas Monjalon; Zhang, Helin; Wu, Jingjing; Rasesh Mody; Ajit Khaparde;
> Rahul Lakkireddy; Lu, Wenzhuo; Jan Medala; John Daley; Chen, Jing D; Ananyev,
> Konstantin; Matej Vido; Alejandro Lucero; Sony Chacko; Jerin Jacob; De Lara
> Guarch, Pablo; Olga Shern
> Subject: [RFC] Generic flow director/filtering/classification API
> 
> 
> Requirements for a new API:
> 
> - Flexible and extensible without causing API/ABI problems for existing
>   applications.
> - Should be unambiguous and easy to use.
> - Support existing filtering features and actions listed in `Filter types`_.
> - Support packet alteration.
> - In case of overlapping filters, their priority should be well documented.
Does that mean we don't guarantee the consistent of priority? The priority can be different on different NICs. So the behavior of the actions  can be different. Right?
Seems the users still need to aware the some details of the HW? Do we need to add the negotiation for the priority?

> 
> Flow rules can have several distinct actions (such as counting,
> encapsulating, decapsulating before redirecting packets to a particular
> queue, etc.), instead of relying on several rules to achieve this and having
> applications deal with hardware implementation details regarding their
> order.
I think normally HW doesn't support several actions in one rule. If a rule has several actions, seems HW has to split it to several rules. The order can still be a problem.

> 
> ``ETH``
> ^^^^^^^
> 
> Matches an Ethernet header.
> 
> - ``dst``: destination MAC.
> - ``src``: source MAC.
> - ``type``: EtherType.
> - ``tags``: number of 802.1Q/ad tags defined.
> - ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:
> 
>  - ``tpid``: Tag protocol identifier.
>  - ``tci``: Tag control information.
"ETH" means all the parameters, dst, src, type... need to be matched? The same question for IPv4, IPv6 ...

> 
> ``UDP``
> ^^^^^^^
> 
> Matches a UDP header.
> 
> - ``sport``: source port.
> - ``dport``: destination port.
> - ``length``: UDP length.
> - ``checksum``: UDP checksum.
Why checksum? Do we need to filter the packets by checksum value?

> 
> ``VOID`` (action)
> ^^^^^^^^^^^^^^^^^
> 
> Used as a placeholder for convenience. It is ignored and simply discarded by
> PMDs.
Don't understand why we need VOID. If it’s about the format. Why not guarantee it in rte layer?

> 
> Behavior
> --------
> 
> - API operations are synchronous and blocking (``EAGAIN`` cannot be
>   returned).
> 
> - There is no provision for reentrancy/multi-thread safety, although nothing
>   should prevent different devices from being configured at the same
>   time. PMDs may protect their control path functions accordingly.
> 
> - Stopping the data path (TX/RX) should not be necessary when managing flow
>   rules. If this cannot be achieved naturally or with workarounds (such as
>   temporarily replacing the burst function pointers), an appropriate error
>   code must be returned (``EBUSY``).
PMD cannot stop the data path without adding lock. So I think if some rules cannot be applied without stopping rx/tx, PMD has to return fail.
Or let the APP to stop the data path.

> 
> - PMDs, not applications, are responsible for maintaining flow rules
>   configuration when stopping and restarting a port or performing other
>   actions which may affect them. They can only be destroyed explicitly.
Don’t understand " They can only be destroyed explicitly." If a new rule has conflict with an old one, what should we do? Return fail?

> 
> ``ANY`` pattern item
> ~~~~~~~~~~~~~~~~~~~~
> 
> This pattern item stands for anything, which can be difficult to translate
> to something hardware would understand, particularly if followed by more
> specific types.
> 
> Consider the following pattern:
> 
> +---+--------------------------------+
> | 0 | ETHER                          |
> +---+--------------------------------+
> | 1 | ANY (``min`` = 1, ``max`` = 1) |
> +---+--------------------------------+
> | 2 | TCP                            |
> +---+--------------------------------+
> 
> Knowing that TCP does not make sense with something other than IPv4 and IPv6
> as L3, such a pattern may be translated to two flow rules instead:
> 
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV4 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
> 
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV6 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
> 
> Note that as soon as a ANY rule covers several layers, this approach may
> yield a large number of hidden flow rules. It is thus suggested to only
> support the most common scenarios (anything as L2 and/or L3).
I think "any" may make things confusing.  How about if the NIC doesn't support IPv6? Should we return fail for this rule?

> 
> Flow rules priority
> ~~~~~~~~~~~~~~~~~~~
> 
> While it would naturally make sense, flow rules cannot be assumed to be
> processed by hardware in the same order as their creation for several
> reasons:
> 
> - They may be managed internally as a tree or a hash table instead of a
>   list.
> - Removing a flow rule before adding another one can either put the new rule
>   at the end of the list or reuse a freed entry.
> - Duplication may occur when packets are matched by several rules.
> 
> For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
> predictable behavior is only guaranteed by using different priority levels.
> 
> Priority levels are not necessarily implemented in hardware, or may be
> severely limited (e.g. a single priority bit).
> 
> For these reasons, priority levels may be implemented purely in software by
> PMDs.
> 
> - For devices expecting flow rules to be added in the correct order, PMDs
>   may destroy and re-create existing rules after adding a new one with
>   a higher priority.
> 
> - A configurable number of dummy or empty rules can be created at
>   initialization time to save high priority slots for later.
> 
> - In order to save priority levels, PMDs may evaluate whether rules are
>   likely to collide and adjust their priority accordingly.
If there's 3 rules, r1, r2,r3. The rules say the priority is r1 > r2 > r3. If PMD can only support r1 > r3 > r2, or doesn't support r2. Should PMD apply r1 and r3 or totally not support them all?

A generic question, is the parsing supposed to be done by rte or PMD?

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [PATCH v4] Pci: Add the class_id support
  2016-07-06 11:08  3%         ` Ferruh Yigit
@ 2016-07-07  7:46  0%           ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-07  7:46 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Ziye Yang, dev

2016-07-06 12:08, Ferruh Yigit:
> On 6/14/2016 3:52 PM, Thomas Monjalon wrote:
> > 2016-05-24 20:50, Ziye Yang:
> >> This patch is used to add the class_id (class_code,
> >> subclass_code, programming_interface) support for
> >> pci_device probe. With this patch, it will be
> >> flexible for users to probe a class of devices
> >> by class_id.
> >>
> >>
> >> Signed-off-by: Ziye Yang <ziye.yang@intel.com>
> > 
> > Applied, thanks
> > 
> Hi Thomas, Ziye,
> 
> Is modification in public "struct rte_pci_id" is a ABI break?
> If so, it requires eal LIBABIVER increase and release notes update.

Not really sure. I was thinking that it is used only by drivers
but not by applications.

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
  2016-07-07  7:14  0% ` Lu, Wenzhuo
@ 2016-07-07 10:26  2%   ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2016-07-07 10:26 UTC (permalink / raw)
  To: Lu, Wenzhuo
  Cc: dev, Thomas Monjalon, Zhang, Helin, Wu, Jingjing, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Jan Medala, John Daley, Chen,
	Jing D, Ananyev, Konstantin, Matej Vido, Alejandro Lucero,
	Sony Chacko, Jerin Jacob, De Lara Guarch, Pablo, Olga Shern

Hi Lu Wenzhuo,

Thanks for your feedback, I'm replying below as well.

On Thu, Jul 07, 2016 at 07:14:18AM +0000, Lu, Wenzhuo wrote:
> Hi Adrien,
> I have some questions, please see inline, thanks.
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Wednesday, July 6, 2016 2:17 AM
> > To: dev@dpdk.org
> > Cc: Thomas Monjalon; Zhang, Helin; Wu, Jingjing; Rasesh Mody; Ajit Khaparde;
> > Rahul Lakkireddy; Lu, Wenzhuo; Jan Medala; John Daley; Chen, Jing D; Ananyev,
> > Konstantin; Matej Vido; Alejandro Lucero; Sony Chacko; Jerin Jacob; De Lara
> > Guarch, Pablo; Olga Shern
> > Subject: [RFC] Generic flow director/filtering/classification API
> > 
> > 
> > Requirements for a new API:
> > 
> > - Flexible and extensible without causing API/ABI problems for existing
> >   applications.
> > - Should be unambiguous and easy to use.
> > - Support existing filtering features and actions listed in `Filter types`_.
> > - Support packet alteration.
> > - In case of overlapping filters, their priority should be well documented.
> Does that mean we don't guarantee the consistent of priority? The priority can be different on different NICs. So the behavior of the actions  can be different. Right?

No, the intent is precisely to define what happens in order to get a
consistent result across different devices, and document cases with
undefined behavior. There must be no room left for interpretation.

For example, the API must describe what happens when two overlapping filters
(e.g. one matching an Ethernet header, another one matching an IP header)
match a given packet at a given priority level.

It is documented in section 4.1.1 (priorities) as "undefined behavior".
Applications remain free to do it and deal with consequences, at least they
know they cannot expect a consistent outcome, unless they use different
priority levels for both rules, see also 4.4.5 (flow rules priority).

> Seems the users still need to aware the some details of the HW? Do we need to add the negotiation for the priority?

Priorities as defined in this document may not be directly mappable to HW
capabilities (e.g. HW does not support enough priorities, or that some
corner case make them not work as described), in which case the PMD may
choose to simulate priorities (again 4.4.5), as long as the end result
follows the specification.

So users must not be aware of some HW details, the PMD does and must perform
the needed workarounds to suit their expectations. Users may only be
impacted by errors while attempting to create rules that are either
unsupported or would cause them (or existing rules) to diverge from the
spec.

> > Flow rules can have several distinct actions (such as counting,
> > encapsulating, decapsulating before redirecting packets to a particular
> > queue, etc.), instead of relying on several rules to achieve this and having
> > applications deal with hardware implementation details regarding their
> > order.
> I think normally HW doesn't support several actions in one rule. If a rule has several actions, seems HW has to split it to several rules. The order can still be a problem.

Note that, except for a very small subset of pattern items and actions,
supporting multiple actions for a given rule is not mandatory, and can be
emulated as you said by having to split them into several rules each with
its own priority if possible (see 3.3 "high level design").

Also, a rule "action" as defined in this API can be just about anything, for
example combining a queue redirection with 32-bit tagging. FDIR supports
many cases of what can be described as several actions, see 5.7 "FDIR to
most item types → QUEUE, DROP, PASSTHRU".

If you were thinking about having two queue targets for a given rule, then
I agree with you - that is why a rule cannot have more than a single action
of a given type (see 4.1.5 actions), to avoid such abuse from applications.

Applications must use several pass-through rules with different priority
levels if they want to perform a given action several times on a given
packet. Again, PMDs support is not mandatory as pass-through is optional.

> > ``ETH``
> > ^^^^^^^
> > 
> > Matches an Ethernet header.
> > 
> > - ``dst``: destination MAC.
> > - ``src``: source MAC.
> > - ``type``: EtherType.
> > - ``tags``: number of 802.1Q/ad tags defined.
> > - ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:
> > 
> >  - ``tpid``: Tag protocol identifier.
> >  - ``tci``: Tag control information.
> "ETH" means all the parameters, dst, src, type... need to be matched? The same question for IPv4, IPv6 ...

Yes, it's basically the description of all Ethernet header fields including
VLAN tags (same for other protocols). Please see the linked draft header
file which should make all of this easier to understand:

 https://raw.githubusercontent.com/6WIND/rte_flow/master/rte_flow.h

> > ``UDP``
> > ^^^^^^^
> > 
> > Matches a UDP header.
> > 
> > - ``sport``: source port.
> > - ``dport``: destination port.
> > - ``length``: UDP length.
> > - ``checksum``: UDP checksum.
> Why checksum? Do we need to filter the packets by checksum value?

Well, I've decided to include all protocol header fields for completeness
(so the ABI does not need to be broken later then they become necessary, or
require another pattern item), not that all of them make sense in a pattern.

In this specific case, all PMDs I know of must reject a pattern
specification with a nonzero mask for the checksum field, because none of
them support it.

> > ``VOID`` (action)
> > ^^^^^^^^^^^^^^^^^
> > 
> > Used as a placeholder for convenience. It is ignored and simply discarded by
> > PMDs.
> Don't understand why we need VOID. If it’s about the format. Why not guarantee it in rte layer?

I'm not sure to understand your question about rte layer, but this type is
fully managed by the PMD and is not supposed to be translated to a hardware
action.

I think it may come handy in some cases (like the VOID pattern item), so it
is defined just in case. Should be relatively trivial to support.

Applications may find a use for it when they want to statically define
templates for flow rules, when they need room for some reason.

> > Behavior
> > --------
> > 
> > - API operations are synchronous and blocking (``EAGAIN`` cannot be
> >   returned).
> > 
> > - There is no provision for reentrancy/multi-thread safety, although nothing
> >   should prevent different devices from being configured at the same
> >   time. PMDs may protect their control path functions accordingly.
> > 
> > - Stopping the data path (TX/RX) should not be necessary when managing flow
> >   rules. If this cannot be achieved naturally or with workarounds (such as
> >   temporarily replacing the burst function pointers), an appropriate error
> >   code must be returned (``EBUSY``).
> PMD cannot stop the data path without adding lock. So I think if some rules cannot be applied without stopping rx/tx, PMD has to return fail.
> Or let the APP to stop the data path.

Agreed, that is the intent. If the PMD cannot touch flow rules for some
reason even after trying really hard, then it just returns EBUSY.

Perhaps we should write down that applications may get a different outcome
after stopping the data path if they get EBUSY?

> > - PMDs, not applications, are responsible for maintaining flow rules
> >   configuration when stopping and restarting a port or performing other
> >   actions which may affect them. They can only be destroyed explicitly.
> Don’t understand " They can only be destroyed explicitly."

This part says that as long as an application has not called
rte_flow_destroy() on a flow rule, it never disappears, whatever happens to
the port (stopped, restarted). The application is not responsible for
re-creating rules after that.

Note that according to the specification, this may translate to not being
able to stop a port as long as a flow rule is present, depending on how nice
the PMD intends to be with applications. Implementation can be done in small
steps with minimal amount of code on the PMD side.

> If a new rule has conflict with an old one, what should we do? Return fail?

That should not happen. If say 100 rules have been created with various
priorities and the port is happily running with them, stopping the port may
require the PMD to destroy them, it then has to re-create all 100 of them
exactly as they were automatically when restarting the port.

If re-creating them is not possible for some reason, the port cannot be
restarted as long as rules that cannot be added back haven't been destroyed
by the application. Frankly, this should not happen.

To manage this case, I suggest preventing applications from doing things
that conflict with existing flow rules while the port is stopped (just like
when it is not stopped, as described in 5.7 "FDIR to most item types").

> > ``ANY`` pattern item
> > ~~~~~~~~~~~~~~~~~~~~
> > 
> > This pattern item stands for anything, which can be difficult to translate
> > to something hardware would understand, particularly if followed by more
> > specific types.
> > 
> > Consider the following pattern:
> > 
> > +---+--------------------------------+
> > | 0 | ETHER                          |
> > +---+--------------------------------+
> > | 1 | ANY (``min`` = 1, ``max`` = 1) |
> > +---+--------------------------------+
> > | 2 | TCP                            |
> > +---+--------------------------------+
> > 
> > Knowing that TCP does not make sense with something other than IPv4 and IPv6
> > as L3, such a pattern may be translated to two flow rules instead:
> > 
> > +---+--------------------+
> > | 0 | ETHER              |
> > +---+--------------------+
> > | 1 | IPV4 (zeroed mask) |
> > +---+--------------------+
> > | 2 | TCP                |
> > +---+--------------------+
> > 
> > +---+--------------------+
> > | 0 | ETHER              |
> > +---+--------------------+
> > | 1 | IPV6 (zeroed mask) |
> > +---+--------------------+
> > | 2 | TCP                |
> > +---+--------------------+
> > 
> > Note that as soon as a ANY rule covers several layers, this approach may
> > yield a large number of hidden flow rules. It is thus suggested to only
> > support the most common scenarios (anything as L2 and/or L3).
> I think "any" may make things confusing.  How about if the NIC doesn't support IPv6? Should we return fail for this rule?

In a sense you are right, ANY relies on HW capabilities so you cannot know
that it won't match unsupported protocols. The above example would be
somewhat useless for a conscious application which should really have
created two separate flow rules (and gotten an error on the IPv6 one).

So an ANY flow rule only able to match v4 packets won't return an error.

ANY can be useful to match outer packets when only a tunnel header and the
inner packet are meaningful to the application. HW that does not recognize
the outer packet is not able to recognize the inner one anyway.

This section only says that PMDs should do their best to make HW match what
they can when faced with ANY.

Also once again, ANY support is not mandatory.

> > Flow rules priority
> > ~~~~~~~~~~~~~~~~~~~
> > 
> > While it would naturally make sense, flow rules cannot be assumed to be
> > processed by hardware in the same order as their creation for several
> > reasons:
> > 
> > - They may be managed internally as a tree or a hash table instead of a
> >   list.
> > - Removing a flow rule before adding another one can either put the new rule
> >   at the end of the list or reuse a freed entry.
> > - Duplication may occur when packets are matched by several rules.
> > 
> > For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
> > predictable behavior is only guaranteed by using different priority levels.
> > 
> > Priority levels are not necessarily implemented in hardware, or may be
> > severely limited (e.g. a single priority bit).
> > 
> > For these reasons, priority levels may be implemented purely in software by
> > PMDs.
> > 
> > - For devices expecting flow rules to be added in the correct order, PMDs
> >   may destroy and re-create existing rules after adding a new one with
> >   a higher priority.
> > 
> > - A configurable number of dummy or empty rules can be created at
> >   initialization time to save high priority slots for later.
> > 
> > - In order to save priority levels, PMDs may evaluate whether rules are
> >   likely to collide and adjust their priority accordingly.
> If there's 3 rules, r1, r2,r3. The rules say the priority is r1 > r2 > r3. If PMD can only support r1 > r3 > r2, or doesn't support r2. Should PMD apply r1 and r3 or totally not support them all?

Remember that the API lets applications create only one rule at a time. If
all 3 rules are not supported together but individually are, the answer
depends on what the application does:

1. r1 OK, r2 FAIL => application chooses to stop here, thus only r1 works as
  expected (may roll back and remove r1 as a result).

2. r1 OK, r2 FAIL, r3 OK => application chooses to ignore the fact r2 failed
  and added r3 anyway, so it should end up with r1 > r3.

Applications should do as described in 1, they need to check for errors if
they want consistency.

This document describes only the basic functions, but may be extended later
with methods to add several flow rules at once, so rules that depend on
others can be added together and a single failure is returned without the
need for a rollback at the application level.

> A generic question, is the parsing supposed to be done by rte or PMD?

Actually, a bit of both. EAL will certainly at least provide helpers to
assist PMDs. This specification defines only the public-facing API for now,
but our goal is really to have something that is not too difficult to
implement both for applications and PMDs.

These helpers can be defined later with the first implementation.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 2%]

* [dpdk-dev] [PATCH 11/11] maintainers: add section for pmdinfo
  @ 2016-07-07 15:36  4% ` Thomas Monjalon
  2016-07-07 16:14  0%   ` Neil Horman
    1 sibling, 1 reply; 200+ results
From: Thomas Monjalon @ 2016-07-07 15:36 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev

The author of this feature is Neil Horman.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
 MAINTAINERS | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..f996c2e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -68,6 +68,10 @@ F: lib/librte_compat/
 F: doc/guides/rel_notes/deprecation.rst
 F: scripts/validate-abi.sh
 
+Driver information
+F: buildtools/pmdinfogen/
+F: tools/pmdinfo.py
+
 
 Environment Abstraction Layer
 -----------------------------
-- 
2.7.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH 11/11] maintainers: add section for pmdinfo
  2016-07-07 15:36  4% ` [dpdk-dev] [PATCH 11/11] maintainers: add section for pmdinfo Thomas Monjalon
@ 2016-07-07 16:14  0%   ` Neil Horman
  0 siblings, 0 replies; 200+ results
From: Neil Horman @ 2016-07-07 16:14 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

On Thu, Jul 07, 2016 at 05:36:30PM +0200, Thomas Monjalon wrote:
> The author of this feature is Neil Horman.
> 
> Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
> ---
>  MAINTAINERS | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a59191e..f996c2e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -68,6 +68,10 @@ F: lib/librte_compat/
>  F: doc/guides/rel_notes/deprecation.rst
>  F: scripts/validate-abi.sh
>  
> +Driver information
> +F: buildtools/pmdinfogen/
> +F: tools/pmdinfo.py
> +
>  
>  Environment Abstraction Layer
>  -----------------------------
> -- 
> 2.7.0
> 
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
  2016-07-05 18:16  2% [dpdk-dev] [RFC] Generic flow director/filtering/classification API Adrien Mazarguil
  2016-07-07  7:14  0% ` Lu, Wenzhuo
@ 2016-07-07 23:15  0% ` Chandran, Sugesh
  2016-07-08 13:03  0%   ` Adrien Mazarguil
  2016-07-08 11:11  0% ` Liang, Cunming
  2 siblings, 1 reply; 200+ results
From: Chandran, Sugesh @ 2016-07-07 23:15 UTC (permalink / raw)
  To: Adrien Mazarguil, dev
  Cc: Thomas Monjalon, Zhang, Helin, Wu, Jingjing, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Lu, Wenzhuo, Jan Medala,
	John Daley, Chen, Jing D, Ananyev, Konstantin, Matej Vido,
	Alejandro Lucero, Sony Chacko, Jerin Jacob, De Lara Guarch,
	Pablo, Olga Shern

Hi Adrien,

Thank you for proposing this. It would be really useful for application such as OVS-DPDK.
Please find my comments and questions inline below prefixed with [Sugesh]. Most of them are from the perspective of enabling these APIs in application such as OVS-DPDK.

Regards
_Sugesh


> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Adrien Mazarguil
> Sent: Tuesday, July 5, 2016 7:17 PM
> To: dev@dpdk.org
> Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; Zhang, Helin
> <helin.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Rasesh
> Mody <rasesh.mody@qlogic.com>; Ajit Khaparde
> <ajit.khaparde@broadcom.com>; Rahul Lakkireddy
> <rahul.lakkireddy@chelsio.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>;
> Jan Medala <jan@semihalf.com>; John Daley <johndale@cisco.com>; Chen,
> Jing D <jing.d.chen@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Matej Vido <matejvido@gmail.com>;
> Alejandro Lucero <alejandro.lucero@netronome.com>; Sony Chacko
> <sony.chacko@qlogic.com>; Jerin Jacob
> <jerin.jacob@caviumnetworks.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>; Olga Shern <olgas@mellanox.com>
> Subject: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
> 
> Hi All,
> 
> First, forgive me for this large message, I know our mailboxes already
> suffer quite a bit from the amount of traffic on this ML.
> 
> This is not exactly yet another thread about how flow director should be
> extended, rather about a brand new API to handle filtering and
> classification for incoming packets in the most PMD-generic and
> application-friendly fashion we can come up with. Reasons described below.
> 
> I think this topic is important enough to include both the users of this API
> as well as PMD maintainers. So far I have CC'ed librte_ether (especially
> rte_eth_ctrl.h contributors), testpmd and PMD maintainers (with and
> without
> a .filter_ctrl implementation), but if you know application maintainers
> other than testpmd who use FDIR or might be interested in this discussion,
> feel free to add them.
> 
> The issues we found with the current approach are already summarized in
> the
> following document, but here is a quick summary for TL;DR folks:
> 
> - PMDs do not expose a common set of filter types and even when they do,
>   their behavior more or less differs.
> 
> - Applications need to determine and adapt to device-specific limitations
>   and quirks on their own, without help from PMDs.
> 
> - Writing an application that creates flow rules targeting all devices
>   supported by DPDK is thus difficult, if not impossible.
> 
> - The current API has too many unspecified areas (particularly regarding
>   side effects of flow rules) that make PMD implementation tricky.
> 
> This RFC API handles everything currently supported by .filter_ctrl, the
> idea being to reimplement all of these to make them fully usable by
> applications in a more generic and well defined fashion. It has a very small
> set of mandatory features and an easy method to let applications probe for
> supported capabilities.
> 
> The only downside is more work for the software control side of PMDs
> because
> they have to adapt to the API instead of the reverse. I think helpers can be
> added to EAL to assist with this.
> 
> HTML version:
> 
>  https://rawgit.com/6WIND/rte_flow/master/rte_flow.html
> 
> PDF version:
> 
>  https://rawgit.com/6WIND/rte_flow/master/rte_flow.pdf
> 
> Related draft header file (for reference while reading the specification):
> 
>  https://raw.githubusercontent.com/6WIND/rte_flow/master/rte_flow.h
> 
> Git tree for completeness (latest .rst version can be retrieved from here):
> 
>  https://github.com/6WIND/rte_flow
> 
> What follows is the ReST source of the above, for inline comments and
> discussion. I intend to update that specification accordingly.
> 
> ========================
> Generic filter interface
> ========================
> 
> .. footer::
> 
>    v0.6
> 
> .. contents::
> .. sectnum::
> .. raw:: pdf
> 
>    PageBreak
> 
> Overview
> ========
> 
> DPDK provides several competing interfaces added over time to perform
> packet
> matching and related actions such as filtering and classification.
> 
> They must be extended to implement the features supported by newer
> devices
> in order to expose them to applications, however the current design has
> several drawbacks:
> 
> - Complicated filter combinations which have not been hard-coded cannot be
>   expressed.
> - Prone to API/ABI breakage when new features must be added to an
> existing
>   filter type, which frequently happens.
> 
> From an application point of view:
> 
> - Having disparate interfaces, all optional and lacking in features does not
>   make this API easy to use.
> - Seemingly arbitrary built-in limitations of filter types based on the
>   device they were initially designed for.
> - Undefined relationship between different filter types.
> - High complexity, considerable undocumented and/or undefined behavior.
> 
> Considering the growing number of devices supported by DPDK, adding a
> new
> filter type each time a new feature must be implemented is not sustainable
> in the long term. Applications not written to target a specific device
> cannot really benefit from such an API.
> 
> For these reasons, this document defines an extensible unified API that
> encompasses and supersedes these legacy filter types.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Current API
> ===========
> 
> Rationale
> ---------
> 
> The reason several competing (and mostly overlapping) filtering APIs are
> present in DPDK is due to its nature as a thin layer between hardware and
> software.
> 
> Each subsequent interface has been added to better match the capabilities
> and limitations of the latest supported device, which usually happened to
> need an incompatible configuration approach. Because of this, many ended
> up
> device-centric and not usable by applications that were not written for that
> particular device.
> 
> This document is not the first attempt to address this proliferation issue,
> in fact a lot of work has already been done both to create a more generic
> interface while somewhat keeping compatibility with legacy ones through a
> common call interface (``rte_eth_dev_filter_ctrl()`` with the
> ``.filter_ctrl`` PMD callback in ``rte_ethdev.h``).
> 
> Today, these previously incompatible interfaces are known as filter types
> (``RTE_ETH_FILTER_*`` from ``enum rte_filter_type`` in ``rte_eth_ctrl.h``).
> 
> However while trivial to extend with new types, it only shifted the
> underlying problem as applications still need to be written for one kind of
> filter type, which, as described in the following sections, is not
> necessarily implemented by all PMDs that support filtering.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Filter types
> ------------
> 
> This section summarizes the capabilities of each filter type.
> 
> Although the following list is exhaustive, the description of individual
> types may contain inaccuracies due to the lack of documentation or usage
> examples.
> 
> Note: names are prefixed with ``RTE_ETH_FILTER_``.
> 
> ``MACVLAN``
> ~~~~~~~~~~~
> 
> Matching:
> 
> - L2 source/destination addresses.
> - Optional 802.1Q VLAN ID.
> - Masking individual fields on a rule basis is not supported.
> 
> Action:
> 
> - Packets are redirected either to a given VF device using its ID or to the
>   PF.
> 
> ``ETHERTYPE``
> ~~~~~~~~~~~~~
> 
> Matching:
> 
> - L2 source/destination addresses (optional).
> - Ethertype (no VLAN ID?).
> - Masking individual fields on a rule basis is not supported.
> 
> Action:
> 
> - Receive packets on a given queue.
> - Drop packets.
> 
> ``FLEXIBLE``
> ~~~~~~~~~~~~
> 
> Matching:
> 
> - At most 128 consecutive bytes anywhere in packets.
> - Masking is supported with byte granularity.
> - Priorities are supported (relative to this filter type, undefined
>   otherwise).
> 
> Action:
> 
> - Receive packets on a given queue.
> 
> ``SYN``
> ~~~~~~~
> 
> Matching:
> 
> - TCP SYN packets only.
> - One high priority bit can be set to give the highest possible priority to
>   this type when other filters with different types are configured.
> 
> Action:
> 
> - Receive packets on a given queue.
> 
> ``NTUPLE``
> ~~~~~~~~~~
> 
> Matching:
> 
> - Source/destination IPv4 addresses (optional in 2-tuple mode).
> - Source/destination TCP/UDP port (mandatory in 2 and 5-tuple modes).
> - L4 protocol (2 and 5-tuple modes).
> - Masking individual fields is supported.
> - TCP flags.
> - Up to 7 levels of priority relative to this filter type, undefined
>   otherwise.
> - No IPv6.
> 
> Action:
> 
> - Receive packets on a given queue.
> 
> ``TUNNEL``
> ~~~~~~~~~~
> 
> Matching:
> 
> - Outer L2 source/destination addresses.
> - Inner L2 source/destination addresses.
> - Inner VLAN ID.
> - IPv4/IPv6 source (destination?) address.
> - Tunnel type to match (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
> 802.1BR
>   E-Tag).
> - Tenant ID for tunneling protocols that have one.
> - Any combination of the above can be specified.
> - Masking individual fields on a rule basis is not supported.
> 
> Action:
> 
> - Receive packets on a given queue.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``FDIR``
> ~~~~~~~~
> 
> Queries:
> 
> - Device capabilities and limitations.
> - Device statistics about configured filters (resource usage, collisions).
> - Device configuration (matching input set and masks)
> 
> Matching:
> 
> - Device mode of operation: none (to disable filtering), signature
>   (hash-based dispatching from masked fields) or perfect (either MAC VLAN
> or
>   tunnel).
> - L2 Ethertype.
> - Outer L2 destination address (MAC VLAN mode).
> - Inner L2 destination address, tunnel type (NVGRE, VXLAN) and tunnel ID
>   (tunnel mode).
> - IPv4 source/destination addresses, ToS, TTL and protocol fields.
> - IPv6 source/destination addresses, TC, protocol and hop limits fields.
> - UDP source/destination IPv4/IPv6 and ports.
> - TCP source/destination IPv4/IPv6 and ports.
> - SCTP source/destination IPv4/IPv6, ports and verification tag field.
> - Note, only one protocol type at once (either only L2 Ethertype, basic
>   IPv6, IPv4+UDP, IPv4+TCP and so on).
> - VLAN TCI (extended API).
> - At most 16 bytes to match in payload (extended API). A global device
>   look-up table specifies for each possible protocol layer (unknown, raw,
>   L2, L3, L4) the offset to use for each byte (they do not need to be
>   contiguous) and the related bitmask.
> - Whether packet is addressed to PF or VF, in that case its ID can be
>   matched as well (extended API).
> - Masking most of the above fields is supported, but simultaneously affects
>   all filters configured on a device.
> - Input set can be modified in a similar fashion for a given device to
>   ignore individual fields of filters (i.e. do not match the destination
>   address in a IPv4 filter, refer to **RTE_ETH_INPUT_SET_**
>   macros). Configuring this also affects RSS processing on **i40e**.
> - Filters can also provide 32 bits of arbitrary data to return as part of
>   matched packets.
> 
> Action:
> 
> - **RTE_ETH_FDIR_ACCEPT**: receive (accept) packet on a given queue.
> - **RTE_ETH_FDIR_REJECT**: drop packet immediately.
> - **RTE_ETH_FDIR_PASSTHRU**: similar to accept for the last filter in list,
>   otherwise process it with subsequent filters.
> - For accepted packets and if requested by filter, either 32 bits of
>   arbitrary data and four bytes of matched payload (only in case of flex
>   bytes matching), or eight bytes of matched payload (flex also) are added
>   to meta data.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``HASH``
> ~~~~~~~~
> 
> Not an actual filter type. Provides and retrieves the global device
> configuration (per port or entire NIC) for hash functions and their
> properties.
> 
> Hash function selection: "default" (keep current), XOR or Toeplitz.
> 
> This function can be configured per flow type (**RTE_ETH_FLOW_**
> definitions), supported types are:
> 
> - Unknown.
> - Raw.
> - Fragmented or non-fragmented IPv4.
> - Non-fragmented IPv4 with L4 (TCP, UDP, SCTP or other).
> - Fragmented or non-fragmented IPv6.
> - Non-fragmented IPv6 with L4 (TCP, UDP, SCTP or other).
> - L2 payload.
> - IPv6 with extensions.
> - IPv6 with L4 (TCP, UDP) and extensions.
> 
> ``L2_TUNNEL``
> ~~~~~~~~~~~~~
> 
> Matching:
> 
> - All packets received on a given port.
> 
> Action:
> 
> - Add tunnel encapsulation (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
>   802.1BR E-Tag) using the provided Ethertype and tunnel ID (only E-Tag
>   is implemented at the moment).
> - VF ID to use for tag insertion (currently unused).
> - Destination pool for tag based forwarding (pools are IDs that can be
>   affected to ports, duplication occurs if the same ID is shared by several
>   ports of the same NIC).
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Driver support
> --------------
> 
> ======== ======= ========= ======== === ====== ====== ==== ====
> =========
> Driver   MACVLAN ETHERTYPE FLEXIBLE SYN NTUPLE TUNNEL FDIR HASH
> L2_TUNNEL
> ======== ======= ========= ======== === ====== ====== ==== ====
> =========
> bnx2x
> cxgbe
> e1000            yes       yes      yes yes
> ena
> enic                                                  yes
> fm10k
> i40e     yes     yes                           yes    yes  yes
> ixgbe            yes                yes yes           yes       yes
> mlx4
> mlx5                                                  yes
> szedata2
> ======== ======= ========= ======== === ====== ====== ==== ====
> =========
> 
> Flow director
> -------------
> 
> Flow director (FDIR) is the name of the most capable filter type, which
> covers most features offered by others. As such, it is the most widespread
> in PMDs that support filtering (i.e. all of them besides **e1000**).
> 
> It is also the only type that allows an arbitrary 32 bits value provided by
> applications to be attached to a filter and returned with matching packets
> instead of relying on the destination queue to recognize flows.
> 
> Unfortunately, even FDIR requires applications to be aware of low-level
> capabilities and limitations (most of which come directly from **ixgbe** and
> **i40e**):
> 
> - Bitmasks are set globally per device (port?), not per filter.
[Sugesh] This means application cannot define filters that matches on arbitrary different offsets?
If that’s the case, I assume the application has to program bitmask in advance. Otherwise how 
the API framework deduce this bitmask information from the rules?? Its not very clear to me
that how application pass down the bitmask information for multiple filters on same port?
> - Configuration state is not expected to be saved by the driver, and
>   stopping/restarting a port requires the application to perform it again
>   (API documentation is also unclear about this).
> - Monolithic approach with ABI issues as soon as a new kind of flow or
>   combination needs to be supported.
> - Cryptic global statistics/counters.
> - Unclear about how priorities are managed; filters seem to be arranged as a
>   linked list in hardware (possibly related to configuration order).
> 
> Packet alteration
> -----------------
> 
> One interesting feature is that the L2 tunnel filter type implements the
> ability to alter incoming packets through a filter (in this case to
> encapsulate them), thus the **mlx5** flow encap/decap features are not a
> foreign concept.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Proposed API
> ============
> 
> Terminology
> -----------
> 
> - **Filtering API**: overall framework affecting the fate of selected
>   packets, covers everything described in this document.
> - **Matching pattern**: properties to look for in received packets, a
>   combination of any number of items.
> - **Pattern item**: part of a pattern that either matches packet data
>   (protocol header, payload or derived information), or specifies properties
>   of the pattern itself.
> - **Actions**: what needs to be done when a packet matches a pattern.
> - **Flow rule**: this is the result of combining a *matching pattern* with
>   *actions*.
> - **Filter rule**: a less generic term than *flow rule*, can otherwise be
>   used interchangeably.
> - **Hit**: a flow rule is said to be *hit* when processing a matching
>   packet.
> 
> Requirements
> ------------
> 
> As described in the previous section, there is a growing need for a common
> method to configure filtering and related actions in a hardware independent
> fashion.
> 
> The filtering API should not disallow any filter combination by design and
> must remain as simple as possible to use. It can simply be defined as a
> method to perform one or several actions on selected packets.
> 
> PMDs are aware of the capabilities of the device they manage and should be
> responsible for preventing unsupported or conflicting combinations.
> 
> This approach is fundamentally different as it places most of the burden on
> the software side of the PMD instead of having device capabilities directly
> mapped to API functions, then expecting applications to work around
> ensuing
> compatibility issues.
> 
> Requirements for a new API:
> 
> - Flexible and extensible without causing API/ABI problems for existing
>   applications.
> - Should be unambiguous and easy to use.
> - Support existing filtering features and actions listed in `Filter types`_.
> - Support packet alteration.
> - In case of overlapping filters, their priority should be well documented.
> - Support filter queries (for example to retrieve counters).
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> High level design
> -----------------
> 
> The chosen approach to make filtering as generic as possible is by
> expressing matching patterns through lists of items instead of the flat
> structures used in DPDK today, enabling combinations that are not
> predefined
> and thus being more versatile.
> 
> Flow rules can have several distinct actions (such as counting,
> encapsulating, decapsulating before redirecting packets to a particular
> queue, etc.), instead of relying on several rules to achieve this and having
> applications deal with hardware implementation details regarding their
> order.
> 
> Support for different priority levels on a rule basis is provided, for
> example in order to force a more specific rule come before a more generic
> one for packets matched by both, however hardware support for more than
> a
> single priority level cannot be guaranteed. When supported, the number of
> available priority levels is usually low, which is why they can also be
> implemented in software by PMDs (e.g. to simulate missing priority levels by
> reordering rules).
> 
> In order to remain as hardware agnostic as possible, by default all rules
> are considered to have the same priority, which means that the order
> between
> overlapping rules (when a packet is matched by several filters) is
> undefined, packet duplication may even occur as a result.
> 
> PMDs may refuse to create overlapping rules at a given priority level when
> they can be detected (e.g. if a pattern matches an existing filter).
> 
> Thus predictable results for a given priority level can only be achieved
> with non-overlapping rules, using perfect matching on all protocol layers.
> 
> Support for multiple actions per rule may be implemented internally on top
> of non-default hardware priorities, as a result both features may not be
> simultaneously available to applications.
> 
> Considering that allowed pattern/actions combinations cannot be known in
> advance and would result in an unpractically large number of capabilities to
> expose, a method is provided to validate a given rule from the current
> device configuration state without actually adding it (akin to a "dry run"
> mode).
> 
> This enables applications to check if the rule types they need is supported
> at initialization time, before starting their data path. This method can be
> used anytime, its only requirement being that the resources needed by a
> rule
> must exist (e.g. a target RX queue must be configured first).
> 
> Each defined rule is associated with an opaque handle managed by the PMD,
> applications are responsible for keeping it. These can be used for queries
> and rules management, such as retrieving counters or other data and
> destroying them.
> 
> Handles must be destroyed before releasing associated resources such as
> queues.
> 
> Integration
> -----------
> 
> To avoid ABI breakage, this new interface will be implemented through the
> existing filtering control framework (``rte_eth_dev_filter_ctrl()``) using
> **RTE_ETH_FILTER_GENERIC** as a new filter type.
> 
> However a public front-end API described in `Rules management`_ will
> be added as the preferred method to use it.
> 
> Once discussions with the community have converged to a definite API,
> legacy
> filter types should be deprecated and a deadline defined to remove their
> support entirely.
> 
> PMDs will have to be gradually converted to **RTE_ETH_FILTER_GENERIC**
> or
> drop filtering support entirely. Less maintained PMDs for older hardware
> may
> lose support at this point.
> 
> The notion of filter type will then be deprecated and subsequently dropped
> to avoid confusion between both frameworks.
> 
> Implementation details
> ======================
> 
> Flow rule
> ---------
> 
> A flow rule is the combination of a matching pattern with a list of actions,
> and is the basis of this API.
> 
> Priorities
> ~~~~~~~~~~
> 
> A priority can be assigned to a matching pattern.
> 
> The default priority level is 0 and is also the highest. Support for more
> than a single priority level in hardware is not guaranteed.
> 
> If a packet is matched by several filters at a given priority level, the
> outcome is undefined. It can take any path and can even be duplicated.
> 
> Matching pattern
> ~~~~~~~~~~~~~~~~
> 
> A matching pattern comprises any number of items of various types.
> 
> Items are arranged in a list to form a matching pattern for packets. They
> fall in two categories:
> 
> - Protocol matching (ANY, RAW, ETH, IPV4, IPV6, ICMP, UDP, TCP, VXLAN and
> so
>   on), usually associated with a specification structure. These must be
>   stacked in the same order as the protocol layers to match, starting from
>   L2.
> 
> - Affecting how the pattern is processed (END, VOID, INVERT, PF, VF,
>   SIGNATURE and so on), often without a specification structure. Since they
>   are meta data that does not match packet contents, these can be specified
>   anywhere within item lists without affecting the protocol matching items.
> 
> Most item specifications can be optionally paired with a mask to narrow the
> specific fields or bits to be matched.
> 
> - Items are defined with ``struct rte_flow_item``.
> - Patterns are defined with ``struct rte_flow_pattern``.
> 
> Example of an item specification matching an Ethernet header:
> 
> +-----------------------------------------+
> | Ethernet                                |
> +==========+=========+====================+
> | ``spec`` | ``src`` | ``00:01:02:03:04`` |
> |          +---------+--------------------+
> |          | ``dst`` | ``00:2a:66:00:01`` |
> +----------+---------+--------------------+
> | ``mask`` | ``src`` | ``00:ff:ff:ff:00`` |
> |          +---------+--------------------+
> |          | ``dst`` | ``00:00:00:00:ff`` |
> +----------+---------+--------------------+
> 
> Non-masked bits stand for any value, Ethernet headers with the following
> properties are thus matched:
> 
> - ``src``: ``??:01:02:03:??``
> - ``dst``: ``??:??:??:??:01``
> 
> Except for meta types that do not need one, ``spec`` must be a valid pointer
> to a structure of the related item type. A ``mask`` of the same type can be
> provided to tell which bits in ``spec`` are to be matched.
> 
> A mask is normally only needed for ``spec`` fields matching packet data,
> ignored otherwise. See individual item types for more information.
> 
> A ``NULL`` mask pointer is allowed and is similar to matching with a full
> mask (all ones) ``spec`` fields supported by hardware, the remaining fields
> are ignored (all zeroes), there is thus no error checking for unsupported
> fields.
> 
> Matching pattern items for packet data must be naturally stacked (ordered
> from lowest to highest protocol layer), as in the following examples:
> 
> +--------------+
> | TCPv4 as L4  |
> +===+==========+
> | 0 | Ethernet |
> +---+----------+
> | 1 | IPv4     |
> +---+----------+
> | 2 | TCP      |
> +---+----------+
> 
> +----------------+
> | TCPv6 in VXLAN |
> +===+============+
> | 0 | Ethernet   |
> +---+------------+
> | 1 | IPv4       |
> +---+------------+
> | 2 | UDP        |
> +---+------------+
> | 3 | VXLAN      |
> +---+------------+
> | 4 | Ethernet   |
> +---+------------+
> | 5 | IPv6       |
> +---+------------+
> | 6 | TCP        |
> +---+------------+
> 
> +-----------------------------+
> | TCPv4 as L4 with meta items |
> +===+=========================+
> | 0 | VOID                    |
> +---+-------------------------+
> | 1 | Ethernet                |
> +---+-------------------------+
> | 2 | VOID                    |
> +---+-------------------------+
> | 3 | IPv4                    |
> +---+-------------------------+
> | 4 | TCP                     |
> +---+-------------------------+
> | 5 | VOID                    |
> +---+-------------------------+
> | 6 | VOID                    |
> +---+-------------------------+
> 
> The above example shows how meta items do not affect packet data
> matching
> items, as long as those remain stacked properly. The resulting matching
> pattern is identical to "TCPv4 as L4".
> 
> +----------------+
> | UDPv6 anywhere |
> +===+============+
> | 0 | IPv6       |
> +---+------------+
> | 1 | UDP        |
> +---+------------+
> 
> If supported by the PMD, omitting one or several protocol layers at the
> bottom of the stack as in the above example (missing an Ethernet
> specification) enables hardware to look anywhere in packets.
> 
> It is unspecified whether the payload of supported encapsulations
> (e.g. VXLAN inner packet) is matched by such a pattern, which may apply to
> inner, outer or both packets.
> 
> +---------------------+
> | Invalid, missing L3 |
> +===+=================+
> | 0 | Ethernet        |
> +---+-----------------+
> | 1 | UDP             |
> +---+-----------------+
> 
> The above pattern is invalid due to a missing L3 specification between L2
> and L4. It is only allowed at the bottom and at the top of the stack.
> 
> Meta item types
> ~~~~~~~~~~~~~~~
> 
> These do not match packet data but affect how the pattern is processed,
> most
> of them do not need a specification structure. This particularity allows
> them to be specified anywhere without affecting other item types.
> 
> ``END``
> ^^^^^^^
> 
> End marker for item lists. Prevents further processing of items, thereby
> ending the pattern.
> 
> - Its numeric value is **0** for convenience.
> - PMD support is mandatory.
> - Both ``spec`` and ``mask`` are ignored.
> 
> +--------------------+
> | END                |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
> 
> ``VOID``
> ^^^^^^^^
> 
> Used as a placeholder for convenience. It is ignored and simply discarded by
> PMDs.
> 
> - PMD support is mandatory.
> - Both ``spec`` and ``mask`` are ignored.
> 
> +--------------------+
> | VOID               |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
> 
> One usage example for this type is generating rules that share a common
> prefix quickly without reallocating memory, only by updating item types:
> 
> +------------------------+
> | TCP, UDP or ICMP as L4 |
> +===+====================+
> | 0 | Ethernet           |
> +---+--------------------+
> | 1 | IPv4               |
> +---+------+------+------+
> | 2 | UDP  | VOID | VOID |
> +---+------+------+------+
> | 3 | VOID | TCP  | VOID |
> +---+------+------+------+
> | 4 | VOID | VOID | ICMP |
> +---+------+------+------+
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``INVERT``
> ^^^^^^^^^^
> 
> Inverted matching, i.e. process packets that do not match the pattern.
> 
> - Both ``spec`` and ``mask`` are ignored.
> 
> +--------------------+
> | INVERT             |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
> 
> Usage example in order to match non-TCPv4 packets only:
> 
> +--------------------+
> | Anything but TCPv4 |
> +===+================+
> | 0 | INVERT         |
> +---+----------------+
> | 1 | Ethernet       |
> +---+----------------+
> | 2 | IPv4           |
> +---+----------------+
> | 3 | TCP            |
> +---+----------------+
> 
> ``PF``
> ^^^^^^
> 
> Matches packets addressed to the physical function of the device.
> 
> - Both ``spec`` and ``mask`` are ignored.
> 
> +--------------------+
> | PF                 |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
> 
> ``VF``
> ^^^^^^
> 
> Matches packets addressed to the given virtual function ID of the device.
> 
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
> 
> +----------------------------------------+
> | VF                                     |
> +==========+=========+===================+
> | ``spec`` | ``vf``  | destination VF ID |
> +----------+---------+-------------------+
> | ``mask`` | ignored                     |
> +----------+-----------------------------+
> 
> ``SIGNATURE``
> ^^^^^^^^^^^^^
> 
> Requests hash-based signature dispatching for this rule.
> 
> Considering this is a global setting on devices that support it, all
> subsequent filter rules may have to be created with it as well.
> 
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
> 
> +--------------------+
> | SIGNATURE          |
> +==========+=========+
> | ``spec`` | TBD     |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Data matching item types
> ~~~~~~~~~~~~~~~~~~~~~~~~
> 
> Most of these are basically protocol header definitions with associated
> bitmasks. They must be specified (stacked) from lowest to highest protocol
> layer.
> 
> The following list is not exhaustive as new protocols will be added in the
> future.
> 
> ``ANY``
> ^^^^^^^
> 
> Matches any protocol in place of the current layer, a single ANY may also
> stand for several protocol layers.
> 
> This is usually specified as the first pattern item when looking for a
> protocol anywhere in a packet.
> 
> - A maximum value of **0** requests matching any number of protocol
> layers
>   above or equal to the minimum value, a maximum value lower than the
>   minimum one is otherwise invalid.
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
> 
> +-----------------------------------------------------------------------+
> | ANY                                                                   |
> +==========+=========+====================================
> ==============+
> | ``spec`` | ``min`` | minimum number of layers covered                 |
> |          +---------+--------------------------------------------------+
> |          | ``max`` | maximum number of layers covered, 0 for infinity |
> +----------+---------+--------------------------------------------------+
> | ``mask`` | ignored                                                    |
> +----------+------------------------------------------------------------+
> 
> Example for VXLAN TCP payload matching regardless of outer L3 (IPv4 or
> IPv6)
> and L4 (UDP) both matched by the first ANY specification, and inner L3 (IPv4
> or IPv6) matched by the second ANY specification:
> 
> +----------------------------------+
> | TCP in VXLAN with wildcards      |
> +===+==============================+
> | 0 | Ethernet                     |
> +---+-----+----------+---------+---+
> | 1 | ANY | ``spec`` | ``min`` | 2 |
> |   |     |          +---------+---+
> |   |     |          | ``max`` | 2 |
> +---+-----+----------+---------+---+
> | 2 | VXLAN                        |
> +---+------------------------------+
> | 3 | Ethernet                     |
> +---+-----+----------+---------+---+
> | 4 | ANY | ``spec`` | ``min`` | 1 |
> |   |     |          +---------+---+
> |   |     |          | ``max`` | 1 |
> +---+-----+----------+---------+---+
> | 5 | TCP                          |
> +---+------------------------------+
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``RAW``
> ^^^^^^^
> 
> Matches a string of a given length at a given offset (in bytes), or anywhere
> in the payload of the current protocol layer (including L2 header if used as
> the first item in the stack).
> 
> This does not increment the protocol layer count as it is not a protocol
> definition. Subsequent RAW items modulate the first absolute one with
> relative offsets.
> 
> - Using **-1** as the ``offset`` of the first RAW item makes its absolute
>   offset not fixed, i.e. the pattern is searched everywhere.
> - ``mask`` only affects the pattern.
> 
> +--------------------------------------------------------------+
> | RAW                                                          |
> +==========+=============+================================
> =====+
> | ``spec`` | ``offset``  | absolute or relative pattern offset |
> |          +-------------+-------------------------------------+
> |          | ``length``  | pattern length                      |
> |          +-------------+-------------------------------------+
> |          | ``pattern`` | byte string of the above length     |
> +----------+-------------+-------------------------------------+
> | ``mask`` | ``offset``  | ignored                             |
> |          +-------------+-------------------------------------+
> |          | ``length``  | ignored                             |
> |          +-------------+-------------------------------------+
> |          | ``pattern`` | bitmask with the same byte length   |
> +----------+-------------+-------------------------------------+
> 
> Example pattern looking for several strings at various offsets of a UDP
> payload, using combined RAW items:
> 
> +------------------------------------------+
> | UDP payload matching                     |
> +===+======================================+
> | 0 | Ethernet                             |
> +---+--------------------------------------+
> | 1 | IPv4                                 |
> +---+--------------------------------------+
> | 2 | UDP                                  |
> +---+-----+----------+-------------+-------+
> | 3 | RAW | ``spec`` | ``offset``  | -1    |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "foo" |
> +---+-----+----------+-------------+-------+
> | 4 | RAW | ``spec`` | ``offset``  | 20    |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "bar" |
> +---+-----+----------+-------------+-------+
> | 5 | RAW | ``spec`` | ``offset``  | -30   |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "baz" |
> +---+-----+----------+-------------+-------+
> 
> This translates to:
> 
> - Locate "foo" in UDP payload, remember its offset.
> - Check "bar" at "foo"'s offset plus 20 bytes.
> - Check "baz" at "foo"'s offset minus 30 bytes.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``ETH``
> ^^^^^^^
> 
> Matches an Ethernet header.
> 
> - ``dst``: destination MAC.
> - ``src``: source MAC.
> - ``type``: EtherType.
> - ``tags``: number of 802.1Q/ad tags defined.
> - ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:
> 
>  - ``tpid``: Tag protocol identifier.
>  - ``tci``: Tag control information.
> 
> ``IPV4``
> ^^^^^^^^
> 
> Matches an IPv4 header.
> 
> - ``src``: source IP address.
> - ``dst``: destination IP address.
> - ``tos``: ToS/DSCP field.
> - ``ttl``: TTL field.
> - ``proto``: protocol number for the next layer.
> 
> ``IPV6``
> ^^^^^^^^
> 
> Matches an IPv6 header.
> 
> - ``src``: source IP address.
> - ``dst``: destination IP address.
> - ``tc``: traffic class field.
> - ``nh``: Next header field (protocol).
> - ``hop_limit``: hop limit field (TTL).
> 
> ``ICMP``
> ^^^^^^^^
> 
> Matches an ICMP header.
> 
> - TBD.
> 
> ``UDP``
> ^^^^^^^
> 
> Matches a UDP header.
> 
> - ``sport``: source port.
> - ``dport``: destination port.
> - ``length``: UDP length.
> - ``checksum``: UDP checksum.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``TCP``
> ^^^^^^^
> 
> Matches a TCP header.
> 
> - ``sport``: source port.
> - ``dport``: destination port.
> - All other TCP fields and bits.
> 
> ``VXLAN``
> ^^^^^^^^^
> 
> Matches a VXLAN header.
> 
> - TBD.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Actions
> ~~~~~~~
> 
> Each possible action is represented by a type. Some have associated
> configuration structures. Several actions combined in a list can be affected
> to a flow rule. That list is not ordered.
> 
> At least one action must be defined in a filter rule in order to do
> something with matched packets.
> 
> - Actions are defined with ``struct rte_flow_action``.
> - A list of actions is defined with ``struct rte_flow_actions``.
> 
> They fall in three categories:
> 
> - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>   processing matched packets by subsequent flow rules, unless overridden
>   with PASSTHRU.
> 
> - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
> for
>   additional processing by subsequent flow rules.
> 
> - Other non terminating meta actions that do not affect the fate of packets
>   (END, VOID, ID, COUNT).
> 
> When several actions are combined in a flow rule, they should all have
> different types (e.g. dropping a packet twice is not possible). However
> considering the VOID type is an exception to this rule, the defined behavior
> is for PMDs to only take into account the last action of a given type found
> in the list. PMDs still perform error checking on the entire list.
> 
> *Note that PASSTHRU is the only action able to override a terminating rule.*
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Example of an action that redirects packets to queue index 10:
> 
> +----------------+
> | QUEUE          |
> +===========+====+
> | ``queue`` | 10 |
> +-----------+----+
> 
> Action lists examples, their order is not significant, applications must
> consider all actions to be performed simultaneously:
> 
> +----------------+
> | Count and drop |
> +=======+========+
> | COUNT |        |
> +-------+--------+
> | DROP  |        |
> +-------+--------+
> 
> +--------------------------+
> | Tag, count and redirect  |
> +=======+===========+======+
> | ID    | ``id``    | 0x2a |
> +-------+-----------+------+
> | COUNT |                  |
> +-------+-----------+------+
> | QUEUE | ``queue`` | 10   |
> +-------+-----------+------+
> 
> +-----------------------+
> | Redirect to queue 5   |
> +=======+===============+
> | DROP  |               |
> +-------+-----------+---+
> | QUEUE | ``queue`` | 5 |
> +-------+-----------+---+
> 
> In the above example, considering both actions are performed
> simultaneously,
> its end result is that only QUEUE has any effect.
> 
> +-----------------------+
> | Redirect to queue 3   |
> +=======+===========+===+
> | QUEUE | ``queue`` | 5 |
> +-------+-----------+---+
> | VOID  |               |
> +-------+-----------+---+
> | QUEUE | ``queue`` | 3 |
> +-------+-----------+---+
> 
> As previously described, only the last action of a given type found in the
> list is taken into account. The above example also shows that VOID is
> ignored.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Action types
> ~~~~~~~~~~~~
> 
> Common action types are described in this section. Like pattern item types,
> this list is not exhaustive as new actions will be added in the future.
> 
> ``END`` (action)
> ^^^^^^^^^^^^^^^^
> 
> End marker for action lists. Prevents further processing of actions, thereby
> ending the list.
> 
> - Its numeric value is **0** for convenience.
> - PMD support is mandatory.
> - No configurable property.
> 
> +---------------+
> | END           |
> +===============+
> | no properties |
> +---------------+
> 
> ``VOID`` (action)
> ^^^^^^^^^^^^^^^^^
> 
> Used as a placeholder for convenience. It is ignored and simply discarded by
> PMDs.
> 
> - PMD support is mandatory.
> - No configurable property.
> 
> +---------------+
> | VOID          |
> +===============+
> | no properties |
> +---------------+
> 
> ``PASSTHRU``
> ^^^^^^^^^^^^
> 
> Leaves packets up for additional processing by subsequent flow rules. This
> is the default when a rule does not contain a terminating action, but can be
> specified to force a rule to become non-terminating.
> 
> - No configurable property.
> 
> +---------------+
> | PASSTHRU      |
> +===============+
> | no properties |
> +---------------+
> 
> Example to copy a packet to a queue and continue processing by subsequent
> flow rules:
[Sugesh] If a packet get copied to a queue, it’s a termination action. 
How can its possible to do subsequent action after the packet already 
moved to the queue. ?How it differs from DUP action?
 Am I missing anything here? 
> 
> +--------------------------+
> | Copy to queue 8          |
> +==========+===============+
> | PASSTHRU |               |
> +----------+-----------+---+
> | QUEUE    | ``queue`` | 8 |
> +----------+-----------+---+
> 
> ``ID``
> ^^^^^^
> 
> Attaches a 32 bit value to packets.
> 
> +----------------------------------------------+
> | ID                                           |
> +========+=====================================+
> | ``id`` | 32 bit value to return with packets |
> +--------+-------------------------------------+
> 
[Sugesh] I assume the application has to program the flow 
with a unique ID and matching packets are stamped with this ID
when reporting to the software. The uniqueness of ID is NOT 
guaranteed by the API framework. Correct me if I am wrong here.

[Sugesh] Is it a limitation to use only 32 bit ID? Is it possible to have a
64 bit ID? So that application can use the control plane flow pointer
Itself as an ID. Does it make sense? 


> .. raw:: pdf
> 
>    PageBreak
> 
> ``QUEUE``
> ^^^^^^^^^
> 
> Assigns packets to a given queue index.
> 
> - Terminating by default.
> 
> +--------------------------------+
> | QUEUE                          |
> +===========+====================+
> | ``queue`` | queue index to use |
> +-----------+--------------------+
> 
> ``DROP``
> ^^^^^^^^
> 
> Drop packets.
> 
> - No configurable property.
> - Terminating by default.
> - PASSTHRU overrides this action if both are specified.
> 
> +---------------+
> | DROP          |
> +===============+
> | no properties |
> +---------------+
> 
> ``COUNT``
> ^^^^^^^^^
> 
[Sugesh] Should we really have to set count action explicitly for every rule?
IMHO it would be great to be an implicit action. Most of the application would be
interested in the stats of almost all the filters/flows .
> Enables hits counter for this rule.
> 
> This counter can be retrieved and reset through ``rte_flow_query()``, see
> ``struct rte_flow_query_count``.
> 
> - Counters can be retrieved with ``rte_flow_query()``.
> - No configurable property.
> 
> +---------------+
> | COUNT         |
> +===============+
> | no properties |
> +---------------+
> 
> Query structure to retrieve and reset the flow rule hits counter:
> 
> +------------------------------------------------+
> | COUNT query                                    |
> +===========+=====+==============================+
> | ``reset`` | in  | reset counter after query    |
> +-----------+-----+------------------------------+
> | ``hits``  | out | number of hits for this flow |
> +-----------+-----+------------------------------+
> 
> ``DUP``
> ^^^^^^^
> 
> Duplicates packets to a given queue index.
> 
> This is normally combined with QUEUE, however when used alone, it is
> actually similar to QUEUE + PASSTHRU.
> 
> - Non-terminating by default.
> 
> +------------------------------------------------+
> | DUP                                            |
> +===========+====================================+
> | ``queue`` | queue index to duplicate packet to |
> +-----------+------------------------------------+
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``RSS``
> ^^^^^^^
> 
> Similar to QUEUE, except RSS is additionally performed on packets to spread
> them among several queues according to the provided parameters.
> 
> - Terminating by default.
> 
> +---------------------------------------------+
> | RSS                                         |
> +==============+==============================+
> | ``rss_conf`` | RSS parameters               |
> +--------------+------------------------------+
> | ``queues``   | number of entries in queue[] |
> +--------------+------------------------------+
> | ``queue[]``  | queue indices to use         |
> +--------------+------------------------------+
> 
> ``PF`` (action)
> ^^^^^^^^^^^^^^^
> 
> Redirects packets to the physical function (PF) of the current device.
> 
> - No configurable property.
> - Terminating by default.
> 
> +---------------+
> | PF            |
> +===============+
> | no properties |
> +---------------+
> 
> ``VF`` (action)
> ^^^^^^^^^^^^^^^
> 
> Redirects packets to the virtual function (VF) of the current device with
> the specified ID.
> 
> - Terminating by default.
> 
> +---------------------------------------+
> | VF                                    |
> +========+==============================+
> | ``id`` | VF ID to redirect packets to |
> +--------+------------------------------+
> 
> Planned types
> ~~~~~~~~~~~~~
> 
> Other action types are planned but not defined yet. These actions will add
> the ability to alter matching packets in several ways, such as performing
> encapsulation/decapsulation of tunnel headers on specific flows.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Rules management
> ----------------
> 
> A simple API with only four functions is provided to fully manage flows.
> 
> Each created flow rule is associated with an opaque, PMD-specific handle
> pointer. The application is responsible for keeping it until the rule is
> destroyed.
> 
> Flows rules are defined with ``struct rte_flow``.
> 
> Validation
> ~~~~~~~~~~
> 
> Given that expressing a definite set of device capabilities with this API is
> not practical, a dedicated function is provided to check if a flow rule is
> supported and can be created.
> 
> ::
> 
>  int
>  rte_flow_validate(uint8_t port_id,
>                    const struct rte_flow_pattern *pattern,
>                    const struct rte_flow_actions *actions);
> 
> While this function has no effect on the target device, the flow rule is
> validated against its current configuration state and the returned value
> should be considered valid by the caller for that state only.
> 
> The returned value is guaranteed to remain valid only as long as no
> successful calls to rte_flow_create() or rte_flow_destroy() are made in the
> meantime and no device parameter affecting flow rules in any way are
> modified, due to possible collisions or resource limitations (although in
> such cases ``EINVAL`` should not be returned).
> 
> Arguments:
> 
> - ``port_id``: port identifier of Ethernet device.
> - ``pattern``: pattern specification to check.
> - ``actions``: actions associated with the flow definition.
> 
> Return value:
> 
> - **0** if flow rule is valid and can be created. A negative errno value
>   otherwise (``rte_errno`` is also set), the following errors are defined.
> - ``-EINVAL``: unknown or invalid rule specification.
> - ``-ENOTSUP``: valid but unsupported rule specification (e.g. partial masks
>   are unsupported).
> - ``-EEXIST``: collision with an existing rule.
> - ``-ENOMEM``: not enough resources.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Creation
> ~~~~~~~~
> 
> Creating a flow rule is similar to validating one, except the rule is
> actually created.
> 
> ::
> 
>  struct rte_flow *
>  rte_flow_create(uint8_t port_id,
>                  const struct rte_flow_pattern *pattern,
>                  const struct rte_flow_actions *actions);
> 
> Arguments:
> 
> - ``port_id``: port identifier of Ethernet device.
> - ``pattern``: pattern specification to add.
> - ``actions``: actions associated with the flow definition.
> 
> Return value:
> 
> A valid flow pointer in case of success, NULL otherwise and ``rte_errno`` is
> set to the positive version of one of the error codes defined for
> ``rte_flow_validate()``.
[Sugesh] : Kind of implementation specific query. What if application
try to add duplicate rules? Does the API create new flow entry for every 
API call? 
[Sugesh] Another concern is the cost and time of installing these rules
in the hardware. Can we make these APIs time bound(or at least an option to
set the time limit to execute these APIs), so that
Application doesn’t have to wait so long when installing and deleting flows with
slow hardware/NIC. What do you think? Most of the datapath flow installations are 
dynamic and triggered only when there is
an ingress traffic. Delay in flow insertion/deletion have unpredictable consequences.

[Sugesh] Another query is on the synchronization part. What if same rules are 
handled from different threads? Is application responsible for handling the concurrent
hardware programming?

> 
> Destruction
> ~~~~~~~~~~~
> 
> Flow rules destruction is not automatic, and a queue should not be released
> if any are still attached to it. Applications must take care of performing
> this step before releasing resources.
> 
> ::
> 
>  int
>  rte_flow_destroy(uint8_t port_id,
>                   struct rte_flow *flow);
> 
> 
[Sugesh] I would suggest having a clean-up API is really useful as the releasing of
Queue(is it applicable for releasing of port too?) is not guaranteeing the automatic flow 
destruction. This way application can initialize the port,
clean-up all the existing rules and create new rules  on a clean slate.

> Failure to destroy a flow rule may occur when other flow rules depend on it,
> and destroying it would result in an inconsistent state.
> 
> This function is only guaranteed to succeed if flow rules are destroyed in
> reverse order of their creation.
> 
> Arguments:
> 
> - ``port_id``: port identifier of Ethernet device.
> - ``flow``: flow rule to destroy.
> 
> Return value:
> 
> - **0** on success, a negative errno value otherwise and ``rte_errno`` is
>   set.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Query
> ~~~~~
> 
> Query an existing flow rule.
> 
> This function allows retrieving flow-specific data such as counters. Data
> is gathered by special actions which must be present in the flow rule
> definition.
> 
> ::
> 
>  int
>  rte_flow_query(uint8_t port_id,
>                 struct rte_flow *flow,
>                 enum rte_flow_action_type action,
>                 void *data);
> 
> Arguments:
> 
> - ``port_id``: port identifier of Ethernet device.
> - ``flow``: flow rule to query.
> - ``action``: action type to query.
> - ``data``: pointer to storage for the associated query data type.
> 
> Return value:
> 
> - **0** on success, a negative errno value otherwise and ``rte_errno`` is
>   set.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Behavior
> --------
> 
> - API operations are synchronous and blocking (``EAGAIN`` cannot be
>   returned).
> 
> - There is no provision for reentrancy/multi-thread safety, although nothing
>   should prevent different devices from being configured at the same
>   time. PMDs may protect their control path functions accordingly.
> 
> - Stopping the data path (TX/RX) should not be necessary when managing
> flow
>   rules. If this cannot be achieved naturally or with workarounds (such as
>   temporarily replacing the burst function pointers), an appropriate error
>   code must be returned (``EBUSY``).
> 
> - PMDs, not applications, are responsible for maintaining flow rules
>   configuration when stopping and restarting a port or performing other
>   actions which may affect them. They can only be destroyed explicitly.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
[Sugesh] Query all the rules for a specific port/queue?? Useful when adding and
deleting ports and queues dynamically according to the need. I am not sure 
what are the other  different usecases for these APIs. But I feel it makes much easier to 
manage flows from the application. What do you think?
> Compatibility
> -------------
> 
> No known hardware implementation supports all the features described in
> this
> document.
> 
> Unsupported features or combinations are not expected to be fully
> emulated
> in software by PMDs for performance reasons. Partially supported features
> may be completed in software as long as hardware performs most of the
> work
> (such as queue redirection and packet recognition).
> 
> However PMDs are expected to do their best to satisfy application requests
> by working around hardware limitations as long as doing so does not affect
> the behavior of existing flow rules.
> 
> The following sections provide a few examples of such cases, they are based
> on limitations built into the previous APIs.
> 
> Global bitmasks
> ~~~~~~~~~~~~~~~
> 
> Each flow rule comes with its own, per-layer bitmasks, while hardware may
> support only a single, device-wide bitmask for a given layer type, so that
> two IPv4 rules cannot use different bitmasks.
> 
> The expected behavior in this case is that PMDs automatically configure
> global bitmasks according to the needs of the first created flow rule.
> 
> Subsequent rules are allowed only if their bitmasks match those, the
> ``EEXIST`` error code should be returned otherwise.
> 
> Unsupported layer types
> ~~~~~~~~~~~~~~~~~~~~~~~
> 
> Many protocols can be simulated by crafting patterns with the `RAW`_ type.
> 
> PMDs can rely on this capability to simulate support for protocols with
> fixed headers not directly recognized by hardware.
> 
> ``ANY`` pattern item
> ~~~~~~~~~~~~~~~~~~~~
> 
> This pattern item stands for anything, which can be difficult to translate
> to something hardware would understand, particularly if followed by more
> specific types.
> 
> Consider the following pattern:
> 
> +---+--------------------------------+
> | 0 | ETHER                          |
> +---+--------------------------------+
> | 1 | ANY (``min`` = 1, ``max`` = 1) |
> +---+--------------------------------+
> | 2 | TCP                            |
> +---+--------------------------------+
> 
> Knowing that TCP does not make sense with something other than IPv4 and
> IPv6
> as L3, such a pattern may be translated to two flow rules instead:
> 
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV4 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
> 
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV6 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
> 
> Note that as soon as a ANY rule covers several layers, this approach may
> yield a large number of hidden flow rules. It is thus suggested to only
> support the most common scenarios (anything as L2 and/or L3).
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> Unsupported actions
> ~~~~~~~~~~~~~~~~~~~
> 
> - When combined with a `QUEUE`_ action, packet counting (`COUNT`_) and
>   tagging (`ID`_) may be implemented in software as long as the target queue
>   is used by a single rule.
> 
> - A rule specifying both `DUP`_ + `QUEUE`_ may be translated to two hidden
>   rules combining `QUEUE`_ and `PASSTHRU`_.
> 
> - When a single target queue is provided, `RSS`_ can also be implemented
>   through `QUEUE`_.
> 
> Flow rules priority
> ~~~~~~~~~~~~~~~~~~~
> 
> While it would naturally make sense, flow rules cannot be assumed to be
> processed by hardware in the same order as their creation for several
> reasons:
> 
> - They may be managed internally as a tree or a hash table instead of a
>   list.
> - Removing a flow rule before adding another one can either put the new
> rule
>   at the end of the list or reuse a freed entry.
> - Duplication may occur when packets are matched by several rules.
> 
> For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
> predictable behavior is only guaranteed by using different priority levels.
> 
> Priority levels are not necessarily implemented in hardware, or may be
> severely limited (e.g. a single priority bit).
> 
> For these reasons, priority levels may be implemented purely in software by
> PMDs.
> 
> - For devices expecting flow rules to be added in the correct order, PMDs
>   may destroy and re-create existing rules after adding a new one with
>   a higher priority.
> 
> - A configurable number of dummy or empty rules can be created at
>   initialization time to save high priority slots for later.
> 
> - In order to save priority levels, PMDs may evaluate whether rules are
>   likely to collide and adjust their priority accordingly.
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> API migration
> =============
> 
> Exhaustive list of deprecated filter types and how to convert them to
> generic flow rules.
> 
> ``MACVLAN`` to ``ETH`` → ``VF``, ``PF``
> ---------------------------------------
> 
> `MACVLAN`_ can be translated to a basic `ETH`_ flow rule with a `VF
> (action)`_ or `PF (action)`_ terminating action.
> 
> +------------------------------------+
> | MACVLAN                            |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | ETH | ``spec`` | any | VF,     |
> |   |     +----------+-----+ PF      |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
> 
> ``ETHERTYPE`` to ``ETH`` → ``QUEUE``, ``DROP``
> ----------------------------------------------
> 
> `ETHERTYPE`_ is basically an `ETH`_ flow rule with `QUEUE`_ or `DROP`_ as
> a terminating action.
> 
> +------------------------------------+
> | ETHERTYPE                          |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | ETH | ``spec`` | any | QUEUE,  |
> |   |     +----------+-----+ DROP    |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
> 
> ``FLEXIBLE`` to ``RAW`` → ``QUEUE``
> -----------------------------------
> 
> `FLEXIBLE`_ can be translated to one `RAW`_ pattern with `QUEUE`_ as the
> terminating action and a defined priority level.
> 
> +------------------------------------+
> | FLEXIBLE                           |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | RAW | ``spec`` | any | QUEUE   |
> |   |     +----------+-----+         |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
> 
> ``SYN`` to ``TCP`` → ``QUEUE``
> ------------------------------
> 
> `SYN`_ is a `TCP`_ rule with only the ``syn`` bit enabled and masked, and
> `QUEUE`_ as the terminating action.
> 
> Priority level can be set to simulate the high priority bit.
> 
> +---------------------------------------------+
> | SYN                                         |
> +-----------------------------------+---------+
> | Pattern                           | Actions |
> +===+======+==========+=============+=========+
> | 0 | ETH  | ``spec`` | N/A         | QUEUE   |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | empty       |         |
> +---+------+----------+-------------+         |
> | 1 | IPV4 | ``spec`` | N/A         |         |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | empty       |         |
> +---+------+----------+-------------+         |
> | 2 | TCP  | ``spec`` | ``syn`` = 1 |         |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | ``syn`` = 1 |         |
> +---+------+----------+-------------+---------+
> 
> ``NTUPLE`` to ``IPV4``, ``TCP``, ``UDP`` → ``QUEUE``
> ----------------------------------------------------
> 
> `NTUPLE`_ is similar to specifying an empty L2, `IPV4`_ as L3 with `TCP`_ or
> `UDP`_ as L4 and `QUEUE`_ as the terminating action.
> 
> A priority level can be specified as well.
> 
> +---------------------------------------+
> | NTUPLE                                |
> +-----------------------------+---------+
> | Pattern                     | Actions |
> +===+======+==========+=======+=========+
> | 0 | ETH  | ``spec`` | N/A   | QUEUE   |
> |   |      +----------+-------+         |
> |   |      | ``mask`` | empty |         |
> +---+------+----------+-------+         |
> | 1 | IPV4 | ``spec`` | any   |         |
> |   |      +----------+-------+         |
> |   |      | ``mask`` | any   |         |
> +---+------+----------+-------+         |
> | 2 | TCP, | ``spec`` | any   |         |
> |   | UDP  +----------+-------+         |
> |   |      | ``mask`` | any   |         |
> +---+------+----------+-------+---------+
> 
> ``TUNNEL`` to ``ETH``, ``IPV4``, ``IPV6``, ``VXLAN`` (or other) → ``QUEUE``
> ---------------------------------------------------------------------------
> 
> `TUNNEL`_ matches common IPv4 and IPv6 L3/L4-based tunnel types.
> 
> In the following table, `ANY`_ is used to cover the optional L4.
> 
> +------------------------------------------------+
> | TUNNEL                                         |
> +--------------------------------------+---------+
> | Pattern                              | Actions |
> +===+=========+==========+=============+=========+
> | 0 | ETH     | ``spec`` | any         | QUEUE   |
> |   |         +----------+-------------+         |
> |   |         | ``mask`` | any         |         |
> +---+---------+----------+-------------+         |
> | 1 | IPV4,   | ``spec`` | any         |         |
> |   | IPV6    +----------+-------------+         |
> |   |         | ``mask`` | any         |         |
> +---+---------+----------+-------------+         |
> | 2 | ANY     | ``spec`` | ``min`` = 0 |         |
> |   |         |          +-------------+         |
> |   |         |          | ``max`` = 0 |         |
> |   |         +----------+-------------+         |
> |   |         | ``mask`` | N/A         |         |
> +---+---------+----------+-------------+         |
> | 3 | VXLAN,  | ``spec`` | any         |         |
> |   | GENEVE, +----------+-------------+         |
> |   | TEREDO, | ``mask`` | any         |         |
> |   | NVGRE,  |          |             |         |
> |   | GRE,    |          |             |         |
> |   | ...     |          |             |         |
> +---+---------+----------+-------------+---------+
> 
> .. raw:: pdf
> 
>    PageBreak
> 
> ``FDIR`` to most item types → ``QUEUE``, ``DROP``, ``PASSTHRU``
> ---------------------------------------------------------------
> 
> `FDIR`_ is more complex than any other type, there are several methods to
> emulate its functionality. It is summarized for the most part in the table
> below.
> 
> A few features are intentionally not supported:
> 
> - The ability to configure the matching input set and masks for the entire
>   device, PMDs should take care of it automatically according to flow rules.
> 
> - Returning four or eight bytes of matched data when using flex bytes
>   filtering. Although a specific action could implement it, it conflicts
>   with the much more useful 32 bits tagging on devices that support it.
> 
> - Side effects on RSS processing of the entire device. Flow rules that
>   conflict with the current device configuration should not be
>   allowed. Similarly, device configuration should not be allowed when it
>   affects existing flow rules.
> 
> - Device modes of operation. "none" is unsupported since filtering cannot be
>   disabled as long as a flow rule is present.
> 
> - "MAC VLAN" or "tunnel" perfect matching modes should be automatically
> set
>   according to the created flow rules.
> 
> +----------------------------------------------+
> | FDIR                                         |
> +---------------------------------+------------+
> | Pattern                         | Actions    |
> +===+============+==========+=====+============+
> | 0 | ETH,       | ``spec`` | any | QUEUE,     |
> |   | RAW        +----------+-----+ DROP,      |
> |   |            | ``mask`` | any | PASSTHRU   |
> +---+------------+----------+-----+------------+
> | 1 | IPV4,      | ``spec`` | any | ID         |
> |   | IPV6       +----------+-----+ (optional) |
> |   |            | ``mask`` | any |            |
> +---+------------+----------+-----+            |
> | 2 | TCP,       | ``spec`` | any |            |
> |   | UDP,       +----------+-----+            |
> |   | SCTP       | ``mask`` | any |            |
> +---+------------+----------+-----+            |
> | 3 | VF,        | ``spec`` | any |            |
> |   | PF,        +----------+-----+            |
> |   | SIGNATURE  | ``mask`` | any |            |
> |   | (optional) |          |     |            |
> +---+------------+----------+-----+------------+
> 
> ``HASH``
> ~~~~~~~~
> 
> Hashing configuration is set per rule through the `SIGNATURE`_ item.
> 
> Since it is usually a global device setting, all flow rules created with
> this item may have to share the same specification.
> 
> ``L2_TUNNEL`` to ``VOID`` → ``VXLAN`` (or others)
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> All packets are matched. This type alters incoming packets to encapsulate
> them in a chosen tunnel type, optionally redirect them to a VF as well.
> 
> The destination pool for tag based forwarding can be emulated with other
> flow rules using `DUP`_ as the action.
> 
> +----------------------------------------+
> | L2_TUNNEL                              |
> +---------------------------+------------+
> | Pattern                   | Actions    |
> +===+======+==========+=====+============+
> | 0 | VOID | ``spec`` | N/A | VXLAN,     |
> |   |      |          |     | GENEVE,    |
> |   |      |          |     | ...        |
> |   |      +----------+-----+------------+
> |   |      | ``mask`` | N/A | VF         |
> |   |      |          |     | (optional) |
> +---+------+----------+-----+------------+
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2 10/10] maintainers: add section for pmdinfo
  @ 2016-07-08 10:14  4%   ` Thomas Monjalon
    1 sibling, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-08 10:14 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev

The author of this feature is Neil Horman.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
 MAINTAINERS | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..f996c2e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -68,6 +68,10 @@ F: lib/librte_compat/
 F: doc/guides/rel_notes/deprecation.rst
 F: scripts/validate-abi.sh
 
+Driver information
+F: buildtools/pmdinfogen/
+F: tools/pmdinfo.py
+
 
 Environment Abstraction Layer
 -----------------------------
-- 
2.7.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
  2016-07-05 18:16  2% [dpdk-dev] [RFC] Generic flow director/filtering/classification API Adrien Mazarguil
  2016-07-07  7:14  0% ` Lu, Wenzhuo
  2016-07-07 23:15  0% ` Chandran, Sugesh
@ 2016-07-08 11:11  0% ` Liang, Cunming
  2 siblings, 0 replies; 200+ results
From: Liang, Cunming @ 2016-07-08 11:11 UTC (permalink / raw)
  To: dev, Thomas Monjalon, Helin Zhang, Jingjing Wu, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Wenzhuo Lu, Jan Medala,
	John Daley, Jing Chen, Konstantin Ananyev, Matej Vido,
	Alejandro Lucero, Sony Chacko, Jerin Jacob, Pablo de Lara,
	Olga Shern

Hi Adrien,

On 7/6/2016 2:16 AM, Adrien Mazarguil wrote:
> Hi All,
>
> First, forgive me for this large message, I know our mailboxes already
> suffer quite a bit from the amount of traffic on this ML.
>
> This is not exactly yet another thread about how flow director should be
> extended, rather about a brand new API to handle filtering and
> classification for incoming packets in the most PMD-generic and
> application-friendly fashion we can come up with. Reasons described below.
>
> I think this topic is important enough to include both the users of this API
> as well as PMD maintainers. So far I have CC'ed librte_ether (especially
> rte_eth_ctrl.h contributors), testpmd and PMD maintainers (with and without
> a .filter_ctrl implementation), but if you know application maintainers
> other than testpmd who use FDIR or might be interested in this discussion,
> feel free to add them.
>
> The issues we found with the current approach are already summarized in the
> following document, but here is a quick summary for TL;DR folks:
>
> - PMDs do not expose a common set of filter types and even when they do,
>    their behavior more or less differs.
>
> - Applications need to determine and adapt to device-specific limitations
>    and quirks on their own, without help from PMDs.
>
> - Writing an application that creates flow rules targeting all devices
>    supported by DPDK is thus difficult, if not impossible.
>
> - The current API has too many unspecified areas (particularly regarding
>    side effects of flow rules) that make PMD implementation tricky.
>
> This RFC API handles everything currently supported by .filter_ctrl, the
> idea being to reimplement all of these to make them fully usable by
> applications in a more generic and well defined fashion. It has a very small
> set of mandatory features and an easy method to let applications probe for
> supported capabilities.
>
> The only downside is more work for the software control side of PMDs because
> they have to adapt to the API instead of the reverse. I think helpers can be
> added to EAL to assist with this.
>
> HTML version:
>
>   https://rawgit.com/6WIND/rte_flow/master/rte_flow.html
>
> PDF version:
>
>   https://rawgit.com/6WIND/rte_flow/master/rte_flow.pdf
>
> Related draft header file (for reference while reading the specification):
>
>   https://raw.githubusercontent.com/6WIND/rte_flow/master/rte_flow.h
>
> Git tree for completeness (latest .rst version can be retrieved from here):
>
>   https://github.com/6WIND/rte_flow
>
> What follows is the ReST source of the above, for inline comments and
> discussion. I intend to update that specification accordingly.
>
> ========================
> Generic filter interface
> ========================
>
> .. footer::
>
>     v0.6
>
> .. contents::
> .. sectnum::
> .. raw:: pdf
>
>     PageBreak
>
> Overview
> ========
>
> DPDK provides several competing interfaces added over time to perform packet
> matching and related actions such as filtering and classification.
>
> They must be extended to implement the features supported by newer devices
> in order to expose them to applications, however the current design has
> several drawbacks:
>
> - Complicated filter combinations which have not been hard-coded cannot be
>    expressed.
> - Prone to API/ABI breakage when new features must be added to an existing
>    filter type, which frequently happens.
>
>  From an application point of view:
>
> - Having disparate interfaces, all optional and lacking in features does not
>    make this API easy to use.
> - Seemingly arbitrary built-in limitations of filter types based on the
>    device they were initially designed for.
> - Undefined relationship between different filter types.
> - High complexity, considerable undocumented and/or undefined behavior.
>
> Considering the growing number of devices supported by DPDK, adding a new
> filter type each time a new feature must be implemented is not sustainable
> in the long term. Applications not written to target a specific device
> cannot really benefit from such an API.
>
> For these reasons, this document defines an extensible unified API that
> encompasses and supersedes these legacy filter types.
>
> .. raw:: pdf
>
>     PageBreak
>
> Current API
> ===========
>
> Rationale
> ---------
>
> The reason several competing (and mostly overlapping) filtering APIs are
> present in DPDK is due to its nature as a thin layer between hardware and
> software.
>
> Each subsequent interface has been added to better match the capabilities
> and limitations of the latest supported device, which usually happened to
> need an incompatible configuration approach. Because of this, many ended up
> device-centric and not usable by applications that were not written for that
> particular device.
>
> This document is not the first attempt to address this proliferation issue,
> in fact a lot of work has already been done both to create a more generic
> interface while somewhat keeping compatibility with legacy ones through a
> common call interface (``rte_eth_dev_filter_ctrl()`` with the
> ``.filter_ctrl`` PMD callback in ``rte_ethdev.h``).
>
> Today, these previously incompatible interfaces are known as filter types
> (``RTE_ETH_FILTER_*`` from ``enum rte_filter_type`` in ``rte_eth_ctrl.h``).
>
> However while trivial to extend with new types, it only shifted the
> underlying problem as applications still need to be written for one kind of
> filter type, which, as described in the following sections, is not
> necessarily implemented by all PMDs that support filtering.
>
> .. raw:: pdf
>
>     PageBreak
>
> Filter types
> ------------
>
> This section summarizes the capabilities of each filter type.
>
> Although the following list is exhaustive, the description of individual
> types may contain inaccuracies due to the lack of documentation or usage
> examples.
>
> Note: names are prefixed with ``RTE_ETH_FILTER_``.
>
> ``MACVLAN``
> ~~~~~~~~~~~
>
> Matching:
>
> - L2 source/destination addresses.
> - Optional 802.1Q VLAN ID.
> - Masking individual fields on a rule basis is not supported.
>
> Action:
>
> - Packets are redirected either to a given VF device using its ID or to the
>    PF.
>
> ``ETHERTYPE``
> ~~~~~~~~~~~~~
>
> Matching:
>
> - L2 source/destination addresses (optional).
> - Ethertype (no VLAN ID?).
> - Masking individual fields on a rule basis is not supported.
>
> Action:
>
> - Receive packets on a given queue.
> - Drop packets.
>
> ``FLEXIBLE``
> ~~~~~~~~~~~~
>
> Matching:
>
> - At most 128 consecutive bytes anywhere in packets.
> - Masking is supported with byte granularity.
> - Priorities are supported (relative to this filter type, undefined
>    otherwise).
>
> Action:
>
> - Receive packets on a given queue.
>
> ``SYN``
> ~~~~~~~
>
> Matching:
>
> - TCP SYN packets only.
> - One high priority bit can be set to give the highest possible priority to
>    this type when other filters with different types are configured.
>
> Action:
>
> - Receive packets on a given queue.
>
> ``NTUPLE``
> ~~~~~~~~~~
>
> Matching:
>
> - Source/destination IPv4 addresses (optional in 2-tuple mode).
> - Source/destination TCP/UDP port (mandatory in 2 and 5-tuple modes).
> - L4 protocol (2 and 5-tuple modes).
> - Masking individual fields is supported.
> - TCP flags.
> - Up to 7 levels of priority relative to this filter type, undefined
>    otherwise.
> - No IPv6.
>
> Action:
>
> - Receive packets on a given queue.
>
> ``TUNNEL``
> ~~~~~~~~~~
>
> Matching:
>
> - Outer L2 source/destination addresses.
> - Inner L2 source/destination addresses.
> - Inner VLAN ID.
> - IPv4/IPv6 source (destination?) address.
> - Tunnel type to match (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE, 802.1BR
>    E-Tag).
> - Tenant ID for tunneling protocols that have one.
> - Any combination of the above can be specified.
> - Masking individual fields on a rule basis is not supported.
>
> Action:
>
> - Receive packets on a given queue.
>
> .. raw:: pdf
>
>     PageBreak
>
> ``FDIR``
> ~~~~~~~~
>
> Queries:
>
> - Device capabilities and limitations.
> - Device statistics about configured filters (resource usage, collisions).
> - Device configuration (matching input set and masks)
>
> Matching:
>
> - Device mode of operation: none (to disable filtering), signature
>    (hash-based dispatching from masked fields) or perfect (either MAC VLAN or
>    tunnel).
> - L2 Ethertype.
> - Outer L2 destination address (MAC VLAN mode).
> - Inner L2 destination address, tunnel type (NVGRE, VXLAN) and tunnel ID
>    (tunnel mode).
> - IPv4 source/destination addresses, ToS, TTL and protocol fields.
> - IPv6 source/destination addresses, TC, protocol and hop limits fields.
> - UDP source/destination IPv4/IPv6 and ports.
> - TCP source/destination IPv4/IPv6 and ports.
> - SCTP source/destination IPv4/IPv6, ports and verification tag field.
> - Note, only one protocol type at once (either only L2 Ethertype, basic
>    IPv6, IPv4+UDP, IPv4+TCP and so on).
> - VLAN TCI (extended API).
> - At most 16 bytes to match in payload (extended API). A global device
>    look-up table specifies for each possible protocol layer (unknown, raw,
>    L2, L3, L4) the offset to use for each byte (they do not need to be
>    contiguous) and the related bitmask.
> - Whether packet is addressed to PF or VF, in that case its ID can be
>    matched as well (extended API).
> - Masking most of the above fields is supported, but simultaneously affects
>    all filters configured on a device.
> - Input set can be modified in a similar fashion for a given device to
>    ignore individual fields of filters (i.e. do not match the destination
>    address in a IPv4 filter, refer to **RTE_ETH_INPUT_SET_**
>    macros). Configuring this also affects RSS processing on **i40e**.
> - Filters can also provide 32 bits of arbitrary data to return as part of
>    matched packets.
>
> Action:
>
> - **RTE_ETH_FDIR_ACCEPT**: receive (accept) packet on a given queue.
> - **RTE_ETH_FDIR_REJECT**: drop packet immediately.
> - **RTE_ETH_FDIR_PASSTHRU**: similar to accept for the last filter in list,
>    otherwise process it with subsequent filters.
> - For accepted packets and if requested by filter, either 32 bits of
>    arbitrary data and four bytes of matched payload (only in case of flex
>    bytes matching), or eight bytes of matched payload (flex also) are added
>    to meta data.
>
> .. raw:: pdf
>
>     PageBreak
>
> ``HASH``
> ~~~~~~~~
>
> Not an actual filter type. Provides and retrieves the global device
> configuration (per port or entire NIC) for hash functions and their
> properties.
>
> Hash function selection: "default" (keep current), XOR or Toeplitz.
>
> This function can be configured per flow type (**RTE_ETH_FLOW_**
> definitions), supported types are:
>
> - Unknown.
> - Raw.
> - Fragmented or non-fragmented IPv4.
> - Non-fragmented IPv4 with L4 (TCP, UDP, SCTP or other).
> - Fragmented or non-fragmented IPv6.
> - Non-fragmented IPv6 with L4 (TCP, UDP, SCTP or other).
> - L2 payload.
> - IPv6 with extensions.
> - IPv6 with L4 (TCP, UDP) and extensions.
>
> ``L2_TUNNEL``
> ~~~~~~~~~~~~~
>
> Matching:
>
> - All packets received on a given port.
>
> Action:
>
> - Add tunnel encapsulation (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
>    802.1BR E-Tag) using the provided Ethertype and tunnel ID (only E-Tag
>    is implemented at the moment).
> - VF ID to use for tag insertion (currently unused).
> - Destination pool for tag based forwarding (pools are IDs that can be
>    affected to ports, duplication occurs if the same ID is shared by several
>    ports of the same NIC).
>
> .. raw:: pdf
>
>     PageBreak
>
> Driver support
> --------------
>
> ======== ======= ========= ======== === ====== ====== ==== ==== =========
> Driver   MACVLAN ETHERTYPE FLEXIBLE SYN NTUPLE TUNNEL FDIR HASH L2_TUNNEL
> ======== ======= ========= ======== === ====== ====== ==== ==== =========
> bnx2x
> cxgbe
> e1000            yes       yes      yes yes
> ena
> enic                                                  yes
> fm10k
> i40e     yes     yes                           yes    yes  yes
> ixgbe            yes                yes yes           yes       yes
> mlx4
> mlx5                                                  yes
> szedata2
> ======== ======= ========= ======== === ====== ====== ==== ==== =========
>
> Flow director
> -------------
>
> Flow director (FDIR) is the name of the most capable filter type, which
> covers most features offered by others. As such, it is the most widespread
> in PMDs that support filtering (i.e. all of them besides **e1000**).
>
> It is also the only type that allows an arbitrary 32 bits value provided by
> applications to be attached to a filter and returned with matching packets
> instead of relying on the destination queue to recognize flows.
>
> Unfortunately, even FDIR requires applications to be aware of low-level
> capabilities and limitations (most of which come directly from **ixgbe** and
> **i40e**):
>
> - Bitmasks are set globally per device (port?), not per filter.
> - Configuration state is not expected to be saved by the driver, and
>    stopping/restarting a port requires the application to perform it again
>    (API documentation is also unclear about this).
> - Monolithic approach with ABI issues as soon as a new kind of flow or
>    combination needs to be supported.
> - Cryptic global statistics/counters.
> - Unclear about how priorities are managed; filters seem to be arranged as a
>    linked list in hardware (possibly related to configuration order).
>
> Packet alteration
> -----------------
>
> One interesting feature is that the L2 tunnel filter type implements the
> ability to alter incoming packets through a filter (in this case to
> encapsulate them), thus the **mlx5** flow encap/decap features are not a
> foreign concept.
>
> .. raw:: pdf
>
>     PageBreak
>
> Proposed API
> ============
>
> Terminology
> -----------
>
> - **Filtering API**: overall framework affecting the fate of selected
>    packets, covers everything described in this document.
> - **Matching pattern**: properties to look for in received packets, a
>    combination of any number of items.
> - **Pattern item**: part of a pattern that either matches packet data
>    (protocol header, payload or derived information), or specifies properties
>    of the pattern itself.
> - **Actions**: what needs to be done when a packet matches a pattern.
> - **Flow rule**: this is the result of combining a *matching pattern* with
>    *actions*.
> - **Filter rule**: a less generic term than *flow rule*, can otherwise be
>    used interchangeably.
> - **Hit**: a flow rule is said to be *hit* when processing a matching
>    packet.
>
> Requirements
> ------------
>
> As described in the previous section, there is a growing need for a common
> method to configure filtering and related actions in a hardware independent
> fashion.
>
> The filtering API should not disallow any filter combination by design and
> must remain as simple as possible to use. It can simply be defined as a
> method to perform one or several actions on selected packets.
>
> PMDs are aware of the capabilities of the device they manage and should be
> responsible for preventing unsupported or conflicting combinations.
>
> This approach is fundamentally different as it places most of the burden on
> the software side of the PMD instead of having device capabilities directly
> mapped to API functions, then expecting applications to work around ensuing
> compatibility issues.
>
> Requirements for a new API:
>
> - Flexible and extensible without causing API/ABI problems for existing
>    applications.
> - Should be unambiguous and easy to use.
> - Support existing filtering features and actions listed in `Filter types`_.
> - Support packet alteration.
> - In case of overlapping filters, their priority should be well documented.
> - Support filter queries (for example to retrieve counters).
>
> .. raw:: pdf
>
>     PageBreak
>
> High level design
> -----------------
>
> The chosen approach to make filtering as generic as possible is by
> expressing matching patterns through lists of items instead of the flat
> structures used in DPDK today, enabling combinations that are not predefined
> and thus being more versatile.
>
> Flow rules can have several distinct actions (such as counting,
> encapsulating, decapsulating before redirecting packets to a particular
> queue, etc.), instead of relying on several rules to achieve this and having
> applications deal with hardware implementation details regarding their
> order.
>
> Support for different priority levels on a rule basis is provided, for
> example in order to force a more specific rule come before a more generic
> one for packets matched by both, however hardware support for more than a
> single priority level cannot be guaranteed. When supported, the number of
> available priority levels is usually low, which is why they can also be
> implemented in software by PMDs (e.g. to simulate missing priority levels by
> reordering rules).
>
> In order to remain as hardware agnostic as possible, by default all rules
> are considered to have the same priority, which means that the order between
> overlapping rules (when a packet is matched by several filters) is
> undefined, packet duplication may even occur as a result.
>
> PMDs may refuse to create overlapping rules at a given priority level when
> they can be detected (e.g. if a pattern matches an existing filter).
>
> Thus predictable results for a given priority level can only be achieved
> with non-overlapping rules, using perfect matching on all protocol layers.
>
> Support for multiple actions per rule may be implemented internally on top
> of non-default hardware priorities, as a result both features may not be
> simultaneously available to applications.
>
> Considering that allowed pattern/actions combinations cannot be known in
> advance and would result in an unpractically large number of capabilities to
> expose, a method is provided to validate a given rule from the current
> device configuration state without actually adding it (akin to a "dry run"
> mode).
>
> This enables applications to check if the rule types they need is supported
> at initialization time, before starting their data path. This method can be
> used anytime, its only requirement being that the resources needed by a rule
> must exist (e.g. a target RX queue must be configured first).
>
> Each defined rule is associated with an opaque handle managed by the PMD,
> applications are responsible for keeping it. These can be used for queries
> and rules management, such as retrieving counters or other data and
> destroying them.
>
> Handles must be destroyed before releasing associated resources such as
> queues.
>
> Integration
> -----------
>
> To avoid ABI breakage, this new interface will be implemented through the
> existing filtering control framework (``rte_eth_dev_filter_ctrl()``) using
> **RTE_ETH_FILTER_GENERIC** as a new filter type.
>
> However a public front-end API described in `Rules management`_ will
> be added as the preferred method to use it.
>
> Once discussions with the community have converged to a definite API, legacy
> filter types should be deprecated and a deadline defined to remove their
> support entirely.
>
> PMDs will have to be gradually converted to **RTE_ETH_FILTER_GENERIC** or
> drop filtering support entirely. Less maintained PMDs for older hardware may
> lose support at this point.
>
> The notion of filter type will then be deprecated and subsequently dropped
> to avoid confusion between both frameworks.
>
> Implementation details
> ======================
>
> Flow rule
> ---------
>
> A flow rule is the combination of a matching pattern with a list of actions,
> and is the basis of this API.
>
> Priorities
> ~~~~~~~~~~
>
> A priority can be assigned to a matching pattern.
>
> The default priority level is 0 and is also the highest. Support for more
> than a single priority level in hardware is not guaranteed.
>
> If a packet is matched by several filters at a given priority level, the
> outcome is undefined. It can take any path and can even be duplicated.
>
> Matching pattern
> ~~~~~~~~~~~~~~~~
>
> A matching pattern comprises any number of items of various types.
>
> Items are arranged in a list to form a matching pattern for packets. They
> fall in two categories:
>
> - Protocol matching (ANY, RAW, ETH, IPV4, IPV6, ICMP, UDP, TCP, VXLAN and so
>    on), usually associated with a specification structure. These must be
>    stacked in the same order as the protocol layers to match, starting from
>    L2.
>
> - Affecting how the pattern is processed (END, VOID, INVERT, PF, VF,
>    SIGNATURE and so on), often without a specification structure. Since they
>    are meta data that does not match packet contents, these can be specified
>    anywhere within item lists without affecting the protocol matching items.
>
> Most item specifications can be optionally paired with a mask to narrow the
> specific fields or bits to be matched.
>
> - Items are defined with ``struct rte_flow_item``.
> - Patterns are defined with ``struct rte_flow_pattern``.
>
> Example of an item specification matching an Ethernet header:
>
> +-----------------------------------------+
> | Ethernet                                |
> +==========+=========+====================+
> | ``spec`` | ``src`` | ``00:01:02:03:04`` |
> |          +---------+--------------------+
> |          | ``dst`` | ``00:2a:66:00:01`` |
> +----------+---------+--------------------+
> | ``mask`` | ``src`` | ``00:ff:ff:ff:00`` |
> |          +---------+--------------------+
> |          | ``dst`` | ``00:00:00:00:ff`` |
> +----------+---------+--------------------+
>
> Non-masked bits stand for any value, Ethernet headers with the following
> properties are thus matched:
>
> - ``src``: ``??:01:02:03:??``
> - ``dst``: ``??:??:??:??:01``
>
> Except for meta types that do not need one, ``spec`` must be a valid pointer
> to a structure of the related item type. A ``mask`` of the same type can be
> provided to tell which bits in ``spec`` are to be matched.
>
> A mask is normally only needed for ``spec`` fields matching packet data,
> ignored otherwise. See individual item types for more information.
>
> A ``NULL`` mask pointer is allowed and is similar to matching with a full
> mask (all ones) ``spec`` fields supported by hardware, the remaining fields
> are ignored (all zeroes), there is thus no error checking for unsupported
> fields.
>
> Matching pattern items for packet data must be naturally stacked (ordered
> from lowest to highest protocol layer), as in the following examples:
>
> +--------------+
> | TCPv4 as L4  |
> +===+==========+
> | 0 | Ethernet |
> +---+----------+
> | 1 | IPv4     |
> +---+----------+
> | 2 | TCP      |
> +---+----------+
>
> +----------------+
> | TCPv6 in VXLAN |
> +===+============+
> | 0 | Ethernet   |
> +---+------------+
> | 1 | IPv4       |
> +---+------------+
> | 2 | UDP        |
> +---+------------+
> | 3 | VXLAN      |
> +---+------------+
> | 4 | Ethernet   |
> +---+------------+
> | 5 | IPv6       |
> +---+------------+
> | 6 | TCP        |
> +---+------------+
>
> +-----------------------------+
> | TCPv4 as L4 with meta items |
> +===+=========================+
> | 0 | VOID                    |
> +---+-------------------------+
> | 1 | Ethernet                |
> +---+-------------------------+
> | 2 | VOID                    |
> +---+-------------------------+
> | 3 | IPv4                    |
> +---+-------------------------+
> | 4 | TCP                     |
> +---+-------------------------+
> | 5 | VOID                    |
> +---+-------------------------+
> | 6 | VOID                    |
> +---+-------------------------+
>
> The above example shows how meta items do not affect packet data matching
> items, as long as those remain stacked properly. The resulting matching
> pattern is identical to "TCPv4 as L4".
>
> +----------------+
> | UDPv6 anywhere |
> +===+============+
> | 0 | IPv6       |
> +---+------------+
> | 1 | UDP        |
> +---+------------+
>
> If supported by the PMD, omitting one or several protocol layers at the
> bottom of the stack as in the above example (missing an Ethernet
> specification) enables hardware to look anywhere in packets.
>
> It is unspecified whether the payload of supported encapsulations
> (e.g. VXLAN inner packet) is matched by such a pattern, which may apply to
> inner, outer or both packets.
>
> +---------------------+
> | Invalid, missing L3 |
> +===+=================+
> | 0 | Ethernet        |
> +---+-----------------+
> | 1 | UDP             |
> +---+-----------------+
>
> The above pattern is invalid due to a missing L3 specification between L2
> and L4. It is only allowed at the bottom and at the top of the stack.
>
> Meta item types
> ~~~~~~~~~~~~~~~
>
> These do not match packet data but affect how the pattern is processed, most
> of them do not need a specification structure. This particularity allows
> them to be specified anywhere without affecting other item types.
[LC] For the meta item(END, VOID, INVERT) and some data matching type 
like ANY and RAW,
it's all PMD responsible to understand the key character and to parse 
the header graph?
>
> ``END``
> ^^^^^^^
>
> End marker for item lists. Prevents further processing of items, thereby
> ending the pattern.
>
> - Its numeric value is **0** for convenience.
> - PMD support is mandatory.
> - Both ``spec`` and ``mask`` are ignored.
>
> +--------------------+
> | END                |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
>
> ``VOID``
> ^^^^^^^^
>
> Used as a placeholder for convenience. It is ignored and simply discarded by
> PMDs.
>
> - PMD support is mandatory.
> - Both ``spec`` and ``mask`` are ignored.
>
> +--------------------+
> | VOID               |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
>
> One usage example for this type is generating rules that share a common
> prefix quickly without reallocating memory, only by updating item types:
>
> +------------------------+
> | TCP, UDP or ICMP as L4 |
> +===+====================+
> | 0 | Ethernet           |
> +---+--------------------+
> | 1 | IPv4               |
> +---+------+------+------+
> | 2 | UDP  | VOID | VOID |
> +---+------+------+------+
> | 3 | VOID | TCP  | VOID |
> +---+------+------+------+
> | 4 | VOID | VOID | ICMP |
> +---+------+------+------+
>
> .. raw:: pdf
>
>     PageBreak
>
> ``INVERT``
> ^^^^^^^^^^
>
> Inverted matching, i.e. process packets that do not match the pattern.
>
> - Both ``spec`` and ``mask`` are ignored.
>
> +--------------------+
> | INVERT             |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
>
> Usage example in order to match non-TCPv4 packets only:
>
> +--------------------+
> | Anything but TCPv4 |
> +===+================+
> | 0 | INVERT         |
> +---+----------------+
> | 1 | Ethernet       |
> +---+----------------+
> | 2 | IPv4           |
> +---+----------------+
> | 3 | TCP            |
> +---+----------------+
>
> ``PF``
> ^^^^^^
>
> Matches packets addressed to the physical function of the device.
>
> - Both ``spec`` and ``mask`` are ignored.
>
> +--------------------+
> | PF                 |
> +==========+=========+
> | ``spec`` | ignored |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
>
> ``VF``
> ^^^^^^
>
> Matches packets addressed to the given virtual function ID of the device.
>
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
>
> +----------------------------------------+
> | VF                                     |
> +==========+=========+===================+
> | ``spec`` | ``vf``  | destination VF ID |
> +----------+---------+-------------------+
> | ``mask`` | ignored                     |
> +----------+-----------------------------+
>
> ``SIGNATURE``
> ^^^^^^^^^^^^^
>
> Requests hash-based signature dispatching for this rule.
>
> Considering this is a global setting on devices that support it, all
> subsequent filter rules may have to be created with it as well.
>
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
>
> +--------------------+
> | SIGNATURE          |
> +==========+=========+
> | ``spec`` | TBD     |
> +----------+---------+
> | ``mask`` | ignored |
> +----------+---------+
>
> .. raw:: pdf
>
>     PageBreak
>
> Data matching item types
> ~~~~~~~~~~~~~~~~~~~~~~~~
>
> Most of these are basically protocol header definitions with associated
> bitmasks. They must be specified (stacked) from lowest to highest protocol
> layer.
>
> The following list is not exhaustive as new protocols will be added in the
> future.
>
> ``ANY``
> ^^^^^^^
>
> Matches any protocol in place of the current layer, a single ANY may also
> stand for several protocol layers.
>
> This is usually specified as the first pattern item when looking for a
> protocol anywhere in a packet.
>
> - A maximum value of **0** requests matching any number of protocol layers
>    above or equal to the minimum value, a maximum value lower than the
>    minimum one is otherwise invalid.
> - Only ``spec`` needs to be defined, ``mask`` is ignored.
>
> +-----------------------------------------------------------------------+
> | ANY                                                                   |
> +==========+=========+==================================================+
> | ``spec`` | ``min`` | minimum number of layers covered                 |
> |          +---------+--------------------------------------------------+
> |          | ``max`` | maximum number of layers covered, 0 for infinity |
> +----------+---------+--------------------------------------------------+
> | ``mask`` | ignored                                                    |
> +----------+------------------------------------------------------------+
>
> Example for VXLAN TCP payload matching regardless of outer L3 (IPv4 or IPv6)
> and L4 (UDP) both matched by the first ANY specification, and inner L3 (IPv4
> or IPv6) matched by the second ANY specification:
>
> +----------------------------------+
> | TCP in VXLAN with wildcards      |
> +===+==============================+
> | 0 | Ethernet                     |
> +---+-----+----------+---------+---+
> | 1 | ANY | ``spec`` | ``min`` | 2 |
> |   |     |          +---------+---+
> |   |     |          | ``max`` | 2 |
> +---+-----+----------+---------+---+
> | 2 | VXLAN                        |
> +---+------------------------------+
> | 3 | Ethernet                     |
> +---+-----+----------+---------+---+
> | 4 | ANY | ``spec`` | ``min`` | 1 |
> |   |     |          +---------+---+
> |   |     |          | ``max`` | 1 |
> +---+-----+----------+---------+---+
> | 5 | TCP                          |
> +---+------------------------------+
>
> .. raw:: pdf
>
>     PageBreak
>
> ``RAW``
> ^^^^^^^
>
> Matches a string of a given length at a given offset (in bytes), or anywhere
> in the payload of the current protocol layer (including L2 header if used as
> the first item in the stack).
>
> This does not increment the protocol layer count as it is not a protocol
> definition. Subsequent RAW items modulate the first absolute one with
> relative offsets.
>
> - Using **-1** as the ``offset`` of the first RAW item makes its absolute
>    offset not fixed, i.e. the pattern is searched everywhere.
> - ``mask`` only affects the pattern.
The RAW matching type allow offset & length which support anchor setting 
setting and string match.
It's not defined for a user defined packet layout. Sometimes, comparing 
payload raw data after a header require
{offset, length}. One typical case is 5-tuples matching. The 'PORT' of 
transport layer is an offset to the IP header.
It can't address by IP/ANY, as it requires to extract key from the field 
in ANY.

>
> +--------------------------------------------------------------+
> | RAW                                                          |
> +==========+=============+=====================================+
> | ``spec`` | ``offset``  | absolute or relative pattern offset |
> |          +-------------+-------------------------------------+
> |          | ``length``  | pattern length                      |
> |          +-------------+-------------------------------------+
> |          | ``pattern`` | byte string of the above length     |
> +----------+-------------+-------------------------------------+
> | ``mask`` | ``offset``  | ignored                             |
> |          +-------------+-------------------------------------+
> |          | ``length``  | ignored                             |
> |          +-------------+-------------------------------------+
> |          | ``pattern`` | bitmask with the same byte length   |
> +----------+-------------+-------------------------------------+
>
> Example pattern looking for several strings at various offsets of a UDP
> payload, using combined RAW items:
>
> +------------------------------------------+
> | UDP payload matching                     |
> +===+======================================+
> | 0 | Ethernet                             |
> +---+--------------------------------------+
> | 1 | IPv4                                 |
> +---+--------------------------------------+
> | 2 | UDP                                  |
> +---+-----+----------+-------------+-------+
> | 3 | RAW | ``spec`` | ``offset``  | -1    |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "foo" |
> +---+-----+----------+-------------+-------+
> | 4 | RAW | ``spec`` | ``offset``  | 20    |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "bar" |
> +---+-----+----------+-------------+-------+
> | 5 | RAW | ``spec`` | ``offset``  | -30   |
> |   |     |          +-------------+-------+
> |   |     |          | ``length``  | 3     |
> |   |     |          +-------------+-------+
> |   |     |          | ``pattern`` | "baz" |
> +---+-----+----------+-------------+-------+
>
> This translates to:
>
> - Locate "foo" in UDP payload, remember its offset.
> - Check "bar" at "foo"'s offset plus 20 bytes.
> - Check "baz" at "foo"'s offset minus 30 bytes.
>
> .. raw:: pdf
>
>     PageBreak
>
> ``ETH``
> ^^^^^^^
>
> Matches an Ethernet header.
>
> - ``dst``: destination MAC.
> - ``src``: source MAC.
> - ``type``: EtherType.
> - ``tags``: number of 802.1Q/ad tags defined.
> - ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:
>
>   - ``tpid``: Tag protocol identifier.
>   - ``tci``: Tag control information.
>
> ``IPV4``
> ^^^^^^^^
>
> Matches an IPv4 header.
>
> - ``src``: source IP address.
> - ``dst``: destination IP address.
> - ``tos``: ToS/DSCP field.
> - ``ttl``: TTL field.
> - ``proto``: protocol number for the next layer.
>
> ``IPV6``
> ^^^^^^^^
>
> Matches an IPv6 header.
>
> - ``src``: source IP address.
> - ``dst``: destination IP address.
> - ``tc``: traffic class field.
> - ``nh``: Next header field (protocol).
> - ``hop_limit``: hop limit field (TTL).
>
> ``ICMP``
> ^^^^^^^^
>
> Matches an ICMP header.
>
> - TBD.
>
> ``UDP``
> ^^^^^^^
>
> Matches a UDP header.
>
> - ``sport``: source port.
> - ``dport``: destination port.
> - ``length``: UDP length.
> - ``checksum``: UDP checksum.
>
> .. raw:: pdf
>
>     PageBreak
>
> ``TCP``
> ^^^^^^^
>
> Matches a TCP header.
>
> - ``sport``: source port.
> - ``dport``: destination port.
> - All other TCP fields and bits.
>
> ``VXLAN``
> ^^^^^^^^^
>
> Matches a VXLAN header.
>
> - TBD.
>
> .. raw:: pdf
>
>     PageBreak
>
> Actions
> ~~~~~~~
>
> Each possible action is represented by a type. Some have associated
> configuration structures. Several actions combined in a list can be affected
> to a flow rule. That list is not ordered.
>
> At least one action must be defined in a filter rule in order to do
> something with matched packets.
>
> - Actions are defined with ``struct rte_flow_action``.
> - A list of actions is defined with ``struct rte_flow_actions``.
>
> They fall in three categories:
>
> - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
>    processing matched packets by subsequent flow rules, unless overridden
>    with PASSTHRU.
>
> - Non terminating actions (PASSTHRU, DUP) that leave matched packets up for
>    additional processing by subsequent flow rules.
>
> - Other non terminating meta actions that do not affect the fate of packets
>    (END, VOID, ID, COUNT).
>
> When several actions are combined in a flow rule, they should all have
> different types (e.g. dropping a packet twice is not possible). However
> considering the VOID type is an exception to this rule, the defined behavior
> is for PMDs to only take into account the last action of a given type found
> in the list. PMDs still perform error checking on the entire list.
>
> *Note that PASSTHRU is the only action able to override a terminating rule.*
[LC] I'm wondering how to address the meta data carried by mbuf, there's 
no mentioned here.
For packets hit one specific flow, usually there's something for CPU to 
identify the flow.
FDIR and RSS as an example, has id or key in mbuf. In addition, some 
meta may pointed by userdata in mbuf.
Any view on it ?

>
> .. raw:: pdf
>
>     PageBreak
>
> Example of an action that redirects packets to queue index 10:
>
> +----------------+
> | QUEUE          |
> +===========+====+
> | ``queue`` | 10 |
> +-----------+----+
>
> Action lists examples, their order is not significant, applications must
> consider all actions to be performed simultaneously:
>
> +----------------+
> | Count and drop |
> +=======+========+
> | COUNT |        |
> +-------+--------+
> | DROP  |        |
> +-------+--------+
>
> +--------------------------+
> | Tag, count and redirect  |
> +=======+===========+======+
> | ID    | ``id``    | 0x2a |
> +-------+-----------+------+
> | COUNT |                  |
> +-------+-----------+------+
> | QUEUE | ``queue`` | 10   |
> +-------+-----------+------+
>
> +-----------------------+
> | Redirect to queue 5   |
> +=======+===============+
> | DROP  |               |
> +-------+-----------+---+
> | QUEUE | ``queue`` | 5 |
> +-------+-----------+---+
>
> In the above example, considering both actions are performed simultaneously,
> its end result is that only QUEUE has any effect.
>
> +-----------------------+
> | Redirect to queue 3   |
> +=======+===========+===+
> | QUEUE | ``queue`` | 5 |
> +-------+-----------+---+
> | VOID  |               |
> +-------+-----------+---+
> | QUEUE | ``queue`` | 3 |
> +-------+-----------+---+
>
> As previously described, only the last action of a given type found in the
> list is taken into account. The above example also shows that VOID is
> ignored.
>
> .. raw:: pdf
>
>     PageBreak
>
> Action types
> ~~~~~~~~~~~~
>
> Common action types are described in this section. Like pattern item types,
> this list is not exhaustive as new actions will be added in the future.
>
> ``END`` (action)
> ^^^^^^^^^^^^^^^^
>
> End marker for action lists. Prevents further processing of actions, thereby
> ending the list.
>
> - Its numeric value is **0** for convenience.
> - PMD support is mandatory.
> - No configurable property.
>
> +---------------+
> | END           |
> +===============+
> | no properties |
> +---------------+
>
> ``VOID`` (action)
> ^^^^^^^^^^^^^^^^^
>
> Used as a placeholder for convenience. It is ignored and simply discarded by
> PMDs.
>
> - PMD support is mandatory.
> - No configurable property.
>
> +---------------+
> | VOID          |
> +===============+
> | no properties |
> +---------------+
>
> ``PASSTHRU``
> ^^^^^^^^^^^^
>
> Leaves packets up for additional processing by subsequent flow rules. This
> is the default when a rule does not contain a terminating action, but can be
> specified to force a rule to become non-terminating.
>
> - No configurable property.
>
> +---------------+
> | PASSTHRU      |
> +===============+
> | no properties |
> +---------------+
>
> Example to copy a packet to a queue and continue processing by subsequent
> flow rules:
>
> +--------------------------+
> | Copy to queue 8          |
> +==========+===============+
> | PASSTHRU |               |
> +----------+-----------+---+
> | QUEUE    | ``queue`` | 8 |
> +----------+-----------+---+
>
> ``ID``
> ^^^^^^
>
> Attaches a 32 bit value to packets.
>
> +----------------------------------------------+
> | ID                                           |
> +========+=====================================+
> | ``id`` | 32 bit value to return with packets |
> +--------+-------------------------------------+
>
> .. raw:: pdf
>
>     PageBreak
>
> ``QUEUE``
> ^^^^^^^^^
>
> Assigns packets to a given queue index.
>
> - Terminating by default.
>
> +--------------------------------+
> | QUEUE                          |
> +===========+====================+
> | ``queue`` | queue index to use |
> +-----------+--------------------+
>
> ``DROP``
> ^^^^^^^^
>
> Drop packets.
>
> - No configurable property.
> - Terminating by default.
> - PASSTHRU overrides this action if both are specified.
>
> +---------------+
> | DROP          |
> +===============+
> | no properties |
> +---------------+
>
> ``COUNT``
> ^^^^^^^^^
>
> Enables hits counter for this rule.
>
> This counter can be retrieved and reset through ``rte_flow_query()``, see
> ``struct rte_flow_query_count``.
>
> - Counters can be retrieved with ``rte_flow_query()``.
> - No configurable property.
>
> +---------------+
> | COUNT         |
> +===============+
> | no properties |
> +---------------+
>
> Query structure to retrieve and reset the flow rule hits counter:
>
> +------------------------------------------------+
> | COUNT query                                    |
> +===========+=====+==============================+
> | ``reset`` | in  | reset counter after query    |
> +-----------+-----+------------------------------+
> | ``hits``  | out | number of hits for this flow |
> +-----------+-----+------------------------------+
>
> ``DUP``
> ^^^^^^^
>
> Duplicates packets to a given queue index.
>
> This is normally combined with QUEUE, however when used alone, it is
> actually similar to QUEUE + PASSTHRU.
>
> - Non-terminating by default.
>
> +------------------------------------------------+
> | DUP                                            |
> +===========+====================================+
> | ``queue`` | queue index to duplicate packet to |
> +-----------+------------------------------------+
>
> .. raw:: pdf
>
>     PageBreak
>
> ``RSS``
> ^^^^^^^
>
> Similar to QUEUE, except RSS is additionally performed on packets to spread
> them among several queues according to the provided parameters.
>
> - Terminating by default.
>
> +---------------------------------------------+
> | RSS                                         |
> +==============+==============================+
> | ``rss_conf`` | RSS parameters               |
> +--------------+------------------------------+
> | ``queues``   | number of entries in queue[] |
> +--------------+------------------------------+
> | ``queue[]``  | queue indices to use         |
> +--------------+------------------------------+
>
> ``PF`` (action)
> ^^^^^^^^^^^^^^^
>
> Redirects packets to the physical function (PF) of the current device.
>
> - No configurable property.
> - Terminating by default.
>
> +---------------+
> | PF            |
> +===============+
> | no properties |
> +---------------+
>
> ``VF`` (action)
> ^^^^^^^^^^^^^^^
>
> Redirects packets to the virtual function (VF) of the current device with
> the specified ID.
>
> - Terminating by default.
>
> +---------------------------------------+
> | VF                                    |
> +========+==============================+
> | ``id`` | VF ID to redirect packets to |
> +--------+------------------------------+
>
> Planned types
> ~~~~~~~~~~~~~
>
> Other action types are planned but not defined yet. These actions will add
> the ability to alter matching packets in several ways, such as performing
> encapsulation/decapsulation of tunnel headers on specific flows.
>
> .. raw:: pdf
>
>     PageBreak
>
> Rules management
> ----------------
>
> A simple API with only four functions is provided to fully manage flows.
>
> Each created flow rule is associated with an opaque, PMD-specific handle
> pointer. The application is responsible for keeping it until the rule is
> destroyed.
>
> Flows rules are defined with ``struct rte_flow``.
>
> Validation
> ~~~~~~~~~~
>
> Given that expressing a definite set of device capabilities with this API is
> not practical, a dedicated function is provided to check if a flow rule is
> supported and can be created.
>
> ::
>
>   int
>   rte_flow_validate(uint8_t port_id,
>                     const struct rte_flow_pattern *pattern,
>                     const struct rte_flow_actions *actions);
>
> While this function has no effect on the target device, the flow rule is
> validated against its current configuration state and the returned value
> should be considered valid by the caller for that state only.
>
> The returned value is guaranteed to remain valid only as long as no
> successful calls to rte_flow_create() or rte_flow_destroy() are made in the
> meantime and no device parameter affecting flow rules in any way are
> modified, due to possible collisions or resource limitations (although in
> such cases ``EINVAL`` should not be returned).
>
> Arguments:
>
> - ``port_id``: port identifier of Ethernet device.
> - ``pattern``: pattern specification to check.
> - ``actions``: actions associated with the flow definition.
>
> Return value:
>
> - **0** if flow rule is valid and can be created. A negative errno value
>    otherwise (``rte_errno`` is also set), the following errors are defined.
> - ``-EINVAL``: unknown or invalid rule specification.
> - ``-ENOTSUP``: valid but unsupported rule specification (e.g. partial masks
>    are unsupported).
> - ``-EEXIST``: collision with an existing rule.
> - ``-ENOMEM``: not enough resources.
>
> .. raw:: pdf
>
>     PageBreak
>
> Creation
> ~~~~~~~~
>
> Creating a flow rule is similar to validating one, except the rule is
> actually created.
>
> ::
>
>   struct rte_flow *
>   rte_flow_create(uint8_t port_id,
>                   const struct rte_flow_pattern *pattern,
>                   const struct rte_flow_actions *actions);
>
> Arguments:
>
> - ``port_id``: port identifier of Ethernet device.
> - ``pattern``: pattern specification to add.
> - ``actions``: actions associated with the flow definition.
>
> Return value:
>
> A valid flow pointer in case of success, NULL otherwise and ``rte_errno`` is
> set to the positive version of one of the error codes defined for
> ``rte_flow_validate()``.
>
> Destruction
> ~~~~~~~~~~~
>
> Flow rules destruction is not automatic, and a queue should not be released
> if any are still attached to it. Applications must take care of performing
> this step before releasing resources.
>
> ::
>
>   int
>   rte_flow_destroy(uint8_t port_id,
>                    struct rte_flow *flow);
>
>
> Failure to destroy a flow rule may occur when other flow rules depend on it,
> and destroying it would result in an inconsistent state.
>
> This function is only guaranteed to succeed if flow rules are destroyed in
> reverse order of their creation.
>
> Arguments:
>
> - ``port_id``: port identifier of Ethernet device.
> - ``flow``: flow rule to destroy.
>
> Return value:
>
> - **0** on success, a negative errno value otherwise and ``rte_errno`` is
>    set.
>
> .. raw:: pdf
>
>     PageBreak
>
> Query
> ~~~~~
>
> Query an existing flow rule.
>
> This function allows retrieving flow-specific data such as counters. Data
> is gathered by special actions which must be present in the flow rule
> definition.
>
> ::
>
>   int
>   rte_flow_query(uint8_t port_id,
>                  struct rte_flow *flow,
>                  enum rte_flow_action_type action,
>                  void *data);
>
> Arguments:
>
> - ``port_id``: port identifier of Ethernet device.
> - ``flow``: flow rule to query.
> - ``action``: action type to query.
> - ``data``: pointer to storage for the associated query data type.
>
> Return value:
>
> - **0** on success, a negative errno value otherwise and ``rte_errno`` is
>    set.
>
> .. raw:: pdf
>
>     PageBreak
>
> Behavior
> --------
>
> - API operations are synchronous and blocking (``EAGAIN`` cannot be
>    returned).
>
> - There is no provision for reentrancy/multi-thread safety, although nothing
>    should prevent different devices from being configured at the same
>    time. PMDs may protect their control path functions accordingly.
>
> - Stopping the data path (TX/RX) should not be necessary when managing flow
>    rules. If this cannot be achieved naturally or with workarounds (such as
>    temporarily replacing the burst function pointers), an appropriate error
>    code must be returned (``EBUSY``).
>
> - PMDs, not applications, are responsible for maintaining flow rules
>    configuration when stopping and restarting a port or performing other
>    actions which may affect them. They can only be destroyed explicitly.
>
> .. raw:: pdf
>
>     PageBreak
>
> Compatibility
> -------------
>
> No known hardware implementation supports all the features described in this
> document.
>
> Unsupported features or combinations are not expected to be fully emulated
> in software by PMDs for performance reasons. Partially supported features
> may be completed in software as long as hardware performs most of the work
> (such as queue redirection and packet recognition).
>
> However PMDs are expected to do their best to satisfy application requests
> by working around hardware limitations as long as doing so does not affect
> the behavior of existing flow rules.
>
> The following sections provide a few examples of such cases, they are based
> on limitations built into the previous APIs.
>
> Global bitmasks
> ~~~~~~~~~~~~~~~
>
> Each flow rule comes with its own, per-layer bitmasks, while hardware may
> support only a single, device-wide bitmask for a given layer type, so that
> two IPv4 rules cannot use different bitmasks.
>
> The expected behavior in this case is that PMDs automatically configure
> global bitmasks according to the needs of the first created flow rule.
>
> Subsequent rules are allowed only if their bitmasks match those, the
> ``EEXIST`` error code should be returned otherwise.
>
> Unsupported layer types
> ~~~~~~~~~~~~~~~~~~~~~~~
>
> Many protocols can be simulated by crafting patterns with the `RAW`_ type.
>
> PMDs can rely on this capability to simulate support for protocols with
> fixed headers not directly recognized by hardware.
>
> ``ANY`` pattern item
> ~~~~~~~~~~~~~~~~~~~~
>
> This pattern item stands for anything, which can be difficult to translate
> to something hardware would understand, particularly if followed by more
> specific types.
>
> Consider the following pattern:
>
> +---+--------------------------------+
> | 0 | ETHER                          |
> +---+--------------------------------+
> | 1 | ANY (``min`` = 1, ``max`` = 1) |
> +---+--------------------------------+
> | 2 | TCP                            |
> +---+--------------------------------+
>
> Knowing that TCP does not make sense with something other than IPv4 and IPv6
> as L3, such a pattern may be translated to two flow rules instead:
>
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV4 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
>
> +---+--------------------+
> | 0 | ETHER              |
> +---+--------------------+
> | 1 | IPV6 (zeroed mask) |
> +---+--------------------+
> | 2 | TCP                |
> +---+--------------------+
>
> Note that as soon as a ANY rule covers several layers, this approach may
> yield a large number of hidden flow rules. It is thus suggested to only
> support the most common scenarios (anything as L2 and/or L3).
>
> .. raw:: pdf
>
>     PageBreak
>
> Unsupported actions
> ~~~~~~~~~~~~~~~~~~~
>
> - When combined with a `QUEUE`_ action, packet counting (`COUNT`_) and
>    tagging (`ID`_) may be implemented in software as long as the target queue
>    is used by a single rule.
>
> - A rule specifying both `DUP`_ + `QUEUE`_ may be translated to two hidden
>    rules combining `QUEUE`_ and `PASSTHRU`_.
>
> - When a single target queue is provided, `RSS`_ can also be implemented
>    through `QUEUE`_.
>
> Flow rules priority
> ~~~~~~~~~~~~~~~~~~~
>
> While it would naturally make sense, flow rules cannot be assumed to be
> processed by hardware in the same order as their creation for several
> reasons:
>
> - They may be managed internally as a tree or a hash table instead of a
>    list.
> - Removing a flow rule before adding another one can either put the new rule
>    at the end of the list or reuse a freed entry.
> - Duplication may occur when packets are matched by several rules.
>
> For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
> predictable behavior is only guaranteed by using different priority levels.
>
> Priority levels are not necessarily implemented in hardware, or may be
> severely limited (e.g. a single priority bit).
>
> For these reasons, priority levels may be implemented purely in software by
> PMDs.
>
> - For devices expecting flow rules to be added in the correct order, PMDs
>    may destroy and re-create existing rules after adding a new one with
>    a higher priority.
>
> - A configurable number of dummy or empty rules can be created at
>    initialization time to save high priority slots for later.
>
> - In order to save priority levels, PMDs may evaluate whether rules are
>    likely to collide and adjust their priority accordingly.
>
> .. raw:: pdf
>
>     PageBreak
>
> API migration
> =============
>
> Exhaustive list of deprecated filter types and how to convert them to
> generic flow rules.
>
> ``MACVLAN`` to ``ETH`` → ``VF``, ``PF``
> ---------------------------------------
>
> `MACVLAN`_ can be translated to a basic `ETH`_ flow rule with a `VF
> (action)`_ or `PF (action)`_ terminating action.
>
> +------------------------------------+
> | MACVLAN                            |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | ETH | ``spec`` | any | VF,     |
> |   |     +----------+-----+ PF      |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
>
> ``ETHERTYPE`` to ``ETH`` → ``QUEUE``, ``DROP``
> ----------------------------------------------
>
> `ETHERTYPE`_ is basically an `ETH`_ flow rule with `QUEUE`_ or `DROP`_ as
> a terminating action.
>
> +------------------------------------+
> | ETHERTYPE                          |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | ETH | ``spec`` | any | QUEUE,  |
> |   |     +----------+-----+ DROP    |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
>
> ``FLEXIBLE`` to ``RAW`` → ``QUEUE``
> -----------------------------------
>
> `FLEXIBLE`_ can be translated to one `RAW`_ pattern with `QUEUE`_ as the
> terminating action and a defined priority level.
>
> +------------------------------------+
> | FLEXIBLE                           |
> +--------------------------+---------+
> | Pattern                  | Actions |
> +===+=====+==========+=====+=========+
> | 0 | RAW | ``spec`` | any | QUEUE   |
> |   |     +----------+-----+         |
> |   |     | ``mask`` | any |         |
> +---+-----+----------+-----+---------+
>
> ``SYN`` to ``TCP`` → ``QUEUE``
> ------------------------------
>
> `SYN`_ is a `TCP`_ rule with only the ``syn`` bit enabled and masked, and
> `QUEUE`_ as the terminating action.
>
> Priority level can be set to simulate the high priority bit.
>
> +---------------------------------------------+
> | SYN                                         |
> +-----------------------------------+---------+
> | Pattern                           | Actions |
> +===+======+==========+=============+=========+
> | 0 | ETH  | ``spec`` | N/A         | QUEUE   |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | empty       |         |
> +---+------+----------+-------------+         |
> | 1 | IPV4 | ``spec`` | N/A         |         |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | empty       |         |
> +---+------+----------+-------------+         |
> | 2 | TCP  | ``spec`` | ``syn`` = 1 |         |
> |   |      +----------+-------------+         |
> |   |      | ``mask`` | ``syn`` = 1 |         |
> +---+------+----------+-------------+---------+
>
> ``NTUPLE`` to ``IPV4``, ``TCP``, ``UDP`` → ``QUEUE``
> ----------------------------------------------------
>
> `NTUPLE`_ is similar to specifying an empty L2, `IPV4`_ as L3 with `TCP`_ or
> `UDP`_ as L4 and `QUEUE`_ as the terminating action.
>
> A priority level can be specified as well.
>
> +---------------------------------------+
> | NTUPLE                                |
> +-----------------------------+---------+
> | Pattern                     | Actions |
> +===+======+==========+=======+=========+
> | 0 | ETH  | ``spec`` | N/A   | QUEUE   |
> |   |      +----------+-------+         |
> |   |      | ``mask`` | empty |         |
> +---+------+----------+-------+         |
> | 1 | IPV4 | ``spec`` | any   |         |
> |   |      +----------+-------+         |
> |   |      | ``mask`` | any   |         |
> +---+------+----------+-------+         |
> | 2 | TCP, | ``spec`` | any   |         |
> |   | UDP  +----------+-------+         |
> |   |      | ``mask`` | any   |         |
> +---+------+----------+-------+---------+
>
> ``TUNNEL`` to ``ETH``, ``IPV4``, ``IPV6``, ``VXLAN`` (or other) → ``QUEUE``
> ---------------------------------------------------------------------------
>
> `TUNNEL`_ matches common IPv4 and IPv6 L3/L4-based tunnel types.
>
> In the following table, `ANY`_ is used to cover the optional L4.
>
> +------------------------------------------------+
> | TUNNEL                                         |
> +--------------------------------------+---------+
> | Pattern                              | Actions |
> +===+=========+==========+=============+=========+
> | 0 | ETH     | ``spec`` | any         | QUEUE   |
> |   |         +----------+-------------+         |
> |   |         | ``mask`` | any         |         |
> +---+---------+----------+-------------+         |
> | 1 | IPV4,   | ``spec`` | any         |         |
> |   | IPV6    +----------+-------------+         |
> |   |         | ``mask`` | any         |         |
> +---+---------+----------+-------------+         |
> | 2 | ANY     | ``spec`` | ``min`` = 0 |         |
> |   |         |          +-------------+         |
> |   |         |          | ``max`` = 0 |         |
> |   |         +----------+-------------+         |
> |   |         | ``mask`` | N/A         |         |
> +---+---------+----------+-------------+         |
> | 3 | VXLAN,  | ``spec`` | any         |         |
> |   | GENEVE, +----------+-------------+         |
> |   | TEREDO, | ``mask`` | any         |         |
> |   | NVGRE,  |          |             |         |
> |   | GRE,    |          |             |         |
> |   | ...     |          |             |         |
> +---+---------+----------+-------------+---------+
>
> .. raw:: pdf
>
>     PageBreak
>
> ``FDIR`` to most item types → ``QUEUE``, ``DROP``, ``PASSTHRU``
> ---------------------------------------------------------------
>
> `FDIR`_ is more complex than any other type, there are several methods to
> emulate its functionality. It is summarized for the most part in the table
> below.
>
> A few features are intentionally not supported:
>
> - The ability to configure the matching input set and masks for the entire
>    device, PMDs should take care of it automatically according to flow rules.
>
> - Returning four or eight bytes of matched data when using flex bytes
>    filtering. Although a specific action could implement it, it conflicts
>    with the much more useful 32 bits tagging on devices that support it.
>
> - Side effects on RSS processing of the entire device. Flow rules that
>    conflict with the current device configuration should not be
>    allowed. Similarly, device configuration should not be allowed when it
>    affects existing flow rules.
>
> - Device modes of operation. "none" is unsupported since filtering cannot be
>    disabled as long as a flow rule is present.
>
> - "MAC VLAN" or "tunnel" perfect matching modes should be automatically set
>    according to the created flow rules.
>
> +----------------------------------------------+
> | FDIR                                         |
> +---------------------------------+------------+
> | Pattern                         | Actions    |
> +===+============+==========+=====+============+
> | 0 | ETH,       | ``spec`` | any | QUEUE,     |
> |   | RAW        +----------+-----+ DROP,      |
> |   |            | ``mask`` | any | PASSTHRU   |
> +---+------------+----------+-----+------------+
> | 1 | IPV4,      | ``spec`` | any | ID         |
> |   | IPV6       +----------+-----+ (optional) |
> |   |            | ``mask`` | any |            |
> +---+------------+----------+-----+            |
> | 2 | TCP,       | ``spec`` | any |            |
> |   | UDP,       +----------+-----+            |
> |   | SCTP       | ``mask`` | any |            |
> +---+------------+----------+-----+            |
> | 3 | VF,        | ``spec`` | any |            |
> |   | PF,        +----------+-----+            |
> |   | SIGNATURE  | ``mask`` | any |            |
> |   | (optional) |          |     |            |
> +---+------------+----------+-----+------------+
>
> ``HASH``
> ~~~~~~~~
>
> Hashing configuration is set per rule through the `SIGNATURE`_ item.
>
> Since it is usually a global device setting, all flow rules created with
> this item may have to share the same specification.
>
> ``L2_TUNNEL`` to ``VOID`` → ``VXLAN`` (or others)
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> All packets are matched. This type alters incoming packets to encapsulate
> them in a chosen tunnel type, optionally redirect them to a VF as well.
>
> The destination pool for tag based forwarding can be emulated with other
> flow rules using `DUP`_ as the action.
>
> +----------------------------------------+
> | L2_TUNNEL                              |
> +---------------------------+------------+
> | Pattern                   | Actions    |
> +===+======+==========+=====+============+
> | 0 | VOID | ``spec`` | N/A | VXLAN,     |
> |   |      |          |     | GENEVE,    |
> |   |      |          |     | ...        |
> |   |      +----------+-----+------------+
> |   |      | ``mask`` | N/A | VF         |
> |   |      |          |     | (optional) |
> +---+------+----------+-----+------------+
>

^ permalink raw reply	[relevance 0%]

* Re: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
  2016-07-07 23:15  0% ` Chandran, Sugesh
@ 2016-07-08 13:03  0%   ` Adrien Mazarguil
  0 siblings, 0 replies; 200+ results
From: Adrien Mazarguil @ 2016-07-08 13:03 UTC (permalink / raw)
  To: Chandran, Sugesh
  Cc: dev, Thomas Monjalon, Zhang, Helin, Wu, Jingjing, Rasesh Mody,
	Ajit Khaparde, Rahul Lakkireddy, Lu, Wenzhuo, Jan Medala,
	John Daley, Chen, Jing D, Ananyev, Konstantin, Matej Vido,
	Alejandro Lucero, Sony Chacko, Jerin Jacob, De Lara Guarch,
	Pablo, Olga Shern

Hi Sugesh,

On Thu, Jul 07, 2016 at 11:15:07PM +0000, Chandran, Sugesh wrote:
> Hi Adrien,
> 
> Thank you for proposing this. It would be really useful for application such as OVS-DPDK.
> Please find my comments and questions inline below prefixed with [Sugesh]. Most of them are from the perspective of enabling these APIs in application such as OVS-DPDK.

Thanks, I'm replying below.

> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Adrien Mazarguil
> > Sent: Tuesday, July 5, 2016 7:17 PM
> > To: dev@dpdk.org
> > Cc: Thomas Monjalon <thomas.monjalon@6wind.com>; Zhang, Helin
> > <helin.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; Rasesh
> > Mody <rasesh.mody@qlogic.com>; Ajit Khaparde
> > <ajit.khaparde@broadcom.com>; Rahul Lakkireddy
> > <rahul.lakkireddy@chelsio.com>; Lu, Wenzhuo <wenzhuo.lu@intel.com>;
> > Jan Medala <jan@semihalf.com>; John Daley <johndale@cisco.com>; Chen,
> > Jing D <jing.d.chen@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; Matej Vido <matejvido@gmail.com>;
> > Alejandro Lucero <alejandro.lucero@netronome.com>; Sony Chacko
> > <sony.chacko@qlogic.com>; Jerin Jacob
> > <jerin.jacob@caviumnetworks.com>; De Lara Guarch, Pablo
> > <pablo.de.lara.guarch@intel.com>; Olga Shern <olgas@mellanox.com>
> > Subject: [dpdk-dev] [RFC] Generic flow director/filtering/classification API
> > 
> > Hi All,
> > 
> > First, forgive me for this large message, I know our mailboxes already
> > suffer quite a bit from the amount of traffic on this ML.
> > 
> > This is not exactly yet another thread about how flow director should be
> > extended, rather about a brand new API to handle filtering and
> > classification for incoming packets in the most PMD-generic and
> > application-friendly fashion we can come up with. Reasons described below.
> > 
> > I think this topic is important enough to include both the users of this API
> > as well as PMD maintainers. So far I have CC'ed librte_ether (especially
> > rte_eth_ctrl.h contributors), testpmd and PMD maintainers (with and
> > without
> > a .filter_ctrl implementation), but if you know application maintainers
> > other than testpmd who use FDIR or might be interested in this discussion,
> > feel free to add them.
> > 
> > The issues we found with the current approach are already summarized in
> > the
> > following document, but here is a quick summary for TL;DR folks:
> > 
> > - PMDs do not expose a common set of filter types and even when they do,
> >   their behavior more or less differs.
> > 
> > - Applications need to determine and adapt to device-specific limitations
> >   and quirks on their own, without help from PMDs.
> > 
> > - Writing an application that creates flow rules targeting all devices
> >   supported by DPDK is thus difficult, if not impossible.
> > 
> > - The current API has too many unspecified areas (particularly regarding
> >   side effects of flow rules) that make PMD implementation tricky.
> > 
> > This RFC API handles everything currently supported by .filter_ctrl, the
> > idea being to reimplement all of these to make them fully usable by
> > applications in a more generic and well defined fashion. It has a very small
> > set of mandatory features and an easy method to let applications probe for
> > supported capabilities.
> > 
> > The only downside is more work for the software control side of PMDs
> > because
> > they have to adapt to the API instead of the reverse. I think helpers can be
> > added to EAL to assist with this.
> > 
> > HTML version:
> > 
> >  https://rawgit.com/6WIND/rte_flow/master/rte_flow.html
> > 
> > PDF version:
> > 
> >  https://rawgit.com/6WIND/rte_flow/master/rte_flow.pdf
> > 
> > Related draft header file (for reference while reading the specification):
> > 
> >  https://raw.githubusercontent.com/6WIND/rte_flow/master/rte_flow.h
> > 
> > Git tree for completeness (latest .rst version can be retrieved from here):
> > 
> >  https://github.com/6WIND/rte_flow
> > 
> > What follows is the ReST source of the above, for inline comments and
> > discussion. I intend to update that specification accordingly.
> > 
> > ========================
> > Generic filter interface
> > ========================
> > 
> > .. footer::
> > 
> >    v0.6
> > 
> > .. contents::
> > .. sectnum::
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Overview
> > ========
> > 
> > DPDK provides several competing interfaces added over time to perform
> > packet
> > matching and related actions such as filtering and classification.
> > 
> > They must be extended to implement the features supported by newer
> > devices
> > in order to expose them to applications, however the current design has
> > several drawbacks:
> > 
> > - Complicated filter combinations which have not been hard-coded cannot be
> >   expressed.
> > - Prone to API/ABI breakage when new features must be added to an
> > existing
> >   filter type, which frequently happens.
> > 
> > From an application point of view:
> > 
> > - Having disparate interfaces, all optional and lacking in features does not
> >   make this API easy to use.
> > - Seemingly arbitrary built-in limitations of filter types based on the
> >   device they were initially designed for.
> > - Undefined relationship between different filter types.
> > - High complexity, considerable undocumented and/or undefined behavior.
> > 
> > Considering the growing number of devices supported by DPDK, adding a
> > new
> > filter type each time a new feature must be implemented is not sustainable
> > in the long term. Applications not written to target a specific device
> > cannot really benefit from such an API.
> > 
> > For these reasons, this document defines an extensible unified API that
> > encompasses and supersedes these legacy filter types.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Current API
> > ===========
> > 
> > Rationale
> > ---------
> > 
> > The reason several competing (and mostly overlapping) filtering APIs are
> > present in DPDK is due to its nature as a thin layer between hardware and
> > software.
> > 
> > Each subsequent interface has been added to better match the capabilities
> > and limitations of the latest supported device, which usually happened to
> > need an incompatible configuration approach. Because of this, many ended
> > up
> > device-centric and not usable by applications that were not written for that
> > particular device.
> > 
> > This document is not the first attempt to address this proliferation issue,
> > in fact a lot of work has already been done both to create a more generic
> > interface while somewhat keeping compatibility with legacy ones through a
> > common call interface (``rte_eth_dev_filter_ctrl()`` with the
> > ``.filter_ctrl`` PMD callback in ``rte_ethdev.h``).
> > 
> > Today, these previously incompatible interfaces are known as filter types
> > (``RTE_ETH_FILTER_*`` from ``enum rte_filter_type`` in ``rte_eth_ctrl.h``).
> > 
> > However while trivial to extend with new types, it only shifted the
> > underlying problem as applications still need to be written for one kind of
> > filter type, which, as described in the following sections, is not
> > necessarily implemented by all PMDs that support filtering.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Filter types
> > ------------
> > 
> > This section summarizes the capabilities of each filter type.
> > 
> > Although the following list is exhaustive, the description of individual
> > types may contain inaccuracies due to the lack of documentation or usage
> > examples.
> > 
> > Note: names are prefixed with ``RTE_ETH_FILTER_``.
> > 
> > ``MACVLAN``
> > ~~~~~~~~~~~
> > 
> > Matching:
> > 
> > - L2 source/destination addresses.
> > - Optional 802.1Q VLAN ID.
> > - Masking individual fields on a rule basis is not supported.
> > 
> > Action:
> > 
> > - Packets are redirected either to a given VF device using its ID or to the
> >   PF.
> > 
> > ``ETHERTYPE``
> > ~~~~~~~~~~~~~
> > 
> > Matching:
> > 
> > - L2 source/destination addresses (optional).
> > - Ethertype (no VLAN ID?).
> > - Masking individual fields on a rule basis is not supported.
> > 
> > Action:
> > 
> > - Receive packets on a given queue.
> > - Drop packets.
> > 
> > ``FLEXIBLE``
> > ~~~~~~~~~~~~
> > 
> > Matching:
> > 
> > - At most 128 consecutive bytes anywhere in packets.
> > - Masking is supported with byte granularity.
> > - Priorities are supported (relative to this filter type, undefined
> >   otherwise).
> > 
> > Action:
> > 
> > - Receive packets on a given queue.
> > 
> > ``SYN``
> > ~~~~~~~
> > 
> > Matching:
> > 
> > - TCP SYN packets only.
> > - One high priority bit can be set to give the highest possible priority to
> >   this type when other filters with different types are configured.
> > 
> > Action:
> > 
> > - Receive packets on a given queue.
> > 
> > ``NTUPLE``
> > ~~~~~~~~~~
> > 
> > Matching:
> > 
> > - Source/destination IPv4 addresses (optional in 2-tuple mode).
> > - Source/destination TCP/UDP port (mandatory in 2 and 5-tuple modes).
> > - L4 protocol (2 and 5-tuple modes).
> > - Masking individual fields is supported.
> > - TCP flags.
> > - Up to 7 levels of priority relative to this filter type, undefined
> >   otherwise.
> > - No IPv6.
> > 
> > Action:
> > 
> > - Receive packets on a given queue.
> > 
> > ``TUNNEL``
> > ~~~~~~~~~~
> > 
> > Matching:
> > 
> > - Outer L2 source/destination addresses.
> > - Inner L2 source/destination addresses.
> > - Inner VLAN ID.
> > - IPv4/IPv6 source (destination?) address.
> > - Tunnel type to match (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
> > 802.1BR
> >   E-Tag).
> > - Tenant ID for tunneling protocols that have one.
> > - Any combination of the above can be specified.
> > - Masking individual fields on a rule basis is not supported.
> > 
> > Action:
> > 
> > - Receive packets on a given queue.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``FDIR``
> > ~~~~~~~~
> > 
> > Queries:
> > 
> > - Device capabilities and limitations.
> > - Device statistics about configured filters (resource usage, collisions).
> > - Device configuration (matching input set and masks)
> > 
> > Matching:
> > 
> > - Device mode of operation: none (to disable filtering), signature
> >   (hash-based dispatching from masked fields) or perfect (either MAC VLAN
> > or
> >   tunnel).
> > - L2 Ethertype.
> > - Outer L2 destination address (MAC VLAN mode).
> > - Inner L2 destination address, tunnel type (NVGRE, VXLAN) and tunnel ID
> >   (tunnel mode).
> > - IPv4 source/destination addresses, ToS, TTL and protocol fields.
> > - IPv6 source/destination addresses, TC, protocol and hop limits fields.
> > - UDP source/destination IPv4/IPv6 and ports.
> > - TCP source/destination IPv4/IPv6 and ports.
> > - SCTP source/destination IPv4/IPv6, ports and verification tag field.
> > - Note, only one protocol type at once (either only L2 Ethertype, basic
> >   IPv6, IPv4+UDP, IPv4+TCP and so on).
> > - VLAN TCI (extended API).
> > - At most 16 bytes to match in payload (extended API). A global device
> >   look-up table specifies for each possible protocol layer (unknown, raw,
> >   L2, L3, L4) the offset to use for each byte (they do not need to be
> >   contiguous) and the related bitmask.
> > - Whether packet is addressed to PF or VF, in that case its ID can be
> >   matched as well (extended API).
> > - Masking most of the above fields is supported, but simultaneously affects
> >   all filters configured on a device.
> > - Input set can be modified in a similar fashion for a given device to
> >   ignore individual fields of filters (i.e. do not match the destination
> >   address in a IPv4 filter, refer to **RTE_ETH_INPUT_SET_**
> >   macros). Configuring this also affects RSS processing on **i40e**.
> > - Filters can also provide 32 bits of arbitrary data to return as part of
> >   matched packets.
> > 
> > Action:
> > 
> > - **RTE_ETH_FDIR_ACCEPT**: receive (accept) packet on a given queue.
> > - **RTE_ETH_FDIR_REJECT**: drop packet immediately.
> > - **RTE_ETH_FDIR_PASSTHRU**: similar to accept for the last filter in list,
> >   otherwise process it with subsequent filters.
> > - For accepted packets and if requested by filter, either 32 bits of
> >   arbitrary data and four bytes of matched payload (only in case of flex
> >   bytes matching), or eight bytes of matched payload (flex also) are added
> >   to meta data.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``HASH``
> > ~~~~~~~~
> > 
> > Not an actual filter type. Provides and retrieves the global device
> > configuration (per port or entire NIC) for hash functions and their
> > properties.
> > 
> > Hash function selection: "default" (keep current), XOR or Toeplitz.
> > 
> > This function can be configured per flow type (**RTE_ETH_FLOW_**
> > definitions), supported types are:
> > 
> > - Unknown.
> > - Raw.
> > - Fragmented or non-fragmented IPv4.
> > - Non-fragmented IPv4 with L4 (TCP, UDP, SCTP or other).
> > - Fragmented or non-fragmented IPv6.
> > - Non-fragmented IPv6 with L4 (TCP, UDP, SCTP or other).
> > - L2 payload.
> > - IPv6 with extensions.
> > - IPv6 with L4 (TCP, UDP) and extensions.
> > 
> > ``L2_TUNNEL``
> > ~~~~~~~~~~~~~
> > 
> > Matching:
> > 
> > - All packets received on a given port.
> > 
> > Action:
> > 
> > - Add tunnel encapsulation (VXLAN, GENEVE, TEREDO, NVGRE, IP over GRE,
> >   802.1BR E-Tag) using the provided Ethertype and tunnel ID (only E-Tag
> >   is implemented at the moment).
> > - VF ID to use for tag insertion (currently unused).
> > - Destination pool for tag based forwarding (pools are IDs that can be
> >   affected to ports, duplication occurs if the same ID is shared by several
> >   ports of the same NIC).
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Driver support
> > --------------
> > 
> > ======== ======= ========= ======== === ====== ====== ==== ====
> > =========
> > Driver   MACVLAN ETHERTYPE FLEXIBLE SYN NTUPLE TUNNEL FDIR HASH
> > L2_TUNNEL
> > ======== ======= ========= ======== === ====== ====== ==== ====
> > =========
> > bnx2x
> > cxgbe
> > e1000            yes       yes      yes yes
> > ena
> > enic                                                  yes
> > fm10k
> > i40e     yes     yes                           yes    yes  yes
> > ixgbe            yes                yes yes           yes       yes
> > mlx4
> > mlx5                                                  yes
> > szedata2
> > ======== ======= ========= ======== === ====== ====== ==== ====
> > =========
> > 
> > Flow director
> > -------------
> > 
> > Flow director (FDIR) is the name of the most capable filter type, which
> > covers most features offered by others. As such, it is the most widespread
> > in PMDs that support filtering (i.e. all of them besides **e1000**).
> > 
> > It is also the only type that allows an arbitrary 32 bits value provided by
> > applications to be attached to a filter and returned with matching packets
> > instead of relying on the destination queue to recognize flows.
> > 
> > Unfortunately, even FDIR requires applications to be aware of low-level
> > capabilities and limitations (most of which come directly from **ixgbe** and
> > **i40e**):
> > 
> > - Bitmasks are set globally per device (port?), not per filter.
> [Sugesh] This means application cannot define filters that matches on arbitrary different offsets?
> If that’s the case, I assume the application has to program bitmask in advance. Otherwise how 
> the API framework deduce this bitmask information from the rules?? Its not very clear to me
> that how application pass down the bitmask information for multiple filters on same port?

This is my understanding of how flow director currently works, perhaps
someome more familiar with it can answer this question better than I could.

Let me take an example, if particular device can only handle a single IPv4
mask common to all flow rules (say only to match destination addresses),
updating that mask to also match the source address affects all defined and
future flow rules simultaneously.

That is how FDIR currently works and I think it is wrong, as it penalizes
devices that do support individual bit-masks per rule, and is a little
awkward from an application point of view.

What I suggest for the new API instead is the ability to specify one
bit-mask per rule, and let the PMD deal with HW limitations by automatically
configuring global bitmasks from the first added rule, then refusing to add
subsequent rules if they specify a conflicting bit-mask. Existing rules
remain unaffected that way, and applications do not have to be extra
cautious.

> > - Configuration state is not expected to be saved by the driver, and
> >   stopping/restarting a port requires the application to perform it again
> >   (API documentation is also unclear about this).
> > - Monolithic approach with ABI issues as soon as a new kind of flow or
> >   combination needs to be supported.
> > - Cryptic global statistics/counters.
> > - Unclear about how priorities are managed; filters seem to be arranged as a
> >   linked list in hardware (possibly related to configuration order).
> > 
> > Packet alteration
> > -----------------
> > 
> > One interesting feature is that the L2 tunnel filter type implements the
> > ability to alter incoming packets through a filter (in this case to
> > encapsulate them), thus the **mlx5** flow encap/decap features are not a
> > foreign concept.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Proposed API
> > ============
> > 
> > Terminology
> > -----------
> > 
> > - **Filtering API**: overall framework affecting the fate of selected
> >   packets, covers everything described in this document.
> > - **Matching pattern**: properties to look for in received packets, a
> >   combination of any number of items.
> > - **Pattern item**: part of a pattern that either matches packet data
> >   (protocol header, payload or derived information), or specifies properties
> >   of the pattern itself.
> > - **Actions**: what needs to be done when a packet matches a pattern.
> > - **Flow rule**: this is the result of combining a *matching pattern* with
> >   *actions*.
> > - **Filter rule**: a less generic term than *flow rule*, can otherwise be
> >   used interchangeably.
> > - **Hit**: a flow rule is said to be *hit* when processing a matching
> >   packet.
> > 
> > Requirements
> > ------------
> > 
> > As described in the previous section, there is a growing need for a common
> > method to configure filtering and related actions in a hardware independent
> > fashion.
> > 
> > The filtering API should not disallow any filter combination by design and
> > must remain as simple as possible to use. It can simply be defined as a
> > method to perform one or several actions on selected packets.
> > 
> > PMDs are aware of the capabilities of the device they manage and should be
> > responsible for preventing unsupported or conflicting combinations.
> > 
> > This approach is fundamentally different as it places most of the burden on
> > the software side of the PMD instead of having device capabilities directly
> > mapped to API functions, then expecting applications to work around
> > ensuing
> > compatibility issues.
> > 
> > Requirements for a new API:
> > 
> > - Flexible and extensible without causing API/ABI problems for existing
> >   applications.
> > - Should be unambiguous and easy to use.
> > - Support existing filtering features and actions listed in `Filter types`_.
> > - Support packet alteration.
> > - In case of overlapping filters, their priority should be well documented.
> > - Support filter queries (for example to retrieve counters).
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > High level design
> > -----------------
> > 
> > The chosen approach to make filtering as generic as possible is by
> > expressing matching patterns through lists of items instead of the flat
> > structures used in DPDK today, enabling combinations that are not
> > predefined
> > and thus being more versatile.
> > 
> > Flow rules can have several distinct actions (such as counting,
> > encapsulating, decapsulating before redirecting packets to a particular
> > queue, etc.), instead of relying on several rules to achieve this and having
> > applications deal with hardware implementation details regarding their
> > order.
> > 
> > Support for different priority levels on a rule basis is provided, for
> > example in order to force a more specific rule come before a more generic
> > one for packets matched by both, however hardware support for more than
> > a
> > single priority level cannot be guaranteed. When supported, the number of
> > available priority levels is usually low, which is why they can also be
> > implemented in software by PMDs (e.g. to simulate missing priority levels by
> > reordering rules).
> > 
> > In order to remain as hardware agnostic as possible, by default all rules
> > are considered to have the same priority, which means that the order
> > between
> > overlapping rules (when a packet is matched by several filters) is
> > undefined, packet duplication may even occur as a result.
> > 
> > PMDs may refuse to create overlapping rules at a given priority level when
> > they can be detected (e.g. if a pattern matches an existing filter).
> > 
> > Thus predictable results for a given priority level can only be achieved
> > with non-overlapping rules, using perfect matching on all protocol layers.
> > 
> > Support for multiple actions per rule may be implemented internally on top
> > of non-default hardware priorities, as a result both features may not be
> > simultaneously available to applications.
> > 
> > Considering that allowed pattern/actions combinations cannot be known in
> > advance and would result in an unpractically large number of capabilities to
> > expose, a method is provided to validate a given rule from the current
> > device configuration state without actually adding it (akin to a "dry run"
> > mode).
> > 
> > This enables applications to check if the rule types they need is supported
> > at initialization time, before starting their data path. This method can be
> > used anytime, its only requirement being that the resources needed by a
> > rule
> > must exist (e.g. a target RX queue must be configured first).
> > 
> > Each defined rule is associated with an opaque handle managed by the PMD,
> > applications are responsible for keeping it. These can be used for queries
> > and rules management, such as retrieving counters or other data and
> > destroying them.
> > 
> > Handles must be destroyed before releasing associated resources such as
> > queues.
> > 
> > Integration
> > -----------
> > 
> > To avoid ABI breakage, this new interface will be implemented through the
> > existing filtering control framework (``rte_eth_dev_filter_ctrl()``) using
> > **RTE_ETH_FILTER_GENERIC** as a new filter type.
> > 
> > However a public front-end API described in `Rules management`_ will
> > be added as the preferred method to use it.
> > 
> > Once discussions with the community have converged to a definite API,
> > legacy
> > filter types should be deprecated and a deadline defined to remove their
> > support entirely.
> > 
> > PMDs will have to be gradually converted to **RTE_ETH_FILTER_GENERIC**
> > or
> > drop filtering support entirely. Less maintained PMDs for older hardware
> > may
> > lose support at this point.
> > 
> > The notion of filter type will then be deprecated and subsequently dropped
> > to avoid confusion between both frameworks.
> > 
> > Implementation details
> > ======================
> > 
> > Flow rule
> > ---------
> > 
> > A flow rule is the combination of a matching pattern with a list of actions,
> > and is the basis of this API.
> > 
> > Priorities
> > ~~~~~~~~~~
> > 
> > A priority can be assigned to a matching pattern.
> > 
> > The default priority level is 0 and is also the highest. Support for more
> > than a single priority level in hardware is not guaranteed.
> > 
> > If a packet is matched by several filters at a given priority level, the
> > outcome is undefined. It can take any path and can even be duplicated.
> > 
> > Matching pattern
> > ~~~~~~~~~~~~~~~~
> > 
> > A matching pattern comprises any number of items of various types.
> > 
> > Items are arranged in a list to form a matching pattern for packets. They
> > fall in two categories:
> > 
> > - Protocol matching (ANY, RAW, ETH, IPV4, IPV6, ICMP, UDP, TCP, VXLAN and
> > so
> >   on), usually associated with a specification structure. These must be
> >   stacked in the same order as the protocol layers to match, starting from
> >   L2.
> > 
> > - Affecting how the pattern is processed (END, VOID, INVERT, PF, VF,
> >   SIGNATURE and so on), often without a specification structure. Since they
> >   are meta data that does not match packet contents, these can be specified
> >   anywhere within item lists without affecting the protocol matching items.
> > 
> > Most item specifications can be optionally paired with a mask to narrow the
> > specific fields or bits to be matched.
> > 
> > - Items are defined with ``struct rte_flow_item``.
> > - Patterns are defined with ``struct rte_flow_pattern``.
> > 
> > Example of an item specification matching an Ethernet header:
> > 
> > +-----------------------------------------+
> > | Ethernet                                |
> > +==========+=========+====================+
> > | ``spec`` | ``src`` | ``00:01:02:03:04`` |
> > |          +---------+--------------------+
> > |          | ``dst`` | ``00:2a:66:00:01`` |
> > +----------+---------+--------------------+
> > | ``mask`` | ``src`` | ``00:ff:ff:ff:00`` |
> > |          +---------+--------------------+
> > |          | ``dst`` | ``00:00:00:00:ff`` |
> > +----------+---------+--------------------+
> > 
> > Non-masked bits stand for any value, Ethernet headers with the following
> > properties are thus matched:
> > 
> > - ``src``: ``??:01:02:03:??``
> > - ``dst``: ``??:??:??:??:01``
> > 
> > Except for meta types that do not need one, ``spec`` must be a valid pointer
> > to a structure of the related item type. A ``mask`` of the same type can be
> > provided to tell which bits in ``spec`` are to be matched.
> > 
> > A mask is normally only needed for ``spec`` fields matching packet data,
> > ignored otherwise. See individual item types for more information.
> > 
> > A ``NULL`` mask pointer is allowed and is similar to matching with a full
> > mask (all ones) ``spec`` fields supported by hardware, the remaining fields
> > are ignored (all zeroes), there is thus no error checking for unsupported
> > fields.
> > 
> > Matching pattern items for packet data must be naturally stacked (ordered
> > from lowest to highest protocol layer), as in the following examples:
> > 
> > +--------------+
> > | TCPv4 as L4  |
> > +===+==========+
> > | 0 | Ethernet |
> > +---+----------+
> > | 1 | IPv4     |
> > +---+----------+
> > | 2 | TCP      |
> > +---+----------+
> > 
> > +----------------+
> > | TCPv6 in VXLAN |
> > +===+============+
> > | 0 | Ethernet   |
> > +---+------------+
> > | 1 | IPv4       |
> > +---+------------+
> > | 2 | UDP        |
> > +---+------------+
> > | 3 | VXLAN      |
> > +---+------------+
> > | 4 | Ethernet   |
> > +---+------------+
> > | 5 | IPv6       |
> > +---+------------+
> > | 6 | TCP        |
> > +---+------------+
> > 
> > +-----------------------------+
> > | TCPv4 as L4 with meta items |
> > +===+=========================+
> > | 0 | VOID                    |
> > +---+-------------------------+
> > | 1 | Ethernet                |
> > +---+-------------------------+
> > | 2 | VOID                    |
> > +---+-------------------------+
> > | 3 | IPv4                    |
> > +---+-------------------------+
> > | 4 | TCP                     |
> > +---+-------------------------+
> > | 5 | VOID                    |
> > +---+-------------------------+
> > | 6 | VOID                    |
> > +---+-------------------------+
> > 
> > The above example shows how meta items do not affect packet data
> > matching
> > items, as long as those remain stacked properly. The resulting matching
> > pattern is identical to "TCPv4 as L4".
> > 
> > +----------------+
> > | UDPv6 anywhere |
> > +===+============+
> > | 0 | IPv6       |
> > +---+------------+
> > | 1 | UDP        |
> > +---+------------+
> > 
> > If supported by the PMD, omitting one or several protocol layers at the
> > bottom of the stack as in the above example (missing an Ethernet
> > specification) enables hardware to look anywhere in packets.
> > 
> > It is unspecified whether the payload of supported encapsulations
> > (e.g. VXLAN inner packet) is matched by such a pattern, which may apply to
> > inner, outer or both packets.
> > 
> > +---------------------+
> > | Invalid, missing L3 |
> > +===+=================+
> > | 0 | Ethernet        |
> > +---+-----------------+
> > | 1 | UDP             |
> > +---+-----------------+
> > 
> > The above pattern is invalid due to a missing L3 specification between L2
> > and L4. It is only allowed at the bottom and at the top of the stack.
> > 
> > Meta item types
> > ~~~~~~~~~~~~~~~
> > 
> > These do not match packet data but affect how the pattern is processed,
> > most
> > of them do not need a specification structure. This particularity allows
> > them to be specified anywhere without affecting other item types.
> > 
> > ``END``
> > ^^^^^^^
> > 
> > End marker for item lists. Prevents further processing of items, thereby
> > ending the pattern.
> > 
> > - Its numeric value is **0** for convenience.
> > - PMD support is mandatory.
> > - Both ``spec`` and ``mask`` are ignored.
> > 
> > +--------------------+
> > | END                |
> > +==========+=========+
> > | ``spec`` | ignored |
> > +----------+---------+
> > | ``mask`` | ignored |
> > +----------+---------+
> > 
> > ``VOID``
> > ^^^^^^^^
> > 
> > Used as a placeholder for convenience. It is ignored and simply discarded by
> > PMDs.
> > 
> > - PMD support is mandatory.
> > - Both ``spec`` and ``mask`` are ignored.
> > 
> > +--------------------+
> > | VOID               |
> > +==========+=========+
> > | ``spec`` | ignored |
> > +----------+---------+
> > | ``mask`` | ignored |
> > +----------+---------+
> > 
> > One usage example for this type is generating rules that share a common
> > prefix quickly without reallocating memory, only by updating item types:
> > 
> > +------------------------+
> > | TCP, UDP or ICMP as L4 |
> > +===+====================+
> > | 0 | Ethernet           |
> > +---+--------------------+
> > | 1 | IPv4               |
> > +---+------+------+------+
> > | 2 | UDP  | VOID | VOID |
> > +---+------+------+------+
> > | 3 | VOID | TCP  | VOID |
> > +---+------+------+------+
> > | 4 | VOID | VOID | ICMP |
> > +---+------+------+------+
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``INVERT``
> > ^^^^^^^^^^
> > 
> > Inverted matching, i.e. process packets that do not match the pattern.
> > 
> > - Both ``spec`` and ``mask`` are ignored.
> > 
> > +--------------------+
> > | INVERT             |
> > +==========+=========+
> > | ``spec`` | ignored |
> > +----------+---------+
> > | ``mask`` | ignored |
> > +----------+---------+
> > 
> > Usage example in order to match non-TCPv4 packets only:
> > 
> > +--------------------+
> > | Anything but TCPv4 |
> > +===+================+
> > | 0 | INVERT         |
> > +---+----------------+
> > | 1 | Ethernet       |
> > +---+----------------+
> > | 2 | IPv4           |
> > +---+----------------+
> > | 3 | TCP            |
> > +---+----------------+
> > 
> > ``PF``
> > ^^^^^^
> > 
> > Matches packets addressed to the physical function of the device.
> > 
> > - Both ``spec`` and ``mask`` are ignored.
> > 
> > +--------------------+
> > | PF                 |
> > +==========+=========+
> > | ``spec`` | ignored |
> > +----------+---------+
> > | ``mask`` | ignored |
> > +----------+---------+
> > 
> > ``VF``
> > ^^^^^^
> > 
> > Matches packets addressed to the given virtual function ID of the device.
> > 
> > - Only ``spec`` needs to be defined, ``mask`` is ignored.
> > 
> > +----------------------------------------+
> > | VF                                     |
> > +==========+=========+===================+
> > | ``spec`` | ``vf``  | destination VF ID |
> > +----------+---------+-------------------+
> > | ``mask`` | ignored                     |
> > +----------+-----------------------------+
> > 
> > ``SIGNATURE``
> > ^^^^^^^^^^^^^
> > 
> > Requests hash-based signature dispatching for this rule.
> > 
> > Considering this is a global setting on devices that support it, all
> > subsequent filter rules may have to be created with it as well.
> > 
> > - Only ``spec`` needs to be defined, ``mask`` is ignored.
> > 
> > +--------------------+
> > | SIGNATURE          |
> > +==========+=========+
> > | ``spec`` | TBD     |
> > +----------+---------+
> > | ``mask`` | ignored |
> > +----------+---------+
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Data matching item types
> > ~~~~~~~~~~~~~~~~~~~~~~~~
> > 
> > Most of these are basically protocol header definitions with associated
> > bitmasks. They must be specified (stacked) from lowest to highest protocol
> > layer.
> > 
> > The following list is not exhaustive as new protocols will be added in the
> > future.
> > 
> > ``ANY``
> > ^^^^^^^
> > 
> > Matches any protocol in place of the current layer, a single ANY may also
> > stand for several protocol layers.
> > 
> > This is usually specified as the first pattern item when looking for a
> > protocol anywhere in a packet.
> > 
> > - A maximum value of **0** requests matching any number of protocol
> > layers
> >   above or equal to the minimum value, a maximum value lower than the
> >   minimum one is otherwise invalid.
> > - Only ``spec`` needs to be defined, ``mask`` is ignored.
> > 
> > +-----------------------------------------------------------------------+
> > | ANY                                                                   |
> > +==========+=========+====================================
> > ==============+
> > | ``spec`` | ``min`` | minimum number of layers covered                 |
> > |          +---------+--------------------------------------------------+
> > |          | ``max`` | maximum number of layers covered, 0 for infinity |
> > +----------+---------+--------------------------------------------------+
> > | ``mask`` | ignored                                                    |
> > +----------+------------------------------------------------------------+
> > 
> > Example for VXLAN TCP payload matching regardless of outer L3 (IPv4 or
> > IPv6)
> > and L4 (UDP) both matched by the first ANY specification, and inner L3 (IPv4
> > or IPv6) matched by the second ANY specification:
> > 
> > +----------------------------------+
> > | TCP in VXLAN with wildcards      |
> > +===+==============================+
> > | 0 | Ethernet                     |
> > +---+-----+----------+---------+---+
> > | 1 | ANY | ``spec`` | ``min`` | 2 |
> > |   |     |          +---------+---+
> > |   |     |          | ``max`` | 2 |
> > +---+-----+----------+---------+---+
> > | 2 | VXLAN                        |
> > +---+------------------------------+
> > | 3 | Ethernet                     |
> > +---+-----+----------+---------+---+
> > | 4 | ANY | ``spec`` | ``min`` | 1 |
> > |   |     |          +---------+---+
> > |   |     |          | ``max`` | 1 |
> > +---+-----+----------+---------+---+
> > | 5 | TCP                          |
> > +---+------------------------------+
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``RAW``
> > ^^^^^^^
> > 
> > Matches a string of a given length at a given offset (in bytes), or anywhere
> > in the payload of the current protocol layer (including L2 header if used as
> > the first item in the stack).
> > 
> > This does not increment the protocol layer count as it is not a protocol
> > definition. Subsequent RAW items modulate the first absolute one with
> > relative offsets.
> > 
> > - Using **-1** as the ``offset`` of the first RAW item makes its absolute
> >   offset not fixed, i.e. the pattern is searched everywhere.
> > - ``mask`` only affects the pattern.
> > 
> > +--------------------------------------------------------------+
> > | RAW                                                          |
> > +==========+=============+================================
> > =====+
> > | ``spec`` | ``offset``  | absolute or relative pattern offset |
> > |          +-------------+-------------------------------------+
> > |          | ``length``  | pattern length                      |
> > |          +-------------+-------------------------------------+
> > |          | ``pattern`` | byte string of the above length     |
> > +----------+-------------+-------------------------------------+
> > | ``mask`` | ``offset``  | ignored                             |
> > |          +-------------+-------------------------------------+
> > |          | ``length``  | ignored                             |
> > |          +-------------+-------------------------------------+
> > |          | ``pattern`` | bitmask with the same byte length   |
> > +----------+-------------+-------------------------------------+
> > 
> > Example pattern looking for several strings at various offsets of a UDP
> > payload, using combined RAW items:
> > 
> > +------------------------------------------+
> > | UDP payload matching                     |
> > +===+======================================+
> > | 0 | Ethernet                             |
> > +---+--------------------------------------+
> > | 1 | IPv4                                 |
> > +---+--------------------------------------+
> > | 2 | UDP                                  |
> > +---+-----+----------+-------------+-------+
> > | 3 | RAW | ``spec`` | ``offset``  | -1    |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``length``  | 3     |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``pattern`` | "foo" |
> > +---+-----+----------+-------------+-------+
> > | 4 | RAW | ``spec`` | ``offset``  | 20    |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``length``  | 3     |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``pattern`` | "bar" |
> > +---+-----+----------+-------------+-------+
> > | 5 | RAW | ``spec`` | ``offset``  | -30   |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``length``  | 3     |
> > |   |     |          +-------------+-------+
> > |   |     |          | ``pattern`` | "baz" |
> > +---+-----+----------+-------------+-------+
> > 
> > This translates to:
> > 
> > - Locate "foo" in UDP payload, remember its offset.
> > - Check "bar" at "foo"'s offset plus 20 bytes.
> > - Check "baz" at "foo"'s offset minus 30 bytes.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``ETH``
> > ^^^^^^^
> > 
> > Matches an Ethernet header.
> > 
> > - ``dst``: destination MAC.
> > - ``src``: source MAC.
> > - ``type``: EtherType.
> > - ``tags``: number of 802.1Q/ad tags defined.
> > - ``tag[]``: 802.1Q/ad tag definitions, innermost first. For each one:
> > 
> >  - ``tpid``: Tag protocol identifier.
> >  - ``tci``: Tag control information.
> > 
> > ``IPV4``
> > ^^^^^^^^
> > 
> > Matches an IPv4 header.
> > 
> > - ``src``: source IP address.
> > - ``dst``: destination IP address.
> > - ``tos``: ToS/DSCP field.
> > - ``ttl``: TTL field.
> > - ``proto``: protocol number for the next layer.
> > 
> > ``IPV6``
> > ^^^^^^^^
> > 
> > Matches an IPv6 header.
> > 
> > - ``src``: source IP address.
> > - ``dst``: destination IP address.
> > - ``tc``: traffic class field.
> > - ``nh``: Next header field (protocol).
> > - ``hop_limit``: hop limit field (TTL).
> > 
> > ``ICMP``
> > ^^^^^^^^
> > 
> > Matches an ICMP header.
> > 
> > - TBD.
> > 
> > ``UDP``
> > ^^^^^^^
> > 
> > Matches a UDP header.
> > 
> > - ``sport``: source port.
> > - ``dport``: destination port.
> > - ``length``: UDP length.
> > - ``checksum``: UDP checksum.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``TCP``
> > ^^^^^^^
> > 
> > Matches a TCP header.
> > 
> > - ``sport``: source port.
> > - ``dport``: destination port.
> > - All other TCP fields and bits.
> > 
> > ``VXLAN``
> > ^^^^^^^^^
> > 
> > Matches a VXLAN header.
> > 
> > - TBD.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Actions
> > ~~~~~~~
> > 
> > Each possible action is represented by a type. Some have associated
> > configuration structures. Several actions combined in a list can be affected
> > to a flow rule. That list is not ordered.
> > 
> > At least one action must be defined in a filter rule in order to do
> > something with matched packets.
> > 
> > - Actions are defined with ``struct rte_flow_action``.
> > - A list of actions is defined with ``struct rte_flow_actions``.
> > 
> > They fall in three categories:
> > 
> > - Terminating actions (such as QUEUE, DROP, RSS, PF, VF) that prevent
> >   processing matched packets by subsequent flow rules, unless overridden
> >   with PASSTHRU.
> > 
> > - Non terminating actions (PASSTHRU, DUP) that leave matched packets up
> > for
> >   additional processing by subsequent flow rules.
> > 
> > - Other non terminating meta actions that do not affect the fate of packets
> >   (END, VOID, ID, COUNT).
> > 
> > When several actions are combined in a flow rule, they should all have
> > different types (e.g. dropping a packet twice is not possible). However
> > considering the VOID type is an exception to this rule, the defined behavior
> > is for PMDs to only take into account the last action of a given type found
> > in the list. PMDs still perform error checking on the entire list.
> > 
> > *Note that PASSTHRU is the only action able to override a terminating rule.*
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Example of an action that redirects packets to queue index 10:
> > 
> > +----------------+
> > | QUEUE          |
> > +===========+====+
> > | ``queue`` | 10 |
> > +-----------+----+
> > 
> > Action lists examples, their order is not significant, applications must
> > consider all actions to be performed simultaneously:
> > 
> > +----------------+
> > | Count and drop |
> > +=======+========+
> > | COUNT |        |
> > +-------+--------+
> > | DROP  |        |
> > +-------+--------+
> > 
> > +--------------------------+
> > | Tag, count and redirect  |
> > +=======+===========+======+
> > | ID    | ``id``    | 0x2a |
> > +-------+-----------+------+
> > | COUNT |                  |
> > +-------+-----------+------+
> > | QUEUE | ``queue`` | 10   |
> > +-------+-----------+------+
> > 
> > +-----------------------+
> > | Redirect to queue 5   |
> > +=======+===============+
> > | DROP  |               |
> > +-------+-----------+---+
> > | QUEUE | ``queue`` | 5 |
> > +-------+-----------+---+
> > 
> > In the above example, considering both actions are performed
> > simultaneously,
> > its end result is that only QUEUE has any effect.
> > 
> > +-----------------------+
> > | Redirect to queue 3   |
> > +=======+===========+===+
> > | QUEUE | ``queue`` | 5 |
> > +-------+-----------+---+
> > | VOID  |               |
> > +-------+-----------+---+
> > | QUEUE | ``queue`` | 3 |
> > +-------+-----------+---+
> > 
> > As previously described, only the last action of a given type found in the
> > list is taken into account. The above example also shows that VOID is
> > ignored.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Action types
> > ~~~~~~~~~~~~
> > 
> > Common action types are described in this section. Like pattern item types,
> > this list is not exhaustive as new actions will be added in the future.
> > 
> > ``END`` (action)
> > ^^^^^^^^^^^^^^^^
> > 
> > End marker for action lists. Prevents further processing of actions, thereby
> > ending the list.
> > 
> > - Its numeric value is **0** for convenience.
> > - PMD support is mandatory.
> > - No configurable property.
> > 
> > +---------------+
> > | END           |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > ``VOID`` (action)
> > ^^^^^^^^^^^^^^^^^
> > 
> > Used as a placeholder for convenience. It is ignored and simply discarded by
> > PMDs.
> > 
> > - PMD support is mandatory.
> > - No configurable property.
> > 
> > +---------------+
> > | VOID          |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > ``PASSTHRU``
> > ^^^^^^^^^^^^
> > 
> > Leaves packets up for additional processing by subsequent flow rules. This
> > is the default when a rule does not contain a terminating action, but can be
> > specified to force a rule to become non-terminating.
> > 
> > - No configurable property.
> > 
> > +---------------+
> > | PASSTHRU      |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > Example to copy a packet to a queue and continue processing by subsequent
> > flow rules:
> [Sugesh] If a packet get copied to a queue, it’s a termination action. 
> How can its possible to do subsequent action after the packet already 
> moved to the queue. ?How it differs from DUP action?
>  Am I missing anything here? 

Devices may not support the combination of QUEUE + PASSTHRU (i.e. making
QUEUE non-terminating). However these same devices may expose the ability to
copy a packet to another (sniffer) queue all while keeping the rule
terminating (QUEUE + DUP but no PASSTHRU).

DUP with two rules, assuming priorties and PASSTRHU are supported:

- pattern X, priority 0; actions: QUEUE 5, PASSTHRU (non-terminating)

- pattern X, priority 1; actions: QUEUE 6 (terminating)

DUP with two actions on a single rule and a single priority:

- pattern X, priority 0; actions: DUP 5, QUEUE 6 (terminating)

If supported, from an application point of view the end result is similar in
both cases (note the second case may be implemented by the PMD using two HW
rules internally).

However the second case does not waste a priority level and clearly states
the intent to the PMD which is more likely to be supported. If HW supports
DUP directly it is even faster since there is a single rule. That is why I
thought having DUP as an action would be useful.

> > +--------------------------+
> > | Copy to queue 8          |
> > +==========+===============+
> > | PASSTHRU |               |
> > +----------+-----------+---+
> > | QUEUE    | ``queue`` | 8 |
> > +----------+-----------+---+
> > 
> > ``ID``
> > ^^^^^^
> > 
> > Attaches a 32 bit value to packets.
> > 
> > +----------------------------------------------+
> > | ID                                           |
> > +========+=====================================+
> > | ``id`` | 32 bit value to return with packets |
> > +--------+-------------------------------------+
> > 
> [Sugesh] I assume the application has to program the flow 
> with a unique ID and matching packets are stamped with this ID
> when reporting to the software. The uniqueness of ID is NOT 
> guaranteed by the API framework. Correct me if I am wrong here.

You are right, if the way I wrote it is not clear enough, I'm open to
suggestions to improve it.

> [Sugesh] Is it a limitation to use only 32 bit ID? Is it possible to have a
> 64 bit ID? So that application can use the control plane flow pointer
> Itself as an ID. Does it make sense? 

I've specified a 32 bit ID for now because this is what FDIR supports and
also what existing devices can report today AFAIK (i40e and mlx5).

We could use 64 bit for future-proofness in a separate action like "ID64"
when at least one device supports it.

To PMD maintainers: please comment if you know devices that support tagging
matching packets with more than 32 bits of user-provided data!

> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``QUEUE``
> > ^^^^^^^^^
> > 
> > Assigns packets to a given queue index.
> > 
> > - Terminating by default.
> > 
> > +--------------------------------+
> > | QUEUE                          |
> > +===========+====================+
> > | ``queue`` | queue index to use |
> > +-----------+--------------------+
> > 
> > ``DROP``
> > ^^^^^^^^
> > 
> > Drop packets.
> > 
> > - No configurable property.
> > - Terminating by default.
> > - PASSTHRU overrides this action if both are specified.
> > 
> > +---------------+
> > | DROP          |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > ``COUNT``
> > ^^^^^^^^^
> > 
> [Sugesh] Should we really have to set count action explicitly for every rule?
> IMHO it would be great to be an implicit action. Most of the application would be
> interested in the stats of almost all the filters/flows .

I can see why, but no, it must be explicitly requested because you may want
to know in advance when it is not supported. Also considering it is
something else to be done by HW (a separate action), we can assume enabling
this may slow things down a bit.

HW limitations may also prevent you from having as many flow counters as you
want, in which case you probably want to carefully pick which rules have
them.

I think this target is most useful with DROP, VF and PF actions since
those are currently the only ones where SW may not see the related packets.

> > Enables hits counter for this rule.
> > 
> > This counter can be retrieved and reset through ``rte_flow_query()``, see
> > ``struct rte_flow_query_count``.
> > 
> > - Counters can be retrieved with ``rte_flow_query()``.
> > - No configurable property.
> > 
> > +---------------+
> > | COUNT         |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > Query structure to retrieve and reset the flow rule hits counter:
> > 
> > +------------------------------------------------+
> > | COUNT query                                    |
> > +===========+=====+==============================+
> > | ``reset`` | in  | reset counter after query    |
> > +-----------+-----+------------------------------+
> > | ``hits``  | out | number of hits for this flow |
> > +-----------+-----+------------------------------+
> > 
> > ``DUP``
> > ^^^^^^^
> > 
> > Duplicates packets to a given queue index.
> > 
> > This is normally combined with QUEUE, however when used alone, it is
> > actually similar to QUEUE + PASSTHRU.
> > 
> > - Non-terminating by default.
> > 
> > +------------------------------------------------+
> > | DUP                                            |
> > +===========+====================================+
> > | ``queue`` | queue index to duplicate packet to |
> > +-----------+------------------------------------+
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``RSS``
> > ^^^^^^^
> > 
> > Similar to QUEUE, except RSS is additionally performed on packets to spread
> > them among several queues according to the provided parameters.
> > 
> > - Terminating by default.
> > 
> > +---------------------------------------------+
> > | RSS                                         |
> > +==============+==============================+
> > | ``rss_conf`` | RSS parameters               |
> > +--------------+------------------------------+
> > | ``queues``   | number of entries in queue[] |
> > +--------------+------------------------------+
> > | ``queue[]``  | queue indices to use         |
> > +--------------+------------------------------+
> > 
> > ``PF`` (action)
> > ^^^^^^^^^^^^^^^
> > 
> > Redirects packets to the physical function (PF) of the current device.
> > 
> > - No configurable property.
> > - Terminating by default.
> > 
> > +---------------+
> > | PF            |
> > +===============+
> > | no properties |
> > +---------------+
> > 
> > ``VF`` (action)
> > ^^^^^^^^^^^^^^^
> > 
> > Redirects packets to the virtual function (VF) of the current device with
> > the specified ID.
> > 
> > - Terminating by default.
> > 
> > +---------------------------------------+
> > | VF                                    |
> > +========+==============================+
> > | ``id`` | VF ID to redirect packets to |
> > +--------+------------------------------+
> > 
> > Planned types
> > ~~~~~~~~~~~~~
> > 
> > Other action types are planned but not defined yet. These actions will add
> > the ability to alter matching packets in several ways, such as performing
> > encapsulation/decapsulation of tunnel headers on specific flows.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Rules management
> > ----------------
> > 
> > A simple API with only four functions is provided to fully manage flows.
> > 
> > Each created flow rule is associated with an opaque, PMD-specific handle
> > pointer. The application is responsible for keeping it until the rule is
> > destroyed.
> > 
> > Flows rules are defined with ``struct rte_flow``.
> > 
> > Validation
> > ~~~~~~~~~~
> > 
> > Given that expressing a definite set of device capabilities with this API is
> > not practical, a dedicated function is provided to check if a flow rule is
> > supported and can be created.
> > 
> > ::
> > 
> >  int
> >  rte_flow_validate(uint8_t port_id,
> >                    const struct rte_flow_pattern *pattern,
> >                    const struct rte_flow_actions *actions);
> > 
> > While this function has no effect on the target device, the flow rule is
> > validated against its current configuration state and the returned value
> > should be considered valid by the caller for that state only.
> > 
> > The returned value is guaranteed to remain valid only as long as no
> > successful calls to rte_flow_create() or rte_flow_destroy() are made in the
> > meantime and no device parameter affecting flow rules in any way are
> > modified, due to possible collisions or resource limitations (although in
> > such cases ``EINVAL`` should not be returned).
> > 
> > Arguments:
> > 
> > - ``port_id``: port identifier of Ethernet device.
> > - ``pattern``: pattern specification to check.
> > - ``actions``: actions associated with the flow definition.
> > 
> > Return value:
> > 
> > - **0** if flow rule is valid and can be created. A negative errno value
> >   otherwise (``rte_errno`` is also set), the following errors are defined.
> > - ``-EINVAL``: unknown or invalid rule specification.
> > - ``-ENOTSUP``: valid but unsupported rule specification (e.g. partial masks
> >   are unsupported).
> > - ``-EEXIST``: collision with an existing rule.
> > - ``-ENOMEM``: not enough resources.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Creation
> > ~~~~~~~~
> > 
> > Creating a flow rule is similar to validating one, except the rule is
> > actually created.
> > 
> > ::
> > 
> >  struct rte_flow *
> >  rte_flow_create(uint8_t port_id,
> >                  const struct rte_flow_pattern *pattern,
> >                  const struct rte_flow_actions *actions);
> > 
> > Arguments:
> > 
> > - ``port_id``: port identifier of Ethernet device.
> > - ``pattern``: pattern specification to add.
> > - ``actions``: actions associated with the flow definition.
> > 
> > Return value:
> > 
> > A valid flow pointer in case of success, NULL otherwise and ``rte_errno`` is
> > set to the positive version of one of the error codes defined for
> > ``rte_flow_validate()``.
> [Sugesh] : Kind of implementation specific query. What if application
> try to add duplicate rules? Does the API create new flow entry for every 
> API call? 

If an application adds duplicate rules at a given priority level, the second
one may return an error depending on the PMD. Collisions are sometimes
trivial to detect (such as the same pattern twice), others not so much (one
matching an Ethernet header only, the other one matching an IP header only).

Either way if a packet is matched by two rules at a given priority level,
what happens is described in 3.3 (High level design) and 4.4.1 (Priorities).

Applications are responsible for not relying on the PMD to detect these, or
should use a single priority level for each rule to make things clear.

However since the number of HW priority levels is finite and possibly small,
they must also make sure not to waste them. My advice is to only use
priority levels when it cannot be proven that rules do not collide.

If all you have is perfect matching rules without wildcards and all of them
match the same number of layers, a single priority level is fine.

> [Sugesh] Another concern is the cost and time of installing these rules
> in the hardware. Can we make these APIs time bound(or at least an option to
> set the time limit to execute these APIs), so that
> Application doesn’t have to wait so long when installing and deleting flows with
> slow hardware/NIC. What do you think? Most of the datapath flow installations are 
> dynamic and triggered only when there is
> an ingress traffic. Delay in flow insertion/deletion have unpredictable consequences.

This API is (currently) aimed at the control path only, and must indeed be
assumed to be slow. Creating million of rules may take quite long as it may
involve syscalls and other time-consuming synchronization things on the PMD
side.

So currently there is no plan to have rules added from the data path with
time constraints. I think it would be implemented through a different set of
functions anyway.

I do not think adding time limits is practical, even specifying in the API
that creating a single flow rule must take less than a maximum number of
seconds in order to be effective is too much of a constraint (applications
that create all flows during init may not care after all).

You should consider in any case that modifying flow rules will always be
slower than receiving packets, there is no way around that. Applications
have to live with it and provide a software fallback for incoming packets
while managing flow rules.

Moreover, think about what happens when you hit the maximum number of flow
rules and cannot create any more. Applications need to implement some kind
of fallback in their data path.

Offloading flows in HW is also only useful if they live much longer than the
time taken to create and delete them. Perhaps applications may choose to do
so after detecting long lived flows such as TCP sessions.

You may have one separate control thread dedicated to manage flows and
keep your normal control thread unaffected by delays. Several threads can
even be dedicated, one per device.

> [Sugesh] Another query is on the synchronization part. What if same rules are 
> handled from different threads? Is application responsible for handling the concurrent
> hardware programming?

Like most (if not all) DPDK APIs, applications are responsible for managing
locking issues as decribed in 4.3 (Behavior). Since this is a control path
API and applications usually have a single control thread, locking should
not be necessary in most cases.

Regarding my above comment about using several control threads to manage
different devices, section 4.3 says:
 
 "There is no provision for reentrancy/multi-thread safety, although nothing
 should prevent different devices from being configured at the same
 time. PMDs may protect their control path functions accordingly."

I'd like to emphasize it is not "per port" but "per device", since in a few
cases a configurable resource is shared by several ports. It may be
difficult for applications to determine which ports are shared by a given
device but this falls outside the scope of this API.

Do you think adding the guarantee that it is always safe to configure two
different ports simultaneously without locking from the application side is
necessary? In which case the PMD would be responsible for locking shared
resources.

> > Destruction
> > ~~~~~~~~~~~
> > 
> > Flow rules destruction is not automatic, and a queue should not be released
> > if any are still attached to it. Applications must take care of performing
> > this step before releasing resources.
> > 
> > ::
> > 
> >  int
> >  rte_flow_destroy(uint8_t port_id,
> >                   struct rte_flow *flow);
> > 
> > 
> [Sugesh] I would suggest having a clean-up API is really useful as the releasing of
> Queue(is it applicable for releasing of port too?) is not guaranteeing the automatic flow 
> destruction.

Would something like rte_flow_flush(port_id) do the trick? I wanted to
emphasize in this first draft that applications should really keep the flow
pointers around in order to manage/destroy them. It is their responsibility,
not PMD's.

> This way application can initialize the port,
> clean-up all the existing rules and create new rules  on a clean slate.

No resource can be released as long as a flow rule is using it (bad things
may happen otherwise), all flow rules must be destroyed first, thus none can
possibly remain after initializing a port. It is assumed that PMDs do
automatic clean up during init if necessary to ensure this.

> > Failure to destroy a flow rule may occur when other flow rules depend on it,
> > and destroying it would result in an inconsistent state.
> > 
> > This function is only guaranteed to succeed if flow rules are destroyed in
> > reverse order of their creation.
> > 
> > Arguments:
> > 
> > - ``port_id``: port identifier of Ethernet device.
> > - ``flow``: flow rule to destroy.
> > 
> > Return value:
> > 
> > - **0** on success, a negative errno value otherwise and ``rte_errno`` is
> >   set.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Query
> > ~~~~~
> > 
> > Query an existing flow rule.
> > 
> > This function allows retrieving flow-specific data such as counters. Data
> > is gathered by special actions which must be present in the flow rule
> > definition.
> > 
> > ::
> > 
> >  int
> >  rte_flow_query(uint8_t port_id,
> >                 struct rte_flow *flow,
> >                 enum rte_flow_action_type action,
> >                 void *data);
> > 
> > Arguments:
> > 
> > - ``port_id``: port identifier of Ethernet device.
> > - ``flow``: flow rule to query.
> > - ``action``: action type to query.
> > - ``data``: pointer to storage for the associated query data type.
> > 
> > Return value:
> > 
> > - **0** on success, a negative errno value otherwise and ``rte_errno`` is
> >   set.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Behavior
> > --------
> > 
> > - API operations are synchronous and blocking (``EAGAIN`` cannot be
> >   returned).
> > 
> > - There is no provision for reentrancy/multi-thread safety, although nothing
> >   should prevent different devices from being configured at the same
> >   time. PMDs may protect their control path functions accordingly.
> > 
> > - Stopping the data path (TX/RX) should not be necessary when managing
> > flow
> >   rules. If this cannot be achieved naturally or with workarounds (such as
> >   temporarily replacing the burst function pointers), an appropriate error
> >   code must be returned (``EBUSY``).
> > 
> > - PMDs, not applications, are responsible for maintaining flow rules
> >   configuration when stopping and restarting a port or performing other
> >   actions which may affect them. They can only be destroyed explicitly.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> [Sugesh] Query all the rules for a specific port/queue?? Useful when adding and
> deleting ports and queues dynamically according to the need. I am not sure 
> what are the other  different usecases for these APIs. But I feel it makes much easier to 
> manage flows from the application. What do you think?

Not sure, that seems to fall out of the scope of this API. As described,
applications already store the related rte_flow pointers. Accordingly, they
know how many rules are associated to a given port. They need both a port ID
and a flow rule pointer to destroy them after all.

Now perhaps something to convert back an existing rte_flow to a pattern and
a list of actions, however I cannot see an immediate use case for it.

What you describe seems to be doable through a front-end API, I think
keeping this one as low-level as possible with only basic actions is better
right now. I'll keep your suggestion in mind.

> > Compatibility
> > -------------
> > 
> > No known hardware implementation supports all the features described in
> > this
> > document.
> > 
> > Unsupported features or combinations are not expected to be fully
> > emulated
> > in software by PMDs for performance reasons. Partially supported features
> > may be completed in software as long as hardware performs most of the
> > work
> > (such as queue redirection and packet recognition).
> > 
> > However PMDs are expected to do their best to satisfy application requests
> > by working around hardware limitations as long as doing so does not affect
> > the behavior of existing flow rules.
> > 
> > The following sections provide a few examples of such cases, they are based
> > on limitations built into the previous APIs.
> > 
> > Global bitmasks
> > ~~~~~~~~~~~~~~~
> > 
> > Each flow rule comes with its own, per-layer bitmasks, while hardware may
> > support only a single, device-wide bitmask for a given layer type, so that
> > two IPv4 rules cannot use different bitmasks.
> > 
> > The expected behavior in this case is that PMDs automatically configure
> > global bitmasks according to the needs of the first created flow rule.
> > 
> > Subsequent rules are allowed only if their bitmasks match those, the
> > ``EEXIST`` error code should be returned otherwise.
> > 
> > Unsupported layer types
> > ~~~~~~~~~~~~~~~~~~~~~~~
> > 
> > Many protocols can be simulated by crafting patterns with the `RAW`_ type.
> > 
> > PMDs can rely on this capability to simulate support for protocols with
> > fixed headers not directly recognized by hardware.
> > 
> > ``ANY`` pattern item
> > ~~~~~~~~~~~~~~~~~~~~
> > 
> > This pattern item stands for anything, which can be difficult to translate
> > to something hardware would understand, particularly if followed by more
> > specific types.
> > 
> > Consider the following pattern:
> > 
> > +---+--------------------------------+
> > | 0 | ETHER                          |
> > +---+--------------------------------+
> > | 1 | ANY (``min`` = 1, ``max`` = 1) |
> > +---+--------------------------------+
> > | 2 | TCP                            |
> > +---+--------------------------------+
> > 
> > Knowing that TCP does not make sense with something other than IPv4 and
> > IPv6
> > as L3, such a pattern may be translated to two flow rules instead:
> > 
> > +---+--------------------+
> > | 0 | ETHER              |
> > +---+--------------------+
> > | 1 | IPV4 (zeroed mask) |
> > +---+--------------------+
> > | 2 | TCP                |
> > +---+--------------------+
> > 
> > +---+--------------------+
> > | 0 | ETHER              |
> > +---+--------------------+
> > | 1 | IPV6 (zeroed mask) |
> > +---+--------------------+
> > | 2 | TCP                |
> > +---+--------------------+
> > 
> > Note that as soon as a ANY rule covers several layers, this approach may
> > yield a large number of hidden flow rules. It is thus suggested to only
> > support the most common scenarios (anything as L2 and/or L3).
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > Unsupported actions
> > ~~~~~~~~~~~~~~~~~~~
> > 
> > - When combined with a `QUEUE`_ action, packet counting (`COUNT`_) and
> >   tagging (`ID`_) may be implemented in software as long as the target queue
> >   is used by a single rule.
> > 
> > - A rule specifying both `DUP`_ + `QUEUE`_ may be translated to two hidden
> >   rules combining `QUEUE`_ and `PASSTHRU`_.
> > 
> > - When a single target queue is provided, `RSS`_ can also be implemented
> >   through `QUEUE`_.
> > 
> > Flow rules priority
> > ~~~~~~~~~~~~~~~~~~~
> > 
> > While it would naturally make sense, flow rules cannot be assumed to be
> > processed by hardware in the same order as their creation for several
> > reasons:
> > 
> > - They may be managed internally as a tree or a hash table instead of a
> >   list.
> > - Removing a flow rule before adding another one can either put the new
> > rule
> >   at the end of the list or reuse a freed entry.
> > - Duplication may occur when packets are matched by several rules.
> > 
> > For overlapping rules (particularly in order to use the `PASSTHRU`_ action)
> > predictable behavior is only guaranteed by using different priority levels.
> > 
> > Priority levels are not necessarily implemented in hardware, or may be
> > severely limited (e.g. a single priority bit).
> > 
> > For these reasons, priority levels may be implemented purely in software by
> > PMDs.
> > 
> > - For devices expecting flow rules to be added in the correct order, PMDs
> >   may destroy and re-create existing rules after adding a new one with
> >   a higher priority.
> > 
> > - A configurable number of dummy or empty rules can be created at
> >   initialization time to save high priority slots for later.
> > 
> > - In order to save priority levels, PMDs may evaluate whether rules are
> >   likely to collide and adjust their priority accordingly.
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > API migration
> > =============
> > 
> > Exhaustive list of deprecated filter types and how to convert them to
> > generic flow rules.
> > 
> > ``MACVLAN`` to ``ETH`` → ``VF``, ``PF``
> > ---------------------------------------
> > 
> > `MACVLAN`_ can be translated to a basic `ETH`_ flow rule with a `VF
> > (action)`_ or `PF (action)`_ terminating action.
> > 
> > +------------------------------------+
> > | MACVLAN                            |
> > +--------------------------+---------+
> > | Pattern                  | Actions |
> > +===+=====+==========+=====+=========+
> > | 0 | ETH | ``spec`` | any | VF,     |
> > |   |     +----------+-----+ PF      |
> > |   |     | ``mask`` | any |         |
> > +---+-----+----------+-----+---------+
> > 
> > ``ETHERTYPE`` to ``ETH`` → ``QUEUE``, ``DROP``
> > ----------------------------------------------
> > 
> > `ETHERTYPE`_ is basically an `ETH`_ flow rule with `QUEUE`_ or `DROP`_ as
> > a terminating action.
> > 
> > +------------------------------------+
> > | ETHERTYPE                          |
> > +--------------------------+---------+
> > | Pattern                  | Actions |
> > +===+=====+==========+=====+=========+
> > | 0 | ETH | ``spec`` | any | QUEUE,  |
> > |   |     +----------+-----+ DROP    |
> > |   |     | ``mask`` | any |         |
> > +---+-----+----------+-----+---------+
> > 
> > ``FLEXIBLE`` to ``RAW`` → ``QUEUE``
> > -----------------------------------
> > 
> > `FLEXIBLE`_ can be translated to one `RAW`_ pattern with `QUEUE`_ as the
> > terminating action and a defined priority level.
> > 
> > +------------------------------------+
> > | FLEXIBLE                           |
> > +--------------------------+---------+
> > | Pattern                  | Actions |
> > +===+=====+==========+=====+=========+
> > | 0 | RAW | ``spec`` | any | QUEUE   |
> > |   |     +----------+-----+         |
> > |   |     | ``mask`` | any |         |
> > +---+-----+----------+-----+---------+
> > 
> > ``SYN`` to ``TCP`` → ``QUEUE``
> > ------------------------------
> > 
> > `SYN`_ is a `TCP`_ rule with only the ``syn`` bit enabled and masked, and
> > `QUEUE`_ as the terminating action.
> > 
> > Priority level can be set to simulate the high priority bit.
> > 
> > +---------------------------------------------+
> > | SYN                                         |
> > +-----------------------------------+---------+
> > | Pattern                           | Actions |
> > +===+======+==========+=============+=========+
> > | 0 | ETH  | ``spec`` | N/A         | QUEUE   |
> > |   |      +----------+-------------+         |
> > |   |      | ``mask`` | empty       |         |
> > +---+------+----------+-------------+         |
> > | 1 | IPV4 | ``spec`` | N/A         |         |
> > |   |      +----------+-------------+         |
> > |   |      | ``mask`` | empty       |         |
> > +---+------+----------+-------------+         |
> > | 2 | TCP  | ``spec`` | ``syn`` = 1 |         |
> > |   |      +----------+-------------+         |
> > |   |      | ``mask`` | ``syn`` = 1 |         |
> > +---+------+----------+-------------+---------+
> > 
> > ``NTUPLE`` to ``IPV4``, ``TCP``, ``UDP`` → ``QUEUE``
> > ----------------------------------------------------
> > 
> > `NTUPLE`_ is similar to specifying an empty L2, `IPV4`_ as L3 with `TCP`_ or
> > `UDP`_ as L4 and `QUEUE`_ as the terminating action.
> > 
> > A priority level can be specified as well.
> > 
> > +---------------------------------------+
> > | NTUPLE                                |
> > +-----------------------------+---------+
> > | Pattern                     | Actions |
> > +===+======+==========+=======+=========+
> > | 0 | ETH  | ``spec`` | N/A   | QUEUE   |
> > |   |      +----------+-------+         |
> > |   |      | ``mask`` | empty |         |
> > +---+------+----------+-------+         |
> > | 1 | IPV4 | ``spec`` | any   |         |
> > |   |      +----------+-------+         |
> > |   |      | ``mask`` | any   |         |
> > +---+------+----------+-------+         |
> > | 2 | TCP, | ``spec`` | any   |         |
> > |   | UDP  +----------+-------+         |
> > |   |      | ``mask`` | any   |         |
> > +---+------+----------+-------+---------+
> > 
> > ``TUNNEL`` to ``ETH``, ``IPV4``, ``IPV6``, ``VXLAN`` (or other) → ``QUEUE``
> > ---------------------------------------------------------------------------
> > 
> > `TUNNEL`_ matches common IPv4 and IPv6 L3/L4-based tunnel types.
> > 
> > In the following table, `ANY`_ is used to cover the optional L4.
> > 
> > +------------------------------------------------+
> > | TUNNEL                                         |
> > +--------------------------------------+---------+
> > | Pattern                              | Actions |
> > +===+=========+==========+=============+=========+
> > | 0 | ETH     | ``spec`` | any         | QUEUE   |
> > |   |         +----------+-------------+         |
> > |   |         | ``mask`` | any         |         |
> > +---+---------+----------+-------------+         |
> > | 1 | IPV4,   | ``spec`` | any         |         |
> > |   | IPV6    +----------+-------------+         |
> > |   |         | ``mask`` | any         |         |
> > +---+---------+----------+-------------+         |
> > | 2 | ANY     | ``spec`` | ``min`` = 0 |         |
> > |   |         |          +-------------+         |
> > |   |         |          | ``max`` = 0 |         |
> > |   |         +----------+-------------+         |
> > |   |         | ``mask`` | N/A         |         |
> > +---+---------+----------+-------------+         |
> > | 3 | VXLAN,  | ``spec`` | any         |         |
> > |   | GENEVE, +----------+-------------+         |
> > |   | TEREDO, | ``mask`` | any         |         |
> > |   | NVGRE,  |          |             |         |
> > |   | GRE,    |          |             |         |
> > |   | ...     |          |             |         |
> > +---+---------+----------+-------------+---------+
> > 
> > .. raw:: pdf
> > 
> >    PageBreak
> > 
> > ``FDIR`` to most item types → ``QUEUE``, ``DROP``, ``PASSTHRU``
> > ---------------------------------------------------------------
> > 
> > `FDIR`_ is more complex than any other type, there are several methods to
> > emulate its functionality. It is summarized for the most part in the table
> > below.
> > 
> > A few features are intentionally not supported:
> > 
> > - The ability to configure the matching input set and masks for the entire
> >   device, PMDs should take care of it automatically according to flow rules.
> > 
> > - Returning four or eight bytes of matched data when using flex bytes
> >   filtering. Although a specific action could implement it, it conflicts
> >   with the much more useful 32 bits tagging on devices that support it.
> > 
> > - Side effects on RSS processing of the entire device. Flow rules that
> >   conflict with the current device configuration should not be
> >   allowed. Similarly, device configuration should not be allowed when it
> >   affects existing flow rules.
> > 
> > - Device modes of operation. "none" is unsupported since filtering cannot be
> >   disabled as long as a flow rule is present.
> > 
> > - "MAC VLAN" or "tunnel" perfect matching modes should be automatically
> > set
> >   according to the created flow rules.
> > 
> > +----------------------------------------------+
> > | FDIR                                         |
> > +---------------------------------+------------+
> > | Pattern                         | Actions    |
> > +===+============+==========+=====+============+
> > | 0 | ETH,       | ``spec`` | any | QUEUE,     |
> > |   | RAW        +----------+-----+ DROP,      |
> > |   |            | ``mask`` | any | PASSTHRU   |
> > +---+------------+----------+-----+------------+
> > | 1 | IPV4,      | ``spec`` | any | ID         |
> > |   | IPV6       +----------+-----+ (optional) |
> > |   |            | ``mask`` | any |            |
> > +---+------------+----------+-----+            |
> > | 2 | TCP,       | ``spec`` | any |            |
> > |   | UDP,       +----------+-----+            |
> > |   | SCTP       | ``mask`` | any |            |
> > +---+------------+----------+-----+            |
> > | 3 | VF,        | ``spec`` | any |            |
> > |   | PF,        +----------+-----+            |
> > |   | SIGNATURE  | ``mask`` | any |            |
> > |   | (optional) |          |     |            |
> > +---+------------+----------+-----+------------+
> > 
> > ``HASH``
> > ~~~~~~~~
> > 
> > Hashing configuration is set per rule through the `SIGNATURE`_ item.
> > 
> > Since it is usually a global device setting, all flow rules created with
> > this item may have to share the same specification.
> > 
> > ``L2_TUNNEL`` to ``VOID`` → ``VXLAN`` (or others)
> > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > 
> > All packets are matched. This type alters incoming packets to encapsulate
> > them in a chosen tunnel type, optionally redirect them to a VF as well.
> > 
> > The destination pool for tag based forwarding can be emulated with other
> > flow rules using `DUP`_ as the action.
> > 
> > +----------------------------------------+
> > | L2_TUNNEL                              |
> > +---------------------------+------------+
> > | Pattern                   | Actions    |
> > +===+======+==========+=====+============+
> > | 0 | VOID | ``spec`` | N/A | VXLAN,     |
> > |   |      |          |     | GENEVE,    |
> > |   |      |          |     | ...        |
> > |   |      +----------+-----+------------+
> > |   |      | ``mask`` | N/A | VF         |
> > |   |      |          |     | (optional) |
> > +---+------+----------+-----+------------+
> > 
> > --
> > Adrien Mazarguil
> > 6WIND

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v3 10/10] maintainers: add section for pmdinfo
  @ 2016-07-08 14:42  4%     ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-08 14:42 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev

The author of this feature is Neil Horman.

Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
 MAINTAINERS | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..f996c2e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -68,6 +68,10 @@ F: lib/librte_compat/
 F: doc/guides/rel_notes/deprecation.rst
 F: scripts/validate-abi.sh
 
+Driver information
+F: buildtools/pmdinfogen/
+F: tools/pmdinfo.py
+
 
 Environment Abstraction Layer
 -----------------------------
-- 
2.7.0

^ permalink raw reply	[relevance 4%]

* Re: [dpdk-dev] [PATCH] cryptodev: move new cryptodev type to bottom of enum
  2016-07-06 14:05  3% [dpdk-dev] [PATCH] cryptodev: move new cryptodev type to bottom of enum Pablo de Lara
@ 2016-07-08 17:52  0% ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-08 17:52 UTC (permalink / raw)
  To: Pablo de Lara; +Cc: dev, declan.doherty

2016-07-06 15:05, Pablo de Lara:
> New cryptodev type for the new KASUMI PMD was added
> in the cryptodev type enum, but not at the end of it,
> causing an ABI breakage.
> 
> Fixes: 2773c86d061a ("crypto/kasumi: add driver for KASUMI library")
> 
> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> Reported-by: Ferruh Yigit <ferruh.yigit@intel.com>

Applied, thanks

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v2] librte_pmd_bond: fix exported symbol versioning
  2016-07-06 11:39  3% [dpdk-dev] [PATCH] librte_pmd_bond: fix exported symbol versioning Christian Ehrhardt
@ 2016-07-11 11:27  3% ` Christian Ehrhardt
  2016-07-11 12:58  0%   ` Thomas Monjalon
  0 siblings, 1 reply; 200+ results
From: Christian Ehrhardt @ 2016-07-11 11:27 UTC (permalink / raw)
  To: Eric Kinzie, christian.ehrhardt, thomas.monjalon, dev

*update in v2*
- add missing changes in rte_eth_bond_8023ad.h

The older versions of rte_eth_bond_8023ad_conf_get and
rte_eth_bond_8023ad_setup were available in the old way since 2.0 - at
least according to the map file.

But versioning in the code was set to 16.04.
That breaks compatibility checks for 2.0 on that library.

For example with the dpdk abi checker:
http://people.canonical.com/~paelzer/compat_report.html

To fix, version the old symbols on the 2.0 version as they were
initially added to the map file.

See http://people.canonical.com/~paelzer/compat_report.html

Fixes: dc40f17a ("net/bonding: allow external state machine in mode 4")

Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
---
 drivers/net/bonding/rte_eth_bond_8023ad.c | 12 ++++++------
 drivers/net/bonding/rte_eth_bond_8023ad.h |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c
index 48a50e4..2f7ae70 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.c
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.c
@@ -1068,7 +1068,7 @@ bond_mode_8023ad_conf_assign(struct mode8023ad_private *mode4,
 }
 
 static void
-bond_mode_8023ad_setup_v1604(struct rte_eth_dev *dev,
+bond_mode_8023ad_setup_v20(struct rte_eth_dev *dev,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_bond_8023ad_conf def_conf;
@@ -1214,7 +1214,7 @@ free_out:
 }
 
 int
-rte_eth_bond_8023ad_conf_get_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_conf_get_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_dev *bond_dev;
@@ -1229,7 +1229,7 @@ rte_eth_bond_8023ad_conf_get_v1604(uint8_t port_id,
 	bond_mode_8023ad_conf_get(bond_dev, conf);
 	return 0;
 }
-VERSION_SYMBOL(rte_eth_bond_8023ad_conf_get, _v1604, 16.04);
+VERSION_SYMBOL(rte_eth_bond_8023ad_conf_get, _v20, 2.0);
 
 int
 rte_eth_bond_8023ad_conf_get_v1607(uint8_t port_id,
@@ -1278,7 +1278,7 @@ bond_8023ad_setup_validate(uint8_t port_id,
 }
 
 int
-rte_eth_bond_8023ad_setup_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_setup_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf)
 {
 	struct rte_eth_dev *bond_dev;
@@ -1289,11 +1289,11 @@ rte_eth_bond_8023ad_setup_v1604(uint8_t port_id,
 		return err;
 
 	bond_dev = &rte_eth_devices[port_id];
-	bond_mode_8023ad_setup_v1604(bond_dev, conf);
+	bond_mode_8023ad_setup_v20(bond_dev, conf);
 
 	return 0;
 }
-VERSION_SYMBOL(rte_eth_bond_8023ad_setup, _v1604, 16.04);
+VERSION_SYMBOL(rte_eth_bond_8023ad_setup, _v20, 2.0);
 
 int
 rte_eth_bond_8023ad_setup_v1607(uint8_t port_id,
diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.h b/drivers/net/bonding/rte_eth_bond_8023ad.h
index 1de34bc..6b8ff57 100644
--- a/drivers/net/bonding/rte_eth_bond_8023ad.h
+++ b/drivers/net/bonding/rte_eth_bond_8023ad.h
@@ -188,7 +188,7 @@ int
 rte_eth_bond_8023ad_conf_get(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf);
 int
-rte_eth_bond_8023ad_conf_get_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_conf_get_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf);
 int
 rte_eth_bond_8023ad_conf_get_v1607(uint8_t port_id,
@@ -209,7 +209,7 @@ int
 rte_eth_bond_8023ad_setup(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf);
 int
-rte_eth_bond_8023ad_setup_v1604(uint8_t port_id,
+rte_eth_bond_8023ad_setup_v20(uint8_t port_id,
 		struct rte_eth_bond_8023ad_conf *conf);
 int
 rte_eth_bond_8023ad_setup_v1607(uint8_t port_id,
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v2] librte_pmd_bond: fix exported symbol versioning
  2016-07-11 11:27  3% ` [dpdk-dev] [PATCH v2] " Christian Ehrhardt
@ 2016-07-11 12:58  0%   ` Thomas Monjalon
  0 siblings, 0 replies; 200+ results
From: Thomas Monjalon @ 2016-07-11 12:58 UTC (permalink / raw)
  To: Christian Ehrhardt; +Cc: Eric Kinzie, dev

2016-07-11 13:27, Christian Ehrhardt:
> *update in v2*
> - add missing changes in rte_eth_bond_8023ad.h
> 
> The older versions of rte_eth_bond_8023ad_conf_get and
> rte_eth_bond_8023ad_setup were available in the old way since 2.0 - at
> least according to the map file.
> 
> But versioning in the code was set to 16.04.
> That breaks compatibility checks for 2.0 on that library.
> 
> For example with the dpdk abi checker:
> http://people.canonical.com/~paelzer/compat_report.html
> 
> To fix, version the old symbols on the 2.0 version as they were
> initially added to the map file.
> 
> See http://people.canonical.com/~paelzer/compat_report.html
> 
> Fixes: dc40f17a ("net/bonding: allow external state machine in mode 4")
> 
> Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>

Applied, thanks

^ permalink raw reply	[relevance 0%]

* [dpdk-dev] [PATCH v6 04/17] eal: remove duplicate function declaration
  @ 2016-07-12  6:01  3%     ` Shreyansh Jain
  2016-07-14 17:13  0%       ` viktorin
  0 siblings, 1 reply; 200+ results
From: Shreyansh Jain @ 2016-07-12  6:01 UTC (permalink / raw)
  To: dev; +Cc: viktorin, thomas.monjalon, david.marchand

rte_eal_dev_init is declared in both eal_private.h and rte_dev.h since its
introduction.
This function has been exported in ABI, so remove it from eal_private.h

Fixes: e57f20e05177 ("eal: make vdev init path generic for both virtual and pci devices")
Signed-off-by: David Marchand <david.marchand@6wind.com>
Signed-off-by: Shreyansh Jain <shreyansh.jain@nxp.com>
---
 lib/librte_eal/common/eal_private.h | 7 -------
 lib/librte_eal/linuxapp/eal/eal.c   | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 857dc3e..06a68f6 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -259,13 +259,6 @@ int rte_eal_intr_init(void);
 int rte_eal_alarm_init(void);
 
 /**
- * This function initialises any virtual devices
- *
- * This function is private to the EAL.
- */
-int rte_eal_dev_init(void);
-
-/**
  * Function is to check if the kernel module(like, vfio, vfio_iommu_type1,
  * etc.) loaded.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 3fb2188..fe9c704 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -70,6 +70,7 @@
 #include <rte_cpuflags.h>
 #include <rte_interrupts.h>
 #include <rte_pci.h>
+#include <rte_dev.h>
 #include <rte_devargs.h>
 #include <rte_common.h>
 #include <rte_version.h>
-- 
2.7.4

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] rte_ether: Driver-specific stats getting overwritten
  @ 2016-07-14 15:50  3%     ` Remy Horton
  0 siblings, 0 replies; 200+ results
From: Remy Horton @ 2016-07-14 15:50 UTC (permalink / raw)
  To: Igor Ryzhov, Thomas Monjalon; +Cc: dev


On 14/07/2016 14:51, Igor Ryzhov wrote:
[..]
> How about deleting rx_nombuf from rte_eth_stats? Do you think this
> counter is necessary? It just shows enormous numbers in case of a
> lack of processing speed. But we already have imissed counter which
> shows real number of packets, dropped for the same reason.

Deleting it has API/ABI breakage issues. There is also lack of 
consistency between drivers as to what imissed includes, as some don't 
implement it at all whereas others include filtered packets as well.


>> 14 июля 2016 г., в 16:37, Thomas Monjalon
>> <thomas.monjalon@6wind.com> написал(а):
>>
[..]
>> Yes it is strange and has always been like that. Why not moving the
>> assignment before calling the driver callback?

Think I'll do that. Easier than updating all the drivers that don't fill 
it in..

^ permalink raw reply	[relevance 3%]

* Re: [dpdk-dev] [PATCH v6 04/17] eal: remove duplicate function declaration
  2016-07-12  6:01  3%     ` [dpdk-dev] [PATCH v6 04/17] eal: remove duplicate function declaration Shreyansh Jain
@ 2016-07-14 17:13  0%       ` viktorin
  0 siblings, 0 replies; 200+ results
From: viktorin @ 2016-07-14 17:13 UTC (permalink / raw)
  To: Shreyansh Jain; +Cc: dev, thomas.monjalon, david.marchand

On Tue, 12 Jul 2016 11:31:09 +0530
Shreyansh Jain <shreyansh.jain@nxp.com> wrote:

> rte_eal_dev_init is declared in both eal_private.h and rte_dev.h since its
> introduction.
> This function has been exported in ABI, so remove it from eal_private.h
> 
> Fixes: e57f20e05177 ("eal: make vdev init path generic for both virtual and pci devices")
> Signed-off-by: David Marchand <david.marchand@6wind.com>
> Signed-off-by: Shreyansh Jain <shreyansh.jain@nxp.com>
> ---
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

^ permalink raw reply	[relevance 0%]

Results 2001-2200 of ~18000  next (newer) | prev (older) | reverse | sort options + mbox downloads above

-- links below jump to the message on this page --
2016-01-29 14:08     [dpdk-dev] [PATCH 0/9] prepare for rte_device / rte_driver David Marchand
2016-06-16 14:06     ` [dpdk-dev] [PATCH v3 00/17] " Shreyansh Jain
2016-06-16 14:06  3%   ` [dpdk-dev] [PATCH v3 04/17] eal: remove duplicate function declaration Shreyansh Jain
2016-06-21 12:02     ` [dpdk-dev] [PATCH v4 00/17] prepare for rte_device / rte_driver Shreyansh Jain
2016-06-21 12:02  3%   ` [dpdk-dev] [PATCH v4 04/17] eal: remove duplicate function declaration Shreyansh Jain
2016-06-22  9:06       ` [dpdk-dev] [PATCH v5 00/17] Prepare for rte_device / rte_driver Shreyansh Jain
2016-06-22  9:06  3%     ` [dpdk-dev] [PATCH v5 04/17] eal: remove duplicate function declaration Shreyansh Jain
2016-06-22  9:06       ` [dpdk-dev] [PATCH v5 09/17] crypto: get rid of crypto driver register callback Shreyansh Jain
2016-06-22 13:27         ` Neil Horman
2016-06-22 13:44  3%       ` Thomas Monjalon
2016-07-12  6:01       ` [dpdk-dev] [PATCH v6 00/17] Prepare for rte_device / rte_driver Shreyansh Jain
2016-07-12  6:01  3%     ` [dpdk-dev] [PATCH v6 04/17] eal: remove duplicate function declaration Shreyansh Jain
2016-07-14 17:13  0%       ` viktorin
2016-02-17 14:20     [dpdk-dev] [PATCH 0/3] ethdev: add helper functions to get eth_dev and dev private data Ferruh Yigit
2016-06-22 21:47  0% ` Thomas Monjalon
2016-03-24  7:14     [dpdk-dev] Suggestions for the dpdk stable tree Christian Ehrhardt
2016-05-20  8:07     ` Christian Ehrhardt
2016-05-20 14:49       ` Mcnamara, John
2016-05-23  2:21  3%     ` Yuanhan Liu
2016-06-01 19:01  0%       ` Mcnamara, John
2016-04-14 10:19     [dpdk-dev] [PATCH 00/36] mempool: rework memory allocation Olivier Matz
2016-05-18 11:04     ` [dpdk-dev] [PATCH v3 00/35] " Olivier Matz
2016-05-19 12:47  0%   ` Thomas Monjalon
2016-05-20  8:42  0%     ` Panu Matilainen
2016-04-14 13:57     [dpdk-dev] [PATCH v4 0/3] external mempool manager Olivier Matz
2016-05-19 13:44  2% ` [dpdk-dev] mempool: " David Hunt
2016-05-19 13:44       ` [dpdk-dev] [PATCH v5 1/3] mempool: support external handler David Hunt
2016-05-24 15:35         ` Jerin Jacob
2016-05-27  9:52           ` Hunt, David
2016-05-27 10:33             ` Jerin Jacob
2016-05-27 14:44               ` Hunt, David
2016-05-30  9:41  3%             ` Jerin Jacob
2016-05-30 11:27  0%               ` Hunt, David
2016-06-01 16:19  2%   ` [dpdk-dev] [PATCH v6 0/5] mempool: add external mempool manager David Hunt
2016-06-02 13:27  2%     ` [dpdk-dev] [PATCH v7 " David Hunt
2016-06-02 13:27           ` [dpdk-dev] [PATCH v7 1/5] mempool: support external mempool operations David Hunt
2016-06-02 13:38  2%         ` [dpdk-dev] [PATCH v7 0/5] mempool: add external mempool manager Hunt, David
2016-06-03 14:58  2%       ` [dpdk-dev] [PATCH v8 " David Hunt
2016-06-10 15:16  2%         ` [dpdk-dev] [PATCH v9 0/3] " David Hunt
2016-06-14  9:46  2%           ` [dpdk-dev] [PATCH v10 " David Hunt
2016-06-14 15:48  3%             ` [dpdk-dev] [PATCH v11 " David Hunt
2016-06-14 15:48  1%               ` [dpdk-dev] [PATCH v11 1/3] mempool: support external mempool operations David Hunt
2016-06-15  7:47  3%               ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager David Hunt
2016-06-15  7:47  1%                 ` [dpdk-dev] [PATCH v12 1/3] mempool: support external mempool operations David Hunt
2016-06-15 10:13  0%                 ` [dpdk-dev] [PATCH v12 0/3] mempool: add external mempool manager Jan Viktorin
2016-06-16 12:30  3%                 ` [dpdk-dev] [PATCH v13 " David Hunt
2016-06-16 12:30  1%                   ` [dpdk-dev] [PATCH v13 1/3] mempool: support external mempool operations David Hunt
2016-06-17 13:53  3%                   ` [dpdk-dev] [PATCH v14 0/3] mempool: add mempool handler feature David Hunt
2016-06-17 13:53  1%                     ` [dpdk-dev] [PATCH v14 1/3] mempool: support mempool handler operations David Hunt
2016-06-19 12:05  3%                     ` [dpdk-dev] [PATCH v15 0/3] mempool: add mempool handler feature David Hunt
2016-06-19 12:05  1%                       ` [dpdk-dev] [PATCH v15 1/3] mempool: support mempool handler operations David Hunt
2016-06-22  9:27  3%                       ` [dpdk-dev] [PATCH v16 0/3] mempool: add mempool handler feature David Hunt
2016-06-22  9:27  1%                         ` [dpdk-dev] [PATCH v16 1/3] mempool: support mempool handler operations David Hunt
2016-05-02 22:25     [dpdk-dev] [PATCH 00/16] vhost ABI/API refactoring Yuanhan Liu
2016-05-13  5:24     ` [dpdk-dev] [PATCH v2 00/19] " Yuanhan Liu
2016-05-26 17:04  4%   ` Rich Lane
2016-05-27  1:36  4%     ` Yuanhan Liu
2016-06-07  3:51  9%   ` [dpdk-dev] [PATCH v3 00/20] " Yuanhan Liu
2016-06-07  3:52  7%     ` [dpdk-dev] [PATCH v3 11/20] vhost: introduce new API to export queue free entries Yuanhan Liu
2016-06-07  3:52  3%     ` [dpdk-dev] [PATCH v3 12/20] vhost: remove dependency on priv field Yuanhan Liu
2016-06-07  3:52 13%     ` [dpdk-dev] [PATCH v3 13/20] vhost: export vid as the only interface to applications Yuanhan Liu
2016-06-07  3:52  4%     ` [dpdk-dev] [PATCH v3 17/20] vhost: reserve few more space for future extension Yuanhan Liu
2016-06-07  3:52  6%     ` [dpdk-dev] [PATCH v3 18/20] examples/tep_term: adapt to new vhost ABI/API changes Yuanhan Liu
2016-06-14 12:00  4%     ` [dpdk-dev] [PATCH v3 00/20] vhost ABI/API refactoring Yuanhan Liu
2016-06-30  7:39  9%     ` Panu Matilainen
2016-06-30  7:57  4%       ` Yuanhan Liu
2016-06-30  9:05  7%         ` Panu Matilainen
2016-06-30 11:15  7%           ` Mcnamara, John
2016-06-30 11:40  4%             ` Thomas Monjalon
2016-05-03  0:46     [dpdk-dev] [PATCH 0/3] [RFC] vhost: micro vhost optimization Yuanhan Liu
2016-06-14 12:42  0% ` Yuanhan Liu
2016-05-03  5:51     [dpdk-dev] [PATCH 0/2] NSH packet type support in i40e Jingjing Wu
2016-05-03  5:51     ` [dpdk-dev] [PATCH 1/2] mbuf: new NSH packet type Jingjing Wu
2016-05-19 12:26  4%   ` Olivier Matz
2016-05-06 20:05     [dpdk-dev] [PATCH v1] hash: add tsx support for cuckoo hash Shen Wei
2016-05-07  4:56     ` Stephen Hemminger
2016-05-09 16:51       ` Shen, Wei1
2016-06-10 11:09  0%     ` De Lara Guarch, Pablo
2016-05-10 16:24     [dpdk-dev] [RFC] mbuf: new flag when vlan is stripped Olivier Matz
2016-05-23  8:46  2% ` [dpdk-dev] [PATCH] mbuf: new flag when Vlan " Olivier Matz
2016-05-23  8:59  0%   ` Ananyev, Konstantin
2016-05-23  9:20  0%   ` Ananyev, Konstantin
2016-05-27 14:33  2%   ` [dpdk-dev] [PATCH v2] " Olivier Matz
2016-06-15 11:48  2%     ` [dpdk-dev] [PATCH v3] " Olivier Matz
2016-05-11  6:08     [dpdk-dev] [PATCH] pci: Add the class_id support in pci probe Ziye Yang
2016-05-19 12:25  7% ` [dpdk-dev] [PATCH v2] ci: " Ziye Yang
2016-05-19 13:17  7%   ` [dpdk-dev] [PATCH v3] " Ziye Yang
2016-05-24 12:50  7%     ` [dpdk-dev] [PATCH v4] Pci: Add the class_id support Ziye Yang
2016-06-14 14:52           ` Thomas Monjalon
2016-07-06 11:08  3%         ` Ferruh Yigit
2016-07-07  7:46  0%           ` Thomas Monjalon
2016-05-11 10:48     [dpdk-dev] [PATCH] examples/ethtool: include case for 64-bit registers zr
2016-05-25  6:36     ` [dpdk-dev] [PATCH 1/2] ethdev: add callback to get register size in bytes zr
2016-05-27 10:28  4%   ` Panu Matilainen
2016-05-27 14:43  3%     ` Thomas Monjalon
2016-05-30  9:32  0%     ` Zyta Szpak
2016-05-13  6:16     [dpdk-dev] [PATCH v2 0/6] vhost: add vhost-user client mode and reconnect ability Yuanhan Liu
2016-06-07  4:05  3% ` [dpdk-dev] [PATCH v3 " Yuanhan Liu
2016-06-14 12:00  0%   ` Yuanhan Liu
2016-05-13  8:15     [dpdk-dev] [PATCH v3] i40e: configure MTU Beilei Xing
2016-05-20 15:17  4% ` [dpdk-dev] [PATCH v4] " Beilei Xing
2016-05-13 12:50     [dpdk-dev] [PATCH 0/7] virtio-net support on ppc64 Olivier Matz
2016-05-17  9:59     ` [dpdk-dev] [PATCH v2 " Olivier Matz
2016-05-17  9:59       ` [dpdk-dev] [PATCH v2 5/7] eal/linux: mmap ioports " Olivier Matz
2016-05-17 15:54         ` David Marchand
2016-05-23 13:07  3%       ` Yuanhan Liu
2016-05-23 13:40  3%         ` Olivier Matz
2016-05-24  5:15  3%           ` Yuanhan Liu
2016-05-30  8:45  0%             ` Olivier Matz
2016-06-15 16:13  3%               ` Thomas Monjalon
2016-05-16 13:18     [dpdk-dev] [PATCH 0/2] doc: announce ABI change of struct rte_port_source_params Fan Zhang
2016-05-19 14:18 20% ` [dpdk-dev] [PATCH v2] doc: announce ABI change of struct rte_port_source_params and rte_port_sink_params Fan Zhang
2016-05-17 16:37     [dpdk-dev] [PATCH v3 0/8] add packet capture framework Reshma Pattan
2016-05-23 21:38  3% ` [dpdk-dev] [PATCH v4 0/9] " Reshma Pattan
2016-05-23 21:38       ` [dpdk-dev] [PATCH v4 3/9] librte_ether: add new fields to rte_eth_dev_info struct Reshma Pattan
2016-05-23 22:24  3%     ` Stephen Hemminger
2016-05-24  8:09  3%       ` Pattan, Reshma
2016-05-23 21:38  6%   ` [dpdk-dev] [PATCH v4 8/9] doc: update doc for packet capture framework Reshma Pattan
2016-05-23 21:38  9%   ` [dpdk-dev] [PATCH v4 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
2016-06-08 13:38  3%   ` [dpdk-dev] [PATCH v5 0/9] add packet capture framework Reshma Pattan
2016-06-08 13:38  5%     ` [dpdk-dev] [PATCH v5 8/9] doc: update doc for " Reshma Pattan
2016-06-08 13:38  9%     ` [dpdk-dev] [PATCH v5 9/9] doc: announce ABI change for rte_eth_dev_info structure Reshma Pattan
2016-06-08 16:15  4%       ` Mcnamara, John
2016-06-09  8:50  2%     ` [dpdk-dev] [PATCH v6 0/8] add packet capture framework Reshma Pattan
2016-06-09  8:50  5%       ` [dpdk-dev] [PATCH v6 8/8] doc: update doc for " Reshma Pattan
2016-06-09 16:10  2%       ` [dpdk-dev] [PATCH v7 0/8] add " Reshma Pattan
2016-06-09 16:10  5%         ` [dpdk-dev] [PATCH v7 8/8] doc: update doc for " Reshma Pattan
2016-06-09 17:34  0%         ` [dpdk-dev] [PATCH v7 0/8] add " Ananyev, Konstantin
2016-05-18 13:57     [dpdk-dev] [PATCH] mbuf: make rearm_data address naturally aligned Jerin Jacob
2016-05-18 16:43     ` Bruce Richardson
2016-05-18 18:50       ` Jerin Jacob
2016-05-19  8:50         ` Bruce Richardson
2016-05-19 12:18           ` Ananyev, Konstantin
2016-05-19 13:35  0%         ` Jerin Jacob
2016-05-19 14:48     [dpdk-dev] v2 mempool: add stack (lifo) mempool handler David Hunt
2016-06-20 13:08     ` [dpdk-dev] [PATCH v3 1/2] " David Hunt
2016-06-20 13:25       ` Jerin Jacob
2016-06-20 13:54         ` Thomas Monjalon
2016-06-20 13:58           ` Ananyev, Konstantin
2016-06-20 14:22             ` Jerin Jacob
2016-06-20 17:56               ` Ananyev, Konstantin
2016-06-21  3:35                 ` Jerin Jacob
2016-06-21  9:28                   ` Ananyev, Konstantin
2016-06-21  9:44  3%                 ` Olivier Matz
2016-05-20 13:51 13% [dpdk-dev] [PATCH v1] doc: fix code section in abi versioning doc John McNamara
2016-05-20 14:08 13% [dpdk-dev] [PATCH v2] " John McNamara
2016-06-08 16:46  4% ` Thomas Monjalon
2016-05-26  7:28  4% [dpdk-dev] [PATCH] ethdev: change comments of VLAN type Beilei Xing
2016-06-13  8:03  4% ` [dpdk-dev] [PATCH v2] i40e: modify the meaning of single " Beilei Xing
2016-06-21 10:29  4%   ` Bruce Richardson
2016-06-21 11:06  0%     ` Panu Matilainen
2016-06-21 11:28  0%       ` Bruce Richardson
2016-05-30 10:48     [dpdk-dev] [PATCH v3 00/10] Remove string operations from xstats Remy Horton
2016-05-30 10:48     ` [dpdk-dev] [PATCH v3 01/10] rte: change xstats to use integer ids Remy Horton
2016-06-08  9:37       ` Thomas Monjalon
2016-06-08 11:16  3%     ` Remy Horton
2016-06-08 12:22  0%       ` Thomas Monjalon
2016-05-30 15:26     [dpdk-dev] about rx checksum flags Olivier Matz
2016-05-31  8:09     ` Yuanhan Liu
2016-05-31 19:11       ` Olivier MATZ
2016-05-31 20:28  3%     ` Stephen Hemminger
2016-05-31 20:58  0%       ` Olivier MATZ
2016-05-31 22:02  0%         ` Stephen Hemminger
2016-06-01  9:06  0%           ` Ananyev, Konstantin
2016-06-02  7:42  0%             ` Chandran, Sugesh
2016-06-01 15:00     [dpdk-dev] [RFC] Yet another option for DPDK options Wiles, Keith
2016-06-02 10:41     ` Neil Horman
2016-06-02 13:19       ` Thomas Monjalon
2016-06-02 13:53         ` Wiles, Keith
2016-06-02 17:11           ` Neil Horman
2016-06-02 19:41  3%         ` Wiles, Keith
2016-06-02 20:08  3%           ` Neil Horman
2016-06-02 20:53                 ` Matthew Hall
2016-06-02 22:34  3%               ` Neil Horman
2016-06-03 10:29  0%             ` Bruce Richardson
2016-06-03 11:01  0%               ` Bruce Richardson
2016-06-03 11:50  0%                 ` Neil Horman
2016-06-03 12:01  0%                   ` Arnon Warshavsky
2016-06-03 12:53  0%                     ` Panu Matilainen
2016-06-03 14:31  0%                       ` Arnon Warshavsky
2016-06-03 16:04                             ` Wiles, Keith
2016-06-03 17:44  3%                           ` Neil Horman
2016-06-03 18:29  3%                             ` Wiles, Keith
2016-06-03 18:38  0%                               ` Neil Horman
2016-06-03 18:52  0%                                 ` Arnon Warshavsky
2016-06-03 19:00  0%                                   ` Wiles, Keith
2016-06-03 19:07  0%                                     ` Wiles, Keith
2016-06-03 19:18  0%                                       ` Neil Horman
2016-06-03 19:23  0%                                         ` Wiles, Keith
2016-06-03 19:28  0%                                           ` Arnon Warshavsky
2016-06-03 12:14  0%                   ` Panu Matilainen
2016-06-03 15:07  4% [dpdk-dev] RFC: DPDK Long Term Support Mcnamara, John
2016-06-03 16:05  0% ` Thomas Monjalon
2016-06-06 11:49  0%   ` Yuanhan Liu
2016-06-07 13:17  3%   ` Mcnamara, John
2016-06-03 18:17  3% ` Matthew Hall
2016-06-07 12:53  3%   ` Mcnamara, John
2016-06-05 18:15  5% ` Neil Horman
2016-06-06  9:27  5%   ` Thomas Monjalon
2016-06-06 13:47  5%     ` Neil Horman
2016-06-06 14:21  4%       ` Thomas Monjalon
2016-06-06 15:07  5%         ` Neil Horman
2016-06-07 16:21  3%       ` Mcnamara, John
2016-06-07 15:55  5%   ` Mcnamara, John
2016-06-07 12:36  3% ` Christian Ehrhardt
2016-06-07 19:39  0%   ` Martinx - ジェームズ
2016-06-06  5:40     [dpdk-dev] [PATCH 0/8] support reset of VF link Wenzhuo Lu
2016-06-06  5:40  4% ` [dpdk-dev] [PATCH 2/8] lib/librte_ether: defind RX/TX lock mode Wenzhuo Lu
2016-06-08  2:15  0%   ` Stephen Hemminger
2016-06-08  7:34  0%     ` Lu, Wenzhuo
2016-06-09  7:50  0%       ` Olivier Matz
2016-06-12  5:25  0%         ` Lu, Wenzhuo
2016-06-07  5:45     [dpdk-dev] [PATCH v2 0/8] support reset of VF link Zhe Tao
2016-06-07  5:45  4% ` [dpdk-dev] [PATCH v2 2/8] lib/librte_ether: defind RX/TX lock mode Zhe Tao
2016-06-07  6:12     [dpdk-dev] [PATCH v3 0/8] support reset of VF link Zhe Tao
2016-06-07  6:12  4% ` [dpdk-dev] [PATCH v3 2/8] lib/librte_ether: defind RX/TX lock mode Zhe Tao
     [not found]     <1465278858-5131-1-git-send-email-zhe.tao@intel.com>
2016-06-07  6:53     ` [dpdk-dev] [PATCH v4 0/8] support reset of VF link Zhe Tao
2016-06-07  6:53  4%   ` [dpdk-dev] [PATCH v4 2/8] lib/librte_ether: defind RX/TX lock mode Zhe Tao
2016-06-07  9:58  0%     ` Ananyev, Konstantin
2016-06-08  7:24  3%       ` Lu, Wenzhuo
2016-06-08  9:19  0%         ` Ananyev, Konstantin
2016-06-12  2:00  0%           ` Lu, Wenzhuo
2016-06-12 23:16  0%             ` Ananyev, Konstantin
2016-06-13  1:06  0%               ` Lu, Wenzhuo
2016-06-08  8:31     [dpdk-dev] [PATCH] mbuf: remove inconsistent assert statements Adrien Mazarguil
2016-06-08 10:34     ` Ananyev, Konstantin
2016-06-08 12:27       ` Adrien Mazarguil
2016-06-08 13:09         ` Ananyev, Konstantin
2016-06-08 13:57           ` Adrien Mazarguil
2016-06-08 14:11             ` Olivier Matz
2016-06-08 16:07  3%           ` Ananyev, Konstantin
2016-06-09  7:46  0%             ` Olivier Matz
2016-06-09 13:21  0%               ` Ananyev, Konstantin
2016-06-09 14:09  6% [dpdk-dev] [PATCH] log: deprecate history dump Thomas Monjalon
2016-06-09 14:45  0% ` David Marchand
2016-06-09 15:01  0%   ` Thomas Monjalon
2016-06-09 15:01  0%   ` Christian Ehrhardt
2016-06-09 15:06  5% ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
2016-06-09 22:10  5%   ` [dpdk-dev] [PATCH v3] " Thomas Monjalon
2016-06-10  9:50  0%     ` David Marchand
2016-06-10 13:09  0%       ` Thomas Monjalon
     [not found]     <1465487895-5870-1-git-send-email-reshma.pattan@intel.com>
2016-06-10 16:18  2% ` [dpdk-dev] [PATCH v8 0/8] add packet capture framework Reshma Pattan
2016-06-10 16:18  5%   ` [dpdk-dev] [PATCH v8 8/8] doc: update doc for " Reshma Pattan
2016-06-10 23:23  0%   ` [dpdk-dev] [PATCH v8 0/8] add " Neil Horman
2016-06-13  8:47  0%     ` Pattan, Reshma
2016-06-14  9:38  2%   ` [dpdk-dev] [PATCH v9 " Reshma Pattan
2016-06-14  9:38  5%     ` [dpdk-dev] [PATCH v9 8/8] doc: update doc for " Reshma Pattan
2016-06-14 20:41  3%       ` Thomas Monjalon
2016-06-15  5:44  0%         ` Pattan, Reshma
2016-06-15  8:24  0%           ` Thomas Monjalon
2016-06-15 14:06  2%     ` [dpdk-dev] [PATCH v10 0/7] add " Reshma Pattan
2016-06-15 14:06 10%       ` [dpdk-dev] [PATCH v10 3/7] ethdev: add new fields to ethdev info struct Reshma Pattan
2016-06-16 19:14  4%         ` Thomas Monjalon
2016-06-15 14:06  4%       ` [dpdk-dev] [PATCH v10 4/7] ethdev: make get port by name and get name by port public Reshma Pattan
2016-06-13 11:52     [dpdk-dev] [PATCH] vhost: remove internal lockless enqueue Huawei Xie
2016-06-14 14:07  3% ` Yuanhan Liu
2016-06-17 10:32     [dpdk-dev] [PATCH v2 0/3] Add new KASUMI SW PMD Pablo de Lara
2016-06-20 14:40     ` [dpdk-dev] [PATCH v3 " Pablo de Lara
2016-06-20 14:40       ` [dpdk-dev] [PATCH v3 1/3] kasumi: add new KASUMI PMD Pablo de Lara
2016-07-06 11:26  3%     ` Ferruh Yigit
2016-07-06 13:07  0%       ` Thomas Monjalon
2016-07-06 13:22  0%       ` De Lara Guarch, Pablo
2016-06-17 15:32  3% [dpdk-dev] [RFC] librte_vhost: Add unix domain socket fd registration Aaron Conole
2016-06-17 18:46     [dpdk-dev] [PATCHv8 0/6] Implement pmd hardware support exports Neil Horman
2016-07-04  1:13     ` [dpdk-dev] [PATCH v9 0/7] export PMD infos Thomas Monjalon
2016-07-04  1:14  2%   ` [dpdk-dev] [PATCH v9 4/7] pmdinfogen: parse driver to generate code to export Thomas Monjalon
2016-07-04  1:14  2%   ` [dpdk-dev] [PATCH v9 7/7] tools: query binaries for support information Thomas Monjalon
2016-06-30 11:57     [dpdk-dev] [RFC] mk: filter duplicate configuration entries Christian Ehrhardt
2016-06-30 12:00     ` [dpdk-dev] [PATCH v2] " Christian Ehrhardt
2016-07-05 16:47       ` Ferruh Yigit
2016-07-05 19:47         ` Thomas Monjalon
2016-07-06  5:37  3%       ` Christian Ehrhardt
2016-07-05 15:41     [dpdk-dev] [PATCH 00/18] software parser for packet type Olivier Matz
2016-07-05 15:41  6% ` [dpdk-dev] [PATCH 01/18] doc: add template for release notes 16.11 Olivier Matz
2016-07-05 18:16  2% [dpdk-dev] [RFC] Generic flow director/filtering/classification API Adrien Mazarguil
2016-07-07  7:14  0% ` Lu, Wenzhuo
2016-07-07 10:26  2%   ` Adrien Mazarguil
2016-07-07 23:15  0% ` Chandran, Sugesh
2016-07-08 13:03  0%   ` Adrien Mazarguil
2016-07-08 11:11  0% ` Liang, Cunming
2016-07-06 11:39  3% [dpdk-dev] [PATCH] librte_pmd_bond: fix exported symbol versioning Christian Ehrhardt
2016-07-11 11:27  3% ` [dpdk-dev] [PATCH v2] " Christian Ehrhardt
2016-07-11 12:58  0%   ` Thomas Monjalon
2016-07-06 14:05  3% [dpdk-dev] [PATCH] cryptodev: move new cryptodev type to bottom of enum Pablo de Lara
2016-07-08 17:52  0% ` Thomas Monjalon
2016-07-07 15:36     [dpdk-dev] [PATCH 00/11] additions to pmdinfogen Thomas Monjalon
2016-07-07 15:36  4% ` [dpdk-dev] [PATCH 11/11] maintainers: add section for pmdinfo Thomas Monjalon
2016-07-07 16:14  0%   ` Neil Horman
2016-07-08 10:14     ` [dpdk-dev] [PATCH v2 00/10] additions to pmdinfogen Thomas Monjalon
2016-07-08 10:14  4%   ` [dpdk-dev] [PATCH v2 10/10] maintainers: add section for pmdinfo Thomas Monjalon
2016-07-08 14:42       ` [dpdk-dev] [PATCH v3 00/10] additions to pmdinfogen Thomas Monjalon
2016-07-08 14:42  4%     ` [dpdk-dev] [PATCH v3 10/10] maintainers: add section for pmdinfo Thomas Monjalon
2016-07-14 13:29     [dpdk-dev] rte_ether: Driver-specific stats getting overwritten Remy Horton
2016-07-14 13:37     ` Thomas Monjalon
2016-07-14 13:51       ` Igor Ryzhov
2016-07-14 15:50  3%     ` Remy Horton
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).