DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create
@ 2024-01-19 16:41 Andrew Boyer
  2024-02-06 14:24 ` [EXT] " Akhil Goyal
  2024-02-16 17:03 ` [PATCH v2] cryptodev: " Andrew Boyer
  0 siblings, 2 replies; 7+ messages in thread
From: Andrew Boyer @ 2024-01-19 16:41 UTC (permalink / raw)
  To: dev; +Cc: Akhil Goyal, Andrew Boyer

Cache the most recent VA -> PA mapping found so that we can skip
most of the system calls. With 4K pages this reduces pool create
time by about 90%.

Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
---
 lib/cryptodev/rte_crypto.h    |  5 +++++
 lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
index dbc2700da5..ee6aa1e40e 100644
--- a/lib/cryptodev/rte_crypto.h
+++ b/lib/cryptodev/rte_crypto.h
@@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
 	/**< Crypto op pool type operation. */
 	uint16_t priv_size;
 	/**< Size of private area in each crypto operation. */
+
+	unsigned long vp_cache;
+	/* Virtual page address of previous op. */
+	rte_iova_t iovp_cache;
+	/* I/O virtual page address of previous op. */
 };
 
 
diff --git a/lib/cryptodev/rte_cryptodev.c b/lib/cryptodev/rte_cryptodev.c
index b233c0ecd7..d596f85a57 100644
--- a/lib/cryptodev/rte_cryptodev.c
+++ b/lib/cryptodev/rte_cryptodev.c
@@ -10,6 +10,7 @@
 #include <errno.h>
 #include <stdint.h>
 #include <inttypes.h>
+#include <unistd.h>
 
 #include <rte_log.h>
 #include <rte_debug.h>
@@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool *mempool,
 {
 	struct rte_crypto_op *op = _op_data;
 	enum rte_crypto_op_type type = *(enum rte_crypto_op_type *)opaque_arg;
+	struct rte_crypto_op_pool_private *priv;
+	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
+#ifdef RTE_EXEC_ENV_WINDOWS
+	unsigned long page_mask = 4095;
+#else
+	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
+#endif
+	unsigned long virt_page = virt_addr & ~page_mask;
 
 	memset(_op_data, 0, mempool->elt_size);
 
 	__rte_crypto_op_reset(op, type);
 
-	op->phys_addr = rte_mem_virt2iova(_op_data);
+	priv = (struct rte_crypto_op_pool_private *)
+		rte_mempool_get_priv(mempool);
+
+	if (virt_page == priv->vp_cache) {
+		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
+	} else {
+		op->phys_addr = rte_mem_virt2iova(_op_data);
+
+		/* Update cached values */
+		priv->vp_cache = virt_page;
+		priv->iovp_cache = op->phys_addr & ~page_mask;
+	}
+
 	op->mempool = mempool;
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [EXT] [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create
  2024-01-19 16:41 [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create Andrew Boyer
@ 2024-02-06 14:24 ` Akhil Goyal
  2024-02-07  2:24   ` Morten Brørup
  2024-02-16 17:03 ` [PATCH v2] cryptodev: " Andrew Boyer
  1 sibling, 1 reply; 7+ messages in thread
From: Akhil Goyal @ 2024-02-06 14:24 UTC (permalink / raw)
  To: Andrew Boyer, dev, Morten Brørup, Andrew Rybchenko; +Cc: thomas


> Cache the most recent VA -> PA mapping found so that we can skip
> most of the system calls. With 4K pages this reduces pool create
> time by about 90%.
> 
> Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>

I believe there should be a generic solution for this in mempool
 if it is not there already.
Here, you are adding cache in mempool priv
which does not seem to be a correct place.
This optimization would be needed across all types of mempools.
Adding more people for comments.


> ---
>  lib/cryptodev/rte_crypto.h    |  5 +++++
>  lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
>  2 files changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
> index dbc2700da5..ee6aa1e40e 100644
> --- a/lib/cryptodev/rte_crypto.h
> +++ b/lib/cryptodev/rte_crypto.h
> @@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
>  	/**< Crypto op pool type operation. */
>  	uint16_t priv_size;
>  	/**< Size of private area in each crypto operation. */
> +
> +	unsigned long vp_cache;
> +	/* Virtual page address of previous op. */
> +	rte_iova_t iovp_cache;
> +	/* I/O virtual page address of previous op. */
>  };
> 
> 
> diff --git a/lib/cryptodev/rte_cryptodev.c b/lib/cryptodev/rte_cryptodev.c
> index b233c0ecd7..d596f85a57 100644
> --- a/lib/cryptodev/rte_cryptodev.c
> +++ b/lib/cryptodev/rte_cryptodev.c
> @@ -10,6 +10,7 @@
>  #include <errno.h>
>  #include <stdint.h>
>  #include <inttypes.h>
> +#include <unistd.h>
> 
>  #include <rte_log.h>
>  #include <rte_debug.h>
> @@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool *mempool,
>  {
>  	struct rte_crypto_op *op = _op_data;
>  	enum rte_crypto_op_type type = *(enum rte_crypto_op_type
> *)opaque_arg;
> +	struct rte_crypto_op_pool_private *priv;
> +	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
> +#ifdef RTE_EXEC_ENV_WINDOWS
> +	unsigned long page_mask = 4095;
> +#else
> +	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
> +#endif
> +	unsigned long virt_page = virt_addr & ~page_mask;
> 
>  	memset(_op_data, 0, mempool->elt_size);
> 
>  	__rte_crypto_op_reset(op, type);
> 
> -	op->phys_addr = rte_mem_virt2iova(_op_data);
> +	priv = (struct rte_crypto_op_pool_private *)
> +		rte_mempool_get_priv(mempool);
> +
> +	if (virt_page == priv->vp_cache) {
> +		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
> +	} else {
> +		op->phys_addr = rte_mem_virt2iova(_op_data);
> +
> +		/* Update cached values */
> +		priv->vp_cache = virt_page;
> +		priv->iovp_cache = op->phys_addr & ~page_mask;
> +	}
> +
>  	op->mempool = mempool;
>  }
> 
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [EXT] [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create
  2024-02-06 14:24 ` [EXT] " Akhil Goyal
@ 2024-02-07  2:24   ` Morten Brørup
  2024-02-07  3:02     ` Boyer, Andrew
  0 siblings, 1 reply; 7+ messages in thread
From: Morten Brørup @ 2024-02-07  2:24 UTC (permalink / raw)
  To: Akhil Goyal, Andrew Boyer, dev, Andrew Rybchenko; +Cc: thomas, Fan Zhang

> From: Akhil Goyal [mailto:gakhil@marvell.com]
> Sent: Tuesday, 6 February 2024 15.25
> 
> > Cache the most recent VA -> PA mapping found so that we can skip
> > most of the system calls. With 4K pages this reduces pool create
> > time by about 90%.
> >
> > Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
> 
> I believe there should be a generic solution for this in mempool
>  if it is not there already.
> Here, you are adding cache in mempool priv
> which does not seem to be a correct place.
> This optimization would be needed across all types of mempools.
> Adding more people for comments.
> 
> 
> > ---
> >  lib/cryptodev/rte_crypto.h    |  5 +++++
> >  lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
> >  2 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
> > index dbc2700da5..ee6aa1e40e 100644
> > --- a/lib/cryptodev/rte_crypto.h
> > +++ b/lib/cryptodev/rte_crypto.h
> > @@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
> >  	/**< Crypto op pool type operation. */
> >  	uint16_t priv_size;
> >  	/**< Size of private area in each crypto operation. */
> > +
> > +	unsigned long vp_cache;
> > +	/* Virtual page address of previous op. */
> > +	rte_iova_t iovp_cache;
> > +	/* I/O virtual page address of previous op. */
> >  };
> >
> >
> > diff --git a/lib/cryptodev/rte_cryptodev.c
> b/lib/cryptodev/rte_cryptodev.c
> > index b233c0ecd7..d596f85a57 100644
> > --- a/lib/cryptodev/rte_cryptodev.c
> > +++ b/lib/cryptodev/rte_cryptodev.c
> > @@ -10,6 +10,7 @@
> >  #include <errno.h>
> >  #include <stdint.h>
> >  #include <inttypes.h>
> > +#include <unistd.h>
> >
> >  #include <rte_log.h>
> >  #include <rte_debug.h>
> > @@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool
> *mempool,
> >  {
> >  	struct rte_crypto_op *op = _op_data;
> >  	enum rte_crypto_op_type type = *(enum rte_crypto_op_type
> > *)opaque_arg;
> > +	struct rte_crypto_op_pool_private *priv;
> > +	unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
> > +#ifdef RTE_EXEC_ENV_WINDOWS
> > +	unsigned long page_mask = 4095;
> > +#else
> > +	unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
> > +#endif
> > +	unsigned long virt_page = virt_addr & ~page_mask;
> >
> >  	memset(_op_data, 0, mempool->elt_size);
> >
> >  	__rte_crypto_op_reset(op, type);
> >
> > -	op->phys_addr = rte_mem_virt2iova(_op_data);

This optimization is for rte_mem_virt2iova(_op_data) being slow.

If I'm not mistaken, _op_data is an object in a mempool, where the mempool object headers have already been initialized.

In this case, it could simply be optimized as this:
-	op->phys_addr = rte_mem_virt2iova(_op_data);
+	op->phys_addr = rte_mempool_virt2iova(_op_data);

Now going down a rat hole...

If the above is true, I wonder if struct rte_crypto_op is only instantiated as objects in mempools... If it is, the op->mempool and op->phys_addr fields are fundamentally redundant, and can be retrieved from the mempool object header instead:
op->phys_addr === rte_mempool_virt2iova(op)
op->mempool === rte_mempool_from_obj(op)

Having these shadow variables in struct rte_crypto_op may provide performance benefits when RTE_MEMPOOL_F_NO_CACHE_ALIGN is not set on the mempool, the mempool object header is in the preceding cache line of the mempool object (containing the struct rte_crypto_op op).

A better solution than these shadow variables would be to introduce another mempool flag to cache align the mempool object header, but let the mempool object itself directly follow the mempool object header, so the mempool object header and the mempool object itself (if small enough) reside in the same cache line. This would also use less memory.

> > +	priv = (struct rte_crypto_op_pool_private *)
> > +		rte_mempool_get_priv(mempool);
> > +
> > +	if (virt_page == priv->vp_cache) {
> > +		op->phys_addr = priv->iovp_cache + (virt_addr & page_mask);
> > +	} else {
> > +		op->phys_addr = rte_mem_virt2iova(_op_data);
> > +
> > +		/* Update cached values */
> > +		priv->vp_cache = virt_page;
> > +		priv->iovp_cache = op->phys_addr & ~page_mask;
> > +	}
> > +
> >  	op->mempool = mempool;
> >  }
> >
> > --
> > 2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [EXT] [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create
  2024-02-07  2:24   ` Morten Brørup
@ 2024-02-07  3:02     ` Boyer, Andrew
  0 siblings, 0 replies; 7+ messages in thread
From: Boyer, Andrew @ 2024-02-07  3:02 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Akhil Goyal, Boyer, Andrew, dev, Andrew Rybchenko, thomas, Fan Zhang

[-- Attachment #1: Type: text/plain, Size: 2916 bytes --]



On Feb 6, 2024, at 9:24 PM, Morten Brørup <mb@smartsharesystems.com> wrote:

Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.


From: Akhil Goyal [mailto:gakhil@marvell.com]
Sent: Tuesday, 6 February 2024 15.25

Cache the most recent VA -> PA mapping found so that we can skip
most of the system calls. With 4K pages this reduces pool create
time by about 90%.

Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>

I believe there should be a generic solution for this in mempool
if it is not there already.
Here, you are adding cache in mempool priv
which does not seem to be a correct place.
This optimization would be needed across all types of mempools.
Adding more people for comments.


---
lib/cryptodev/rte_crypto.h    |  5 +++++
lib/cryptodev/rte_cryptodev.c | 23 ++++++++++++++++++++++-
2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/cryptodev/rte_crypto.h b/lib/cryptodev/rte_crypto.h
index dbc2700da5..ee6aa1e40e 100644
--- a/lib/cryptodev/rte_crypto.h
+++ b/lib/cryptodev/rte_crypto.h
@@ -220,6 +220,11 @@ struct rte_crypto_op_pool_private {
   /**< Crypto op pool type operation. */
   uint16_t priv_size;
   /**< Size of private area in each crypto operation. */
+
+   unsigned long vp_cache;
+   /* Virtual page address of previous op. */
+   rte_iova_t iovp_cache;
+   /* I/O virtual page address of previous op. */
};


diff --git a/lib/cryptodev/rte_cryptodev.c
b/lib/cryptodev/rte_cryptodev.c
index b233c0ecd7..d596f85a57 100644
--- a/lib/cryptodev/rte_cryptodev.c
+++ b/lib/cryptodev/rte_cryptodev.c
@@ -10,6 +10,7 @@
#include <errno.h>
#include <stdint.h>
#include <inttypes.h>
+#include <unistd.h>

#include <rte_log.h>
#include <rte_debug.h>
@@ -2568,12 +2569,32 @@ rte_crypto_op_init(struct rte_mempool
*mempool,
{
   struct rte_crypto_op *op = _op_data;
   enum rte_crypto_op_type type = *(enum rte_crypto_op_type
*)opaque_arg;
+   struct rte_crypto_op_pool_private *priv;
+   unsigned long virt_addr = (unsigned long)(uintptr_t)_op_data;
+#ifdef RTE_EXEC_ENV_WINDOWS
+   unsigned long page_mask = 4095;
+#else
+   unsigned long page_mask = sysconf(_SC_PAGESIZE) - 1;
+#endif
+   unsigned long virt_page = virt_addr & ~page_mask;

   memset(_op_data, 0, mempool->elt_size);

   __rte_crypto_op_reset(op, type);

-   op->phys_addr = rte_mem_virt2iova(_op_data);

This optimization is for rte_mem_virt2iova(_op_data) being slow.

If I'm not mistaken, _op_data is an object in a mempool, where the mempool object headers have already been initialized.

In this case, it could simply be optimized as this:
-       op->phys_addr = rte_mem_virt2iova(_op_data);
+       op->phys_addr = rte_mempool_virt2iova(_op_data);


That certainly is shorter! Thanks, I was not aware of this function.

-Andrew

[-- Attachment #2: Type: text/html, Size: 10621 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] cryptodev: speed up ops pool create
  2024-01-19 16:41 [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create Andrew Boyer
  2024-02-06 14:24 ` [EXT] " Akhil Goyal
@ 2024-02-16 17:03 ` Andrew Boyer
  2024-02-16 17:18   ` Morten Brørup
  2024-02-22  7:07   ` [EXT] " Akhil Goyal
  1 sibling, 2 replies; 7+ messages in thread
From: Andrew Boyer @ 2024-02-16 17:03 UTC (permalink / raw)
  To: dev; +Cc: Akhil Goyal, Andrew Boyer

Use rte_mempool_virt2iova(), which uses arithmetic based on the mempool
state, rather than rte_mem_virt2iova(), which uses syscalls to look at
the proc filesystem. This speeds up pool create by more than 90%.

Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
---
 lib/cryptodev/rte_cryptodev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cryptodev/rte_cryptodev.c b/lib/cryptodev/rte_cryptodev.c
index b233c0ecd7..886eb7adc4 100644
--- a/lib/cryptodev/rte_cryptodev.c
+++ b/lib/cryptodev/rte_cryptodev.c
@@ -2573,7 +2573,7 @@ rte_crypto_op_init(struct rte_mempool *mempool,
 
 	__rte_crypto_op_reset(op, type);
 
-	op->phys_addr = rte_mem_virt2iova(_op_data);
+	op->phys_addr = rte_mempool_virt2iova(_op_data);
 	op->mempool = mempool;
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH v2] cryptodev: speed up ops pool create
  2024-02-16 17:03 ` [PATCH v2] cryptodev: " Andrew Boyer
@ 2024-02-16 17:18   ` Morten Brørup
  2024-02-22  7:07   ` [EXT] " Akhil Goyal
  1 sibling, 0 replies; 7+ messages in thread
From: Morten Brørup @ 2024-02-16 17:18 UTC (permalink / raw)
  To: Andrew Boyer, dev; +Cc: Akhil Goyal

> From: Andrew Boyer [mailto:andrew.boyer@amd.com]
> Sent: Friday, 16 February 2024 18.04
> 
> Use rte_mempool_virt2iova(), which uses arithmetic based on the mempool
> state, rather than rte_mem_virt2iova(), which uses syscalls to look at
> the proc filesystem. This speeds up pool create by more than 90%.
> 
> Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
> ---
>  lib/cryptodev/rte_cryptodev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/cryptodev/rte_cryptodev.c
> b/lib/cryptodev/rte_cryptodev.c
> index b233c0ecd7..886eb7adc4 100644
> --- a/lib/cryptodev/rte_cryptodev.c
> +++ b/lib/cryptodev/rte_cryptodev.c
> @@ -2573,7 +2573,7 @@ rte_crypto_op_init(struct rte_mempool *mempool,
> 
>  	__rte_crypto_op_reset(op, type);
> 
> -	op->phys_addr = rte_mem_virt2iova(_op_data);
> +	op->phys_addr = rte_mempool_virt2iova(_op_data);
>  	op->mempool = mempool;
>  }
> 
> --
> 2.17.1

Reviewed-by: Morten Brørup <mb@smartsharesystems.com>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [EXT] [PATCH v2] cryptodev: speed up ops pool create
  2024-02-16 17:03 ` [PATCH v2] cryptodev: " Andrew Boyer
  2024-02-16 17:18   ` Morten Brørup
@ 2024-02-22  7:07   ` Akhil Goyal
  1 sibling, 0 replies; 7+ messages in thread
From: Akhil Goyal @ 2024-02-22  7:07 UTC (permalink / raw)
  To: Andrew Boyer, dev

> Use rte_mempool_virt2iova(), which uses arithmetic based on the mempool
> state, rather than rte_mem_virt2iova(), which uses syscalls to look at
> the proc filesystem. This speeds up pool create by more than 90%.
> 
> Signed-off-by: Andrew Boyer <andrew.boyer@amd.com>
> ---
Acked-by: Akhil Goyal <gakhil@marvell.com>

Applied to dpdk-next-crypto
Thanks.


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-02-22  7:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-19 16:41 [PATCH] cryptodev: add a simple mapping cache to speed up ops pool create Andrew Boyer
2024-02-06 14:24 ` [EXT] " Akhil Goyal
2024-02-07  2:24   ` Morten Brørup
2024-02-07  3:02     ` Boyer, Andrew
2024-02-16 17:03 ` [PATCH v2] cryptodev: " Andrew Boyer
2024-02-16 17:18   ` Morten Brørup
2024-02-22  7:07   ` [EXT] " Akhil Goyal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).