Acked-by: Kai Ji <kai.ji@intel.com>

From: Jack Bond-Preston <jack.bond-preston@foss.arm.com>
Sent: 07 June 2024 13:47
To: Ji, Kai <kai.ji@intel.com>
Cc: dev@dpdk.org <dev@dpdk.org>; Wathsala Vithanage <wathsala.vithanage@arm.com>
Subject: [PATCH v4 4/5] crypto/openssl: per-qp auth context clones
 
Currently EVP auth ctxs (e.g. EVP_MD_CTX, EVP_MAC_CTX) are allocated,
copied to (from openssl_session), and then freed for every auth
operation (ie. per packet). This is very inefficient, and avoidable.

Make each openssl_session hold an array of structures, containing
pointers to per-queue-pair cipher and auth context copies. These are
populated on first use by allocating a new context and copying from the
main context. These copies can then be used in a thread-safe manner by
different worker lcores simultaneously. Consequently the auth context
allocation and copy only has to happen once - the first time a given qp
uses an openssl_session. This brings about a large performance boost.

Throughput performance uplift measurements for HMAC-SHA1 generate on
Ampere Altra Max platform:
1 worker lcore
|   buffer sz (B) |   prev (Gbps) |   optimised (Gbps) |   uplift |
|-----------------+---------------+--------------------+----------|
|              64 |          0.63 |               1.42 |   123.5% |
|             256 |          2.24 |               4.40 |    96.4% |
|            1024 |          6.15 |               9.26 |    50.6% |
|            2048 |          8.68 |              11.38 |    31.1% |
|            4096 |         10.92 |              12.84 |    17.6% |

8 worker lcores
|   buffer sz (B) |   prev (Gbps) |   optimised (Gbps) |   uplift |
|-----------------+---------------+--------------------+----------|
|              64 |          0.93 |              11.35 |  1122.5% |
|             256 |          3.70 |              35.30 |   853.7% |
|            1024 |         15.22 |              74.27 |   387.8% |
|            2048 |         30.20 |              91.08 |   201.6% |
|            4096 |         56.92 |             102.76 |    80.5% |

Signed-off-by: Jack Bond-Preston <jack.bond-preston@foss.arm.com>
Reviewed-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
---
 drivers/crypto/openssl/compat.h              |  26 +++
 drivers/crypto/openssl/openssl_pmd_private.h |  25 ++-
 drivers/crypto/openssl/rte_openssl_pmd.c     | 176 +++++++++++++++----
 drivers/crypto/openssl/rte_openssl_pmd_ops.c |   7 +-
 4 files changed, 193 insertions(+), 41 deletions(-)

diff --git a/drivers/crypto/openssl/compat.h b/drivers/crypto/openssl/compat.h
index 9f9167c4f1..4c5ddfbf3a 100644
--- a/drivers/crypto/openssl/compat.h
+++ b/drivers/crypto/openssl/compat.h
@@ -5,6 +5,32 @@
 #ifndef __RTA_COMPAT_H__
 #define __RTA_COMPAT_H__
 
+#if OPENSSL_VERSION_NUMBER > 0x30000000L
+static __rte_always_inline void
+free_hmac_ctx(EVP_MAC_CTX *ctx)
+{
+       EVP_MAC_CTX_free(ctx);
+}
+
+static __rte_always_inline void
+free_cmac_ctx(EVP_MAC_CTX *ctx)
+{
+       EVP_MAC_CTX_free(ctx);
+}
+#else
+static __rte_always_inline void
+free_hmac_ctx(HMAC_CTX *ctx)
+{
+       HMAC_CTX_free(ctx);
+}
+
+static __rte_always_inline void
+free_cmac_ctx(CMAC_CTX *ctx)
+{
+       CMAC_CTX_free(ctx);
+}
+#endif
+
 #if (OPENSSL_VERSION_NUMBER < 0x10100000L)
 
 static __rte_always_inline int
diff --git a/drivers/crypto/openssl/openssl_pmd_private.h b/drivers/crypto/openssl/openssl_pmd_private.h
index bad7dcf2f5..a50e4d4918 100644
--- a/drivers/crypto/openssl/openssl_pmd_private.h
+++ b/drivers/crypto/openssl/openssl_pmd_private.h
@@ -80,6 +80,20 @@ struct __rte_cache_aligned openssl_qp {
          */
 };
 
+struct evp_ctx_pair {
+       EVP_CIPHER_CTX *cipher;
+       union {
+               EVP_MD_CTX *auth;
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+               EVP_MAC_CTX *hmac;
+               EVP_MAC_CTX *cmac;
+#else
+               HMAC_CTX *hmac;
+               CMAC_CTX *cmac;
+#endif
+       };
+};
+
 /** OPENSSL crypto private session structure */
 struct __rte_cache_aligned openssl_session {
         enum openssl_chain_order chain_order;
@@ -168,11 +182,12 @@ struct __rte_cache_aligned openssl_session {
 
         uint16_t ctx_copies_len;
         /* < number of entries in ctx_copies */
-       EVP_CIPHER_CTX *qp_ctx[];
-       /**< Flexible array member of per-queue-pair pointers to copies of EVP
-        * context structure. Cipher contexts are not safe to use from multiple
-        * cores simultaneously, so maintaining these copies allows avoiding
-        * per-buffer copying into a temporary context.
+       struct evp_ctx_pair qp_ctx[];
+       /**< Flexible array member of per-queue-pair structures, each containing
+        * pointers to copies of the cipher and auth EVP contexts. Cipher
+        * contexts are not safe to use from multiple cores simultaneously, so
+        * maintaining these copies allows avoiding per-buffer copying into a
+        * temporary context.
          */
 };
 
diff --git a/drivers/crypto/openssl/rte_openssl_pmd.c b/drivers/crypto/openssl/rte_openssl_pmd.c
index df44cc097e..7e2e505222 100644
--- a/drivers/crypto/openssl/rte_openssl_pmd.c
+++ b/drivers/crypto/openssl/rte_openssl_pmd.c
@@ -892,40 +892,45 @@ openssl_set_session_parameters(struct openssl_session *sess,
 void
 openssl_reset_session(struct openssl_session *sess)
 {
+       /* Free all the qp_ctx entries. */
         for (uint16_t i = 0; i < sess->ctx_copies_len; i++) {
-               if (sess->qp_ctx[i] != NULL) {
-                       EVP_CIPHER_CTX_free(sess->qp_ctx[i]);
-                       sess->qp_ctx[i] = NULL;
+               if (sess->qp_ctx[i].cipher != NULL) {
+                       EVP_CIPHER_CTX_free(sess->qp_ctx[i].cipher);
+                       sess->qp_ctx[i].cipher = NULL;
+               }
+
+               switch (sess->auth.mode) {
+               case OPENSSL_AUTH_AS_AUTH:
+                       EVP_MD_CTX_destroy(sess->qp_ctx[i].auth);
+                       sess->qp_ctx[i].auth = NULL;
+                       break;
+               case OPENSSL_AUTH_AS_HMAC:
+                       free_hmac_ctx(sess->qp_ctx[i].hmac);
+                       sess->qp_ctx[i].hmac = NULL;
+                       break;
+               case OPENSSL_AUTH_AS_CMAC:
+                       free_cmac_ctx(sess->qp_ctx[i].cmac);
+                       sess->qp_ctx[i].cmac = NULL;
+                       break;
                 }
         }
 
         EVP_CIPHER_CTX_free(sess->cipher.ctx);
 
-       if (sess->chain_order == OPENSSL_CHAIN_CIPHER_BPI)
-               EVP_CIPHER_CTX_free(sess->cipher.bpi_ctx);
-
         switch (sess->auth.mode) {
         case OPENSSL_AUTH_AS_AUTH:
                 EVP_MD_CTX_destroy(sess->auth.auth.ctx);
                 break;
         case OPENSSL_AUTH_AS_HMAC:
-               EVP_PKEY_free(sess->auth.hmac.pkey);
-# if OPENSSL_VERSION_NUMBER >= 0x30000000L
-               EVP_MAC_CTX_free(sess->auth.hmac.ctx);
-# else
-               HMAC_CTX_free(sess->auth.hmac.ctx);
-# endif
+               free_hmac_ctx(sess->auth.hmac.ctx);
                 break;
         case OPENSSL_AUTH_AS_CMAC:
-# if OPENSSL_VERSION_NUMBER >= 0x30000000L
-               EVP_MAC_CTX_free(sess->auth.cmac.ctx);
-# else
-               CMAC_CTX_free(sess->auth.cmac.ctx);
-# endif
-               break;
-       default:
+               free_cmac_ctx(sess->auth.cmac.ctx);
                 break;
         }
+
+       if (sess->chain_order == OPENSSL_CHAIN_CIPHER_BPI)
+               EVP_CIPHER_CTX_free(sess->cipher.bpi_ctx);
 }
 
 /** Provide session for operation */
@@ -1471,6 +1476,9 @@ process_openssl_auth_mac(struct rte_mbuf *mbuf_src, uint8_t *dst, int offset,
         if (m == 0)
                 goto process_auth_err;
 
+       if (EVP_MAC_init(ctx, NULL, 0, NULL) <= 0)
+               goto process_auth_err;
+
         src = rte_pktmbuf_mtod_offset(m, uint8_t *, offset);
 
         l = rte_pktmbuf_data_len(m) - offset;
@@ -1497,11 +1505,9 @@ process_openssl_auth_mac(struct rte_mbuf *mbuf_src, uint8_t *dst, int offset,
         if (EVP_MAC_final(ctx, dst, &dstlen, DIGEST_LENGTH_MAX) != 1)
                 goto process_auth_err;
 
-       EVP_MAC_CTX_free(ctx);
         return 0;
 
 process_auth_err:
-       EVP_MAC_CTX_free(ctx);
         OPENSSL_LOG(ERR, "Process openssl auth failed");
         return -EINVAL;
 }
@@ -1620,7 +1626,7 @@ get_local_cipher_ctx(struct openssl_session *sess, struct openssl_qp *qp)
         if (sess->ctx_copies_len == 0)
                 return sess->cipher.ctx;
 
-       EVP_CIPHER_CTX **lctx = &sess->qp_ctx[qp->id];
+       EVP_CIPHER_CTX **lctx = &sess->qp_ctx[qp->id].cipher;
 
         if (unlikely(*lctx == NULL)) {
 #if OPENSSL_VERSION_NUMBER >= 0x30200000L
@@ -1647,6 +1653,112 @@ get_local_cipher_ctx(struct openssl_session *sess, struct openssl_qp *qp)
         return *lctx;
 }
 
+static inline EVP_MD_CTX *
+get_local_auth_ctx(struct openssl_session *sess, struct openssl_qp *qp)
+{
+       /* If the array is not being used, just return the main context. */
+       if (sess->ctx_copies_len == 0)
+               return sess->auth.auth.ctx;
+
+       EVP_MD_CTX **lctx = &sess->qp_ctx[qp->id].auth;
+
+       if (unlikely(*lctx == NULL)) {
+#if OPENSSL_VERSION_NUMBER >= 0x30100000L
+               /* EVP_MD_CTX_dup() added in OSSL 3.1 */
+               *lctx = EVP_MD_CTX_dup(sess->auth.auth.ctx);
+#else
+               *lctx = EVP_MD_CTX_new();
+               EVP_MD_CTX_copy(*lctx, sess->auth.auth.ctx);
+#endif
+       }
+
+       return *lctx;
+}
+
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+static inline EVP_MAC_CTX *
+#else
+static inline HMAC_CTX *
+#endif
+get_local_hmac_ctx(struct openssl_session *sess, struct openssl_qp *qp)
+{
+#if (OPENSSL_VERSION_NUMBER >= 0x30000000L && OPENSSL_VERSION_NUMBER < 0x30003000L)
+       /* For OpenSSL versions 3.0.0 <= v < 3.0.3, re-initing of
+        * EVP_MAC_CTXs is broken, and doesn't actually reset their
+        * state. This was fixed in OSSL commit c9ddc5af5199 ("Avoid
+        * undefined behavior of provided macs on EVP_MAC
+        * reinitialization"). In cases where the fix is not present,
+        * fall back to duplicating the context every buffer as a
+        * workaround, at the cost of performance.
+        */
+       RTE_SET_USED(qp);
+       return EVP_MAC_CTX_dup(sess->auth.hmac.ctx);
+#else
+       if (sess->ctx_copies_len == 0)
+               return sess->auth.hmac.ctx;
+
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+       EVP_MAC_CTX **lctx =
+#else
+       HMAC_CTX **lctx =
+#endif
+               &sess->qp_ctx[qp->id].hmac;
+
+       if (unlikely(*lctx == NULL)) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+               *lctx = EVP_MAC_CTX_dup(sess->auth.hmac.ctx);
+#else
+               *lctx = HMAC_CTX_new();
+               HMAC_CTX_copy(*lctx, sess->auth.hmac.ctx);
+#endif
+       }
+
+       return *lctx;
+#endif
+}
+
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+static inline EVP_MAC_CTX *
+#else
+static inline CMAC_CTX *
+#endif
+get_local_cmac_ctx(struct openssl_session *sess, struct openssl_qp *qp)
+{
+#if (OPENSSL_VERSION_NUMBER >= 0x30000000L && OPENSSL_VERSION_NUMBER < 0x30003000L)
+       /* For OpenSSL versions 3.0.0 <= v < 3.0.3, re-initing of
+        * EVP_MAC_CTXs is broken, and doesn't actually reset their
+        * state. This was fixed in OSSL commit c9ddc5af5199 ("Avoid
+        * undefined behavior of provided macs on EVP_MAC
+        * reinitialization"). In cases where the fix is not present,
+        * fall back to duplicating the context every buffer as a
+        * workaround, at the cost of performance.
+        */
+       RTE_SET_USED(qp);
+       return EVP_MAC_CTX_dup(sess->auth.cmac.ctx);
+#else
+       if (sess->ctx_copies_len == 0)
+               return sess->auth.cmac.ctx;
+
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+       EVP_MAC_CTX **lctx =
+#else
+       CMAC_CTX **lctx =
+#endif
+               &sess->qp_ctx[qp->id].cmac;
+
+       if (unlikely(*lctx == NULL)) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+               *lctx = EVP_MAC_CTX_dup(sess->auth.cmac.ctx);
+#else
+               *lctx = CMAC_CTX_new();
+               CMAC_CTX_copy(*lctx, sess->auth.cmac.ctx);
+#endif
+       }
+
+       return *lctx;
+#endif
+}
+
 /** Process auth/cipher combined operation */
 static void
 process_openssl_combined_op(struct openssl_qp *qp, struct rte_crypto_op *op,
@@ -1895,42 +2007,40 @@ process_openssl_auth_op(struct openssl_qp *qp, struct rte_crypto_op *op,
 
         switch (sess->auth.mode) {
         case OPENSSL_AUTH_AS_AUTH:
-               ctx_a = EVP_MD_CTX_create();
-               EVP_MD_CTX_copy_ex(ctx_a, sess->auth.auth.ctx);
+               ctx_a = get_local_auth_ctx(sess, qp);
                 status = process_openssl_auth(mbuf_src, dst,
                                 op->sym->auth.data.offset, NULL, NULL, srclen,
                                 ctx_a, sess->auth.auth.evp_algo);
-               EVP_MD_CTX_destroy(ctx_a);
                 break;
         case OPENSSL_AUTH_AS_HMAC:
+               ctx_h = get_local_hmac_ctx(sess, qp);
 # if OPENSSL_VERSION_NUMBER >= 0x30000000L
-               ctx_h = EVP_MAC_CTX_dup(sess->auth.hmac.ctx);
                 status = process_openssl_auth_mac(mbuf_src, dst,
                                 op->sym->auth.data.offset, srclen,
                                 ctx_h);
 # else
-               ctx_h = HMAC_CTX_new();
-               HMAC_CTX_copy(ctx_h, sess->auth.hmac.ctx);
                 status = process_openssl_auth_hmac(mbuf_src, dst,
                                 op->sym->auth.data.offset, srclen,
                                 ctx_h);
-               HMAC_CTX_free(ctx_h);
 # endif
+#if (OPENSSL_VERSION_NUMBER >= 0x30000000L && OPENSSL_VERSION_NUMBER < 0x30003000L)
+               EVP_MAC_CTX_free(ctx_h);
+#endif
                 break;
         case OPENSSL_AUTH_AS_CMAC:
+               ctx_c = get_local_cmac_ctx(sess, qp);
 # if OPENSSL_VERSION_NUMBER >= 0x30000000L
-               ctx_c = EVP_MAC_CTX_dup(sess->auth.cmac.ctx);
                 status = process_openssl_auth_mac(mbuf_src, dst,
                                 op->sym->auth.data.offset, srclen,
                                 ctx_c);
 # else
-               ctx_c = CMAC_CTX_new();
-               CMAC_CTX_copy(ctx_c, sess->auth.cmac.ctx);
                 status = process_openssl_auth_cmac(mbuf_src, dst,
                                 op->sym->auth.data.offset, srclen,
                                 ctx_c);
-               CMAC_CTX_free(ctx_c);
 # endif
+#if (OPENSSL_VERSION_NUMBER >= 0x30000000L && OPENSSL_VERSION_NUMBER < 0x30003000L)
+               EVP_MAC_CTX_free(ctx_c);
+#endif
                 break;
         default:
                 status = -1;
diff --git a/drivers/crypto/openssl/rte_openssl_pmd_ops.c b/drivers/crypto/openssl/rte_openssl_pmd_ops.c
index 4209c6ab6f..1bbb855a59 100644
--- a/drivers/crypto/openssl/rte_openssl_pmd_ops.c
+++ b/drivers/crypto/openssl/rte_openssl_pmd_ops.c
@@ -805,7 +805,7 @@ openssl_pmd_sym_session_get_size(struct rte_cryptodev *dev)
                 unsigned int max_nb_qps = ((struct openssl_private *)
                                 dev->data->dev_private)->max_nb_qpairs;
                 return sizeof(struct openssl_session) +
-                               (sizeof(void *) * max_nb_qps);
+                               (sizeof(struct evp_ctx_pair) * max_nb_qps);
         }
 
         /*
@@ -818,10 +818,11 @@ openssl_pmd_sym_session_get_size(struct rte_cryptodev *dev)
 
         /*
          * Otherwise, the size of the flexible array member should be enough to
-        * fit pointers to per-qp contexts.
+        * fit pointers to per-qp contexts. This is twice the number of queue
+        * pairs, to allow for auth and cipher contexts.
          */
         return sizeof(struct openssl_session) +
-               (sizeof(void *) * dev->data->nb_queue_pairs);
+               (sizeof(struct evp_ctx_pair) * dev->data->nb_queue_pairs);
 }
 
 /** Returns the size of the asymmetric session structure */
--
2.34.1