From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id C641DA045E for ; Mon, 27 May 2019 20:45:23 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 9CB4E1B946; Mon, 27 May 2019 20:45:19 +0200 (CEST) Received: from mga17.intel.com (mga17.intel.com [192.55.52.151]) by dpdk.org (Postfix) with ESMTP id 344571B945 for ; Mon, 27 May 2019 20:45:17 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga005.fm.intel.com ([10.253.24.32]) by fmsmga107.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 27 May 2019 11:45:16 -0700 X-ExtLoop1: 1 Received: from sivswdev08.ir.intel.com ([10.237.217.47]) by fmsmga005.fm.intel.com with ESMTP; 27 May 2019 11:45:15 -0700 From: Konstantin Ananyev To: dev@dpdk.org Cc: akhil.goyal@nxp.com, Konstantin Ananyev Date: Mon, 27 May 2019 19:44:47 +0100 Message-Id: <20190527184448.21264-3-konstantin.ananyev@intel.com> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20190527184448.21264-1-konstantin.ananyev@intel.com> References: <20190527184448.21264-1-konstantin.ananyev@intel.com> Subject: [dpdk-dev] [PATCH 2/3] examples/ipsec-secgw: support packet fragmentation and reassembly X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Add optional ability to fragment packet bigger then mtu, and reassemble fragmented packet. To minimize possible performance effect, reassembly is implemented as RX callback. To support these features ipsec-secgw relies on librte_ipsec ability to handle multi-segment packets. Also when reassemble/fragmentation support is enabled, attached crypto devices have to support 'In Place SGL' offload capability. To enable/disable this functionality, two new optional command-line options are introduced: --reassemble - number of entries in reassemble table --mtu - MTU value for all attached ports As separate '--mtu' option is introduced, '-j ' option is now used to specify mbuf data buffer size only. Signed-off-by: Konstantin Ananyev --- examples/ipsec-secgw/ipsec-secgw.c | 344 ++++++++++++++++++++++++++--- examples/ipsec-secgw/ipsec.h | 1 + examples/ipsec-secgw/meson.build | 2 +- 3 files changed, 316 insertions(+), 31 deletions(-) diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c index 1d1855f50..87c417ca8 100644 --- a/examples/ipsec-secgw/ipsec-secgw.c +++ b/examples/ipsec-secgw/ipsec-secgw.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "ipsec.h" #include "parser.h" @@ -109,6 +110,11 @@ static uint16_t nb_txd = IPSEC_SECGW_TX_DESC_DEFAULT; (addr)->addr_bytes[4], (addr)->addr_bytes[5], \ 0, 0) +#define FRAG_TBL_BUCKET_ENTRIES 4 +#define FRAG_TTL_MS (10 * MS_PER_S) + +#define MTU_TO_FRAMELEN(x) ((x) + ETHER_HDR_LEN + ETHER_CRC_LEN) + /* port/source ethernet addr and destination ethernet addr */ struct ethaddr_info { uint64_t src, dst; @@ -126,6 +132,8 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = { #define CMD_LINE_OPT_CRYPTODEV_MASK "cryptodev_mask" #define CMD_LINE_OPT_RX_OFFLOAD "rxoffload" #define CMD_LINE_OPT_TX_OFFLOAD "txoffload" +#define CMD_LINE_OPT_REASSEMBLE "reassemble" +#define CMD_LINE_OPT_MTU "mtu" enum { /* long options mapped to a short option */ @@ -139,6 +147,8 @@ enum { CMD_LINE_OPT_CRYPTODEV_MASK_NUM, CMD_LINE_OPT_RX_OFFLOAD_NUM, CMD_LINE_OPT_TX_OFFLOAD_NUM, + CMD_LINE_OPT_REASSEMBLE_NUM, + CMD_LINE_OPT_MTU_NUM, }; static const struct option lgopts[] = { @@ -147,6 +157,7 @@ static const struct option lgopts[] = { {CMD_LINE_OPT_CRYPTODEV_MASK, 1, 0, CMD_LINE_OPT_CRYPTODEV_MASK_NUM}, {CMD_LINE_OPT_RX_OFFLOAD, 1, 0, CMD_LINE_OPT_RX_OFFLOAD_NUM}, {CMD_LINE_OPT_TX_OFFLOAD, 1, 0, CMD_LINE_OPT_TX_OFFLOAD_NUM}, + {CMD_LINE_OPT_REASSEMBLE, 1, 0, CMD_LINE_OPT_REASSEMBLE_NUM}, {NULL, 0, 0, 0} }; @@ -159,7 +170,6 @@ static int32_t numa_on = 1; /**< NUMA is enabled by default. */ static uint32_t nb_lcores; static uint32_t single_sa; static uint32_t single_sa_idx; -static uint32_t frame_size; /* * RX/TX HW offload capabilities to enable/use on ethernet ports. @@ -168,6 +178,13 @@ static uint32_t frame_size; static uint64_t dev_rx_offload = UINT64_MAX; static uint64_t dev_tx_offload = UINT64_MAX; +/* + * global values that determine multi-seg policy + */ +static uint32_t frag_tbl_sz; +static uint32_t frame_buf_size = RTE_MBUF_DEFAULT_BUF_SIZE; +static uint32_t mtu_size = ETHER_MTU; + /* application wide librte_ipsec/SA parameters */ struct app_sa_prm app_sa_prm = {.enable = 0}; @@ -204,6 +221,12 @@ struct lcore_conf { struct ipsec_ctx outbound; struct rt_ctx *rt4_ctx; struct rt_ctx *rt6_ctx; + struct { + struct rte_ip_frag_tbl *tbl; + struct rte_mempool *pool_dir; + struct rte_mempool *pool_indir; + struct rte_ip_frag_death_row dr; + } frag; } __rte_cache_aligned; static struct lcore_conf lcore_conf[RTE_MAX_LCORE]; @@ -229,6 +252,18 @@ static struct rte_eth_conf port_conf = { static struct socket_ctx socket_ctx[NB_SOCKETS]; +/* + * Determine is multi-segment support required: + * - either frame buffer size is smaller then mtu + * - or reassmeble support is requested + */ +static int +multi_seg_required(void) +{ + return (MTU_TO_FRAMELEN(mtu_size) + RTE_PKTMBUF_HEADROOM > + frame_buf_size || frag_tbl_sz != 0); +} + static inline void adjust_ipv4_pktlen(struct rte_mbuf *m, const struct ipv4_hdr *iph, uint32_t l2_len) @@ -429,9 +464,52 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port) return 0; } +/* + * Helper function to fragment and queue for TX one packet. + */ +static inline uint32_t +send_fragment_packet(struct lcore_conf *qconf, struct rte_mbuf *m, + uint16_t port, uint8_t proto) +{ + struct buffer *tbl; + uint32_t len, n; + int32_t rc; + + tbl = qconf->tx_mbufs + port; + len = tbl->len; + + /* free space for new fragments */ + if (len + RTE_LIBRTE_IP_FRAG_MAX_FRAG >= RTE_DIM(tbl->m_table)) { + send_burst(qconf, len, port); + len = 0; + } + + n = RTE_DIM(tbl->m_table) - len; + + if (proto == IPPROTO_IP) + rc = rte_ipv4_fragment_packet(m, tbl->m_table + len, + n, mtu_size, qconf->frag.pool_dir, + qconf->frag.pool_indir); + else + rc = rte_ipv6_fragment_packet(m, tbl->m_table + len, + n, mtu_size, qconf->frag.pool_dir, + qconf->frag.pool_indir); + + if (rc >= 0) + len += rc; + else + RTE_LOG(ERR, IPSEC, + "%s: failed to fragment packet with size %u, " + "error code: %d\n", + __func__, m->pkt_len, rte_errno); + + rte_pktmbuf_free(m); + return len; +} + /* Enqueue a single packet, and send burst if queue is filled */ static inline int32_t -send_single_packet(struct rte_mbuf *m, uint16_t port) +send_single_packet(struct rte_mbuf *m, uint16_t port, uint8_t proto) { uint32_t lcore_id; uint16_t len; @@ -441,8 +519,14 @@ send_single_packet(struct rte_mbuf *m, uint16_t port) qconf = &lcore_conf[lcore_id]; len = qconf->tx_mbufs[port].len; - qconf->tx_mbufs[port].m_table[len] = m; - len++; + + if (m->pkt_len <= mtu_size) { + qconf->tx_mbufs[port].m_table[len] = m; + len++; + + /* need to fragment the packet */ + } else + len = send_fragment_packet(qconf, m, port, proto); /* enough pkts to be sent */ if (unlikely(len == MAX_PKT_BURST)) { @@ -796,7 +880,7 @@ route4_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts) rte_pktmbuf_free(pkts[i]); continue; } - send_single_packet(pkts[i], pkt_hop & 0xff); + send_single_packet(pkts[i], pkt_hop & 0xff, IPPROTO_IP); } } @@ -848,7 +932,7 @@ route6_pkts(struct rt_ctx *rt_ctx, struct rte_mbuf *pkts[], uint8_t nb_pkts) rte_pktmbuf_free(pkts[i]); continue; } - send_single_packet(pkts[i], pkt_hop & 0xff); + send_single_packet(pkts[i], pkt_hop & 0xff, IPPROTO_IPV6); } } @@ -1014,6 +1098,8 @@ main_loop(__attribute__((unused)) void *dummy) qconf->outbound.session_pool = socket_ctx[socket_id].session_pool; qconf->outbound.session_priv_pool = socket_ctx[socket_id].session_priv_pool; + qconf->frag.pool_dir = socket_ctx[socket_id].mbuf_pool; + qconf->frag.pool_indir = socket_ctx[socket_id].mbuf_pool_indir; if (qconf->nb_rx_queue == 0) { RTE_LOG(DEBUG, IPSEC, "lcore %u has nothing to do\n", @@ -1160,12 +1246,14 @@ print_usage(const char *prgname) " [--cryptodev_mask MASK]" " [--" CMD_LINE_OPT_RX_OFFLOAD " RX_OFFLOAD_MASK]" " [--" CMD_LINE_OPT_TX_OFFLOAD " TX_OFFLOAD_MASK]" + " [--" CMD_LINE_OPT_REASSEMBLE " REASSEMBLE_TABLE_SIZE]" + " [--" CMD_LINE_OPT_MTU " MTU]" "\n\n" " -p PORTMASK: Hexadecimal bitmask of ports to configure\n" " -P : Enable promiscuous mode\n" " -u PORTMASK: Hexadecimal bitmask of unprotected ports\n" - " -j FRAMESIZE: Enable jumbo frame with 'FRAMESIZE' as maximum\n" - " packet size\n" + " -j FRAMESIZE: Data buffer size, minimum (and default)\n" + " value: RTE_MBUF_DEFAULT_BUF_SIZE\n" " -l enables code-path that uses librte_ipsec\n" " -w REPLAY_WINDOW_SIZE specifies IPsec SQN replay window\n" " size for each SA\n" @@ -1183,6 +1271,13 @@ print_usage(const char *prgname) " --" CMD_LINE_OPT_TX_OFFLOAD ": bitmask of the TX HW offload capabilities to enable/use\n" " (DEV_TX_OFFLOAD_*)\n" + " --" CMD_LINE_OPT_REASSEMBLE " NUM" + ": max number of entries in reassemble(fragment) table\n" + " (zero (default value) disables reassembly)\n" + " --" CMD_LINE_OPT_MTU " MTU" + ": MTU value on all ports (default value: 1500)\n" + " outgoing packets with bigger size will be fragmented\n" + " inicoming packets with bigger size will be discarded\n" "\n", prgname); } @@ -1353,21 +1448,16 @@ parse_args(int32_t argc, char **argv) f_present = 1; break; case 'j': - { - int32_t size = parse_decimal(optarg); - if (size <= 1518) { - printf("Invalid jumbo frame size\n"); - if (size < 0) { - print_usage(prgname); - return -1; - } - printf("Using default value 9000\n"); - frame_size = 9000; - } else { - frame_size = size; - } + ret = parse_decimal(optarg); + if (ret < RTE_MBUF_DEFAULT_BUF_SIZE || + ret > UINT16_MAX) { + printf("Invalid frame buffer size value: %s\n", + optarg); + print_usage(prgname); + return -1; } - printf("Enabled jumbo frames size %u\n", frame_size); + frame_buf_size = ret; + printf("Custom frame buffer size %u\n", frame_buf_size); break; case 'l': app_sa_prm.enable = 1; @@ -1435,6 +1525,26 @@ parse_args(int32_t argc, char **argv) return -1; } break; + case CMD_LINE_OPT_REASSEMBLE_NUM: + ret = parse_decimal(optarg); + if (ret < 0) { + printf("Invalid argument for \'%s\': %s\n", + CMD_LINE_OPT_REASSEMBLE, optarg); + print_usage(prgname); + return -1; + } + frag_tbl_sz = ret; + break; + case CMD_LINE_OPT_MTU_NUM: + ret = parse_decimal(optarg); + if (ret < 0 || ret > IPV4_MAX_PKT_LEN) { + printf("Invalid argument for \'%s\': %s\n", + CMD_LINE_OPT_MTU, optarg); + print_usage(prgname); + return -1; + } + mtu_size = ret; + break; default: print_usage(prgname); return -1; @@ -1446,6 +1556,16 @@ parse_args(int32_t argc, char **argv) return -1; } + /* check do we need to enable multi-seg support */ + if (multi_seg_required()) { + /* legacy mode doesn't support multi-seg */ + app_sa_prm.enable = 1; + printf("frame buf size: %u, mtu: %u, " + "number of reassemble entries: %u\n" + "multi-segment support is required\n", + frame_buf_size, mtu_size, frag_tbl_sz); + } + print_app_sa_prm(&app_sa_prm); if (optind >= 0) @@ -1663,6 +1783,9 @@ cryptodevs_init(void) int16_t cdev_id, port_id; struct rte_hash_parameters params = { 0 }; + const uint64_t mseg_flag = multi_seg_required() ? + RTE_CRYPTODEV_FF_IN_PLACE_SGL : 0; + params.entries = CDEV_MAP_ENTRIES; params.key_len = sizeof(struct cdev_key); params.hash_func = rte_jhash; @@ -1731,6 +1854,12 @@ cryptodevs_init(void) rte_cryptodev_info_get(cdev_id, &cdev_info); + if ((mseg_flag & cdev_info.feature_flags) != mseg_flag) + rte_exit(EXIT_FAILURE, + "Device %hd does not support \'%s\' feature\n", + cdev_id, + rte_cryptodev_get_feature_name(mseg_flag)); + if (nb_lcore_params > cdev_info.max_nb_queue_pairs) max_nb_qps = cdev_info.max_nb_queue_pairs; else @@ -1859,6 +1988,7 @@ cryptodevs_init(void) static void port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads) { + uint32_t frame_size; struct rte_eth_dev_info dev_info; struct rte_eth_txconf *txconf; uint16_t nb_tx_queue, nb_rx_queue; @@ -1897,9 +2027,14 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads) printf("Creating queues: nb_rx_queue=%d nb_tx_queue=%u...\n", nb_rx_queue, nb_tx_queue); - if (frame_size) { - local_port_conf.rxmode.max_rx_pkt_len = frame_size; + frame_size = MTU_TO_FRAMELEN(mtu_size); + if (frame_size > local_port_conf.rxmode.max_rx_pkt_len) local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME; + local_port_conf.rxmode.max_rx_pkt_len = frame_size; + + if (multi_seg_required()) { + local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER; + local_port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS; } local_port_conf.rxmode.offloads |= req_rx_offloads; @@ -2020,16 +2155,25 @@ static void pool_init(struct socket_ctx *ctx, int32_t socket_id, uint32_t nb_mbuf) { char s[64]; - uint32_t buff_size = frame_size ? (frame_size + RTE_PKTMBUF_HEADROOM) : - RTE_MBUF_DEFAULT_BUF_SIZE; - + int32_t ms; snprintf(s, sizeof(s), "mbuf_pool_%d", socket_id); ctx->mbuf_pool = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, ipsec_metadata_size(), - buff_size, - socket_id); - if (ctx->mbuf_pool == NULL) + frame_buf_size, socket_id); + + /* + * if multi-segment support is enabled, then create a pool + * for indirect mbufs. + */ + ms = multi_seg_required(); + if (ms != 0) { + snprintf(s, sizeof(s), "mbuf_pool_indir_%d", socket_id); + ctx->mbuf_pool_indir = rte_pktmbuf_pool_create(s, nb_mbuf, + MEMPOOL_CACHE_SIZE, 0, 0, socket_id); + } + + if (ctx->mbuf_pool == NULL || (ms != 0 && ctx->mbuf_pool_indir == NULL)) rte_exit(EXIT_FAILURE, "Cannot init mbuf pool on socket %d\n", socket_id); else @@ -2091,6 +2235,139 @@ inline_ipsec_event_callback(uint16_t port_id, enum rte_eth_event_type type, return -1; } +static uint16_t +rx_callback(__rte_unused uint16_t port, __rte_unused uint16_t queue, + struct rte_mbuf *pkt[], uint16_t nb_pkts, + __rte_unused uint16_t max_pkts, void *user_param) +{ + uint64_t tm; + uint32_t i, k; + struct lcore_conf *lc; + struct rte_mbuf *mb; + struct ether_hdr *eth; + + lc = user_param; + k = 0; + tm = 0; + + for (i = 0; i != nb_pkts; i++) { + + mb = pkt[i]; + eth = rte_pktmbuf_mtod(mb, struct ether_hdr *); + if (eth->ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { + + struct ipv4_hdr *iph; + + iph = (struct ipv4_hdr *)(eth + 1); + if (rte_ipv4_frag_pkt_is_fragmented(iph)) { + + mb->l2_len = sizeof(*eth); + mb->l3_len = sizeof(*iph); + tm = (tm != 0) ? tm : rte_rdtsc(); + mb = rte_ipv4_frag_reassemble_packet( + lc->frag.tbl, &lc->frag.dr, + mb, tm, iph); + + if (mb != NULL) { + /* fix ip cksum after reassemble. */ + iph = rte_pktmbuf_mtod_offset(mb, + struct ipv4_hdr *, mb->l2_len); + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + } + } + } else if (eth->ether_type == + rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { + + struct ipv6_hdr *iph; + struct ipv6_extension_fragment *fh; + + iph = (struct ipv6_hdr *)(eth + 1); + fh = rte_ipv6_frag_get_ipv6_fragment_header(iph); + if (fh != NULL) { + mb->l2_len = sizeof(*eth); + mb->l3_len = (uintptr_t)fh - (uintptr_t)iph + + sizeof(*fh); + tm = (tm != 0) ? tm : rte_rdtsc(); + mb = rte_ipv6_frag_reassemble_packet( + lc->frag.tbl, &lc->frag.dr, + mb, tm, iph, fh); + if (mb != NULL) + /* fix l3_len after reassemble. */ + mb->l3_len = mb->l3_len - sizeof(*fh); + } + } + + pkt[k] = mb; + k += (mb != NULL); + } + + /* some fragments were encountered, drain death row */ + if (tm != 0) + rte_ip_frag_free_death_row(&lc->frag.dr, 0); + + return k; +} + + +static int +reassemble_lcore_init(struct lcore_conf *lc, uint32_t cid) +{ + int32_t sid; + uint32_t i; + uint64_t frag_cycles; + const struct lcore_rx_queue *rxq; + const struct rte_eth_rxtx_callback *cb; + + /* create fragment table */ + sid = rte_lcore_to_socket_id(cid); + frag_cycles = (rte_get_tsc_hz() + MS_PER_S - 1) / + MS_PER_S * FRAG_TTL_MS; + + lc->frag.tbl = rte_ip_frag_table_create(frag_tbl_sz, + FRAG_TBL_BUCKET_ENTRIES, frag_tbl_sz, frag_cycles, sid); + if (lc->frag.tbl == NULL) { + printf("%s(%u): failed to create fragment table of size: %u, " + "error code: %d\n", + __func__, cid, frag_tbl_sz, rte_errno); + return -ENOMEM; + } + + /* setup reassemble RX callbacks for all queues */ + for (i = 0; i != lc->nb_rx_queue; i++) { + + rxq = lc->rx_queue_list + i; + cb = rte_eth_add_rx_callback(rxq->port_id, rxq->queue_id, + rx_callback, lc); + if (cb == NULL) { + printf("%s(%u): failed to install RX callback for " + "portid=%u, queueid=%u, error code: %d\n", + __func__, cid, + rxq->port_id, rxq->queue_id, rte_errno); + return -ENOMEM; + } + } + + return 0; +} + +static int +reassemble_init(void) +{ + int32_t rc; + uint32_t i, lc; + + rc = 0; + for (i = 0; i != nb_lcore_params; i++) { + lc = lcore_params[i].lcore_id; + rc = reassemble_lcore_init(lcore_conf + lc, lc); + if (rc != 0) + break; + } + + return rc; +} + int32_t main(int32_t argc, char **argv) { @@ -2185,6 +2462,13 @@ main(int32_t argc, char **argv) RTE_ETH_EVENT_IPSEC, inline_ipsec_event_callback, NULL); } + /* fragment reassemble is enabled */ + if (frag_tbl_sz != 0) { + ret = reassemble_init(); + if (ret != 0) + rte_exit(EXIT_FAILURE, "failed at reassemble init"); + } + check_all_ports_link_status(enabled_port_mask); /* launch per-lcore init on every lcore */ diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h index 589398f6f..de7fd7d84 100644 --- a/examples/ipsec-secgw/ipsec.h +++ b/examples/ipsec-secgw/ipsec.h @@ -180,6 +180,7 @@ struct socket_ctx { struct rt_ctx *rt_ip4; struct rt_ctx *rt_ip6; struct rte_mempool *mbuf_pool; + struct rte_mempool *mbuf_pool_indir; struct rte_mempool *session_pool; struct rte_mempool *session_priv_pool; }; diff --git a/examples/ipsec-secgw/meson.build b/examples/ipsec-secgw/meson.build index 81c146ebc..9ece345cf 100644 --- a/examples/ipsec-secgw/meson.build +++ b/examples/ipsec-secgw/meson.build @@ -6,7 +6,7 @@ # To build this example as a standalone application with an already-installed # DPDK instance, use 'make' -deps += ['security', 'lpm', 'acl', 'hash', 'ipsec'] +deps += ['security', 'lpm', 'acl', 'hash', 'ip_frag', 'ipsec'] allow_experimental_apis = true sources = files( 'esp.c', 'ipsec.c', 'ipsec_process.c', 'ipsec-secgw.c', -- 2.17.1