DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
@ 2014-05-22 16:55 Konstantin Ananyev
  2014-05-22 16:55 ` [dpdk-dev] [PATCH 1/2] lpm: Introduce rte_lpm_lookupx4 Konstantin Ananyev
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Konstantin Ananyev @ 2014-05-22 16:55 UTC (permalink / raw)
  To: dev, dev

With latest HW and optimised RX/TX path there is a huge gap between
tespmd iofwd and l3fwd performance results.
So there is an attempt to optimise l3fwd LPM code path and reduce the gap:
 - Instead of processing each input packet up to completion -      
 divide packet processing into several stages and perform      
 stage by stage for the whole burst.
 - Unroll things by the factor of 4 whenever possible.
 - Use SSE instincts for some operations (bswap, replace MAC addresses, etc).
 - Avoid TX packet buffering whenever possible.
 - Move some checks from RX/TX into setup phase.

 app/test/test_lpm.c                             |   70 ++++
 examples/l3fwd/main.c                           |  467 +++++++++++++++++++++-
 lib/librte_eal/common/Makefile                  |    1 +
 lib/librte_eal/common/include/rte_common_vect.h |   93 +++++
 lib/librte_lpm/rte_lpm.h                        |  117 ++++++
 5 files changed, 726 insertions(+), 22 deletions(-)
 create mode 100644 lib/librte_eal/common/include/rte_common_vect.h

-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH 1/2] lpm: Introduce rte_lpm_lookupx4
  2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
@ 2014-05-22 16:55 ` Konstantin Ananyev
  2014-05-22 16:55 ` [dpdk-dev] [PATCH 2/2] l3fwd: reorganise and optimize l3fwd LPM code path Konstantin Ananyev
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Konstantin Ananyev @ 2014-05-22 16:55 UTC (permalink / raw)
  To: dev, dev

Introduce rte_lpm_lookupx4():
 - Allows to lookup four IP addresses in an LPM table.
 - Uses SSE instrincts.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_lpm.c                             |   70 ++++++++++++++
 lib/librte_eal/common/Makefile                  |    1 +
 lib/librte_eal/common/include/rte_common_vect.h |   93 ++++++++++++++++++
 lib/librte_lpm/rte_lpm.h                        |  117 +++++++++++++++++++++++
 4 files changed, 281 insertions(+), 0 deletions(-)
 create mode 100644 lib/librte_eal/common/include/rte_common_vect.h

diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c
index ffed766..1b2f32a 100644
--- a/app/test/test_lpm.c
+++ b/app/test/test_lpm.c
@@ -310,6 +310,8 @@ test6(void)
 int32_t
 test7(void)
 {
+	__m128i ipx4;
+	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip = IPv4(0, 0, 0, 0);
 	uint8_t depth = 32, next_hop_add = 100, next_hop_return = 0;
@@ -324,6 +326,13 @@ test7(void)
 	status = rte_lpm_lookup(lpm, ip, &next_hop_return);
 	TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add));
 
+	ipx4 = _mm_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
+	rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
+	TEST_LPM_ASSERT(hop[0] == next_hop_add);
+	TEST_LPM_ASSERT(hop[1] == UINT16_MAX);
+	TEST_LPM_ASSERT(hop[2] == UINT16_MAX);
+	TEST_LPM_ASSERT(hop[3] == next_hop_add);
+
 	status = rte_lpm_delete(lpm, ip, depth);
 	TEST_LPM_ASSERT(status == 0);
 
@@ -347,6 +356,8 @@ test7(void)
 int32_t
 test8(void)
 {
+	__m128i ipx4;
+	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip1 = IPv4(127, 255, 255, 255), ip2 = IPv4(128, 0, 0, 0);
 	uint8_t depth, next_hop_add, next_hop_return;
@@ -370,6 +381,13 @@ test8(void)
 		status = rte_lpm_lookup(lpm, ip2, &next_hop_return);
 		TEST_LPM_ASSERT((status == 0) &&
 			(next_hop_return == next_hop_add));
+
+		ipx4 = _mm_set_epi32(ip2, ip1, ip2, ip1);
+		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
+		TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
+		TEST_LPM_ASSERT(hop[1] == next_hop_add);
+		TEST_LPM_ASSERT(hop[2] == UINT16_MAX);
+		TEST_LPM_ASSERT(hop[3] == next_hop_add);
 	}
 
 	/* Loop with rte_lpm_delete. */
@@ -391,6 +409,18 @@ test8(void)
 
 		status = rte_lpm_lookup(lpm, ip1, &next_hop_return);
 		TEST_LPM_ASSERT(status == -ENOENT);
+
+		ipx4 = _mm_set_epi32(ip1, ip1, ip2, ip2);
+		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
+		if (depth != 1) {
+			TEST_LPM_ASSERT(hop[0] == next_hop_add);
+			TEST_LPM_ASSERT(hop[1] == next_hop_add);
+		} else {
+			TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
+			TEST_LPM_ASSERT(hop[1] == UINT16_MAX);
+		}
+		TEST_LPM_ASSERT(hop[2] == UINT16_MAX);
+		TEST_LPM_ASSERT(hop[3] == UINT16_MAX);
 	}
 
 	rte_lpm_free(lpm);
@@ -822,6 +852,8 @@ test11(void)
 int32_t
 test12(void)
 {
+	__m128i ipx4;
+	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip, i;
 	uint8_t depth, next_hop_add, next_hop_return;
@@ -842,6 +874,13 @@ test12(void)
 		TEST_LPM_ASSERT((status == 0) &&
 				(next_hop_return == next_hop_add));
 
+		ipx4 = _mm_set_epi32(ip, ip + 1, ip, ip - 1);
+		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
+		TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
+		TEST_LPM_ASSERT(hop[1] == next_hop_add);
+		TEST_LPM_ASSERT(hop[2] == UINT16_MAX);
+		TEST_LPM_ASSERT(hop[3] == next_hop_add);
+
 		status = rte_lpm_delete(lpm, ip, depth);
 		TEST_LPM_ASSERT(status == 0);
 
@@ -1237,6 +1276,37 @@ perf_test(void)
 			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
 			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
 
+	/* Measure LookupX4 */
+	total_time = 0;
+	count = 0;
+	for (i = 0; i < ITERATIONS; i ++) {
+		static uint32_t ip_batch[BATCH_SIZE];
+		uint16_t next_hops[4];
+
+		/* Create array of random IP addresses */
+		for (j = 0; j < BATCH_SIZE; j ++)
+			ip_batch[j] = rte_rand();
+
+		/* Lookup per batch */
+		begin = rte_rdtsc();
+		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
+			unsigned k;
+			__m128i ipx4;
+
+			ipx4 = _mm_loadu_si128((__m128i *)(ip_batch + j));
+			ipx4 = *(__m128i *)(ip_batch + j);
+			rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT16_MAX);
+			for (k = 0; k < RTE_DIM(next_hops); k++)
+				if (unlikely (next_hops[k] == UINT16_MAX))
+					count++;
+		}
+
+		total_time += rte_rdtsc() - begin;
+	}
+	printf("LPM LookupX4: %.1f cycles (fails = %.1f%%)\n",
+			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
+			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
+
 	/* Delete */
 	status = 0;
 	begin = rte_rdtsc();
diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index 0016fc5..3103019 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -39,6 +39,7 @@ INC += rte_rwlock.h rte_spinlock.h rte_tailq.h rte_interrupts.h rte_alarm.h
 INC += rte_string_fns.h rte_cpuflags.h rte_version.h rte_tailq_elem.h
 INC += rte_eal_memconfig.h rte_malloc_heap.h
 INC += rte_hexdump.h rte_devargs.h rte_dev.h
+INC += rte_common_vect.h
 
 ifeq ($(CONFIG_RTE_INSECURE_FUNCTION_WARNING),y)
 INC += rte_warnings.h
diff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h
new file mode 100644
index 0000000..065fc52
--- /dev/null
+++ b/lib/librte_eal/common/include/rte_common_vect.h
@@ -0,0 +1,93 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_COMMON_VECT_H_
+#define _RTE_COMMON_VECT_H_
+
+/**
+ * @file
+ *
+ * RTE SSE/AVX related header.
+ */
+
+#if (defined(__ICC) || (__GNUC__ == 4 &&  __GNUC_MINOR__ < 4))
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#if defined (__SSE4_2__) || defined (__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+#else
+
+#include <x86intrin.h>
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef __m128i xmm_t;
+
+#define	XMM_SIZE	(sizeof (xmm_t))
+#define	XMM_MASK	(XMM_SIZE - 1)
+
+typedef union rte_xmm {
+    xmm_t    m;
+    uint8_t  u8[XMM_SIZE / sizeof (uint8_t)];
+    uint16_t u16[XMM_SIZE / sizeof (uint16_t)];
+    uint32_t u32[XMM_SIZE / sizeof (uint32_t)];
+    uint64_t u64[XMM_SIZE / sizeof (uint64_t)];
+    double   pd[XMM_SIZE / sizeof (double)];
+} rte_xmm_t;
+
+#ifdef RTE_ARCH_I686
+#define _mm_cvtsi128_si64(a) ({ \
+	rte_xmm_t m;            \
+	m.m = (a);              \
+	(m.u64[0]);             \
+})
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_COMMON__VECT_H_ */
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index 033f542..be8aa1d 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -45,6 +45,8 @@
 #include <stdlib.h>
 #include <rte_branch_prediction.h>
 #include <rte_memory.h>
+#include <rte_common.h>
+#include <rte_common_vect.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -331,6 +333,121 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
 	return 0;
 }
 
+/* Mask four results. */
+#define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ff00ff00ff00ff)
+
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ *   LPM object handle
+ * @param ip
+ *   Four IPs to be looked up in the LPM table
+ * @param hop
+ *   Next hop of the most specific rule found for IP (valid on lookup hit only).
+ *   This is an 4 elements array of two byte values.
+ *   If the lookup was succesfull for the given IP, then least significant byte
+ *   of the corresponding element is the  actual next hop and the most
+ *   significant byte is zero.
+ *   If the lookup for the given IP failed, then corresponding element would
+ *   contain default value, see description of then next parameter.
+ * @param defv
+ *   Default value to populate into corresponding element of hop[] array,
+ *   if lookup would fail.
+ */
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
+	uint16_t defv)
+{
+	__m128i i24;
+	rte_xmm_t i8;
+	uint16_t tbl[4];
+	uint64_t idx, pt;
+
+	const __m128i mask8 =
+		_mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
+	 * as one 64-bit value (0x0300030003000300).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
+	 * as one 64-bit value (0x0100010001000100).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = _mm_srli_epi32(ip, CHAR_BIT);
+
+	/* extract values from tbl24[] */
+	idx = _mm_cvtsi128_si64(i24);
+	i24 = _mm_srli_si128(i24, sizeof (uint64_t));
+
+	tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	idx = _mm_cvtsi128_si64(i24);
+
+	tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	/* get 4 indexes for tbl8[]. */
+	i8.m = _mm_and_si128(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 16 |
+		(uint64_t)tbl[2] << 32 |
+		(uint64_t)tbl[3] << 48;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v)) {
+		uintptr_t ph = (uintptr_t)hop;
+		*(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
+	}
+	if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
+	}
+	if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH 2/2] l3fwd: reorganise and optimize l3fwd LPM code path.
  2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
  2014-05-22 16:55 ` [dpdk-dev] [PATCH 1/2] lpm: Introduce rte_lpm_lookupx4 Konstantin Ananyev
@ 2014-05-22 16:55 ` Konstantin Ananyev
  2014-05-23  8:05 ` [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Thomas Monjalon
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Konstantin Ananyev @ 2014-05-22 16:55 UTC (permalink / raw)
  To: dev, dev

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 examples/l3fwd/main.c |  467 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 445 insertions(+), 22 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 8ee1af9..29f115a 100755
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -42,8 +42,8 @@
 #include <errno.h>
 #include <getopt.h>
 
-#include <tmmintrin.h>
 #include <rte_common.h>
+#include <rte_common_vect.h>
 #include <rte_byteorder.h>
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -83,7 +83,16 @@
 #define APP_LOOKUP_METHOD             APP_LOOKUP_LPM
 #endif
 
+/*
+ *  When set to zero, simple forwaring path is eanbled.
+ *  When set to one, optimized forwarding path is enabled.
+ *  Note that LPM optimisation path uses SSE4.1 instructions.
+ */
+#if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && !defined (__SSE4_1__))
+#define ENABLE_MULTI_BUFFER_OPTIMIZE	0
+#else
 #define ENABLE_MULTI_BUFFER_OPTIMIZE	1
+#endif
 
 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
 #include <rte_hash.h>
@@ -150,11 +159,21 @@
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
+/*
+ * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
+ */
+#define	MAX_TX_BURST	(MAX_PKT_BURST / 2)
+
 #define NB_SOCKETS 8
 
 /* Configure how many packets ahead to prefetch, when reading packets */
 #define PREFETCH_OFFSET	3
 
+/* Used to mark destination port as 'invalid'. */
+#define	BAD_PORT	((uint16_t)-1)
+
+#define FWDSTEP	4
+
 /*
  * Configurable number of RX/TX ring descriptors
  */
@@ -166,6 +185,11 @@ static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
 /* ethernet addresses of ports */
 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
+static __m128i val_eth[RTE_MAX_ETHPORTS];
+
+/* replace first 12B of the ethernet header. */
+#define	MASK_ETH	0x3f
+
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
 static int promiscuous_on = 0; /**< Ports set in promiscuous mode off by default. */
@@ -562,6 +586,84 @@ send_single_packet(struct rte_mbuf *m, uint8_t port)
 	return 0;
 }
 
+static inline __attribute__((always_inline)) void
+send_packetsx4(struct lcore_conf *qconf, uint8_t port,
+	struct rte_mbuf *m[], uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	case 3:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	case 2:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	case 1:
+		qconf->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		case 3:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		case 2:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		case 1:
+			qconf->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	qconf->tx_mbufs[port].len = len;
+}
+
 #ifdef DO_RFC_1812_CHECKS
 static inline int
 is_valid_ipv4_pkt(struct ipv4_hdr *pkt, uint32_t link_len)
@@ -647,14 +749,15 @@ get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, lookup_struct_t * ipv6_l3fwd_
 #endif
 
 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
+
 static inline uint8_t
 get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, lookup_struct_t * ipv4_l3fwd_lookup_struct)
 {
 	uint8_t next_hop;
 
 	return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-			rte_be_to_cpu_32(((struct ipv4_hdr*)ipv4_hdr)->dst_addr), &next_hop) == 0)?
-			next_hop : portid);
+		rte_be_to_cpu_32(((struct ipv4_hdr*)ipv4_hdr)->dst_addr),
+		&next_hop) == 0) ? next_hop : portid);
 }
 
 static inline uint8_t
@@ -667,7 +770,8 @@ get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, lookup6_struct_t * ipv6_l3fwd
 }
 #endif
 
-#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) & (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)
+#if ((APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) && \
+	(ENABLE_MULTI_BUFFER_OPTIMIZE == 1))
 static inline void l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qconf);
 
 #define MASK_ALL_PKTS    0xf
@@ -886,7 +990,7 @@ simple_ipv6_fwd_4pkts(struct rte_mbuf* m[4], uint8_t portid, struct lcore_conf *
 	send_single_packet(m[3], (uint8_t)dst_port[3]);
 
 }
-#endif // End of #if(APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)&(ENABLE_MULTI_BUFFER_OPTIMIZE == 1)
+#endif /* APP_LOOKUP_METHOD */
 
 static inline __attribute__((always_inline)) void
 l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qconf)
@@ -911,13 +1015,16 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qcon
 		}
 #endif
 
-		dst_port = get_ipv4_dst_port(ipv4_hdr, portid, qconf->ipv4_lookup_struct);
-		if (dst_port >= RTE_MAX_ETHPORTS || (enabled_port_mask & 1 << dst_port) == 0)
+		 dst_port = get_ipv4_dst_port(ipv4_hdr, portid,
+			qconf->ipv4_lookup_struct);
+		if (dst_port >= RTE_MAX_ETHPORTS ||
+				(enabled_port_mask & 1 << dst_port) == 0)
 			dst_port = portid;
 
 		/* 02:00:00:00:00:xx */
 		d_addr_bytes = &eth_hdr->d_addr.addr_bytes[0];
-		*((uint64_t *)d_addr_bytes) = 0x000000000002 + ((uint64_t)dst_port << 40);
+		*((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR +
+			((uint64_t)dst_port << 40);
 
 #ifdef DO_RFC_1812_CHECKS
 		/* Update time to live and header checksum */
@@ -944,7 +1051,8 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qcon
 
 		/* 02:00:00:00:00:xx */
 		d_addr_bytes = &eth_hdr->d_addr.addr_bytes[0];
-		*((uint64_t *)d_addr_bytes) = 0x000000000002 + ((uint64_t)dst_port << 40);
+		*((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR +
+			((uint64_t)dst_port << 40);
 
 		/* src addr */
 		ether_addr_copy(&ports_eth_addr[dst_port], &eth_hdr->s_addr);
@@ -954,6 +1062,217 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qcon
 
 }
 
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof (struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t flags)
+{
+	uint8_t ihl;
+
+	if ((flags & PKT_RX_IPV4_HDR) != 0) {
+
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE)) {
+			dp[0] = BAD_PORT;
+		}
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+
+#if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \
+	(ENABLE_MULTI_BUFFER_OPTIMIZE == 1))
+
+static inline __attribute__((always_inline)) uint16_t
+get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint8_t portid)
+{
+	uint8_t next_hop;
+	struct ipv6_hdr *ipv6_hdr;
+	struct ether_hdr *eth_hdr;
+
+	if (pkt->ol_flags & PKT_RX_IPV4_HDR) {
+		if (rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
+				&next_hop) != 0)
+			next_hop = portid;
+	} else if (pkt->ol_flags & PKT_RX_IPV6_HDR) {
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+		ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+		if (rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+				ipv6_hdr->dst_addr, &next_hop) != 0)
+			next_hop = portid;
+	} else {
+		next_hop = portid;
+	}
+
+	return (next_hop);
+}
+
+static inline void
+process_packet(struct lcore_conf *qconf, struct rte_mbuf *pkt,
+	uint16_t *dst_port, uint8_t portid)
+{
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	uint32_t dst_ipv4;
+	uint16_t dp;
+	__m128i te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+	dst_ipv4 = ipv4_hdr->dst_addr;
+	dst_ipv4 = rte_be_to_cpu_32(dst_ipv4);
+	dp = get_dst_port(qconf, pkt, dst_ipv4, portid);
+
+	te = _mm_load_si128((__m128i*)eth_hdr);
+	ve = val_eth[dp];
+
+	dst_port[0] = dp;
+	rfc1812_process(ipv4_hdr, dst_port, pkt->ol_flags);
+
+	te =  _mm_blend_epi16(te, ve, MASK_ETH);
+	_mm_store_si128((__m128i*)eth_hdr, te);
+}
+
+/*
+ * Read ol_flags and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], __m128i *dip, uint32_t *flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	uint32_t x0, x1, x2, x3;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x0 = ipv4_hdr->dst_addr;
+	flag[0] = pkt[0]->ol_flags & PKT_RX_IPV4_HDR;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x1 = ipv4_hdr->dst_addr;
+	flag[0] &= pkt[1]->ol_flags;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x2 = ipv4_hdr->dst_addr;
+	flag[0] &= pkt[2]->ol_flags;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	x3 = ipv4_hdr->dst_addr;
+	flag[0] &= pkt[3]->ol_flags;
+
+	dip[0] = _mm_set_epi32(x3, x2, x1, x0);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
+	uint8_t portid, struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+	const  __m128i bswap_mask = _mm_set_epi8(12,13,14,15,8,9,10,11,
+						4,5,6,7,0,1,2,3);
+
+	/* Byte swap 4 IPV4 addresses. */
+	dip = _mm_shuffle_epi8(dip, bswap_mask);
+
+	/* if all 4 packets are IPV4. */
+	if (likely(flag != 0)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
+	} else {
+		dst.m = dip;
+		dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
+		dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
+		dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
+		dprt[3] = get_dst_port(qconf, pkt[3], dst.u32[3], portid);
+	}
+}
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	__m128i te[FWDSTEP];
+	__m128i ve[FWDSTEP];
+	__m128i *p[FWDSTEP];
+
+	p[0] = (rte_pktmbuf_mtod(pkt[0], __m128i *));
+	p[1] = (rte_pktmbuf_mtod(pkt[1], __m128i *));
+	p[2] = (rte_pktmbuf_mtod(pkt[2], __m128i *));
+	p[3] = (rte_pktmbuf_mtod(pkt[3], __m128i *));
+
+	ve[0] = val_eth[dst_port[0]];
+	te[0] = _mm_load_si128(p[0]);
+
+	ve[1] = val_eth[dst_port[1]];
+	te[1] = _mm_load_si128(p[1]);
+
+	ve[2] = val_eth[dst_port[2]];
+	te[2] = _mm_load_si128(p[2]);
+
+	ve[3] = val_eth[dst_port[3]];
+	te[3] = _mm_load_si128(p[3]);
+
+	/* Update first 12 bytes, keep rest bytes intact. */
+	te[0] =  _mm_blend_epi16(te[0], ve[0], MASK_ETH);
+	te[1] =  _mm_blend_epi16(te[1], ve[1], MASK_ETH);
+	te[2] =  _mm_blend_epi16(te[2], ve[2], MASK_ETH);
+	te[3] =  _mm_blend_epi16(te[3], ve[3], MASK_ETH);
+
+	_mm_store_si128(p[0], te[0]);
+	_mm_store_si128(p[1], te[1]);
+	_mm_store_si128(p[2], te[2]);
+	_mm_store_si128(p[3], te[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->ol_flags);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->ol_flags);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->ol_flags);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->ol_flags);
+}
+
+#endif /* APP_LOOKUP_METHOD */
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -964,7 +1283,16 @@ main_loop(__attribute__((unused)) void *dummy)
 	int i, j, nb_rx;
 	uint8_t portid, queueid;
 	struct lcore_conf *qconf;
-	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
+	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
+		US_PER_S * BURST_TX_DRAIN_US;
+
+#if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \
+	(ENABLE_MULTI_BUFFER_OPTIMIZE == 1))
+	int32_t k;
+	uint16_t dst_port[MAX_PKT_BURST];
+	__m128i dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t flag[MAX_PKT_BURST / FWDSTEP];
+#endif
 
 	prev_tsc = 0;
 
@@ -1003,7 +1331,7 @@ main_loop(__attribute__((unused)) void *dummy)
 			for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
-				send_burst(&lcore_conf[lcore_id],
+				send_burst(qconf,
 					qconf->tx_mbufs[portid].len,
 					portid);
 				qconf->tx_mbufs[portid].len = 0;
@@ -1018,10 +1346,18 @@ main_loop(__attribute__((unused)) void *dummy)
 		for (i = 0; i < qconf->n_rx_queue; ++i) {
 			portid = qconf->rx_queue_list[i].port_id;
 			queueid = qconf->rx_queue_list[i].queue_id;
-			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, MAX_PKT_BURST);
-#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) & (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)
+			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
+				MAX_PKT_BURST);
+			if (nb_rx == 0)
+				continue;
+
+#if (ENABLE_MULTI_BUFFER_OPTIMIZE == 1)
+#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
 			{
-				/* Send nb_rx - nb_rx%4 packets in groups of 4.*/
+				/*
+				 * Send nb_rx - nb_rx%4 packets
+				 * in groups of 4.
+				 */
 				int32_t n = RTE_ALIGN_FLOOR(nb_rx, 4);
 				for (j = 0; j < n ; j+=4) {
 					uint32_t ol_flag = pkts_burst[j]->ol_flags 
@@ -1050,7 +1386,71 @@ main_loop(__attribute__((unused)) void *dummy)
 								portid, qconf);
 				}
 			}
-#else			 
+#elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
+
+			k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+			for (j = 0; j != k; j += FWDSTEP) {
+				processx4_step1(&pkts_burst[j],
+					&dip[j / FWDSTEP],
+					&flag[j / FWDSTEP]);
+			}
+
+			k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+			for (j = 0; j != k; j += FWDSTEP) {
+				processx4_step2(qconf, dip[j / FWDSTEP],
+					flag[j / FWDSTEP], portid,
+					&pkts_burst[j], &dst_port[j]);
+			}
+
+			k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+			for (j = 0; j != k; j += FWDSTEP) {
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
+			}
+
+			/* Process up to last 3 packets one by one. */
+			switch(nb_rx % FWDSTEP) {
+			case 3:
+				process_packet(qconf, pkts_burst[j],
+					dst_port + j, portid);
+				j++;
+			case 2:
+				process_packet(qconf, pkts_burst[j],
+					dst_port + j, portid);
+				j++;
+			case 1:
+				process_packet(qconf, pkts_burst[j],
+					dst_port + j, portid);
+				j++;
+			}
+
+			/*
+			 * Send packets out, through destination port.
+			 * Try to group packets with the same destination port.
+			 * If destination port for the packet equals BAD_PORT,
+			 * then free the packet without sending it out.
+			 */
+			for (j = 0; j < nb_rx; j = k) {
+
+				uint16_t cn, pn = dst_port[j];
+
+				k = j;
+				do {
+					cn = dst_port[k];
+				} while (cn != BAD_PORT && pn == cn &&
+						++k < nb_rx);
+
+				send_packetsx4(qconf, pn, pkts_burst + j,
+					k - j);
+
+				if (cn == BAD_PORT) {
+					rte_pktmbuf_free(pkts_burst[k]);
+					k += 1;
+				}
+			}
+
+#endif /* APP_LOOKUP_METHOD */
+#else /* ENABLE_MULTI_BUFFER_OPTIMIZE == 0 */
+
 			/* Prefetch first packets */
 			for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -1061,14 +1461,17 @@ main_loop(__attribute__((unused)) void *dummy)
 			for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
 				rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
 						j + PREFETCH_OFFSET], void *));
-				l3fwd_simple_forward(pkts_burst[j], portid, qconf);
+				l3fwd_simple_forward(pkts_burst[j], portid,
+					qconf);
 			}
 
 			/* Forward remaining prefetched packets */
 			for (; j < nb_rx; j++) {
-				l3fwd_simple_forward(pkts_burst[j], portid, qconf);
+				l3fwd_simple_forward(pkts_burst[j], portid,
+					qconf);
 			}
-#endif // End of #if((ENABLE_MULTI_BUFFER_OPTIMIZE == 1)&(APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH))
+#endif /* ENABLE_MULTI_BUFFER_OPTIMIZE */
+
 		}
 	}
 }
@@ -1459,12 +1862,12 @@ populate_ipv4_few_flow_into_table(const struct rte_hash* h)
 		convert_ipv4_5tuple(&entry.key, &newkey);
 		ret = rte_hash_add_key (h,(void *) &newkey);
 		if (ret < 0) {
-			rte_exit(EXIT_FAILURE, "Unable to add entry %u to the"
+			rte_exit(EXIT_FAILURE, "Unable to add entry %" PRIu32 " to the"
                                 "l3fwd hash.\n", i);
 		}
 		ipv4_l3fwd_out_if[ret] = entry.if_out;
 	}
-	printf("Hash: Adding 0x%x keys\n", array_len);
+	printf("Hash: Adding 0x%" PRIx32 " keys\n", array_len);
 }
 
 #define BIT_16_TO_23 0x00ff0000
@@ -1484,12 +1887,12 @@ populate_ipv6_few_flow_into_table(const struct rte_hash* h)
 		convert_ipv6_5tuple(&entry.key, &newkey);
 		ret = rte_hash_add_key (h, (void *) &newkey);
 		if (ret < 0) {
-			rte_exit(EXIT_FAILURE, "Unable to add entry %u to the"
+			rte_exit(EXIT_FAILURE, "Unable to add entry %" PRIu32 " to the"
                                 "l3fwd hash.\n", i);
 		}
 		ipv6_l3fwd_out_if[ret] = entry.if_out;
 	}
-	printf("Hash: Adding 0x%xkeys\n", array_len);
+	printf("Hash: Adding 0x%" PRIx32 "keys\n", array_len);
 }
 
 #define NUMBER_PORT_USED 4
@@ -1657,6 +2060,12 @@ setup_lpm(int socketid)
 
 	/* populate the LPM table */
 	for (i = 0; i < IPV4_L3FWD_NUM_ROUTES; i++) {
+
+		/* skip unused ports */
+		if ((1 << ipv4_l3fwd_route_array[i].if_out &
+				enabled_port_mask) == 0)
+			continue;
+
 		ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid],
 			ipv4_l3fwd_route_array[i].ip,
 			ipv4_l3fwd_route_array[i].depth,
@@ -1688,6 +2097,12 @@ setup_lpm(int socketid)
 
 	/* populate the LPM table */
 	for (i = 0; i < IPV6_L3FWD_NUM_ROUTES; i++) {
+
+		/* skip unused ports */
+		if ((1 << ipv6_l3fwd_route_array[i].if_out &
+				enabled_port_mask) == 0)
+			continue;
+
 		ret = rte_lpm6_add(ipv6_l3fwd_lookup_struct[socketid],
 			ipv6_l3fwd_route_array[i].ip,
 			ipv6_l3fwd_route_array[i].depth,
@@ -1881,6 +2296,14 @@ MAIN(int argc, char **argv)
 		print_ethaddr(" Address:", &ports_eth_addr[portid]);
 		printf(", ");
 
+		/*
+		 * prepare dst and src MACs for each port.
+		 */
+		*(uint64_t *)(val_eth + portid) =
+			ETHER_LOCAL_ADMIN_ADDR + ((uint64_t)portid << 40);
+		ether_addr_copy(&ports_eth_addr[portid],
+			(struct ether_addr *)(val_eth + portid) + 1);
+
 		/* init memory */
 		ret = init_mem(NB_MBUF);
 		if (ret < 0)
-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
  2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
  2014-05-22 16:55 ` [dpdk-dev] [PATCH 1/2] lpm: Introduce rte_lpm_lookupx4 Konstantin Ananyev
  2014-05-22 16:55 ` [dpdk-dev] [PATCH 2/2] l3fwd: reorganise and optimize l3fwd LPM code path Konstantin Ananyev
@ 2014-05-23  8:05 ` Thomas Monjalon
  2014-06-04 13:47 ` Cao, Waterman
  2014-06-06  8:26 ` De Lara Guarch, Pablo
  4 siblings, 0 replies; 8+ messages in thread
From: Thomas Monjalon @ 2014-05-23  8:05 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev

Hi Konstantin,

2014-05-22 17:55, Konstantin Ananyev:
> With latest HW and optimised RX/TX path there is a huge gap between
> tespmd iofwd and l3fwd performance results.
> So there is an attempt to optimise l3fwd LPM code path and reduce the gap:
>  - Instead of processing each input packet up to completion -
>  divide packet processing into several stages and perform
>  stage by stage for the whole burst.
>  - Unroll things by the factor of 4 whenever possible.
>  - Use SSE instincts for some operations (bswap, replace MAC addresses,
> etc). - Avoid TX packet buffering whenever possible.
>  - Move some checks from RX/TX into setup phase.

As you are doing optimizations, it's important to know the performance gain.
It could help to mitigate future reworks.
So please, could you provide some benchmarking numbers in the commit log?

Thanks
-- 
Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
  2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
                   ` (2 preceding siblings ...)
  2014-05-23  8:05 ` [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Thomas Monjalon
@ 2014-06-04 13:47 ` Cao, Waterman
  2014-06-06  8:26 ` De Lara Guarch, Pablo
  4 siblings, 0 replies; 8+ messages in thread
From: Cao, Waterman @ 2014-06-04 13:47 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev, Thomas Monjalon

Tested-by: Waterman Cao <waterman.cao@intel.com>

This patch has been tested by Intel. We performed l3fwd performance test. 
Test result shows that l3fwd performance with this ‘lpm optimization’ patch is much higher than that without this patch. 
Test environment: Fedora 20, Linux Kernel 3.11.10, GCC 4.8.2, Intel Xeon processor E5-2680 v2, with 2 ports on 2 Niantic (all at socket 0)
Please refer performance data from the separate email:
http://dpdk.org/ml/archives/dev/2014-May/002703.html  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
  2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
                   ` (3 preceding siblings ...)
  2014-06-04 13:47 ` Cao, Waterman
@ 2014-06-06  8:26 ` De Lara Guarch, Pablo
  4 siblings, 0 replies; 8+ messages in thread
From: De Lara Guarch, Pablo @ 2014-06-06  8:26 UTC (permalink / raw)
  To: Ananyev, Konstantin, dev

Acked-by: Pablo de Lara Guarch <pablo.de.lara.guarch@intel.com>

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Konstantin Ananyev
> Sent: Thursday, May 22, 2014 5:56 PM
> To: dev@dpdk.org; dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
> 
> With latest HW and optimised RX/TX path there is a huge gap between
> tespmd iofwd and l3fwd performance results.
> So there is an attempt to optimise l3fwd LPM code path and reduce the gap:
>  - Instead of processing each input packet up to completion -
>  divide packet processing into several stages and perform
>  stage by stage for the whole burst.
>  - Unroll things by the factor of 4 whenever possible.
>  - Use SSE instincts for some operations (bswap, replace MAC addresses, etc).
>  - Avoid TX packet buffering whenever possible.
>  - Move some checks from RX/TX into setup phase.
> 
>  app/test/test_lpm.c                             |   70 ++++
>  examples/l3fwd/main.c                           |  467 +++++++++++++++++++++-
>  lib/librte_eal/common/Makefile                  |    1 +
>  lib/librte_eal/common/include/rte_common_vect.h |   93 +++++
>  lib/librte_lpm/rte_lpm.h                        |  117 ++++++
>  5 files changed, 726 insertions(+), 22 deletions(-)
>  create mode 100644 lib/librte_eal/common/include/rte_common_vect.h
> 
> --
> 1.7.7.6

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
  2014-05-28  9:17 Ananyev, Konstantin
@ 2014-06-10 22:44 ` Thomas Monjalon
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Monjalon @ 2014-06-10 22:44 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

Hi Konstantin,

2014-05-28 09:17, Ananyev, Konstantin:
> Hi Thomas,
> 
> >As you are doing optimizations, it's important to know the performance gain.
> >It could help to mitigate future reworks.
> >So please, could you provide some benchmarking numbers in the commit log?
> 
> Some performance data below.
> Also, forgot to mention that new code path can be switched on/off by setting
> ENABLE_MULTI_BUFFER_OPTIMIZE macro to 1/0.
> Do I need to resubmit the whole patch series, or just a cover letter, or ...?

I think you should resubmit the whole serie after having checked it with checkpatch.pl.
Please keep Acked-by and Tested-by lines from previous mails.

Thanks
-- 
Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation
@ 2014-05-28  9:17 Ananyev, Konstantin
  2014-06-10 22:44 ` Thomas Monjalon
  0 siblings, 1 reply; 8+ messages in thread
From: Ananyev, Konstantin @ 2014-05-28  9:17 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

>As you are doing optimizations, it's important to know the performance gain.
>It could help to mitigate future reworks.
>So please, could you provide some benchmarking numbers in the commit log?

Some performance data below.
Also, forgot to mention that new code path can be switched on/off by setting
ENABLE_MULTI_BUFFER_OPTIMIZE macro to 1/0.
Do I need to resubmit the whole patch series, or just a cover letter, or ...?

Konstantin

SUT:   dual-socket board IVB 2.8 GHz  with 4 ports on 4 NIC (all at socket 0) connected to the traffic generator.
2x1GB pages, kernel: 3.11.3-201.fc19.x86_64, gcc 4.8.2.
64B packets, using the packet flooding method.
All 4 ports are managed by one logical core:
Optimised scalar PMD RX/TX was used.

                                                           DIFF % (NEW-OLD)
IPV4-CONT-BURST:                              +23%
IPV6-CONT-BURST :                             +13% 
IPV4/IPV6-CONT-BURST:                   +8%
IPV4-4STREAMSX8:                              +7%
IPV4-4STREAMSX1:                              -2%

Test cases description:
IPV4-CONT-BURST - IPV4 packets all packets from the one input port are destined for the same output port.
IPV6-CONT-BURST - IPV6 packets all packets from the one input port are destined for the same output port.
IPV4/IPV6-CONT-BURST - mix of the first 2 with interleave=1 (e.g: IPV4,IPV6,IPV4,IPV6, ...)
IPV4-4STREAMSX1 - 4 streams of IPV4 packets, where all packets from same stream are destined for the same output port
(e.g: IPV4_DST_P0, IPV4_DST_P1,  IPV4_DST_P2, IPV4_DST_P3, IPV4_DST_P0, ...)
IPV4-4STREAMSX8 - same as above but packets for each stream are coming in groups of 8
(e.g:  IPV4_DST_P0 X 8, IPV4_DST_P1 X 8,  IPV4_DST_P2 X 8, IPV4_DST_P3 X 8, IPV4_DST_P0 X 8, ...)        

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-06-10 22:44 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-22 16:55 [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Konstantin Ananyev
2014-05-22 16:55 ` [dpdk-dev] [PATCH 1/2] lpm: Introduce rte_lpm_lookupx4 Konstantin Ananyev
2014-05-22 16:55 ` [dpdk-dev] [PATCH 2/2] l3fwd: reorganise and optimize l3fwd LPM code path Konstantin Ananyev
2014-05-23  8:05 ` [dpdk-dev] [PATCH 0/2] L3FWD sample optimisation Thomas Monjalon
2014-06-04 13:47 ` Cao, Waterman
2014-06-06  8:26 ` De Lara Guarch, Pablo
2014-05-28  9:17 Ananyev, Konstantin
2014-06-10 22:44 ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).