From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH] l3fwd improve grouping by destination port a bit
Date: Tue, 22 Jul 2014 17:04:47 +0100 [thread overview]
Message-ID: <1406045087-9625-1-git-send-email-konstantin.ananyev@intel.com> (raw)
Latest changes introduced a small degradation for the corner case
when each input packet is destined to the different port.
For the test-case when 1 core manages 4 ports and packet stream looks like:
IPV4_DSTPORT0, IPV4_DSTPORT1, IPV4_DSTPORT3, IPV4_DSTPORT4, IPV4_DSTPORT0, ...
non-optimised code path outperforms optimised one by 2-3%.
These changes supposed to close that gap.
>From my testing: now for the case descirbed above optimised code path
produces same numbers as non-optimised one.
For other test-cases numbers remain about the same.
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
examples/l3fwd/main.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 235 insertions(+), 16 deletions(-)
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 73a039e..bef409a 100755
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -1271,6 +1271,168 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
&dst_port[3], pkt[3]->ol_flags);
}
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define GRPSZ (1 << FWDSTEP)
+#define GRPMSK (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+ if (likely((dlp) == (dcp)[(idx)])) { \
+ (lp)[0]++; \
+ } else { \
+ (dlp) = (dcp)[idx]; \
+ (lp) = (pn) + (idx); \
+ (lp)[0] = 1; \
+ } \
+} while (0)
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
+{
+ static const struct {
+ uint64_t pnum; /* prebuild 4 values for pnum[]. */
+ int32_t idx; /* index for new last updated elemnet. */
+ uint16_t lpv; /* add value to the last updated element. */
+ } gptbl[GRPSZ] = {
+ {
+ /* 0: a != b, b != c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100010001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 1: a == b, b != c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100010002),
+ .idx = 4,
+ .lpv = 1,
+ },
+ {
+ /* 2: a != b, b == c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100020001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 3: a == b, b == c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100020003),
+ .idx = 4,
+ .lpv = 2,
+ },
+ {
+ /* 4: a != b, b != c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200010001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 5: a == b, b != c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200010002),
+ .idx = 4,
+ .lpv = 1,
+ },
+ {
+ /* 6: a != b, b == c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200030001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 7: a == b, b == c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200030004),
+ .idx = 4,
+ .lpv = 3,
+ },
+ {
+ /* 8: a != b, b != c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100010001),
+ .idx = 3,
+ .lpv = 0,
+ },
+ {
+ /* 9: a == b, b != c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100010002),
+ .idx = 3,
+ .lpv = 1,
+ },
+ {
+ /* 0xa: a != b, b == c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100020001),
+ .idx = 3,
+ .lpv = 0,
+ },
+ {
+ /* 0xb: a == b, b == c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100020003),
+ .idx = 3,
+ .lpv = 2,
+ },
+ {
+ /* 0xc: a != b, b != c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300010001),
+ .idx = 2,
+ .lpv = 0,
+ },
+ {
+ /* 0xd: a == b, b != c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300010002),
+ .idx = 2,
+ .lpv = 1,
+ },
+ {
+ /* 0xe: a != b, b == c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300040001),
+ .idx = 1,
+ .lpv = 0,
+ },
+ {
+ /* 0xf: a == b, b == c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300040005),
+ .idx = 0,
+ .lpv = 4,
+ },
+ };
+
+ union {
+ uint16_t u16[FWDSTEP + 1];
+ uint64_t u64;
+ } *pnum = (void *)pn;
+
+ int32_t v;
+
+ dp1 = _mm_cmpeq_epi16(dp1, dp2);
+ dp1 = _mm_unpacklo_epi16(dp1, dp1);
+ v = _mm_movemask_ps((__m128)dp1);
+
+ /* update last port counter. */
+ lp[0] += gptbl[v].lpv;
+
+ /* if dest port value has changed. */
+ if (v != GRPMSK) {
+ lp = pnum->u16 + gptbl[v].idx;
+ lp[0] = 1;
+ pnum->u64 = gptbl[v].pnum;
+ }
+
+ return lp;
+}
+
#endif /* APP_LOOKUP_METHOD */
/* main processing loop */
@@ -1289,9 +1451,12 @@ main_loop(__attribute__((unused)) void *dummy)
#if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \
(ENABLE_MULTI_BUFFER_OPTIMIZE == 1))
int32_t k;
+ uint16_t dlp;
+ uint16_t *lp;
uint16_t dst_port[MAX_PKT_BURST];
__m128i dip[MAX_PKT_BURST / FWDSTEP];
uint32_t flag[MAX_PKT_BURST / FWDSTEP];
+ uint16_t pnum[MAX_PKT_BURST + 1];
#endif
prev_tsc = 0;
@@ -1402,9 +1567,61 @@ main_loop(__attribute__((unused)) void *dummy)
&pkts_burst[j], &dst_port[j]);
}
+ /*
+ * Finish packet processing and group consecutive
+ * packets with the same destination port.
+ */
k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
- for (j = 0; j != k; j += FWDSTEP) {
- processx4_step3(&pkts_burst[j], &dst_port[j]);
+ if (k != 0) {
+ __m128i dp1, dp2;
+
+ lp = pnum;
+ lp[0] = 1;
+
+ processx4_step3(pkts_burst, dst_port);
+
+ /* dp1: <d[0], d[1], d[2], d[3], ... > */
+ dp1 = _mm_loadu_si128((__m128i *)dst_port);
+
+ for (j = FWDSTEP; j != k; j += FWDSTEP) {
+ processx4_step3(&pkts_burst[j],
+ &dst_port[j]);
+
+ /*
+ * dp2:
+ * <d[j-3], d[j-2], d[j-1], d[j], ... >
+ */
+ dp2 = _mm_loadu_si128((__m128i *)
+ &dst_port[j - FWDSTEP + 1]);
+ lp = port_groupx4(&pnum[j - FWDSTEP],
+ lp, dp1, dp2);
+
+ /*
+ * dp1:
+ * <d[j], d[j+1], d[j+2], d[j+3], ... >
+ */
+ dp1 = _mm_srli_si128(dp2,
+ (FWDSTEP - 1) *
+ sizeof(dst_port[0]));
+ }
+
+ /*
+ * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+ */
+ dp2 = _mm_shufflelo_epi16(dp1, 0xf9);
+ lp = port_groupx4(&pnum[j - FWDSTEP], lp,
+ dp1, dp2);
+
+ /*
+ * remove values added by the last repeated
+ * dst port.
+ */
+ lp[0]--;
+ dlp = dst_port[j - 1];
+ } else {
+ /* set dlp and lp to the never used values. */
+ dlp = BAD_PORT - 1;
+ lp = pnum + MAX_PKT_BURST;
}
/* Process up to last 3 packets one by one. */
@@ -1412,39 +1629,41 @@ main_loop(__attribute__((unused)) void *dummy)
case 3:
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
case 2:
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
case 1:
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
}
/*
* Send packets out, through destination port.
- * Try to group packets with the same destination port.
+ * Consecuteve pacekts with the same destination port
+ * are already grouped together.
* If destination port for the packet equals BAD_PORT,
* then free the packet without sending it out.
*/
- for (j = 0; j < nb_rx; j = k) {
-
- uint16_t cn, pn = dst_port[j];
+ for (j = 0; j < nb_rx; j += k) {
- k = j;
- do {
- cn = dst_port[k];
- } while (cn != BAD_PORT && pn == cn &&
- ++k < nb_rx);
+ int32_t m;
+ uint16_t pn;
- send_packetsx4(qconf, pn, pkts_burst + j,
- k - j);
+ pn = dst_port[j];
+ k = pnum[j];
- if (cn == BAD_PORT) {
- rte_pktmbuf_free(pkts_burst[k]);
- k += 1;
+ if (likely(pn != BAD_PORT)) {
+ send_packetsx4(qconf, pn,
+ pkts_burst + j, k);
+ } else {
+ for (m = j; m != j + k; m++)
+ rte_pktmbuf_free(pkts_burst[m]);
}
}
--
1.8.3.1
next reply other threads:[~2014-07-22 16:04 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-07-22 16:04 Konstantin Ananyev [this message]
2014-08-01 16:27 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1406045087-9625-1-git-send-email-konstantin.ananyev@intel.com \
--to=konstantin.ananyev@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).