From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by dpdk.space (Postfix) with ESMTP id 2A0AEA05D3 for ; Fri, 24 May 2019 15:33:02 +0200 (CEST) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 0A9792C16; Fri, 24 May 2019 15:33:01 +0200 (CEST) Received: from mx1.redhat.com (mx1.redhat.com [209.132.183.28]) by dpdk.org (Postfix) with ESMTP id 6B2541DB9 for ; Fri, 24 May 2019 15:32:59 +0200 (CEST) Received: from smtp.corp.redhat.com (int-mx06.intmail.prod.int.phx2.redhat.com [10.5.11.16]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id 6B25C3078AAC; Fri, 24 May 2019 13:32:48 +0000 (UTC) Received: from dhcp-25.97.bos.redhat.com (unknown [10.18.25.61]) by smtp.corp.redhat.com (Postfix) with ESMTPS id 3AF0917243; Fri, 24 May 2019 13:32:45 +0000 (UTC) From: Aaron Conole To: Cc: , , Pavan Nikhilesh , "Olivier Matz" , "Vamsi Attunuru" References: <20190523081339.56348-1-jerinj@marvell.com> <20190523081339.56348-26-jerinj@marvell.com> Date: Fri, 24 May 2019 09:32:44 -0400 In-Reply-To: <20190523081339.56348-26-jerinj@marvell.com> (jerinj@marvell.com's message of "Thu, 23 May 2019 13:43:37 +0530") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.2 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 2.79 on 10.5.11.16 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.5.110.48]); Fri, 24 May 2019 13:32:53 +0000 (UTC) Subject: Re: [dpdk-dev] [PATCH v1 25/27] mempool/octeontx2: add optimized dequeue operation for arm64 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" writes: > From: Pavan Nikhilesh > > This patch adds an optimized arm64 instruction based routine to leverage > CPU pipeline characteristics of octeontx2. The theme is to fill the > pipeline with CASP operations as much HW can do so that HW can do alloc() > HW ops in full throttle. > > Cc: Olivier Matz > > Signed-off-by: Pavan Nikhilesh > Signed-off-by: Jerin Jacob > Signed-off-by: Vamsi Attunuru > --- > drivers/mempool/octeontx2/otx2_mempool_ops.c | 291 +++++++++++++++++++ > 1 file changed, 291 insertions(+) > > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempo= ol/octeontx2/otx2_mempool_ops.c > index c59bd73c0..ebe90d122 100644 > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c > @@ -37,6 +37,293 @@ npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t= * const addr, > return -ENOENT; > } >=20=20 > +#if defined(RTE_ARCH_ARM64) > +static __rte_noinline int > +npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr, > + void **obj_table, unsigned int n) > +{ > + uint8_t i; > + > + for (i =3D 0; i < n; i++) { > + if (obj_table[i] !=3D NULL) > + continue; > + if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i)) > + return -ENOENT; > + } > + > + return 0; > +} > + > +static __attribute__((optimize("-O3"))) __rte_noinline int __hot > +npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr, > + unsigned int n, void **obj_table) > +{ > + const __uint128_t wdata128 =3D ((__uint128_t)wdata << 64) | wdata; > + uint64x2_t failed =3D vdupq_n_u64(~0); > + > + switch (n) { > + case 32: > + { > + __uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; > + __uint128_t t10, t11; > + > + asm volatile ( > + ".cpu generic+lse\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t8], %H[t8], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t9], %H[t9], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t10], %H[t10], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t11], %H[t11], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d16, %[t0]\n" > + "fmov v16.D[1], %H[t0]\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d17, %[t1]\n" > + "fmov v17.D[1], %H[t1]\n" > + "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d18, %[t2]\n" > + "fmov v18.D[1], %H[t2]\n" > + "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d19, %[t3]\n" > + "fmov v19.D[1], %H[t3]\n" > + "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "and %[failed].16B, %[failed].16B, v17.16B\n" > + "and %[failed].16B, %[failed].16B, v18.16B\n" > + "and %[failed].16B, %[failed].16B, v19.16B\n" > + "fmov d20, %[t4]\n" > + "fmov v20.D[1], %H[t4]\n" > + "fmov d21, %[t5]\n" > + "fmov v21.D[1], %H[t5]\n" > + "fmov d22, %[t6]\n" > + "fmov v22.D[1], %H[t6]\n" > + "fmov d23, %[t7]\n" > + "fmov v23.D[1], %H[t7]\n" > + "and %[failed].16B, %[failed].16B, v20.16B\n" > + "and %[failed].16B, %[failed].16B, v21.16B\n" > + "and %[failed].16B, %[failed].16B, v22.16B\n" > + "and %[failed].16B, %[failed].16B, v23.16B\n" > + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" > + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" > + "fmov d16, %[t8]\n" > + "fmov v16.D[1], %H[t8]\n" > + "fmov d17, %[t9]\n" > + "fmov v17.D[1], %H[t9]\n" > + "fmov d18, %[t10]\n" > + "fmov v18.D[1], %H[t10]\n" > + "fmov d19, %[t11]\n" > + "fmov v19.D[1], %H[t11]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "and %[failed].16B, %[failed].16B, v17.16B\n" > + "and %[failed].16B, %[failed].16B, v18.16B\n" > + "and %[failed].16B, %[failed].16B, v19.16B\n" > + "fmov d20, %[t0]\n" > + "fmov v20.D[1], %H[t0]\n" > + "fmov d21, %[t1]\n" > + "fmov v21.D[1], %H[t1]\n" > + "fmov d22, %[t2]\n" > + "fmov v22.D[1], %H[t2]\n" > + "fmov d23, %[t3]\n" > + "fmov v23.D[1], %H[t3]\n" > + "and %[failed].16B, %[failed].16B, v20.16B\n" > + "and %[failed].16B, %[failed].16B, v21.16B\n" > + "and %[failed].16B, %[failed].16B, v22.16B\n" > + "and %[failed].16B, %[failed].16B, v23.16B\n" > + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" > + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" > + : "+Q" (*addr), [failed] "=3D&w" (failed), > + [t0] "=3D&r" (t0), [t1] "=3D&r" (t1), [t2] "=3D&r" (t2), > + [t3] "=3D&r" (t3), [t4] "=3D&r" (t4), [t5] "=3D&r" (t5), > + [t6] "=3D&r" (t6), [t7] "=3D&r" (t7), [t8] "=3D&r" (t8), > + [t9] "=3D&r" (t9), [t10] "=3D&r" (t10), [t11] "=3D&r" (t11) > + : [wdata] "r" (wdata128), [dst] "r" (obj_table), > + [loc] "r" (addr) > + : "memory", "v16", "v17", "v18", > + "v19", "v20", "v21", "v22", "v23" > + ); > + break; > + } > + case 16: > + { > + __uint128_t t0, t1, t2, t3, t4, t5, t6, t7; > + > + asm volatile ( > + ".cpu generic+lse\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d16, %[t0]\n" > + "fmov v16.D[1], %H[t0]\n" > + "fmov d17, %[t1]\n" > + "fmov v17.D[1], %H[t1]\n" > + "fmov d18, %[t2]\n" > + "fmov v18.D[1], %H[t2]\n" > + "fmov d19, %[t3]\n" > + "fmov v19.D[1], %H[t3]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "and %[failed].16B, %[failed].16B, v17.16B\n" > + "and %[failed].16B, %[failed].16B, v18.16B\n" > + "and %[failed].16B, %[failed].16B, v19.16B\n" > + "fmov d20, %[t4]\n" > + "fmov v20.D[1], %H[t4]\n" > + "fmov d21, %[t5]\n" > + "fmov v21.D[1], %H[t5]\n" > + "fmov d22, %[t6]\n" > + "fmov v22.D[1], %H[t6]\n" > + "fmov d23, %[t7]\n" > + "fmov v23.D[1], %H[t7]\n" > + "and %[failed].16B, %[failed].16B, v20.16B\n" > + "and %[failed].16B, %[failed].16B, v21.16B\n" > + "and %[failed].16B, %[failed].16B, v22.16B\n" > + "and %[failed].16B, %[failed].16B, v23.16B\n" > + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" > + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" > + : "+Q" (*addr), [failed] "=3D&w" (failed), > + [t0] "=3D&r" (t0), [t1] "=3D&r" (t1), [t2] "=3D&r" (t2), > + [t3] "=3D&r" (t3), [t4] "=3D&r" (t4), [t5] "=3D&r" (t5), > + [t6] "=3D&r" (t6), [t7] "=3D&r" (t7) > + : [wdata] "r" (wdata128), [dst] "r" (obj_table), > + [loc] "r" (addr) > + : "memory", "v16", "v17", "v18", "v19", > + "v20", "v21", "v22", "v23" > + ); > + break; > + } > + case 8: > + { > + __uint128_t t0, t1, t2, t3; > + > + asm volatile ( > + ".cpu generic+lse\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d16, %[t0]\n" > + "fmov v16.D[1], %H[t0]\n" > + "fmov d17, %[t1]\n" > + "fmov v17.D[1], %H[t1]\n" > + "fmov d18, %[t2]\n" > + "fmov v18.D[1], %H[t2]\n" > + "fmov d19, %[t3]\n" > + "fmov v19.D[1], %H[t3]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "and %[failed].16B, %[failed].16B, v17.16B\n" > + "and %[failed].16B, %[failed].16B, v18.16B\n" > + "and %[failed].16B, %[failed].16B, v19.16B\n" > + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" > + : "+Q" (*addr), [failed] "=3D&w" (failed), > + [t0] "=3D&r" (t0), [t1] "=3D&r" (t1), [t2] "=3D&r" (t2), > + [t3] "=3D&r" (t3) > + : [wdata] "r" (wdata128), [dst] "r" (obj_table), > + [loc] "r" (addr) > + : "memory", "v16", "v17", "v18", "v19" > + ); > + break; > + } > + case 4: > + { > + __uint128_t t0, t1; > + > + asm volatile ( > + ".cpu generic+lse\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d16, %[t0]\n" > + "fmov v16.D[1], %H[t0]\n" > + "fmov d17, %[t1]\n" > + "fmov v17.D[1], %H[t1]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "and %[failed].16B, %[failed].16B, v17.16B\n" > + "st1 { v16.2d, v17.2d}, [%[dst]], 32\n" > + : "+Q" (*addr), [failed] "=3D&w" (failed), > + [t0] "=3D&r" (t0), [t1] "=3D&r" (t1) > + : [wdata] "r" (wdata128), [dst] "r" (obj_table), > + [loc] "r" (addr) > + : "memory", "v16", "v17" > + ); > + break; > + } > + case 2: > + { > + __uint128_t t0; > + > + asm volatile ( > + ".cpu generic+lse\n" > + "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n" > + "fmov d16, %[t0]\n" > + "fmov v16.D[1], %H[t0]\n" > + "and %[failed].16B, %[failed].16B, v16.16B\n" > + "st1 { v16.2d}, [%[dst]], 16\n" > + : "+Q" (*addr), [failed] "=3D&w" (failed), > + [t0] "=3D&r" (t0) > + : [wdata] "r" (wdata128), [dst] "r" (obj_table), > + [loc] "r" (addr) > + : "memory", "v16" > + ); > + break; > + } > + case 1: > + return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0); > + } > + > + if (unlikely(!(((uint64_t *) &failed)[0] & ((uint64_t *) &failed)[1]))) > + return npa_lf_aura_op_search_alloc(wdata, addr, (void **) > + ((char *)obj_table - (sizeof(uint64_t) * n)), n); I think this is causing an error for some arm64 builds with the following warning; ../drivers/mempool/octeontx2/otx2_mempool_ops.c: In function =E2=80=98npa_l= f_aura_op_alloc_bulk=E2=80=99: ../drivers/mempool/octeontx2/otx2_mempool_ops.c:281:2: error: dereferencing= type-punned pointer will break strict-aliasing rules [-Werror=3Dstrict-ali= asing] if (unlikely(!(((uint64_t *) &failed)[0] & ((uint64_t *) &failed)[1]))) ^ This is only tested with gcc, though. See example travis build here: https://travis-ci.com/ovsrobot/dpdk/builds/112894377 Thanks! > + > + return 0; > +} > + > +static __rte_noinline void > +otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned = int n) > +{ > + unsigned int i; > + > + for (i =3D 0; i < n; i++) { > + if (obj_table[i] !=3D NULL) { > + otx2_npa_enq(mp, &obj_table[i], 1); > + obj_table[i] =3D NULL; > + } > + } > +} > + > +static inline int __hot > +otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned in= t n) > +{ > + const int64_t wdata =3D npa_lf_aura_handle_to_aura(mp->pool_id); > + void **obj_table_bak =3D obj_table; > + const unsigned int nfree =3D n; > + unsigned int parts; > + > + int64_t * const addr =3D (int64_t * const) > + (npa_lf_aura_handle_to_base(mp->pool_id) + > + NPA_LF_AURA_OP_ALLOCX(0)); > + while (n) { > + parts =3D n > 31 ? 32 : rte_align32prevpow2(n); > + n -=3D parts; > + if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr, > + parts, obj_table))) { > + otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n); > + return -ENOENT; > + } > + obj_table +=3D parts; > + } > + > + return 0; > +} > +#endif > + > static inline int __hot > otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n) > { > @@ -463,7 +750,11 @@ static struct rte_mempool_ops otx2_npa_ops =3D { > .get_count =3D otx2_npa_get_count, > .calc_mem_size =3D otx2_npa_calc_mem_size, > .populate =3D otx2_npa_populate, > +#if defined(RTE_ARCH_ARM64) > + .dequeue =3D otx2_npa_deq_arm64, > +#else > .dequeue =3D otx2_npa_deq, > +#endif > }; >=20=20 > MEMPOOL_REGISTER_OPS(otx2_npa_ops);