DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v2] mempool/octeontx2: fix arm64 clang build failure
@ 2019-07-05  4:26 pbhagavatula
  2019-07-05  9:02 ` Thomas Monjalon
  0 siblings, 1 reply; 2+ messages in thread
From: pbhagavatula @ 2019-07-05  4:26 UTC (permalink / raw)
  To: gavin.hu, jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

The ARMv8.1 CASP instruction works with even register pairs and since
there no register constraint in older versions of GCC/Clang, use
explicit register allocation to satisfy CASP requirements.
Remove function level optimization specification as Clang doesnt have
support for it and explicit register allocation removes the need for it.

Fixes build issue with arm64-armv8a-linux-clang.

Fixes: ee338015e7a9 ("mempool/octeontx2: add optimized dequeue operation for arm64")

Reported-by: Gavin Hu <gavin.hu@arm.com>
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
--- 
 v2 Changes:
 - Redo git commit title and descripption.

 drivers/mempool/octeontx2/otx2_mempool_ops.c | 278 +++++++++----------
 1 file changed, 127 insertions(+), 151 deletions(-)

diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index 97146d1fe..e1764b030 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -54,233 +54,206 @@ npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
 	return 0;
 }
 
-/*
- * Some versions of the compiler don't have support for __int128_t for
- * CASP inline-asm. i.e. if the optimization level is reduced to -O0 the
- * CASP restrictions aren't followed and the compiler might end up violation the
- * CASP rules. Fix it by explicitly providing ((optimize("-O3"))).
- *
- * Example:
- * ccSPMGzq.s:1648: Error: reg pair must start from even reg at
- * operand 1 - `casp x21,x22,x0,x1,[x19]'
- */
-static  __attribute__((optimize("-O3"))) __rte_noinline int __hot
+static __rte_always_inline int
 npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
 			  unsigned int n, void **obj_table)
 {
-	const __uint128_t wdata128 = ((__uint128_t)wdata << 64) | wdata;
+	register const uint64_t wdata64 __asm("x26") = wdata;
+	register const uint64_t wdata128 __asm("x27") = wdata;
 	uint64x2_t failed = vdupq_n_u64(~0);
 
 	switch (n) {
 	case 32:
 	{
-		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
-		__uint128_t t10, t11;
-
 		asm volatile (
 		".cpu  generic+lse\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t8], %H[t8], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t9], %H[t9], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t10], %H[t10], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t11], %H[t11], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d16, %[t0]\n"
-		"fmov v16.D[1], %H[t0]\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d17, %[t1]\n"
-		"fmov v17.D[1], %H[t1]\n"
-		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d18, %[t2]\n"
-		"fmov v18.D[1], %H[t2]\n"
-		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d19, %[t3]\n"
-		"fmov v19.D[1], %H[t3]\n"
-		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"and %[failed].16B, %[failed].16B, v17.16B\n"
 		"and %[failed].16B, %[failed].16B, v18.16B\n"
 		"and %[failed].16B, %[failed].16B, v19.16B\n"
-		"fmov d20, %[t4]\n"
-		"fmov v20.D[1], %H[t4]\n"
-		"fmov d21, %[t5]\n"
-		"fmov v21.D[1], %H[t5]\n"
-		"fmov d22, %[t6]\n"
-		"fmov v22.D[1], %H[t6]\n"
-		"fmov d23, %[t7]\n"
-		"fmov v23.D[1], %H[t7]\n"
+		"fmov d20, x8\n"
+		"fmov v20.D[1], x9\n"
+		"fmov d21, x10\n"
+		"fmov v21.D[1], x11\n"
+		"fmov d22, x12\n"
+		"fmov v22.D[1], x13\n"
+		"fmov d23, x14\n"
+		"fmov v23.D[1], x15\n"
 		"and %[failed].16B, %[failed].16B, v20.16B\n"
 		"and %[failed].16B, %[failed].16B, v21.16B\n"
 		"and %[failed].16B, %[failed].16B, v22.16B\n"
 		"and %[failed].16B, %[failed].16B, v23.16B\n"
 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-		"fmov d16, %[t8]\n"
-		"fmov v16.D[1], %H[t8]\n"
-		"fmov d17, %[t9]\n"
-		"fmov v17.D[1], %H[t9]\n"
-		"fmov d18, %[t10]\n"
-		"fmov v18.D[1], %H[t10]\n"
-		"fmov d19, %[t11]\n"
-		"fmov v19.D[1], %H[t11]\n"
+		"fmov d16, x16\n"
+		"fmov v16.D[1], x17\n"
+		"fmov d17, x18\n"
+		"fmov v17.D[1], x19\n"
+		"fmov d18, x20\n"
+		"fmov v18.D[1], x21\n"
+		"fmov d19, x22\n"
+		"fmov v19.D[1], x23\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"and %[failed].16B, %[failed].16B, v17.16B\n"
 		"and %[failed].16B, %[failed].16B, v18.16B\n"
 		"and %[failed].16B, %[failed].16B, v19.16B\n"
-		"fmov d20, %[t0]\n"
-		"fmov v20.D[1], %H[t0]\n"
-		"fmov d21, %[t1]\n"
-		"fmov v21.D[1], %H[t1]\n"
-		"fmov d22, %[t2]\n"
-		"fmov v22.D[1], %H[t2]\n"
-		"fmov d23, %[t3]\n"
-		"fmov v23.D[1], %H[t3]\n"
+		"fmov d20, x0\n"
+		"fmov v20.D[1], x1\n"
+		"fmov d21, x2\n"
+		"fmov v21.D[1], x3\n"
+		"fmov d22, x4\n"
+		"fmov v22.D[1], x5\n"
+		"fmov d23, x6\n"
+		"fmov v23.D[1], x7\n"
 		"and %[failed].16B, %[failed].16B, v20.16B\n"
 		"and %[failed].16B, %[failed].16B, v21.16B\n"
 		"and %[failed].16B, %[failed].16B, v22.16B\n"
 		"and %[failed].16B, %[failed].16B, v23.16B\n"
 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-		: "+Q" (*addr), [failed] "=&w" (failed),
-		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
-		[t6] "=&r" (t6), [t7] "=&r" (t7), [t8] "=&r" (t8),
-		[t9] "=&r" (t9), [t10] "=&r" (t10), [t11] "=&r" (t11)
-		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
-		[loc] "r" (addr)
-		: "memory", "v16", "v17", "v18",
-		"v19", "v20", "v21", "v22", "v23"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+		"x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
+		"v18", "v19", "v20", "v21", "v22", "v23"
 		);
 		break;
 	}
 	case 16:
 	{
-		__uint128_t t0, t1, t2, t3, t4, t5, t6, t7;
-
 		asm volatile (
 		".cpu  generic+lse\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d16, %[t0]\n"
-		"fmov v16.D[1], %H[t0]\n"
-		"fmov d17, %[t1]\n"
-		"fmov v17.D[1], %H[t1]\n"
-		"fmov d18, %[t2]\n"
-		"fmov v18.D[1], %H[t2]\n"
-		"fmov d19, %[t3]\n"
-		"fmov v19.D[1], %H[t3]\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"and %[failed].16B, %[failed].16B, v17.16B\n"
 		"and %[failed].16B, %[failed].16B, v18.16B\n"
 		"and %[failed].16B, %[failed].16B, v19.16B\n"
-		"fmov d20, %[t4]\n"
-		"fmov v20.D[1], %H[t4]\n"
-		"fmov d21, %[t5]\n"
-		"fmov v21.D[1], %H[t5]\n"
-		"fmov d22, %[t6]\n"
-		"fmov v22.D[1], %H[t6]\n"
-		"fmov d23, %[t7]\n"
-		"fmov v23.D[1], %H[t7]\n"
+		"fmov d20, x8\n"
+		"fmov v20.D[1], x9\n"
+		"fmov d21, x10\n"
+		"fmov v21.D[1], x11\n"
+		"fmov d22, x12\n"
+		"fmov v22.D[1], x13\n"
+		"fmov d23, x14\n"
+		"fmov v23.D[1], x15\n"
 		"and %[failed].16B, %[failed].16B, v20.16B\n"
 		"and %[failed].16B, %[failed].16B, v21.16B\n"
 		"and %[failed].16B, %[failed].16B, v22.16B\n"
 		"and %[failed].16B, %[failed].16B, v23.16B\n"
 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-		: "+Q" (*addr), [failed] "=&w" (failed),
-		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-		[t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
-		[t6] "=&r" (t6), [t7] "=&r" (t7)
-		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
-		[loc] "r" (addr)
-		: "memory", "v16", "v17", "v18", "v19",
-		  "v20", "v21", "v22", "v23"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
+		"v17", "v18", "v19", "v20", "v21", "v22", "v23"
 		);
 		break;
 	}
 	case 8:
 	{
-		__uint128_t t0, t1, t2, t3;
-
 		asm volatile (
 		".cpu  generic+lse\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d16, %[t0]\n"
-		"fmov v16.D[1], %H[t0]\n"
-		"fmov d17, %[t1]\n"
-		"fmov v17.D[1], %H[t1]\n"
-		"fmov d18, %[t2]\n"
-		"fmov v18.D[1], %H[t2]\n"
-		"fmov d19, %[t3]\n"
-		"fmov v19.D[1], %H[t3]\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"and %[failed].16B, %[failed].16B, v17.16B\n"
 		"and %[failed].16B, %[failed].16B, v18.16B\n"
 		"and %[failed].16B, %[failed].16B, v19.16B\n"
 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
-		: "+Q" (*addr), [failed] "=&w" (failed),
-		[t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-		[t3] "=&r" (t3)
-		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
-		[loc] "r" (addr)
-		: "memory", "v16", "v17", "v18", "v19"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"v16", "v17", "v18", "v19"
 		);
 		break;
 	}
 	case 4:
 	{
-		__uint128_t t0, t1;
-
 		asm volatile (
 		".cpu  generic+lse\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d16, %[t0]\n"
-		"fmov v16.D[1], %H[t0]\n"
-		"fmov d17, %[t1]\n"
-		"fmov v17.D[1], %H[t1]\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"and %[failed].16B, %[failed].16B, v17.16B\n"
 		"st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
-		: "+Q" (*addr), [failed] "=&w" (failed),
-		[t0] "=&r" (t0), [t1] "=&r" (t1)
-		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
-		[loc] "r" (addr)
-		: "memory", "v16", "v17"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "v16", "v17"
 		);
 		break;
 	}
 	case 2:
 	{
-		__uint128_t t0;
-
 		asm volatile (
 		".cpu  generic+lse\n"
-		"casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-		"fmov d16, %[t0]\n"
-		"fmov v16.D[1], %H[t0]\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
 		"and %[failed].16B, %[failed].16B, v16.16B\n"
 		"st1 { v16.2d}, [%[dst]], 16\n"
-		: "+Q" (*addr), [failed] "=&w" (failed),
-		[t0] "=&r" (t0)
-		: [wdata] "r" (wdata128), [dst] "r" (obj_table),
-		[loc] "r" (addr)
-		: "memory", "v16"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "v16"
 		);
 		break;
 	}
@@ -308,7 +281,7 @@ otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
 	}
 }
 
-static inline int __hot
+static __rte_noinline int __hot
 otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
 	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
@@ -332,7 +305,8 @@ otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
 
 	return 0;
 }
-#endif
+
+#else
 
 static inline int __hot
 otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
@@ -359,6 +333,8 @@ otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 	return 0;
 }
 
+#endif
+
 static unsigned int
 otx2_npa_get_count(const struct rte_mempool *mp)
 {
-- 
2.22.0


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-07-05  9:02 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-05  4:26 [dpdk-dev] [PATCH v2] mempool/octeontx2: fix arm64 clang build failure pbhagavatula
2019-07-05  9:02 ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).