[dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test
@ 2018-12-07 14:31 Kamil Chalupnik
  2018-12-07 14:31 ` [dpdk-dev] [PATCH 2/4] baseband: enhancement of throughput test Kamil Chalupnik
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 14:31 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Offload cost test was improved in order to collect
more accurate results.

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 | 152 +++++++++++------------
 config/common_base                               |   2 +-
 drivers/baseband/turbo_sw/bbdev_turbo_software.c |  70 ++++++++---
 lib/librte_bbdev/rte_bbdev.h                     |   9 +-
 4 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index fbe6cc9..bf97edb 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -88,19 +88,19 @@ struct thread_params {
 /* Stores time statistics */
 struct test_time_stats {
 	/* Stores software enqueue total working time */
-	uint64_t enq_sw_tot_time;
+	uint64_t enq_sw_total_time;
 	/* Stores minimum value of software enqueue working time */
 	uint64_t enq_sw_min_time;
 	/* Stores maximum value of software enqueue working time */
 	uint64_t enq_sw_max_time;
 	/* Stores turbo enqueue total working time */
-	uint64_t enq_tur_tot_time;
-	/* Stores minimum value of turbo enqueue working time */
-	uint64_t enq_tur_min_time;
-	/* Stores maximum value of turbo enqueue working time */
-	uint64_t enq_tur_max_time;
+	uint64_t enq_acc_total_time;
+	/* Stores minimum value of accelerator enqueue working time */
+	uint64_t enq_acc_min_time;
+	/* Stores maximum value of accelerator enqueue working time */
+	uint64_t enq_acc_max_time;
 	/* Stores dequeue total working time */
-	uint64_t deq_tot_time;
+	uint64_t deq_total_time;
 	/* Stores minimum value of dequeue working time */
 	uint64_t deq_min_time;
 	/* Stores maximum value of dequeue working time */
@@ -1200,12 +1200,15 @@ typedef int (test_case_function)(struct active_device *ad,
 	burst_sz = tp->op_params->burst_sz;
 	num_to_process = tp->op_params->num_to_process;
 
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
 				burst_sz);
-	else
+		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+	} else {
 		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops,
 				burst_sz);
+		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+	}
 
 	if (deq < burst_sz) {
 		printf(
@@ -1316,8 +1319,6 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_dec_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1380,8 +1381,6 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_enc_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1575,13 +1574,14 @@ typedef int (test_case_function)(struct active_device *ad,
 	RTE_LCORE_FOREACH(lcore_id) {
 		if (iter++ >= used_cores)
 			break;
-		printf("\tlcore_id: %u, throughput: %.8lg MOPS, %.8lg Mbps\n",
-		lcore_id, t_params[lcore_id].mops, t_params[lcore_id].mbps);
+		printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n",
+				lcore_id, t_params[lcore_id].mops,
+				t_params[lcore_id].mbps);
 		total_mops += t_params[lcore_id].mops;
 		total_mbps += t_params[lcore_id].mbps;
 	}
 	printf(
-		"\n\tTotal stats for %u cores: throughput: %.8lg MOPS, %.8lg Mbps\n",
+		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n",
 		used_cores, total_mops, total_mbps);
 }
 
@@ -1882,7 +1882,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Validation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nValidation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -1899,10 +1899,10 @@ typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\toperation latency:\n"
-			"\t\tavg latency: %lg cycles, %lg us\n"
-			"\t\tmin latency: %lg cycles, %lg us\n"
-			"\t\tmax latency: %lg cycles, %lg us\n",
+	printf("Operation latency:\n"
+			"\tavg latency: %lg cycles, %lg us\n"
+			"\tmin latency: %lg cycles, %lg us\n"
+			"\tmax latency: %lg cycles, %lg us\n",
 			(double)total_time / (double)iter,
 			(double)(total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)min_time,
@@ -1930,7 +1930,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	stats->dequeued_count = q_stats->dequeued_count;
 	stats->enqueue_err_count = q_stats->enqueue_err_count;
 	stats->dequeue_err_count = q_stats->dequeue_err_count;
-	stats->offload_time = q_stats->offload_time;
+	stats->acc_offload_cycles = q_stats->acc_offload_cycles;
 
 	return 0;
 }
@@ -1974,18 +1974,18 @@ typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2003,7 +2003,7 @@ typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		/* Dequeue remaining operations if needed*/
 		while (burst_sz != deq)
@@ -2055,18 +2055,18 @@ typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2084,7 +2084,7 @@ typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		while (burst_sz != deq)
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
@@ -2121,7 +2121,7 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	memset(&time_st, 0, sizeof(struct test_time_stats));
 	time_st.enq_sw_min_time = UINT64_MAX;
-	time_st.enq_tur_min_time = UINT64_MAX;
+	time_st.enq_acc_min_time = UINT64_MAX;
 	time_st.deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2134,7 +2134,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -2149,36 +2149,36 @@ typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tenq offload cost latency:\n"
-			"\t\tsoftware avg %lg cycles, %lg us\n"
-			"\t\tsoftware min %lg cycles, %lg us\n"
-			"\t\tsoftware max %lg cycles, %lg us\n"
-			"\t\tturbo avg %lg cycles, %lg us\n"
-			"\t\tturbo min %lg cycles, %lg us\n"
-			"\t\tturbo max %lg cycles, %lg us\n",
-			(double)time_st.enq_sw_tot_time / (double)iter,
-			(double)(time_st.enq_sw_tot_time * 1000000) /
+	printf("Enqueue offload cost latency:\n"
+			"\tDriver offload avg %lg cycles, %lg us\n"
+			"\tDriver offload min %lg cycles, %lg us\n"
+			"\tDriver offload max %lg cycles, %lg us\n"
+			"\tAccelerator offload avg %lg cycles, %lg us\n"
+			"\tAccelerator offload min %lg cycles, %lg us\n"
+			"\tAccelerator offload max %lg cycles, %lg us\n",
+			(double)time_st.enq_sw_total_time / (double)iter,
+			(double)(time_st.enq_sw_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.enq_sw_min_time,
 			(double)(time_st.enq_sw_min_time * 1000000) /
 			rte_get_tsc_hz(), (double)time_st.enq_sw_max_time,
 			(double)(time_st.enq_sw_max_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_total_time /
 			(double)iter,
-			(double)(time_st.enq_tur_tot_time * 1000000) /
+			(double)(time_st.enq_acc_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
-			(double)time_st.enq_tur_min_time,
-			(double)(time_st.enq_tur_min_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_max_time,
-			(double)(time_st.enq_tur_max_time * 1000000) /
+			(double)time_st.enq_acc_min_time,
+			(double)(time_st.enq_acc_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_max_time,
+			(double)(time_st.enq_acc_max_time * 1000000) /
 			rte_get_tsc_hz());
 
-	printf("\tdeq offload cost latency - one op:\n"
-			"\t\tavg %lg cycles, %lg us\n"
-			"\t\tmin %lg cycles, %lg us\n"
-			"\t\tmax %lg cycles, %lg us\n",
-			(double)time_st.deq_tot_time / (double)iter,
-			(double)(time_st.deq_tot_time * 1000000) /
+	printf("Dequeue offload cost latency - one op:\n"
+			"\tavg %lg cycles, %lg us\n"
+			"\tmin %lg cycles, %lg us\n"
+			"\tmax %lg cycles, %lg us\n",
+			(double)time_st.deq_total_time / (double)iter,
+			(double)(time_st.deq_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.deq_min_time,
 			(double)(time_st.deq_min_time * 1000000) /
@@ -2194,7 +2194,7 @@ typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2214,7 +2214,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2223,7 +2223,7 @@ typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2242,7 +2242,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2261,7 +2261,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	return TEST_SKIPPED;
 #else
 	int iter;
-	uint64_t deq_tot_time, deq_min_time, deq_max_time;
+	uint64_t deq_total_time, deq_min_time, deq_max_time;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -2269,7 +2269,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_info info;
 	const char *op_type_str;
 
-	deq_tot_time = deq_max_time = 0;
+	deq_total_time = deq_max_time = 0;
 	deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2281,27 +2281,27 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tempty deq offload\n"
-			"\t\tavg. latency: %lg cycles, %lg us\n"
-			"\t\tmin. latency: %lg cycles, %lg us\n"
-			"\t\tmax. latency: %lg cycles, %lg us\n",
-			(double)deq_tot_time / (double)iter,
-			(double)(deq_tot_time * 1000000) / (double)iter /
+	printf("Empty dequeue offload\n"
+			"\tavg. latency: %lg cycles, %lg us\n"
+			"\tmin. latency: %lg cycles, %lg us\n"
+			"\tmax. latency: %lg cycles, %lg us\n",
+			(double)deq_total_time / (double)iter,
+			(double)(deq_total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)deq_min_time,
 			(double)(deq_min_time * 1000000) / rte_get_tsc_hz(),
 			(double)deq_max_time, (double)(deq_max_time * 1000000) /
diff --git a/config/common_base b/config/common_base
index d12ae98..3ff98bb 100644
--- a/config/common_base
+++ b/config/common_base
@@ -481,7 +481,7 @@ CONFIG_RTE_PMD_PACKET_PREFETCH=y
 #
 CONFIG_RTE_LIBRTE_BBDEV=y
 CONFIG_RTE_BBDEV_MAX_DEVS=128
-CONFIG_RTE_BBDEV_OFFLOAD_COST=n
+CONFIG_RTE_BBDEV_OFFLOAD_COST=y
 
 #
 # Compile PMD for NULL bbdev device
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 8ceb276..57f6ba1 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -510,9 +510,10 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24A generation */
 		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
@@ -542,9 +543,10 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24B generation */
 		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
@@ -596,15 +598,14 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 	start_time = rte_rdtsc_precise();
 #endif
-
+	/* Turbo encoding */
 	if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) {
 		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 		rte_bbdev_log(ERR, "Turbo Encoder failed");
 		return;
 	}
-
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	q_stats->offload_time += rte_rdtsc_precise() - start_time;
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 
 	/* Restore 3 first bytes of next CB if they were overwritten by CRC*/
@@ -671,23 +672,21 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
-
+		/* Rate-Matching */
 		if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) {
 			op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 			rte_bbdev_log(ERR, "Rate matching failed");
 			return;
 		}
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 		/* SW fills an entire last byte even if E%8 != 0. Clear the
 		 * superfluous data bits for consistency with HW device.
 		 */
 		mask_id = (e & 7) >> 1;
 		rm_out[out_len - 1] &= mask_out[mask_id];
-
-#ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
-#endif
-
 		enc->output.length += rm_resp.OutputLen;
 	} else {
 		/* Rate matching is bypassed */
@@ -798,7 +797,7 @@ struct turbo_sw_queue {
 {
 	uint16_t i;
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	queue_stats->offload_time = 0;
+	queue_stats->acc_offload_cycles = 0;
 #endif
 
 	for (i = 0; i < nb_ops; ++i)
@@ -905,7 +904,8 @@ struct turbo_sw_queue {
 process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in,
 		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
-		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left)
+		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left,
+		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int32_t k_idx;
@@ -917,6 +917,11 @@ struct turbo_sw_queue {
 	struct bblib_turbo_decoder_request turbo_req;
 	struct bblib_turbo_decoder_response turbo_resp;
 	struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	uint64_t start_time;
+#else
+	RTE_SET_USED(q_stats);
+#endif
 
 	k_idx = compute_idx(k);
 
@@ -942,7 +947,14 @@ struct turbo_sw_queue {
 		deint_req.pharqbuffer = q->deint_input;
 		deint_req.ncb = ncb_without_null;
 		deint_resp.pinteleavebuffer = q->deint_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		start_time = rte_rdtsc_precise();
+#endif
 		bblib_deinterleave_ul(&deint_req, &deint_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	} else
 		move_padding_bytes(in, q->deint_output, k, ncb);
 
@@ -961,7 +973,15 @@ struct turbo_sw_queue {
 	adapter_req.ncb = ncb_without_null;
 	adapter_req.pinteleavebuffer = adapter_input;
 	adapter_resp.pharqout = q->adapter_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode adaptation */
 	bblib_turbo_adapter_ul(&adapter_req, &adapter_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 	out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3));
 	if (out == NULL) {
@@ -986,12 +1006,20 @@ struct turbo_sw_queue {
 	turbo_resp.ag_buf = q->ag;
 	turbo_resp.cb_buf = q->code_block;
 	turbo_resp.output = out;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode */
 	iter_cnt = bblib_turbo_decoder(&turbo_req, &turbo_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	dec->hard_output.length += (k >> 3);
 
 	if (iter_cnt > 0) {
 		/* Temporary solution for returned iter_count from SDK */
-		iter_cnt = (iter_cnt - 1) / 2;
+		iter_cnt = (iter_cnt - 1) >> 1;
 		dec->iter_count = RTE_MAX(iter_cnt, dec->iter_count);
 	} else {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
@@ -1001,7 +1029,8 @@ struct turbo_sw_queue {
 }
 
 static inline void
-enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op)
+enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
+		struct rte_bbdev_stats *queue_stats)
 {
 	uint8_t c, r = 0;
 	uint16_t kw, k = 0;
@@ -1053,7 +1082,7 @@ struct turbo_sw_queue {
 		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
 				out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
-				total_left);
+				total_left, queue_stats);
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
@@ -1075,12 +1104,15 @@ struct turbo_sw_queue {
 
 static inline uint16_t
 enqueue_dec_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_dec_op **ops,
-		uint16_t nb_ops)
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
 {
 	uint16_t i;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	queue_stats->acc_offload_cycles = 0;
+#endif
 
 	for (i = 0; i < nb_ops; ++i)
-		enqueue_dec_one_op(q, ops[i]);
+		enqueue_dec_one_op(q, ops[i], queue_stats);
 
 	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
 			NULL);
@@ -1112,7 +1144,7 @@ struct turbo_sw_queue {
 	struct turbo_sw_queue *q = queue;
 	uint16_t nb_enqueued = 0;
 
-	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops);
+	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops, &q_data->queue_stats);
 
 	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
 	q_data->queue_stats.enqueued_count += nb_enqueued;
diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h
index 25ef409..da8cf07 100644
--- a/lib/librte_bbdev/rte_bbdev.h
+++ b/lib/librte_bbdev/rte_bbdev.h
@@ -239,8 +239,13 @@ struct rte_bbdev_stats {
 	uint64_t enqueue_err_count;
 	/** Total error count on operations dequeued */
 	uint64_t dequeue_err_count;
-	/** Offload time */
-	uint64_t offload_time;
+	/** CPU cycles consumed by the (HW/SW) accelerator device to offload
+	 *  the enqueue request to its internal queues.
+	 *  - For a HW device this is the cycles consumed in MMIO write
+	 *  - For a SW (vdev) device, this is the processing time of the
+	 *     bbdev operation
+	 */
+	uint64_t acc_offload_cycles;
 };
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH 2/4] baseband: enhancement of throughput test
  2018-12-07 14:31 [dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
@ 2018-12-07 14:31 ` Kamil Chalupnik
  2018-12-07 14:31 ` [dpdk-dev] [PATCH 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2 siblings, 0 replies; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 14:31 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Improvements added to throughput test:
- test is run in loop (number of iterations is specified by
TEST_REPETITIONS define) which ensures more accurate results
- length of input data is calculated based on amount of CBs in TB
- maximum number of decoding iterations is gathered from results
- added new functions responsible for printing results
- small fixes for memory management

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/main.c              |   2 -
 app/test-bbdev/test_bbdev_perf.c   | 471 ++++++++++++++++++++-----------------
 app/test-bbdev/test_bbdev_vector.c |   7 +
 lib/librte_bbdev/rte_bbdev_op.h    |   2 +
 4 files changed, 263 insertions(+), 219 deletions(-)

diff --git a/app/test-bbdev/main.c b/app/test-bbdev/main.c
index 41b54bb..7af2522 100644
--- a/app/test-bbdev/main.c
+++ b/app/test-bbdev/main.c
@@ -316,8 +316,6 @@
 		return 1;
 	}
 
-	rte_log_set_global_level(RTE_LOG_INFO);
-
 	/* If no argument provided - run all tests */
 	if (test_params.num_tests == 0)
 		return run_all_tests();
diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index bf97edb..a25e3a7 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -23,6 +23,7 @@
 #define GET_SOCKET(socket_id) (((socket_id) == SOCKET_ID_ANY) ? 0 : (socket_id))
 
 #define MAX_QUEUES RTE_MAX_LCORE
+#define TEST_REPETITIONS 1000
 
 #define OPS_CACHE_SIZE 256U
 #define OPS_POOL_SIZE_MIN 511U /* 0.5K per queue */
@@ -77,8 +78,9 @@ struct thread_params {
 	uint8_t dev_id;
 	uint16_t queue_id;
 	uint64_t start_time;
-	double mops;
+	double ops_per_sec;
 	double mbps;
+	uint8_t iter_count;
 	rte_atomic16_t nb_dequeued;
 	rte_atomic16_t processing_status;
 	struct test_op_params *op_params;
@@ -757,6 +759,8 @@ typedef int (test_case_function)(struct active_device *ad,
 					turbo_dec->tb_params.c_neg;
 			ops[i]->turbo_dec.tb_params.cab =
 					turbo_dec->tb_params.cab;
+			ops[i]->turbo_dec.tb_params.r =
+					turbo_dec->tb_params.r;
 		} else {
 			ops[i]->turbo_dec.cb_params.e = turbo_dec->cb_params.e;
 			ops[i]->turbo_dec.cb_params.k = turbo_dec->cb_params.k;
@@ -884,47 +888,6 @@ typedef int (test_case_function)(struct active_device *ad,
 }
 
 static int
-validate_dec_buffers(struct rte_bbdev_dec_op *ref_op, struct test_buffers *bufs,
-		const uint16_t num_to_process)
-{
-	int i;
-
-	struct op_data_entries *hard_data_orig =
-			&test_vector.entries[DATA_HARD_OUTPUT];
-	struct op_data_entries *soft_data_orig =
-			&test_vector.entries[DATA_SOFT_OUTPUT];
-
-	for (i = 0; i < num_to_process; i++) {
-		TEST_ASSERT_SUCCESS(validate_op_chain(&bufs->hard_outputs[i],
-				hard_data_orig),
-				"Hard output buffers are not equal");
-		if (ref_op->turbo_dec.op_flags &
-				RTE_BBDEV_TURBO_SOFT_OUTPUT)
-			TEST_ASSERT_SUCCESS(validate_op_chain(
-					&bufs->soft_outputs[i],
-					soft_data_orig),
-					"Soft output buffers are not equal");
-	}
-
-	return TEST_SUCCESS;
-}
-
-static int
-validate_enc_buffers(struct test_buffers *bufs, const uint16_t num_to_process)
-{
-	int i;
-
-	struct op_data_entries *hard_data_orig =
-			&test_vector.entries[DATA_HARD_OUTPUT];
-
-	for (i = 0; i < num_to_process; i++)
-		TEST_ASSERT_SUCCESS(validate_op_chain(&bufs->hard_outputs[i],
-				hard_data_orig), "");
-
-	return TEST_SUCCESS;
-}
-
-static int
 validate_dec_op(struct rte_bbdev_dec_op **ops, const uint16_t n,
 		struct rte_bbdev_dec_op *ref_op, const int vector_mask)
 {
@@ -1016,6 +979,44 @@ typedef int (test_case_function)(struct active_device *ad,
 				entry->segments[i].length;
 }
 
+static uint32_t
+calc_dec_TB_size(struct rte_bbdev_dec_op *op)
+{
+	uint8_t i;
+	uint32_t c, r, tb_size = 0;
+
+	if (op->turbo_dec.code_block_mode) {
+		tb_size = op->turbo_dec.tb_params.k_neg;
+	} else {
+		c = op->turbo_dec.tb_params.c;
+		r = op->turbo_dec.tb_params.r;
+		for (i = 0; i < c-r; i++)
+			tb_size += (r < op->turbo_dec.tb_params.c_neg) ?
+				op->turbo_dec.tb_params.k_neg :
+				op->turbo_dec.tb_params.k_pos;
+	}
+	return tb_size;
+}
+
+static uint32_t
+calc_enc_TB_size(struct rte_bbdev_enc_op *op)
+{
+	uint8_t i;
+	uint32_t c, r, tb_size = 0;
+
+	if (op->turbo_enc.code_block_mode) {
+		tb_size = op->turbo_enc.tb_params.k_neg;
+	} else {
+		c = op->turbo_enc.tb_params.c;
+		r = op->turbo_enc.tb_params.r;
+		for (i = 0; i < c-r; i++)
+			tb_size += (r < op->turbo_enc.tb_params.c_neg) ?
+				op->turbo_enc.tb_params.k_neg :
+				op->turbo_enc.tb_params.k_pos;
+	}
+	return tb_size;
+}
+
 static int
 init_test_op_params(struct test_op_params *op_params,
 		enum rte_bbdev_op_type op_type, const int expected_status,
@@ -1163,17 +1164,13 @@ typedef int (test_case_function)(struct active_device *ad,
 	int ret;
 	uint16_t i;
 	uint64_t total_time;
-	uint16_t deq, burst_sz, num_to_process;
+	uint16_t deq, burst_sz, num_ops;
 	uint16_t queue_id = INVALID_QUEUE_ID;
 	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
 	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
-	struct test_buffers *bufs;
 	struct rte_bbdev_info info;
 
-	/* Input length in bytes, million operations per second,
-	 * million bits per second.
-	 */
-	double in_len;
+	double tb_len_bits;
 
 	struct thread_params *tp = cb_arg;
 	RTE_SET_USED(ret_param);
@@ -1198,7 +1195,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	}
 
 	burst_sz = tp->op_params->burst_sz;
-	num_to_process = tp->op_params->num_to_process;
+	num_ops = tp->op_params->num_to_process;
 
 	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
@@ -1218,7 +1215,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	if (rte_atomic16_read(&tp->nb_dequeued) + deq < num_to_process) {
+	if (rte_atomic16_read(&tp->nb_dequeued) + deq < num_ops) {
 		rte_atomic16_add(&tp->nb_dequeued, deq);
 		return;
 	}
@@ -1227,14 +1224,18 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	rte_bbdev_info_get(dev_id, &info);
 
-	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
-
 	ret = TEST_SUCCESS;
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
-		ret = validate_dec_buffers(tp->op_params->ref_dec_op, bufs,
-				num_to_process);
-	else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
-		ret = validate_enc_buffers(bufs, num_to_process);
+
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
+		struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
+		ret = validate_dec_op(dec_ops, num_ops, ref_op,
+				tp->op_params->vector_mask);
+		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+	} else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC) {
+		struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
+		ret = validate_enc_op(enc_ops, num_ops, ref_op);
+		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+	}
 
 	if (ret) {
 		printf("Buffers validation failed\n");
@@ -1243,13 +1244,13 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	switch (test_vector.op_type) {
 	case RTE_BBDEV_OP_TURBO_DEC:
-		in_len = tp->op_params->ref_dec_op->turbo_dec.input.length;
+		tb_len_bits = calc_dec_TB_size(tp->op_params->ref_dec_op);
 		break;
 	case RTE_BBDEV_OP_TURBO_ENC:
-		in_len = tp->op_params->ref_enc_op->turbo_enc.input.length;
+		tb_len_bits = calc_enc_TB_size(tp->op_params->ref_enc_op);
 		break;
 	case RTE_BBDEV_OP_NONE:
-		in_len = 0.0;
+		tb_len_bits = 0.0;
 		break;
 	default:
 		printf("Unknown op type: %d\n", test_vector.op_type);
@@ -1257,9 +1258,9 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	tp->mops = ((double)num_to_process / 1000000.0) /
+	tp->ops_per_sec = ((double)num_ops) /
 			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	tp->mbps = (((double)(num_ops * tb_len_bits)) / 1000000.0) /
 			((double)total_time / (double)rte_get_tsc_hz());
 
 	rte_atomic16_add(&tp->nb_dequeued, deq);
@@ -1270,14 +1271,14 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	struct thread_params *tp = arg;
 	unsigned int enqueued;
-	struct rte_bbdev_dec_op *ops[MAX_BURST];
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
 	const uint16_t num_to_process = tp->op_params->num_to_process;
+	struct rte_bbdev_dec_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
 	struct rte_bbdev_info info;
 	int ret;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1287,6 +1288,11 @@ typedef int (test_case_function)(struct active_device *ad,
 			tp->dev_id, queue_id);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_to_process > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	rte_atomic16_clear(&tp->processing_status);
@@ -1295,36 +1301,27 @@ typedef int (test_case_function)(struct active_device *ad,
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
+	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
+				num_to_process);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_to_process);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_dec_op(ops, num_to_process, 0, bufs->inputs,
+				bufs->hard_outputs, bufs->soft_outputs,
+				tp->op_params->ref_dec_op);
+
 	tp->start_time = rte_rdtsc_precise();
 	for (enqueued = 0; enqueued < num_to_process;) {
 
-		uint16_t num_to_enq = burst_sz;
+		num_to_enq = burst_sz;
 
 		if (unlikely(num_to_process - enqueued < num_to_enq))
 			num_to_enq = num_to_process - enqueued;
 
-		ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
-				num_to_enq);
-		if (ret != 0) {
-			allocs_failed++;
-			continue;
-		}
-
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			copy_reference_dec_op(ops, num_to_enq, enqueued,
-					bufs->inputs,
-					bufs->hard_outputs,
-					bufs->soft_outputs,
-					tp->op_params->ref_dec_op);
-
-		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops,
-				num_to_enq);
+		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id,
+				&ops[enqueued], num_to_enq);
 	}
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
-
 	return TEST_SUCCESS;
 }
 
@@ -1333,14 +1330,14 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	struct thread_params *tp = arg;
 	unsigned int enqueued;
-	struct rte_bbdev_enc_op *ops[MAX_BURST];
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
 	const uint16_t num_to_process = tp->op_params->num_to_process;
+	struct rte_bbdev_enc_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
 	struct rte_bbdev_info info;
 	int ret;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1350,6 +1347,11 @@ typedef int (test_case_function)(struct active_device *ad,
 			tp->dev_id, queue_id);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_to_process > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	rte_atomic16_clear(&tp->processing_status);
@@ -1358,35 +1360,26 @@ typedef int (test_case_function)(struct active_device *ad,
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
+	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
+			num_to_process);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_to_process);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_enc_op(ops, num_to_process, 0, bufs->inputs,
+				bufs->hard_outputs, tp->op_params->ref_enc_op);
+
 	tp->start_time = rte_rdtsc_precise();
 	for (enqueued = 0; enqueued < num_to_process;) {
 
-		uint16_t num_to_enq = burst_sz;
+		num_to_enq = burst_sz;
 
 		if (unlikely(num_to_process - enqueued < num_to_enq))
 			num_to_enq = num_to_process - enqueued;
 
-		ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
-				num_to_enq);
-		if (ret != 0) {
-			allocs_failed++;
-			continue;
-		}
-
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			copy_reference_enc_op(ops, num_to_enq, enqueued,
-					bufs->inputs,
-					bufs->hard_outputs,
-					tp->op_params->ref_enc_op);
-
-		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops,
-				num_to_enq);
+		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id,
+				&ops[enqueued], num_to_enq);
 	}
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
-
 	return TEST_SUCCESS;
 }
 
@@ -1394,86 +1387,97 @@ typedef int (test_case_function)(struct active_device *ad,
 throughput_pmd_lcore_dec(void *arg)
 {
 	struct thread_params *tp = arg;
-	unsigned int enqueued, dequeued;
-	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t total_time, start_time;
+	uint16_t enq, deq;
+	uint64_t total_time = 0, start_time;
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
-	const uint16_t num_to_process = tp->op_params->num_to_process;
+	const uint16_t num_ops = tp->op_params->num_to_process;
+	struct rte_bbdev_dec_op *ops_enq[num_ops];
+	struct rte_bbdev_dec_op *ops_deq[num_ops];
 	struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
-	int ret;
+	int i, j, ret;
 	struct rte_bbdev_info info;
-
-	/* Input length in bytes, million operations per second, million bits
-	 * per second.
-	 */
-	double in_len;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_ops > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
-	start_time = rte_rdtsc_precise();
-	for (enqueued = 0, dequeued = 0; dequeued < num_to_process;) {
-		uint16_t deq;
+	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops_enq, num_ops);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops", num_ops);
 
-		if (likely(enqueued < num_to_process)) {
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_dec_op(ops_enq, num_ops, 0, bufs->inputs,
+				bufs->hard_outputs, bufs->soft_outputs, ref_op);
 
-			uint16_t num_to_enq = burst_sz;
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_ops; ++j)
+		ops_enq[j]->opaque_data = (void *)(uintptr_t)j;
 
-			if (unlikely(num_to_process - enqueued < num_to_enq))
-				num_to_enq = num_to_process - enqueued;
+	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-			ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp,
-					ops_enq, num_to_enq);
-			if (ret != 0) {
-				allocs_failed++;
-				goto do_dequeue;
-			}
+		for (j = 0; j < num_ops; ++j) {
+			struct rte_bbdev_dec_op *op = ops_enq[j];
+			rte_pktmbuf_reset(op->turbo_dec.hard_output.data);
+		}
+
+		start_time = rte_rdtsc_precise();
+
+		for (enq = 0, deq = 0; enq < num_ops;) {
+			num_to_enq = burst_sz;
+
+			if (unlikely(num_ops - enq < num_to_enq))
+				num_to_enq = num_ops - enq;
 
-			if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-				copy_reference_dec_op(ops_enq, num_to_enq,
-						enqueued,
-						bufs->inputs,
-						bufs->hard_outputs,
-						bufs->soft_outputs,
-						ref_op);
+			enq += rte_bbdev_enqueue_dec_ops(tp->dev_id,
+					queue_id, &ops_enq[enq], num_to_enq);
 
-			enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id,
-					queue_id, ops_enq, num_to_enq);
+			deq += rte_bbdev_dequeue_dec_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
 		}
-do_dequeue:
-		deq = rte_bbdev_dequeue_dec_ops(tp->dev_id, queue_id, ops_deq,
-				burst_sz);
-		dequeued += deq;
-		rte_bbdev_dec_op_free_bulk(ops_enq, deq);
-	}
-	total_time = rte_rdtsc_precise() - start_time;
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
+		/* dequeue the remaining */
+		while (deq < enq) {
+			deq += rte_bbdev_dequeue_dec_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
+		}
 
-	TEST_ASSERT(enqueued == dequeued, "enqueued (%u) != dequeued (%u)",
-			enqueued, dequeued);
+		total_time += rte_rdtsc_precise() - start_time;
+	}
+
+	tp->iter_count = 0;
+	/* get the max of iter_count for all dequeued ops */
+	for (i = 0; i < num_ops; ++i) {
+		tp->iter_count = RTE_MAX(ops_enq[i]->turbo_dec.iter_count,
+				tp->iter_count);
+	}
 
 	if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
-		ret = validate_dec_buffers(ref_op, bufs, num_to_process);
-		TEST_ASSERT_SUCCESS(ret, "Buffers validation failed");
+		ret = validate_dec_op(ops_deq, num_ops, ref_op,
+				tp->op_params->vector_mask);
+		TEST_ASSERT_SUCCESS(ret, "Validation failed!");
 	}
 
-	in_len = ref_op->turbo_dec.input.length;
-	tp->mops = ((double)num_to_process / 1000000.0) /
-			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	rte_bbdev_dec_op_free_bulk(ops_enq, num_ops);
+
+	double tb_len_bits = calc_dec_TB_size(ref_op);
+
+	tp->ops_per_sec = ((double)num_ops * TEST_REPETITIONS) /
 			((double)total_time / (double)rte_get_tsc_hz());
+	tp->mbps = (((double)(num_ops * TEST_REPETITIONS * tb_len_bits)) /
+			1000000.0) / ((double)total_time /
+			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -1482,91 +1486,94 @@ typedef int (test_case_function)(struct active_device *ad,
 throughput_pmd_lcore_enc(void *arg)
 {
 	struct thread_params *tp = arg;
-	unsigned int enqueued, dequeued;
-	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t total_time, start_time;
+	uint16_t enq, deq;
+	uint64_t total_time = 0, start_time;
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
-	const uint16_t num_to_process = tp->op_params->num_to_process;
+	const uint16_t num_ops = tp->op_params->num_to_process;
+	struct rte_bbdev_enc_op *ops_enq[num_ops];
+	struct rte_bbdev_enc_op *ops_deq[num_ops];
 	struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
-	int ret;
+	int i, j, ret;
 	struct rte_bbdev_info info;
-
-	/* Input length in bytes, million operations per second, million bits
-	 * per second.
-	 */
-	double in_len;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_ops > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
-	start_time = rte_rdtsc_precise();
-	for (enqueued = 0, dequeued = 0; dequeued < num_to_process;) {
-		uint16_t deq;
+	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops_enq,
+			num_ops);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_ops);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_enc_op(ops_enq, num_ops, 0, bufs->inputs,
+				bufs->hard_outputs, ref_op);
 
-		if (likely(enqueued < num_to_process)) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_ops; ++j)
+		ops_enq[j]->opaque_data = (void *)(uintptr_t)j;
 
-			uint16_t num_to_enq = burst_sz;
+	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-			if (unlikely(num_to_process - enqueued < num_to_enq))
-				num_to_enq = num_to_process - enqueued;
+		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+			for (j = 0; j < num_ops; ++j)
+				rte_pktmbuf_reset(
+					ops_enq[j]->turbo_enc.output.data);
 
-			ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp,
-					ops_enq, num_to_enq);
-			if (ret != 0) {
-				allocs_failed++;
-				goto do_dequeue;
-			}
+		start_time = rte_rdtsc_precise();
+
+		for (enq = 0, deq = 0; enq < num_ops;) {
+			num_to_enq = burst_sz;
 
-			if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-				copy_reference_enc_op(ops_enq, num_to_enq,
-						enqueued,
-						bufs->inputs,
-						bufs->hard_outputs,
-						ref_op);
+			if (unlikely(num_ops - enq < num_to_enq))
+				num_to_enq = num_ops - enq;
 
-			enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id,
-					queue_id, ops_enq, num_to_enq);
+			enq += rte_bbdev_enqueue_enc_ops(tp->dev_id,
+					queue_id, &ops_enq[enq], num_to_enq);
+
+			deq += rte_bbdev_dequeue_enc_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
 		}
-do_dequeue:
-		deq = rte_bbdev_dequeue_enc_ops(tp->dev_id, queue_id, ops_deq,
-				burst_sz);
-		dequeued += deq;
-		rte_bbdev_enc_op_free_bulk(ops_enq, deq);
-	}
-	total_time = rte_rdtsc_precise() - start_time;
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
+		/* dequeue the remaining */
+		while (deq < enq) {
+			deq += rte_bbdev_dequeue_enc_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
+		}
 
-	TEST_ASSERT(enqueued == dequeued, "enqueued (%u) != dequeued (%u)",
-			enqueued, dequeued);
+		total_time += rte_rdtsc_precise() - start_time;
+	}
 
 	if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
-		ret = validate_enc_buffers(bufs, num_to_process);
-		TEST_ASSERT_SUCCESS(ret, "Buffers validation failed");
+		ret = validate_enc_op(ops_deq, num_ops, ref_op);
+		TEST_ASSERT_SUCCESS(ret, "Validation failed!");
 	}
 
-	in_len = ref_op->turbo_enc.input.length;
+	double tb_len_bits = calc_enc_TB_size(ref_op);
 
-	tp->mops = ((double)num_to_process / 1000000.0) /
-			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	tp->ops_per_sec = ((double)num_ops * TEST_REPETITIONS) /
 			((double)total_time / (double)rte_get_tsc_hz());
+	tp->mbps = (((double)(num_ops * TEST_REPETITIONS * tb_len_bits))
+			/ 1000000.0) / ((double)total_time /
+			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
+
 static void
-print_throughput(struct thread_params *t_params, unsigned int used_cores)
+print_enc_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
 	unsigned int lcore_id, iter = 0;
 	double total_mops = 0, total_mbps = 0;
@@ -1574,10 +1581,11 @@ typedef int (test_case_function)(struct active_device *ad,
 	RTE_LCORE_FOREACH(lcore_id) {
 		if (iter++ >= used_cores)
 			break;
-		printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n",
-				lcore_id, t_params[lcore_id].mops,
+		printf(
+				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
+				lcore_id, t_params[lcore_id].ops_per_sec,
 				t_params[lcore_id].mbps);
-		total_mops += t_params[lcore_id].mops;
+		total_mops += t_params[lcore_id].ops_per_sec;
 		total_mbps += t_params[lcore_id].mbps;
 	}
 	printf(
@@ -1585,6 +1593,30 @@ typedef int (test_case_function)(struct active_device *ad,
 		used_cores, total_mops, total_mbps);
 }
 
+static void
+print_dec_throughput(struct thread_params *t_params, unsigned int used_cores)
+{
+	unsigned int lcore_id, iter = 0;
+	double total_mops = 0, total_mbps = 0;
+	uint8_t iter_count = 0;
+
+	RTE_LCORE_FOREACH(lcore_id) {
+		if (iter++ >= used_cores)
+			break;
+		printf(
+				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
+				lcore_id, t_params[lcore_id].ops_per_sec,
+				t_params[lcore_id].mbps,
+				t_params[lcore_id].iter_count);
+		total_mops += t_params[lcore_id].ops_per_sec;
+		total_mbps += t_params[lcore_id].mbps;
+		iter_count = RTE_MAX(iter_count, t_params[lcore_id].iter_count);
+	}
+	printf(
+		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps @ max %u iterations\n",
+		used_cores, total_mops, total_mbps, iter_count);
+}
+
 /*
  * Test function that determines how long an enqueue + dequeue of a burst
  * takes on available lcores.
@@ -1677,8 +1709,10 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	/* Print throughput if interrupts are disabled and test passed */
 	if (!intr_enabled) {
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			print_throughput(t_params, num_lcores);
+		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+			print_dec_throughput(t_params, num_lcores);
+		else
+			print_enc_throughput(t_params, num_lcores);
 		return ret;
 	}
 
@@ -1713,9 +1747,12 @@ typedef int (test_case_function)(struct active_device *ad,
 	}
 
 	/* Print throughput if test passed */
-	if (!ret && test_vector.op_type != RTE_BBDEV_OP_NONE)
-		print_throughput(t_params, num_lcores);
-
+	if (!ret) {
+		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+			print_dec_throughput(t_params, num_lcores);
+		else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
+			print_enc_throughput(t_params, num_lcores);
+	}
 	return ret;
 }
 
diff --git a/app/test-bbdev/test_bbdev_vector.c b/app/test-bbdev/test_bbdev_vector.c
index 81b8ee7..45fe999 100644
--- a/app/test-bbdev/test_bbdev_vector.c
+++ b/app/test-bbdev/test_bbdev_vector.c
@@ -412,6 +412,10 @@
 		vector->mask |= TEST_BBDEV_VF_NUM_MAPS;
 		turbo_dec->num_maps = (uint8_t) strtoul(token, &err, 0);
 		ret = ((err == NULL) || (*err != '\0')) ? -1 : 0;
+	} else if (!strcmp(key_token, "r")) {
+		vector->mask |= TEST_BBDEV_VF_R;
+		turbo_dec->tb_params.r = (uint8_t) strtoul(token, &err, 0);
+		ret = ((err == NULL) || (*err != '\0')) ? -1 : 0;
 	} else if (!strcmp(key_token, "code_block_mode")) {
 		vector->mask |= TEST_BBDEV_VF_CODE_BLOCK_MODE;
 		turbo_dec->code_block_mode = (uint8_t) strtoul(token, &err, 0);
@@ -714,6 +718,9 @@
 		if (!(mask & TEST_BBDEV_VF_CAB))
 			printf(
 				"WARNING: cab was not specified in vector file and will be set to 0\n");
+		if (!(mask & TEST_BBDEV_VF_R))
+			printf(
+				"WARNING: r was not specified in vector file and will be set to 0\n");
 	} else {
 		if (!(mask & TEST_BBDEV_VF_E))
 			printf(
diff --git a/lib/librte_bbdev/rte_bbdev_op.h b/lib/librte_bbdev/rte_bbdev_op.h
index 83f62c2..962e2ed 100644
--- a/lib/librte_bbdev/rte_bbdev_op.h
+++ b/lib/librte_bbdev/rte_bbdev_op.h
@@ -216,6 +216,8 @@ struct rte_bbdev_op_dec_tb_params {
 	 * operation when r >= cab
 	 */
 	uint32_t eb;
+	/**< The index of the first CB in the inbound mbuf data, default is 0 */
+	uint8_t r;
 };
 
 /**< Operation structure for Turbo decode.
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH 4/4] baseband: enhancement of interrupt test
  2018-12-07 14:31 [dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2018-12-07 14:31 ` [dpdk-dev] [PATCH 2/4] baseband: enhancement of throughput test Kamil Chalupnik
@ 2018-12-07 14:31 ` Kamil Chalupnik
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2 siblings, 0 replies; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 14:31 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Improvements added to interrupt test:
- test is run in loop (number of iterations is specified by
TEST_REPETITIONS define) which ensures more accurate results
- mapping cores to thread parameteres was put in order.
Master core is always set at first index. It fixes problem with
running test for only one core

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c | 246 +++++++++++++++++++++++++--------------
 1 file changed, 161 insertions(+), 85 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index 5bec70d..1c4a645 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -77,13 +77,17 @@ struct test_op_params {
 struct thread_params {
 	uint8_t dev_id;
 	uint16_t queue_id;
+	uint32_t lcore_id;
 	uint64_t start_time;
 	double ops_per_sec;
 	double mbps;
 	uint8_t iter_count;
 	rte_atomic16_t nb_dequeued;
 	rte_atomic16_t processing_status;
+	rte_atomic16_t burst_sz;
 	struct test_op_params *op_params;
+	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
+	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
 };
 
 #ifdef RTE_BBDEV_OFFLOAD_COST
@@ -1206,16 +1210,12 @@ typedef int (test_case_function)(struct active_device *ad,
 	uint16_t i;
 	uint64_t total_time;
 	uint16_t deq, burst_sz, num_ops;
-	uint16_t queue_id = INVALID_QUEUE_ID;
-	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
-	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
+	uint16_t queue_id = *(uint16_t *) ret_param;
 	struct rte_bbdev_info info;
 
 	double tb_len_bits;
 
 	struct thread_params *tp = cb_arg;
-	RTE_SET_USED(ret_param);
-	queue_id = tp->queue_id;
 
 	/* Find matching thread params using queue_id */
 	for (i = 0; i < MAX_QUEUES; ++i, ++tp)
@@ -1235,18 +1235,19 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	burst_sz = tp->op_params->burst_sz;
+	burst_sz = rte_atomic16_read(&tp->burst_sz);
 	num_ops = tp->op_params->num_to_process;
 
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
-		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
+				&tp->dec_ops[
+					rte_atomic16_read(&tp->nb_dequeued)],
 				burst_sz);
-		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
-	} else {
-		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops,
+	else
+		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
+				&tp->enc_ops[
+					rte_atomic16_read(&tp->nb_dequeued)],
 				burst_sz);
-		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
-	}
 
 	if (deq < burst_sz) {
 		printf(
@@ -1269,13 +1270,18 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
-		ret = validate_dec_op(dec_ops, num_ops, ref_op,
+		ret = validate_dec_op(tp->dec_ops, num_ops, ref_op,
 				tp->op_params->vector_mask);
-		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+		/* get the max of iter_count for all dequeued ops */
+		for (i = 0; i < num_ops; ++i)
+			tp->iter_count = RTE_MAX(
+					tp->dec_ops[i]->turbo_dec.iter_count,
+					tp->iter_count);
+		rte_bbdev_dec_op_free_bulk(tp->dec_ops, deq);
 	} else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC) {
 		struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
-		ret = validate_enc_op(enc_ops, num_ops, ref_op);
-		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+		ret = validate_enc_op(tp->enc_ops, num_ops, ref_op);
+		rte_bbdev_enc_op_free_bulk(tp->enc_ops, deq);
 	}
 
 	if (ret) {
@@ -1299,9 +1305,9 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	tp->ops_per_sec = ((double)num_ops) /
+	tp->ops_per_sec += ((double)num_ops) /
 			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = (((double)(num_ops * tb_len_bits)) / 1000000.0) /
+	tp->mbps += (((double)(num_ops * tb_len_bits)) / 1000000.0) /
 			((double)total_time / (double)rte_get_tsc_hz());
 
 	rte_atomic16_add(&tp->nb_dequeued, deq);
@@ -1318,8 +1324,8 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_dec_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	int ret;
-	uint16_t num_to_enq;
+	int ret, i, j;
+	uint16_t num_to_enq, enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1351,16 +1357,47 @@ typedef int (test_case_function)(struct active_device *ad,
 				bufs->hard_outputs, bufs->soft_outputs,
 				tp->op_params->ref_dec_op);
 
-	tp->start_time = rte_rdtsc_precise();
-	for (enqueued = 0; enqueued < num_to_process;) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_to_process; ++j)
+		ops[j]->opaque_data = (void *)(uintptr_t)j;
 
-		num_to_enq = burst_sz;
+	for (j = 0; j < TEST_REPETITIONS; ++j) {
+		for (i = 0; i < num_to_process; ++i)
+			rte_pktmbuf_reset(ops[i]->turbo_dec.hard_output.data);
 
-		if (unlikely(num_to_process - enqueued < num_to_enq))
-			num_to_enq = num_to_process - enqueued;
+		tp->start_time = rte_rdtsc_precise();
+		for (enqueued = 0; enqueued < num_to_process;) {
+			num_to_enq = burst_sz;
+
+			if (unlikely(num_to_process - enqueued < num_to_enq))
+				num_to_enq = num_to_process - enqueued;
+
+			enq = 0;
+			do {
+				enq += rte_bbdev_enqueue_dec_ops(tp->dev_id,
+					queue_id, &ops[enqueued],
+					num_to_enq);
+			} while (unlikely(num_to_enq != enq));
+			enqueued += enq;
+
+			/* Write to thread burst_sz current number of enqueued
+			 * descriptors. It ensures that proper number of
+			 * descriptors will be dequeued in callback
+			 * function - needed for last batch in case where
+			 * the number of operations is not a multiple of
+			 * burst size.
+			 */
+			rte_atomic16_set(&tp->burst_sz, num_to_enq);
 
-		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id,
-				&ops[enqueued], num_to_enq);
+			/* Wait until processing of previous batch is
+			 * completed.
+			 */
+			while (rte_atomic16_read(&tp->nb_dequeued) !=
+					(int16_t) enqueued)
+				rte_pause();
+		}
+		if (j != TEST_REPETITIONS - 1)
+			rte_atomic16_clear(&tp->nb_dequeued);
 	}
 
 	return TEST_SUCCESS;
@@ -1377,8 +1414,8 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_enc_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	int ret;
-	uint16_t num_to_enq;
+	int ret, i, j;
+	uint16_t num_to_enq, enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1409,16 +1446,47 @@ typedef int (test_case_function)(struct active_device *ad,
 		copy_reference_enc_op(ops, num_to_process, 0, bufs->inputs,
 				bufs->hard_outputs, tp->op_params->ref_enc_op);
 
-	tp->start_time = rte_rdtsc_precise();
-	for (enqueued = 0; enqueued < num_to_process;) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_to_process; ++j)
+		ops[j]->opaque_data = (void *)(uintptr_t)j;
+
+	for (j = 0; j < TEST_REPETITIONS; ++j) {
+		for (i = 0; i < num_to_process; ++i)
+			rte_pktmbuf_reset(ops[i]->turbo_enc.output.data);
 
-		num_to_enq = burst_sz;
+		tp->start_time = rte_rdtsc_precise();
+		for (enqueued = 0; enqueued < num_to_process;) {
+			num_to_enq = burst_sz;
 
-		if (unlikely(num_to_process - enqueued < num_to_enq))
-			num_to_enq = num_to_process - enqueued;
+			if (unlikely(num_to_process - enqueued < num_to_enq))
+				num_to_enq = num_to_process - enqueued;
+
+			enq = 0;
+			do {
+				enq += rte_bbdev_enqueue_enc_ops(tp->dev_id,
+						queue_id, &ops[enqueued],
+						num_to_enq);
+			} while (unlikely(enq != num_to_enq));
+			enqueued += enq;
+
+			/* Write to thread burst_sz current number of enqueued
+			 * descriptors. It ensures that proper number of
+			 * descriptors will be dequeued in callback
+			 * function - needed for last batch in case where
+			 * the number of operations is not a multiple of
+			 * burst size.
+			 */
+			rte_atomic16_set(&tp->burst_sz, num_to_enq);
 
-		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id,
-				&ops[enqueued], num_to_enq);
+			/* Wait until processing of previous batch is
+			 * completed.
+			 */
+			while (rte_atomic16_read(&tp->nb_dequeued) !=
+					(int16_t) enqueued)
+				rte_pause();
+		}
+		if (j != TEST_REPETITIONS - 1)
+			rte_atomic16_clear(&tp->nb_dequeued);
 	}
 
 	return TEST_SUCCESS;
@@ -1613,18 +1681,16 @@ typedef int (test_case_function)(struct active_device *ad,
 static void
 print_enc_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
-	unsigned int lcore_id, iter = 0;
+	unsigned int iter = 0;
 	double total_mops = 0, total_mbps = 0;
 
-	RTE_LCORE_FOREACH(lcore_id) {
-		if (iter++ >= used_cores)
-			break;
+	for (iter = 0; iter < used_cores; iter++) {
 		printf(
-				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
-				lcore_id, t_params[lcore_id].ops_per_sec,
-				t_params[lcore_id].mbps);
-		total_mops += t_params[lcore_id].ops_per_sec;
-		total_mbps += t_params[lcore_id].mbps;
+			"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
+			t_params[iter].lcore_id, t_params[iter].ops_per_sec,
+			t_params[iter].mbps);
+		total_mops += t_params[iter].ops_per_sec;
+		total_mbps += t_params[iter].mbps;
 	}
 	printf(
 		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n",
@@ -1634,21 +1700,18 @@ typedef int (test_case_function)(struct active_device *ad,
 static void
 print_dec_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
-	unsigned int lcore_id, iter = 0;
+	unsigned int iter = 0;
 	double total_mops = 0, total_mbps = 0;
 	uint8_t iter_count = 0;
 
-	RTE_LCORE_FOREACH(lcore_id) {
-		if (iter++ >= used_cores)
-			break;
+	for (iter = 0; iter < used_cores; iter++) {
 		printf(
-				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
-				lcore_id, t_params[lcore_id].ops_per_sec,
-				t_params[lcore_id].mbps,
-				t_params[lcore_id].iter_count);
-		total_mops += t_params[lcore_id].ops_per_sec;
-		total_mbps += t_params[lcore_id].mbps;
-		iter_count = RTE_MAX(iter_count, t_params[lcore_id].iter_count);
+			"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
+			t_params[iter].lcore_id, t_params[iter].ops_per_sec,
+			t_params[iter].mbps, t_params[iter].iter_count);
+		total_mops += t_params[iter].ops_per_sec;
+		total_mbps += t_params[iter].mbps;
+		iter_count = RTE_MAX(iter_count, t_params[iter].iter_count);
 	}
 	printf(
 		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps @ max %u iterations\n",
@@ -1665,10 +1728,9 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	int ret;
 	unsigned int lcore_id, used_cores = 0;
-	struct thread_params t_params[MAX_QUEUES];
+	struct thread_params *t_params, *tp;
 	struct rte_bbdev_info info;
 	lcore_function_t *throughput_function;
-	struct thread_params *tp;
 	uint16_t num_lcores;
 	const char *op_type_str;
 
@@ -1691,6 +1753,13 @@ typedef int (test_case_function)(struct active_device *ad,
 			? ad->nb_queues
 			: op_params->num_lcores;
 
+	/* Allocate memory for thread parameters structure */
+	t_params = rte_zmalloc(NULL, num_lcores * sizeof(struct thread_params),
+			RTE_CACHE_LINE_SIZE);
+	TEST_ASSERT_NOT_NULL(t_params, "Failed to alloc %zuB for t_params",
+			RTE_ALIGN(sizeof(struct thread_params) * num_lcores,
+				RTE_CACHE_LINE_SIZE));
+
 	if (intr_enabled) {
 		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
 			throughput_function = throughput_intr_lcore_dec;
@@ -1700,9 +1769,11 @@ typedef int (test_case_function)(struct active_device *ad,
 		/* Dequeue interrupt callback registration */
 		ret = rte_bbdev_callback_register(ad->dev_id,
 				RTE_BBDEV_EVENT_DEQUEUE, dequeue_event_callback,
-				&t_params);
-		if (ret < 0)
+				t_params);
+		if (ret < 0) {
+			rte_free(t_params);
 			return ret;
+		}
 	} else {
 		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
 			throughput_function = throughput_pmd_lcore_dec;
@@ -1712,38 +1783,39 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	rte_atomic16_set(&op_params->sync, SYNC_WAIT);
 
-	t_params[rte_lcore_id()].dev_id = ad->dev_id;
-	t_params[rte_lcore_id()].op_params = op_params;
-	t_params[rte_lcore_id()].queue_id =
-			ad->queue_ids[used_cores++];
+	/* Master core is set at first entry */
+	t_params[0].dev_id = ad->dev_id;
+	t_params[0].lcore_id = rte_lcore_id();
+	t_params[0].op_params = op_params;
+	t_params[0].queue_id = ad->queue_ids[used_cores++];
+	t_params[0].iter_count = 0;
 
 	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
 		if (used_cores >= num_lcores)
 			break;
 
-		t_params[lcore_id].dev_id = ad->dev_id;
-		t_params[lcore_id].op_params = op_params;
-		t_params[lcore_id].queue_id = ad->queue_ids[used_cores++];
+		t_params[used_cores].dev_id = ad->dev_id;
+		t_params[used_cores].lcore_id = lcore_id;
+		t_params[used_cores].op_params = op_params;
+		t_params[used_cores].queue_id = ad->queue_ids[used_cores];
+		t_params[used_cores].iter_count = 0;
 
-		rte_eal_remote_launch(throughput_function, &t_params[lcore_id],
-				lcore_id);
+		rte_eal_remote_launch(throughput_function,
+				&t_params[used_cores++], lcore_id);
 	}
 
 	rte_atomic16_set(&op_params->sync, SYNC_START);
-	ret = throughput_function(&t_params[rte_lcore_id()]);
+	ret = throughput_function(&t_params[0]);
 
 	/* Master core is always used */
-	used_cores = 1;
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-		if (used_cores++ >= num_lcores)
-			break;
-
-		ret |= rte_eal_wait_lcore(lcore_id);
-	}
+	for (used_cores = 1; used_cores < num_lcores; used_cores++)
+		ret |= rte_eal_wait_lcore(t_params[used_cores].lcore_id);
 
 	/* Return if test failed */
-	if (ret)
+	if (ret) {
+		rte_free(t_params);
 		return ret;
+	}
 
 	/* Print throughput if interrupts are disabled and test passed */
 	if (!intr_enabled) {
@@ -1751,6 +1823,7 @@ typedef int (test_case_function)(struct active_device *ad,
 			print_dec_throughput(t_params, num_lcores);
 		else
 			print_enc_throughput(t_params, num_lcores);
+		rte_free(t_params);
 		return ret;
 	}
 
@@ -1759,21 +1832,20 @@ typedef int (test_case_function)(struct active_device *ad,
 	 * error using processing_status variable.
 	 * Wait for master lcore operations.
 	 */
-	tp = &t_params[rte_lcore_id()];
+	tp = &t_params[0];
 	while ((rte_atomic16_read(&tp->nb_dequeued) <
 			op_params->num_to_process) &&
 			(rte_atomic16_read(&tp->processing_status) !=
 			TEST_FAILED))
 		rte_pause();
 
+	tp->ops_per_sec /= TEST_REPETITIONS;
+	tp->mbps /= TEST_REPETITIONS;
 	ret |= rte_atomic16_read(&tp->processing_status);
 
 	/* Wait for slave lcores operations */
-	used_cores = 1;
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-		tp = &t_params[lcore_id];
-		if (used_cores++ >= num_lcores)
-			break;
+	for (used_cores = 1; used_cores < num_lcores; used_cores++) {
+		tp = &t_params[used_cores];
 
 		while ((rte_atomic16_read(&tp->nb_dequeued) <
 				op_params->num_to_process) &&
@@ -1781,6 +1853,8 @@ typedef int (test_case_function)(struct active_device *ad,
 				TEST_FAILED))
 			rte_pause();
 
+		tp->ops_per_sec /= TEST_REPETITIONS;
+		tp->mbps /= TEST_REPETITIONS;
 		ret |= rte_atomic16_read(&tp->processing_status);
 	}
 
@@ -1791,6 +1865,8 @@ typedef int (test_case_function)(struct active_device *ad,
 		else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
 			print_enc_throughput(t_params, num_lcores);
 	}
+
+	rte_free(t_params);
 	return ret;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block
  2018-12-07 14:31 [dpdk-dev] [PATCH 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
@ 2018-12-07 15:07 ` Kamil Chalupnik
  0 siblings, 0 replies; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 15:07 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Test application and Turbo Software driver were adapted
to support chained-mbuf for bigger TB sizes.

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 |  60 +++++++++---
 drivers/baseband/turbo_sw/bbdev_turbo_software.c | 111 ++++++++++++++++-------
 2 files changed, 126 insertions(+), 45 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index a25e3a7..5bec70d 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -114,6 +114,17 @@ typedef int (test_case_function)(struct active_device *ad,
 		struct test_op_params *op_params);
 
 static inline void
+mbuf_reset(struct rte_mbuf *m)
+{
+	m->pkt_len = 0;
+
+	do {
+		m->data_len = 0;
+		m = m->next;
+	} while (m != NULL);
+}
+
+static inline void
 set_avail_op(struct active_device *ad, enum rte_bbdev_op_type op_type)
 {
 	ad->supported_ops |= (1 << op_type);
@@ -573,6 +584,10 @@ typedef int (test_case_function)(struct active_device *ad,
 				op_type, n * ref_entries->nb_segments,
 				mbuf_pool->size);
 
+		TEST_ASSERT_SUCCESS(((seg->length + RTE_PKTMBUF_HEADROOM) >
+				(uint32_t)UINT16_MAX),
+				"Given data is bigger than allowed mbuf segment size");
+
 		bufs[i].data = m_head;
 		bufs[i].offset = 0;
 		bufs[i].length = 0;
@@ -589,7 +604,6 @@ typedef int (test_case_function)(struct active_device *ad,
 			rte_memcpy(data, seg->addr, seg->length);
 			bufs[i].length += seg->length;
 
-
 			for (j = 1; j < ref_entries->nb_segments; ++j) {
 				struct rte_mbuf *m_tail =
 						rte_pktmbuf_alloc(mbuf_pool);
@@ -617,6 +631,24 @@ typedef int (test_case_function)(struct active_device *ad,
 						"Couldn't chain mbufs from %d data type mbuf pool",
 						op_type);
 			}
+
+		} else {
+
+			/* allocate chained-mbuf for output buffer */
+			for (j = 1; j < ref_entries->nb_segments; ++j) {
+				struct rte_mbuf *m_tail =
+						rte_pktmbuf_alloc(mbuf_pool);
+				TEST_ASSERT_NOT_NULL(m_tail,
+						"Not enough mbufs in %d data type mbuf pool (needed %u, available %u)",
+						op_type,
+						n * ref_entries->nb_segments,
+						mbuf_pool->size);
+
+				ret = rte_pktmbuf_chain(m_head, m_tail);
+				TEST_ASSERT_SUCCESS(ret,
+						"Couldn't chain mbufs from %d data type mbuf pool",
+						op_type);
+			}
 		}
 	}
 
@@ -655,7 +687,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		while (m != NULL) {
 			int8_t *llr = rte_pktmbuf_mtod_offset(m, int8_t *,
 					input_ops[i].offset);
-			for (byte_idx = 0; byte_idx < input_ops[i].length;
+			for (byte_idx = 0; byte_idx < rte_pktmbuf_data_len(m);
 					++byte_idx)
 				llr[byte_idx] = round((double)max_llr_modulus *
 						llr[byte_idx] / INT8_MAX);
@@ -864,15 +896,18 @@ typedef int (test_case_function)(struct active_device *ad,
 	uint8_t i;
 	struct rte_mbuf *m = op->data;
 	uint8_t nb_dst_segments = orig_op->nb_segments;
+	uint32_t total_data_size = 0;
 
 	TEST_ASSERT(nb_dst_segments == m->nb_segs,
 			"Number of segments differ in original (%u) and filled (%u) op",
 			nb_dst_segments, m->nb_segs);
 
+	/* Validate each mbuf segment length */
 	for (i = 0; i < nb_dst_segments; ++i) {
 		/* Apply offset to the first mbuf segment */
 		uint16_t offset = (i == 0) ? op->offset : 0;
-		uint16_t data_len = m->data_len - offset;
+		uint16_t data_len = rte_pktmbuf_data_len(m) - offset;
+		total_data_size += orig_op->segments[i].length;
 
 		TEST_ASSERT(orig_op->segments[i].length == data_len,
 				"Length of segment differ in original (%u) and filled (%u) op",
@@ -884,6 +919,12 @@ typedef int (test_case_function)(struct active_device *ad,
 		m = m->next;
 	}
 
+	/* Validate total mbuf pkt length */
+	uint32_t pkt_len = rte_pktmbuf_pkt_len(op->data) - op->offset;
+	TEST_ASSERT(total_data_size == pkt_len,
+			"Length of data differ in original (%u) and filled (%u) op",
+			total_data_size, pkt_len);
+
 	return TEST_SUCCESS;
 }
 
@@ -1427,10 +1468,8 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-		for (j = 0; j < num_ops; ++j) {
-			struct rte_bbdev_dec_op *op = ops_enq[j];
-			rte_pktmbuf_reset(op->turbo_dec.hard_output.data);
-		}
+		for (j = 0; j < num_ops; ++j)
+			mbuf_reset(ops_enq[j]->turbo_dec.hard_output.data);
 
 		start_time = rte_rdtsc_precise();
 
@@ -1529,8 +1568,7 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
 			for (j = 0; j < num_ops; ++j)
-				rte_pktmbuf_reset(
-					ops_enq[j]->turbo_enc.output.data);
+				mbuf_reset(ops_enq[j]->turbo_enc.output.data);
 
 		start_time = rte_rdtsc_precise();
 
@@ -2025,7 +2063,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
-		rte_delay_ms(10);
+		rte_delay_us(200);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
@@ -2106,7 +2144,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
-		rte_delay_ms(10);
+		rte_delay_us(200);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 57f6ba1..19fbb55 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -83,6 +83,18 @@ struct turbo_sw_queue {
 	enum rte_bbdev_op_type type;
 } __rte_cache_aligned;
 
+static inline char *
+mbuf_append(struct rte_mbuf *m_head, struct rte_mbuf *m, uint16_t len)
+{
+	if (unlikely(len > rte_pktmbuf_tailroom(m)))
+		return NULL;
+
+	char *tail = (char *)m->buf_addr + m->data_off + m->data_len;
+	m->data_len = (uint16_t)(m->data_len + len);
+	m_head->pkt_len  = (m_head->pkt_len + len);
+	return tail;
+}
+
 /* Calculate index based on Table 5.1.3-3 from TS34.212 */
 static inline int32_t
 compute_idx(uint16_t k)
@@ -437,7 +449,7 @@ struct turbo_sw_queue {
 		return -1;
 	}
 
-	if (in_length - kw < 0) {
+	if (in_length < kw) {
 		rte_bbdev_log(ERR,
 				"Mismatch between input length (%u) and kw (%u)",
 				in_length, kw);
@@ -456,9 +468,9 @@ struct turbo_sw_queue {
 static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		uint8_t r, uint8_t c, uint16_t k, uint16_t ncb,
-		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out,
-		uint16_t in_offset, uint16_t out_offset, uint16_t total_left,
-		struct rte_bbdev_stats *q_stats)
+		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out_head,
+		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
+		uint16_t in_length, struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int16_t k_idx;
@@ -484,7 +496,7 @@ struct turbo_sw_queue {
 	/* CRC24A (for TB) */
 	if ((enc->op_flags & RTE_BBDEV_TURBO_CRC_24A_ATTACH) &&
 		(enc->code_block_mode == 1)) {
-		ret = is_enc_input_valid(k - 24, k_idx, total_left);
+		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -494,7 +506,7 @@ struct turbo_sw_queue {
 		/* Check if there is a room for CRC bits if not use
 		 * the temporary buffer.
 		 */
-		if (rte_pktmbuf_append(m_in, 3) == NULL) {
+		if (mbuf_append(m_in, m_in, 3) == NULL) {
 			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 			in = q->enc_in;
 		} else {
@@ -517,7 +529,7 @@ struct turbo_sw_queue {
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
-		ret = is_enc_input_valid(k - 24, k_idx, total_left);
+		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -527,7 +539,7 @@ struct turbo_sw_queue {
 		/* Check if there is a room for CRC bits if this is the last
 		 * CB in TB. If not use temporary buffer.
 		 */
-		if ((c - r == 1) && (rte_pktmbuf_append(m_in, 3) == NULL)) {
+		if ((c - r == 1) && (mbuf_append(m_in, m_in, 3) == NULL)) {
 			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 			in = q->enc_in;
 		} else if (c - r > 1) {
@@ -549,7 +561,7 @@ struct turbo_sw_queue {
 		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else {
-		ret = is_enc_input_valid(k, k_idx, total_left);
+		ret = is_enc_input_valid(k, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -570,7 +582,8 @@ struct turbo_sw_queue {
 		out1 = RTE_PTR_ADD(out0, (k >> 3) + 1);
 		out2 = RTE_PTR_ADD(out1, (k >> 3) + 1);
 	} else {
-		out0 = (uint8_t *)rte_pktmbuf_append(m_out, (k >> 3) * 3 + 2);
+		out0 = (uint8_t *)mbuf_append(m_out_head, m_out,
+				(k >> 3) * 3 + 2);
 		if (out0 == NULL) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			rte_bbdev_log(ERR,
@@ -623,7 +636,7 @@ struct turbo_sw_queue {
 		const uint8_t mask_out[] = {0xFF, 0xC0, 0xF0, 0xFC};
 
 		/* get output data starting address */
-		rm_out = (uint8_t *)rte_pktmbuf_append(m_out, out_len);
+		rm_out = (uint8_t *)mbuf_append(m_out_head, m_out, out_len);
 		if (rm_out == NULL) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			rte_bbdev_log(ERR,
@@ -725,14 +738,16 @@ struct turbo_sw_queue {
 	uint16_t out_offset = enc->output.offset;
 	struct rte_mbuf *m_in = enc->input.data;
 	struct rte_mbuf *m_out = enc->output.data;
-	uint16_t total_left = enc->input.length;
+	struct rte_mbuf *m_out_head = enc->output.data;
+	uint32_t in_length, mbuf_total_left = enc->input.length;
+	uint16_t seg_total_left;
 
 	/* Clear op status */
 	op->status = 0;
 
-	if (total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) {
+	if (mbuf_total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) {
 		rte_bbdev_log(ERR, "TB size (%u) is too big, max: %d",
-				total_left, RTE_BBDEV_MAX_TB_SIZE);
+				mbuf_total_left, RTE_BBDEV_MAX_TB_SIZE);
 		op->status = 1 << RTE_BBDEV_DATA_ERROR;
 		return;
 	}
@@ -755,7 +770,10 @@ struct turbo_sw_queue {
 		r = 0;
 	}
 
-	while (total_left > 0 && r < c) {
+	while (mbuf_total_left > 0 && r < c) {
+
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
 		if (enc->code_block_mode == 0) {
 			k = (r < enc->tb_params.c_neg) ?
 				enc->tb_params.k_neg : enc->tb_params.k_pos;
@@ -769,22 +787,32 @@ struct turbo_sw_queue {
 			e = enc->cb_params.e;
 		}
 
-		process_enc_cb(q, op, r, c, k, ncb, e, m_in,
-				m_out, in_offset, out_offset, total_left,
+		process_enc_cb(q, op, r, c, k, ncb, e, m_in, m_out_head,
+				m_out, in_offset, out_offset, seg_total_left,
 				queue_stats);
 		/* Update total_left */
-		total_left -= (k - crc24_bits) >> 3;
+		in_length = ((k - crc24_bits) >> 3);
+		mbuf_total_left -= in_length;
 		/* Update offsets for next CBs (if exist) */
 		in_offset += (k - crc24_bits) >> 3;
 		if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH)
 			out_offset += e >> 3;
 		else
 			out_offset += (k >> 3) * 3 + 2;
+
+		/* Update offsets */
+		if (seg_total_left == in_length) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			in_offset = 0;
+			out_offset = 0;
+		}
 		r++;
 	}
 
 	/* check if all input data was processed */
-	if (total_left != 0) {
+	if (mbuf_total_left != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR,
 				"Mismatch between mbuf length and included CBs sizes");
@@ -903,8 +931,9 @@ struct turbo_sw_queue {
 static inline void
 process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in,
-		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
-		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left,
+		struct rte_mbuf *m_out_head, struct rte_mbuf *m_out,
+		uint16_t in_offset, uint16_t out_offset, bool check_crc_24b,
+		uint16_t crc24_overlap, uint16_t in_length,
 		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
@@ -925,7 +954,7 @@ struct turbo_sw_queue {
 
 	k_idx = compute_idx(k);
 
-	ret = is_dec_input_valid(k_idx, kw, total_left);
+	ret = is_dec_input_valid(k_idx, kw, in_length);
 	if (ret != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		return;
@@ -983,7 +1012,8 @@ struct turbo_sw_queue {
 	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 
-	out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3));
+	out = (uint8_t *)mbuf_append(m_out_head, m_out,
+			((k - crc24_overlap) >> 3));
 	if (out == NULL) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR, "Too little space in output mbuf");
@@ -1038,9 +1068,11 @@ struct turbo_sw_queue {
 	struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec;
 	struct rte_mbuf *m_in = dec->input.data;
 	struct rte_mbuf *m_out = dec->hard_output.data;
+	struct rte_mbuf *m_out_head = dec->hard_output.data;
 	uint16_t in_offset = dec->input.offset;
-	uint16_t total_left = dec->input.length;
 	uint16_t out_offset = dec->hard_output.offset;
+	uint32_t mbuf_total_left = dec->input.length;
+	uint16_t seg_total_left;
 
 	/* Clear op status */
 	op->status = 0;
@@ -1062,11 +1094,13 @@ struct turbo_sw_queue {
 		RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP))
 		crc24_overlap = 24;
 
-	while (total_left > 0) {
+	while (mbuf_total_left > 0) {
 		if (dec->code_block_mode == 0)
 			k = (r < dec->tb_params.c_neg) ?
 				dec->tb_params.k_neg : dec->tb_params.k_pos;
 
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
 		/* Calculates circular buffer size (Kw).
 		 * According to 3gpp 36.212 section 5.1.4.2
 		 *   Kw = 3 * Kpi,
@@ -1079,23 +1113,32 @@ struct turbo_sw_queue {
 		 */
 		kw = RTE_ALIGN_CEIL(k + 4, RTE_BBDEV_C_SUBBLOCK) * 3;
 
-		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
-				out_offset, check_bit(dec->op_flags,
+		process_dec_cb(q, op, c, k, kw, m_in, m_out_head, m_out,
+				in_offset, out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
-				total_left, queue_stats);
+				seg_total_left, queue_stats);
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
 		 */
 
-		/* Update total_left */
-		total_left -= kw;
-		/* Update offsets for next CBs (if exist) */
-		in_offset += kw;
-		out_offset += ((k - crc24_overlap) >> 3);
+		mbuf_total_left -= kw;
+
+		/* Update offsets */
+		if (seg_total_left == kw) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			in_offset = 0;
+			out_offset = 0;
+		} else {
+			/* Update offsets for next CBs (if exist) */
+			in_offset += kw;
+			out_offset += ((k - crc24_overlap) >> 3);
+		}
 		r++;
 	}
-	if (total_left != 0) {
+	if (mbuf_total_left != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR,
 				"Mismatch between mbuf length and included Circular buffer sizes");
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test
  2018-12-07 14:31 [dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2018-12-07 14:31 ` [dpdk-dev] [PATCH 2/4] baseband: enhancement of throughput test Kamil Chalupnik
  2018-12-07 14:31 ` [dpdk-dev] [PATCH 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
@ 2018-12-07 15:15 ` Kamil Chalupnik
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test Kamil Chalupnik
                     ` (3 more replies)
  2 siblings, 4 replies; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 15:15 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Offload cost test was improved in order to collect
more accurate results.

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 | 152 +++++++++++------------
 config/common_base                               |   2 +-
 drivers/baseband/turbo_sw/bbdev_turbo_software.c |  70 ++++++++---
 lib/librte_bbdev/rte_bbdev.h                     |   9 +-
 4 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index fbe6cc9..bf97edb 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -88,19 +88,19 @@ struct thread_params {
 /* Stores time statistics */
 struct test_time_stats {
 	/* Stores software enqueue total working time */
-	uint64_t enq_sw_tot_time;
+	uint64_t enq_sw_total_time;
 	/* Stores minimum value of software enqueue working time */
 	uint64_t enq_sw_min_time;
 	/* Stores maximum value of software enqueue working time */
 	uint64_t enq_sw_max_time;
 	/* Stores turbo enqueue total working time */
-	uint64_t enq_tur_tot_time;
-	/* Stores minimum value of turbo enqueue working time */
-	uint64_t enq_tur_min_time;
-	/* Stores maximum value of turbo enqueue working time */
-	uint64_t enq_tur_max_time;
+	uint64_t enq_acc_total_time;
+	/* Stores minimum value of accelerator enqueue working time */
+	uint64_t enq_acc_min_time;
+	/* Stores maximum value of accelerator enqueue working time */
+	uint64_t enq_acc_max_time;
 	/* Stores dequeue total working time */
-	uint64_t deq_tot_time;
+	uint64_t deq_total_time;
 	/* Stores minimum value of dequeue working time */
 	uint64_t deq_min_time;
 	/* Stores maximum value of dequeue working time */
@@ -1200,12 +1200,15 @@ typedef int (test_case_function)(struct active_device *ad,
 	burst_sz = tp->op_params->burst_sz;
 	num_to_process = tp->op_params->num_to_process;
 
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
 				burst_sz);
-	else
+		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+	} else {
 		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops,
 				burst_sz);
+		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+	}
 
 	if (deq < burst_sz) {
 		printf(
@@ -1316,8 +1319,6 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_dec_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1380,8 +1381,6 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops,
 				num_to_enq);
-
-		rte_bbdev_enc_op_free_bulk(ops, num_to_enq);
 	}
 
 	if (allocs_failed > 0)
@@ -1575,13 +1574,14 @@ typedef int (test_case_function)(struct active_device *ad,
 	RTE_LCORE_FOREACH(lcore_id) {
 		if (iter++ >= used_cores)
 			break;
-		printf("\tlcore_id: %u, throughput: %.8lg MOPS, %.8lg Mbps\n",
-		lcore_id, t_params[lcore_id].mops, t_params[lcore_id].mbps);
+		printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n",
+				lcore_id, t_params[lcore_id].mops,
+				t_params[lcore_id].mbps);
 		total_mops += t_params[lcore_id].mops;
 		total_mbps += t_params[lcore_id].mbps;
 	}
 	printf(
-		"\n\tTotal stats for %u cores: throughput: %.8lg MOPS, %.8lg Mbps\n",
+		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n",
 		used_cores, total_mops, total_mbps);
 }
 
@@ -1882,7 +1882,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Validation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nValidation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -1899,10 +1899,10 @@ typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\toperation latency:\n"
-			"\t\tavg latency: %lg cycles, %lg us\n"
-			"\t\tmin latency: %lg cycles, %lg us\n"
-			"\t\tmax latency: %lg cycles, %lg us\n",
+	printf("Operation latency:\n"
+			"\tavg latency: %lg cycles, %lg us\n"
+			"\tmin latency: %lg cycles, %lg us\n"
+			"\tmax latency: %lg cycles, %lg us\n",
 			(double)total_time / (double)iter,
 			(double)(total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)min_time,
@@ -1930,7 +1930,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	stats->dequeued_count = q_stats->dequeued_count;
 	stats->enqueue_err_count = q_stats->enqueue_err_count;
 	stats->dequeue_err_count = q_stats->dequeue_err_count;
-	stats->offload_time = q_stats->offload_time;
+	stats->acc_offload_cycles = q_stats->acc_offload_cycles;
 
 	return 0;
 }
@@ -1974,18 +1974,18 @@ typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2003,7 +2003,7 @@ typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		/* Dequeue remaining operations if needed*/
 		while (burst_sz != deq)
@@ -2055,18 +2055,18 @@ typedef int (test_case_function)(struct active_device *ad,
 				queue_id, dev_id);
 
 		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
-				stats.offload_time;
+				stats.acc_offload_cycles;
 		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
 				enq_sw_last_time);
 		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
 				enq_sw_last_time);
-		time_st->enq_sw_tot_time += enq_sw_last_time;
+		time_st->enq_sw_total_time += enq_sw_last_time;
 
-		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
-				stats.offload_time);
-		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
-				stats.offload_time);
-		time_st->enq_tur_tot_time += stats.offload_time;
+		time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time,
+				stats.acc_offload_cycles);
+		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
@@ -2084,7 +2084,7 @@ typedef int (test_case_function)(struct active_device *ad,
 				deq_last_time);
 		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
 				deq_last_time);
-		time_st->deq_tot_time += deq_last_time;
+		time_st->deq_total_time += deq_last_time;
 
 		while (burst_sz != deq)
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
@@ -2121,7 +2121,7 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	memset(&time_st, 0, sizeof(struct test_time_stats));
 	time_st.enq_sw_min_time = UINT64_MAX;
-	time_st.enq_tur_min_time = UINT64_MAX;
+	time_st.enq_acc_min_time = UINT64_MAX;
 	time_st.deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2134,7 +2134,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -2149,36 +2149,36 @@ typedef int (test_case_function)(struct active_device *ad,
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tenq offload cost latency:\n"
-			"\t\tsoftware avg %lg cycles, %lg us\n"
-			"\t\tsoftware min %lg cycles, %lg us\n"
-			"\t\tsoftware max %lg cycles, %lg us\n"
-			"\t\tturbo avg %lg cycles, %lg us\n"
-			"\t\tturbo min %lg cycles, %lg us\n"
-			"\t\tturbo max %lg cycles, %lg us\n",
-			(double)time_st.enq_sw_tot_time / (double)iter,
-			(double)(time_st.enq_sw_tot_time * 1000000) /
+	printf("Enqueue offload cost latency:\n"
+			"\tDriver offload avg %lg cycles, %lg us\n"
+			"\tDriver offload min %lg cycles, %lg us\n"
+			"\tDriver offload max %lg cycles, %lg us\n"
+			"\tAccelerator offload avg %lg cycles, %lg us\n"
+			"\tAccelerator offload min %lg cycles, %lg us\n"
+			"\tAccelerator offload max %lg cycles, %lg us\n",
+			(double)time_st.enq_sw_total_time / (double)iter,
+			(double)(time_st.enq_sw_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.enq_sw_min_time,
 			(double)(time_st.enq_sw_min_time * 1000000) /
 			rte_get_tsc_hz(), (double)time_st.enq_sw_max_time,
 			(double)(time_st.enq_sw_max_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_total_time /
 			(double)iter,
-			(double)(time_st.enq_tur_tot_time * 1000000) /
+			(double)(time_st.enq_acc_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
-			(double)time_st.enq_tur_min_time,
-			(double)(time_st.enq_tur_min_time * 1000000) /
-			rte_get_tsc_hz(), (double)time_st.enq_tur_max_time,
-			(double)(time_st.enq_tur_max_time * 1000000) /
+			(double)time_st.enq_acc_min_time,
+			(double)(time_st.enq_acc_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_acc_max_time,
+			(double)(time_st.enq_acc_max_time * 1000000) /
 			rte_get_tsc_hz());
 
-	printf("\tdeq offload cost latency - one op:\n"
-			"\t\tavg %lg cycles, %lg us\n"
-			"\t\tmin %lg cycles, %lg us\n"
-			"\t\tmax %lg cycles, %lg us\n",
-			(double)time_st.deq_tot_time / (double)iter,
-			(double)(time_st.deq_tot_time * 1000000) /
+	printf("Dequeue offload cost latency - one op:\n"
+			"\tavg %lg cycles, %lg us\n"
+			"\tmin %lg cycles, %lg us\n"
+			"\tmax %lg cycles, %lg us\n",
+			(double)time_st.deq_total_time / (double)iter,
+			(double)(time_st.deq_total_time * 1000000) /
 			(double)iter / (double)rte_get_tsc_hz(),
 			(double)time_st.deq_min_time,
 			(double)(time_st.deq_min_time * 1000000) /
@@ -2194,7 +2194,7 @@ typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2214,7 +2214,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2223,7 +2223,7 @@ typedef int (test_case_function)(struct active_device *ad,
 static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_total_time, uint64_t *deq_min_time,
 		uint64_t *deq_max_time)
 {
 	int i, deq_total;
@@ -2242,7 +2242,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
 		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
-		*deq_tot_time += deq_last_time;
+		*deq_total_time += deq_last_time;
 	}
 
 	return i;
@@ -2261,7 +2261,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	return TEST_SKIPPED;
 #else
 	int iter;
-	uint64_t deq_tot_time, deq_min_time, deq_max_time;
+	uint64_t deq_total_time, deq_min_time, deq_max_time;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -2269,7 +2269,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_info info;
 	const char *op_type_str;
 
-	deq_tot_time = deq_max_time = 0;
+	deq_total_time = deq_max_time = 0;
 	deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
@@ -2281,27 +2281,27 @@ typedef int (test_case_function)(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf(
-		"Offload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
+		"\nOffload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_tot_time,
+				num_to_process, burst_sz, &deq_total_time,
 				&deq_min_time, &deq_max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tempty deq offload\n"
-			"\t\tavg. latency: %lg cycles, %lg us\n"
-			"\t\tmin. latency: %lg cycles, %lg us\n"
-			"\t\tmax. latency: %lg cycles, %lg us\n",
-			(double)deq_tot_time / (double)iter,
-			(double)(deq_tot_time * 1000000) / (double)iter /
+	printf("Empty dequeue offload\n"
+			"\tavg. latency: %lg cycles, %lg us\n"
+			"\tmin. latency: %lg cycles, %lg us\n"
+			"\tmax. latency: %lg cycles, %lg us\n",
+			(double)deq_total_time / (double)iter,
+			(double)(deq_total_time * 1000000) / (double)iter /
 			(double)rte_get_tsc_hz(), (double)deq_min_time,
 			(double)(deq_min_time * 1000000) / rte_get_tsc_hz(),
 			(double)deq_max_time, (double)(deq_max_time * 1000000) /
diff --git a/config/common_base b/config/common_base
index d12ae98..3ff98bb 100644
--- a/config/common_base
+++ b/config/common_base
@@ -481,7 +481,7 @@ CONFIG_RTE_PMD_PACKET_PREFETCH=y
 #
 CONFIG_RTE_LIBRTE_BBDEV=y
 CONFIG_RTE_BBDEV_MAX_DEVS=128
-CONFIG_RTE_BBDEV_OFFLOAD_COST=n
+CONFIG_RTE_BBDEV_OFFLOAD_COST=y
 
 #
 # Compile PMD for NULL bbdev device
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 8ceb276..57f6ba1 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -510,9 +510,10 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24A generation */
 		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
@@ -542,9 +543,10 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
+		/* CRC24B generation */
 		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
@@ -596,15 +598,14 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 	start_time = rte_rdtsc_precise();
 #endif
-
+	/* Turbo encoding */
 	if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) {
 		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 		rte_bbdev_log(ERR, "Turbo Encoder failed");
 		return;
 	}
-
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	q_stats->offload_time += rte_rdtsc_precise() - start_time;
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 
 	/* Restore 3 first bytes of next CB if they were overwritten by CRC*/
@@ -671,23 +672,21 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		start_time = rte_rdtsc_precise();
 #endif
-
+		/* Rate-Matching */
 		if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) {
 			op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 			rte_bbdev_log(ERR, "Rate matching failed");
 			return;
 		}
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 		/* SW fills an entire last byte even if E%8 != 0. Clear the
 		 * superfluous data bits for consistency with HW device.
 		 */
 		mask_id = (e & 7) >> 1;
 		rm_out[out_len - 1] &= mask_out[mask_id];
-
-#ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->offload_time += rte_rdtsc_precise() - start_time;
-#endif
-
 		enc->output.length += rm_resp.OutputLen;
 	} else {
 		/* Rate matching is bypassed */
@@ -798,7 +797,7 @@ struct turbo_sw_queue {
 {
 	uint16_t i;
 #ifdef RTE_BBDEV_OFFLOAD_COST
-	queue_stats->offload_time = 0;
+	queue_stats->acc_offload_cycles = 0;
 #endif
 
 	for (i = 0; i < nb_ops; ++i)
@@ -905,7 +904,8 @@ struct turbo_sw_queue {
 process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in,
 		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
-		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left)
+		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left,
+		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int32_t k_idx;
@@ -917,6 +917,11 @@ struct turbo_sw_queue {
 	struct bblib_turbo_decoder_request turbo_req;
 	struct bblib_turbo_decoder_response turbo_resp;
 	struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	uint64_t start_time;
+#else
+	RTE_SET_USED(q_stats);
+#endif
 
 	k_idx = compute_idx(k);
 
@@ -942,7 +947,14 @@ struct turbo_sw_queue {
 		deint_req.pharqbuffer = q->deint_input;
 		deint_req.ncb = ncb_without_null;
 		deint_resp.pinteleavebuffer = q->deint_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		start_time = rte_rdtsc_precise();
+#endif
 		bblib_deinterleave_ul(&deint_req, &deint_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	} else
 		move_padding_bytes(in, q->deint_output, k, ncb);
 
@@ -961,7 +973,15 @@ struct turbo_sw_queue {
 	adapter_req.ncb = ncb_without_null;
 	adapter_req.pinteleavebuffer = adapter_input;
 	adapter_resp.pharqout = q->adapter_output;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode adaptation */
 	bblib_turbo_adapter_ul(&adapter_req, &adapter_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 
 	out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3));
 	if (out == NULL) {
@@ -986,12 +1006,20 @@ struct turbo_sw_queue {
 	turbo_resp.ag_buf = q->ag;
 	turbo_resp.cb_buf = q->code_block;
 	turbo_resp.output = out;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	start_time = rte_rdtsc_precise();
+#endif
+	/* Turbo decode */
 	iter_cnt = bblib_turbo_decoder(&turbo_req, &turbo_resp);
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
 	dec->hard_output.length += (k >> 3);
 
 	if (iter_cnt > 0) {
 		/* Temporary solution for returned iter_count from SDK */
-		iter_cnt = (iter_cnt - 1) / 2;
+		iter_cnt = (iter_cnt - 1) >> 1;
 		dec->iter_count = RTE_MAX(iter_cnt, dec->iter_count);
 	} else {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
@@ -1001,7 +1029,8 @@ struct turbo_sw_queue {
 }
 
 static inline void
-enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op)
+enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
+		struct rte_bbdev_stats *queue_stats)
 {
 	uint8_t c, r = 0;
 	uint16_t kw, k = 0;
@@ -1053,7 +1082,7 @@ struct turbo_sw_queue {
 		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
 				out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
-				total_left);
+				total_left, queue_stats);
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
@@ -1075,12 +1104,15 @@ struct turbo_sw_queue {
 
 static inline uint16_t
 enqueue_dec_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_dec_op **ops,
-		uint16_t nb_ops)
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
 {
 	uint16_t i;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	queue_stats->acc_offload_cycles = 0;
+#endif
 
 	for (i = 0; i < nb_ops; ++i)
-		enqueue_dec_one_op(q, ops[i]);
+		enqueue_dec_one_op(q, ops[i], queue_stats);
 
 	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
 			NULL);
@@ -1112,7 +1144,7 @@ struct turbo_sw_queue {
 	struct turbo_sw_queue *q = queue;
 	uint16_t nb_enqueued = 0;
 
-	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops);
+	nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops, &q_data->queue_stats);
 
 	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
 	q_data->queue_stats.enqueued_count += nb_enqueued;
diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h
index 25ef409..da8cf07 100644
--- a/lib/librte_bbdev/rte_bbdev.h
+++ b/lib/librte_bbdev/rte_bbdev.h
@@ -239,8 +239,13 @@ struct rte_bbdev_stats {
 	uint64_t enqueue_err_count;
 	/** Total error count on operations dequeued */
 	uint64_t dequeue_err_count;
-	/** Offload time */
-	uint64_t offload_time;
+	/** CPU cycles consumed by the (HW/SW) accelerator device to offload
+	 *  the enqueue request to its internal queues.
+	 *  - For a HW device this is the cycles consumed in MMIO write
+	 *  - For a SW (vdev) device, this is the processing time of the
+	 *     bbdev operation
+	 */
+	uint64_t acc_offload_cycles;
 };
 
 /**
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
@ 2018-12-07 15:15   ` Kamil Chalupnik
  2018-12-13 20:09     ` Mokhtar, Amr
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 15:15 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Improvements added to throughput test:
- test is run in loop (number of iterations is specified by
TEST_REPETITIONS define) which ensures more accurate results
- length of input data is calculated based on amount of CBs in TB
- maximum number of decoding iterations is gathered from results
- added new functions responsible for printing results
- small fixes for memory management

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/main.c              |   2 -
 app/test-bbdev/test_bbdev_perf.c   | 471 ++++++++++++++++++++-----------------
 app/test-bbdev/test_bbdev_vector.c |   7 +
 lib/librte_bbdev/rte_bbdev_op.h    |   2 +
 4 files changed, 263 insertions(+), 219 deletions(-)

diff --git a/app/test-bbdev/main.c b/app/test-bbdev/main.c
index 41b54bb..7af2522 100644
--- a/app/test-bbdev/main.c
+++ b/app/test-bbdev/main.c
@@ -316,8 +316,6 @@
 		return 1;
 	}
 
-	rte_log_set_global_level(RTE_LOG_INFO);
-
 	/* If no argument provided - run all tests */
 	if (test_params.num_tests == 0)
 		return run_all_tests();
diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index bf97edb..a25e3a7 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -23,6 +23,7 @@
 #define GET_SOCKET(socket_id) (((socket_id) == SOCKET_ID_ANY) ? 0 : (socket_id))
 
 #define MAX_QUEUES RTE_MAX_LCORE
+#define TEST_REPETITIONS 1000
 
 #define OPS_CACHE_SIZE 256U
 #define OPS_POOL_SIZE_MIN 511U /* 0.5K per queue */
@@ -77,8 +78,9 @@ struct thread_params {
 	uint8_t dev_id;
 	uint16_t queue_id;
 	uint64_t start_time;
-	double mops;
+	double ops_per_sec;
 	double mbps;
+	uint8_t iter_count;
 	rte_atomic16_t nb_dequeued;
 	rte_atomic16_t processing_status;
 	struct test_op_params *op_params;
@@ -757,6 +759,8 @@ typedef int (test_case_function)(struct active_device *ad,
 					turbo_dec->tb_params.c_neg;
 			ops[i]->turbo_dec.tb_params.cab =
 					turbo_dec->tb_params.cab;
+			ops[i]->turbo_dec.tb_params.r =
+					turbo_dec->tb_params.r;
 		} else {
 			ops[i]->turbo_dec.cb_params.e = turbo_dec->cb_params.e;
 			ops[i]->turbo_dec.cb_params.k = turbo_dec->cb_params.k;
@@ -884,47 +888,6 @@ typedef int (test_case_function)(struct active_device *ad,
 }
 
 static int
-validate_dec_buffers(struct rte_bbdev_dec_op *ref_op, struct test_buffers *bufs,
-		const uint16_t num_to_process)
-{
-	int i;
-
-	struct op_data_entries *hard_data_orig =
-			&test_vector.entries[DATA_HARD_OUTPUT];
-	struct op_data_entries *soft_data_orig =
-			&test_vector.entries[DATA_SOFT_OUTPUT];
-
-	for (i = 0; i < num_to_process; i++) {
-		TEST_ASSERT_SUCCESS(validate_op_chain(&bufs->hard_outputs[i],
-				hard_data_orig),
-				"Hard output buffers are not equal");
-		if (ref_op->turbo_dec.op_flags &
-				RTE_BBDEV_TURBO_SOFT_OUTPUT)
-			TEST_ASSERT_SUCCESS(validate_op_chain(
-					&bufs->soft_outputs[i],
-					soft_data_orig),
-					"Soft output buffers are not equal");
-	}
-
-	return TEST_SUCCESS;
-}
-
-static int
-validate_enc_buffers(struct test_buffers *bufs, const uint16_t num_to_process)
-{
-	int i;
-
-	struct op_data_entries *hard_data_orig =
-			&test_vector.entries[DATA_HARD_OUTPUT];
-
-	for (i = 0; i < num_to_process; i++)
-		TEST_ASSERT_SUCCESS(validate_op_chain(&bufs->hard_outputs[i],
-				hard_data_orig), "");
-
-	return TEST_SUCCESS;
-}
-
-static int
 validate_dec_op(struct rte_bbdev_dec_op **ops, const uint16_t n,
 		struct rte_bbdev_dec_op *ref_op, const int vector_mask)
 {
@@ -1016,6 +979,44 @@ typedef int (test_case_function)(struct active_device *ad,
 				entry->segments[i].length;
 }
 
+static uint32_t
+calc_dec_TB_size(struct rte_bbdev_dec_op *op)
+{
+	uint8_t i;
+	uint32_t c, r, tb_size = 0;
+
+	if (op->turbo_dec.code_block_mode) {
+		tb_size = op->turbo_dec.tb_params.k_neg;
+	} else {
+		c = op->turbo_dec.tb_params.c;
+		r = op->turbo_dec.tb_params.r;
+		for (i = 0; i < c-r; i++)
+			tb_size += (r < op->turbo_dec.tb_params.c_neg) ?
+				op->turbo_dec.tb_params.k_neg :
+				op->turbo_dec.tb_params.k_pos;
+	}
+	return tb_size;
+}
+
+static uint32_t
+calc_enc_TB_size(struct rte_bbdev_enc_op *op)
+{
+	uint8_t i;
+	uint32_t c, r, tb_size = 0;
+
+	if (op->turbo_enc.code_block_mode) {
+		tb_size = op->turbo_enc.tb_params.k_neg;
+	} else {
+		c = op->turbo_enc.tb_params.c;
+		r = op->turbo_enc.tb_params.r;
+		for (i = 0; i < c-r; i++)
+			tb_size += (r < op->turbo_enc.tb_params.c_neg) ?
+				op->turbo_enc.tb_params.k_neg :
+				op->turbo_enc.tb_params.k_pos;
+	}
+	return tb_size;
+}
+
 static int
 init_test_op_params(struct test_op_params *op_params,
 		enum rte_bbdev_op_type op_type, const int expected_status,
@@ -1163,17 +1164,13 @@ typedef int (test_case_function)(struct active_device *ad,
 	int ret;
 	uint16_t i;
 	uint64_t total_time;
-	uint16_t deq, burst_sz, num_to_process;
+	uint16_t deq, burst_sz, num_ops;
 	uint16_t queue_id = INVALID_QUEUE_ID;
 	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
 	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
-	struct test_buffers *bufs;
 	struct rte_bbdev_info info;
 
-	/* Input length in bytes, million operations per second,
-	 * million bits per second.
-	 */
-	double in_len;
+	double tb_len_bits;
 
 	struct thread_params *tp = cb_arg;
 	RTE_SET_USED(ret_param);
@@ -1198,7 +1195,7 @@ typedef int (test_case_function)(struct active_device *ad,
 	}
 
 	burst_sz = tp->op_params->burst_sz;
-	num_to_process = tp->op_params->num_to_process;
+	num_ops = tp->op_params->num_to_process;
 
 	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
@@ -1218,7 +1215,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	if (rte_atomic16_read(&tp->nb_dequeued) + deq < num_to_process) {
+	if (rte_atomic16_read(&tp->nb_dequeued) + deq < num_ops) {
 		rte_atomic16_add(&tp->nb_dequeued, deq);
 		return;
 	}
@@ -1227,14 +1224,18 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	rte_bbdev_info_get(dev_id, &info);
 
-	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
-
 	ret = TEST_SUCCESS;
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
-		ret = validate_dec_buffers(tp->op_params->ref_dec_op, bufs,
-				num_to_process);
-	else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
-		ret = validate_enc_buffers(bufs, num_to_process);
+
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
+		struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
+		ret = validate_dec_op(dec_ops, num_ops, ref_op,
+				tp->op_params->vector_mask);
+		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+	} else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC) {
+		struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
+		ret = validate_enc_op(enc_ops, num_ops, ref_op);
+		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+	}
 
 	if (ret) {
 		printf("Buffers validation failed\n");
@@ -1243,13 +1244,13 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	switch (test_vector.op_type) {
 	case RTE_BBDEV_OP_TURBO_DEC:
-		in_len = tp->op_params->ref_dec_op->turbo_dec.input.length;
+		tb_len_bits = calc_dec_TB_size(tp->op_params->ref_dec_op);
 		break;
 	case RTE_BBDEV_OP_TURBO_ENC:
-		in_len = tp->op_params->ref_enc_op->turbo_enc.input.length;
+		tb_len_bits = calc_enc_TB_size(tp->op_params->ref_enc_op);
 		break;
 	case RTE_BBDEV_OP_NONE:
-		in_len = 0.0;
+		tb_len_bits = 0.0;
 		break;
 	default:
 		printf("Unknown op type: %d\n", test_vector.op_type);
@@ -1257,9 +1258,9 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	tp->mops = ((double)num_to_process / 1000000.0) /
+	tp->ops_per_sec = ((double)num_ops) /
 			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	tp->mbps = (((double)(num_ops * tb_len_bits)) / 1000000.0) /
 			((double)total_time / (double)rte_get_tsc_hz());
 
 	rte_atomic16_add(&tp->nb_dequeued, deq);
@@ -1270,14 +1271,14 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	struct thread_params *tp = arg;
 	unsigned int enqueued;
-	struct rte_bbdev_dec_op *ops[MAX_BURST];
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
 	const uint16_t num_to_process = tp->op_params->num_to_process;
+	struct rte_bbdev_dec_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
 	struct rte_bbdev_info info;
 	int ret;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1287,6 +1288,11 @@ typedef int (test_case_function)(struct active_device *ad,
 			tp->dev_id, queue_id);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_to_process > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	rte_atomic16_clear(&tp->processing_status);
@@ -1295,36 +1301,27 @@ typedef int (test_case_function)(struct active_device *ad,
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
+	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
+				num_to_process);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_to_process);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_dec_op(ops, num_to_process, 0, bufs->inputs,
+				bufs->hard_outputs, bufs->soft_outputs,
+				tp->op_params->ref_dec_op);
+
 	tp->start_time = rte_rdtsc_precise();
 	for (enqueued = 0; enqueued < num_to_process;) {
 
-		uint16_t num_to_enq = burst_sz;
+		num_to_enq = burst_sz;
 
 		if (unlikely(num_to_process - enqueued < num_to_enq))
 			num_to_enq = num_to_process - enqueued;
 
-		ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
-				num_to_enq);
-		if (ret != 0) {
-			allocs_failed++;
-			continue;
-		}
-
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			copy_reference_dec_op(ops, num_to_enq, enqueued,
-					bufs->inputs,
-					bufs->hard_outputs,
-					bufs->soft_outputs,
-					tp->op_params->ref_dec_op);
-
-		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops,
-				num_to_enq);
+		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id,
+				&ops[enqueued], num_to_enq);
 	}
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
-
 	return TEST_SUCCESS;
 }
 
@@ -1333,14 +1330,14 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	struct thread_params *tp = arg;
 	unsigned int enqueued;
-	struct rte_bbdev_enc_op *ops[MAX_BURST];
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
 	const uint16_t num_to_process = tp->op_params->num_to_process;
+	struct rte_bbdev_enc_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
 	struct rte_bbdev_info info;
 	int ret;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1350,6 +1347,11 @@ typedef int (test_case_function)(struct active_device *ad,
 			tp->dev_id, queue_id);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_to_process > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	rte_atomic16_clear(&tp->processing_status);
@@ -1358,35 +1360,26 @@ typedef int (test_case_function)(struct active_device *ad,
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
+	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
+			num_to_process);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_to_process);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_enc_op(ops, num_to_process, 0, bufs->inputs,
+				bufs->hard_outputs, tp->op_params->ref_enc_op);
+
 	tp->start_time = rte_rdtsc_precise();
 	for (enqueued = 0; enqueued < num_to_process;) {
 
-		uint16_t num_to_enq = burst_sz;
+		num_to_enq = burst_sz;
 
 		if (unlikely(num_to_process - enqueued < num_to_enq))
 			num_to_enq = num_to_process - enqueued;
 
-		ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
-				num_to_enq);
-		if (ret != 0) {
-			allocs_failed++;
-			continue;
-		}
-
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			copy_reference_enc_op(ops, num_to_enq, enqueued,
-					bufs->inputs,
-					bufs->hard_outputs,
-					tp->op_params->ref_enc_op);
-
-		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops,
-				num_to_enq);
+		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id,
+				&ops[enqueued], num_to_enq);
 	}
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
-
 	return TEST_SUCCESS;
 }
 
@@ -1394,86 +1387,97 @@ typedef int (test_case_function)(struct active_device *ad,
 throughput_pmd_lcore_dec(void *arg)
 {
 	struct thread_params *tp = arg;
-	unsigned int enqueued, dequeued;
-	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t total_time, start_time;
+	uint16_t enq, deq;
+	uint64_t total_time = 0, start_time;
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
-	const uint16_t num_to_process = tp->op_params->num_to_process;
+	const uint16_t num_ops = tp->op_params->num_to_process;
+	struct rte_bbdev_dec_op *ops_enq[num_ops];
+	struct rte_bbdev_dec_op *ops_deq[num_ops];
 	struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
-	int ret;
+	int i, j, ret;
 	struct rte_bbdev_info info;
-
-	/* Input length in bytes, million operations per second, million bits
-	 * per second.
-	 */
-	double in_len;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_ops > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
-	start_time = rte_rdtsc_precise();
-	for (enqueued = 0, dequeued = 0; dequeued < num_to_process;) {
-		uint16_t deq;
+	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops_enq, num_ops);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops", num_ops);
 
-		if (likely(enqueued < num_to_process)) {
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_dec_op(ops_enq, num_ops, 0, bufs->inputs,
+				bufs->hard_outputs, bufs->soft_outputs, ref_op);
 
-			uint16_t num_to_enq = burst_sz;
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_ops; ++j)
+		ops_enq[j]->opaque_data = (void *)(uintptr_t)j;
 
-			if (unlikely(num_to_process - enqueued < num_to_enq))
-				num_to_enq = num_to_process - enqueued;
+	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-			ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp,
-					ops_enq, num_to_enq);
-			if (ret != 0) {
-				allocs_failed++;
-				goto do_dequeue;
-			}
+		for (j = 0; j < num_ops; ++j) {
+			struct rte_bbdev_dec_op *op = ops_enq[j];
+			rte_pktmbuf_reset(op->turbo_dec.hard_output.data);
+		}
+
+		start_time = rte_rdtsc_precise();
+
+		for (enq = 0, deq = 0; enq < num_ops;) {
+			num_to_enq = burst_sz;
+
+			if (unlikely(num_ops - enq < num_to_enq))
+				num_to_enq = num_ops - enq;
 
-			if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-				copy_reference_dec_op(ops_enq, num_to_enq,
-						enqueued,
-						bufs->inputs,
-						bufs->hard_outputs,
-						bufs->soft_outputs,
-						ref_op);
+			enq += rte_bbdev_enqueue_dec_ops(tp->dev_id,
+					queue_id, &ops_enq[enq], num_to_enq);
 
-			enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id,
-					queue_id, ops_enq, num_to_enq);
+			deq += rte_bbdev_dequeue_dec_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
 		}
-do_dequeue:
-		deq = rte_bbdev_dequeue_dec_ops(tp->dev_id, queue_id, ops_deq,
-				burst_sz);
-		dequeued += deq;
-		rte_bbdev_dec_op_free_bulk(ops_enq, deq);
-	}
-	total_time = rte_rdtsc_precise() - start_time;
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
+		/* dequeue the remaining */
+		while (deq < enq) {
+			deq += rte_bbdev_dequeue_dec_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
+		}
 
-	TEST_ASSERT(enqueued == dequeued, "enqueued (%u) != dequeued (%u)",
-			enqueued, dequeued);
+		total_time += rte_rdtsc_precise() - start_time;
+	}
+
+	tp->iter_count = 0;
+	/* get the max of iter_count for all dequeued ops */
+	for (i = 0; i < num_ops; ++i) {
+		tp->iter_count = RTE_MAX(ops_enq[i]->turbo_dec.iter_count,
+				tp->iter_count);
+	}
 
 	if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
-		ret = validate_dec_buffers(ref_op, bufs, num_to_process);
-		TEST_ASSERT_SUCCESS(ret, "Buffers validation failed");
+		ret = validate_dec_op(ops_deq, num_ops, ref_op,
+				tp->op_params->vector_mask);
+		TEST_ASSERT_SUCCESS(ret, "Validation failed!");
 	}
 
-	in_len = ref_op->turbo_dec.input.length;
-	tp->mops = ((double)num_to_process / 1000000.0) /
-			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	rte_bbdev_dec_op_free_bulk(ops_enq, num_ops);
+
+	double tb_len_bits = calc_dec_TB_size(ref_op);
+
+	tp->ops_per_sec = ((double)num_ops * TEST_REPETITIONS) /
 			((double)total_time / (double)rte_get_tsc_hz());
+	tp->mbps = (((double)(num_ops * TEST_REPETITIONS * tb_len_bits)) /
+			1000000.0) / ((double)total_time /
+			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -1482,91 +1486,94 @@ typedef int (test_case_function)(struct active_device *ad,
 throughput_pmd_lcore_enc(void *arg)
 {
 	struct thread_params *tp = arg;
-	unsigned int enqueued, dequeued;
-	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t total_time, start_time;
+	uint16_t enq, deq;
+	uint64_t total_time = 0, start_time;
 	const uint16_t queue_id = tp->queue_id;
 	const uint16_t burst_sz = tp->op_params->burst_sz;
-	const uint16_t num_to_process = tp->op_params->num_to_process;
+	const uint16_t num_ops = tp->op_params->num_to_process;
+	struct rte_bbdev_enc_op *ops_enq[num_ops];
+	struct rte_bbdev_enc_op *ops_deq[num_ops];
 	struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
 	struct test_buffers *bufs = NULL;
-	unsigned int allocs_failed = 0;
-	int ret;
+	int i, j, ret;
 	struct rte_bbdev_info info;
-
-	/* Input length in bytes, million operations per second, million bits
-	 * per second.
-	 */
-	double in_len;
+	uint16_t num_to_enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
 	rte_bbdev_info_get(tp->dev_id, &info);
+
+	TEST_ASSERT_SUCCESS((num_ops > info.drv.queue_size_lim),
+			"NUM_OPS cannot exceed %u for this device",
+			info.drv.queue_size_lim);
+
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
 	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
 		rte_pause();
 
-	start_time = rte_rdtsc_precise();
-	for (enqueued = 0, dequeued = 0; dequeued < num_to_process;) {
-		uint16_t deq;
+	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops_enq,
+			num_ops);
+	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops",
+			num_ops);
+	if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+		copy_reference_enc_op(ops_enq, num_ops, 0, bufs->inputs,
+				bufs->hard_outputs, ref_op);
 
-		if (likely(enqueued < num_to_process)) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_ops; ++j)
+		ops_enq[j]->opaque_data = (void *)(uintptr_t)j;
 
-			uint16_t num_to_enq = burst_sz;
+	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-			if (unlikely(num_to_process - enqueued < num_to_enq))
-				num_to_enq = num_to_process - enqueued;
+		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
+			for (j = 0; j < num_ops; ++j)
+				rte_pktmbuf_reset(
+					ops_enq[j]->turbo_enc.output.data);
 
-			ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp,
-					ops_enq, num_to_enq);
-			if (ret != 0) {
-				allocs_failed++;
-				goto do_dequeue;
-			}
+		start_time = rte_rdtsc_precise();
+
+		for (enq = 0, deq = 0; enq < num_ops;) {
+			num_to_enq = burst_sz;
 
-			if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-				copy_reference_enc_op(ops_enq, num_to_enq,
-						enqueued,
-						bufs->inputs,
-						bufs->hard_outputs,
-						ref_op);
+			if (unlikely(num_ops - enq < num_to_enq))
+				num_to_enq = num_ops - enq;
 
-			enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id,
-					queue_id, ops_enq, num_to_enq);
+			enq += rte_bbdev_enqueue_enc_ops(tp->dev_id,
+					queue_id, &ops_enq[enq], num_to_enq);
+
+			deq += rte_bbdev_dequeue_enc_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
 		}
-do_dequeue:
-		deq = rte_bbdev_dequeue_enc_ops(tp->dev_id, queue_id, ops_deq,
-				burst_sz);
-		dequeued += deq;
-		rte_bbdev_enc_op_free_bulk(ops_enq, deq);
-	}
-	total_time = rte_rdtsc_precise() - start_time;
 
-	if (allocs_failed > 0)
-		printf("WARNING: op allocations failed: %u times\n",
-				allocs_failed);
+		/* dequeue the remaining */
+		while (deq < enq) {
+			deq += rte_bbdev_dequeue_enc_ops(tp->dev_id,
+					queue_id, &ops_deq[deq], enq - deq);
+		}
 
-	TEST_ASSERT(enqueued == dequeued, "enqueued (%u) != dequeued (%u)",
-			enqueued, dequeued);
+		total_time += rte_rdtsc_precise() - start_time;
+	}
 
 	if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
-		ret = validate_enc_buffers(bufs, num_to_process);
-		TEST_ASSERT_SUCCESS(ret, "Buffers validation failed");
+		ret = validate_enc_op(ops_deq, num_ops, ref_op);
+		TEST_ASSERT_SUCCESS(ret, "Validation failed!");
 	}
 
-	in_len = ref_op->turbo_enc.input.length;
+	double tb_len_bits = calc_enc_TB_size(ref_op);
 
-	tp->mops = ((double)num_to_process / 1000000.0) /
-			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = ((double)num_to_process * in_len * 8 / 1000000.0) /
+	tp->ops_per_sec = ((double)num_ops * TEST_REPETITIONS) /
 			((double)total_time / (double)rte_get_tsc_hz());
+	tp->mbps = (((double)(num_ops * TEST_REPETITIONS * tb_len_bits))
+			/ 1000000.0) / ((double)total_time /
+			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
+
 static void
-print_throughput(struct thread_params *t_params, unsigned int used_cores)
+print_enc_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
 	unsigned int lcore_id, iter = 0;
 	double total_mops = 0, total_mbps = 0;
@@ -1574,10 +1581,11 @@ typedef int (test_case_function)(struct active_device *ad,
 	RTE_LCORE_FOREACH(lcore_id) {
 		if (iter++ >= used_cores)
 			break;
-		printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n",
-				lcore_id, t_params[lcore_id].mops,
+		printf(
+				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
+				lcore_id, t_params[lcore_id].ops_per_sec,
 				t_params[lcore_id].mbps);
-		total_mops += t_params[lcore_id].mops;
+		total_mops += t_params[lcore_id].ops_per_sec;
 		total_mbps += t_params[lcore_id].mbps;
 	}
 	printf(
@@ -1585,6 +1593,30 @@ typedef int (test_case_function)(struct active_device *ad,
 		used_cores, total_mops, total_mbps);
 }
 
+static void
+print_dec_throughput(struct thread_params *t_params, unsigned int used_cores)
+{
+	unsigned int lcore_id, iter = 0;
+	double total_mops = 0, total_mbps = 0;
+	uint8_t iter_count = 0;
+
+	RTE_LCORE_FOREACH(lcore_id) {
+		if (iter++ >= used_cores)
+			break;
+		printf(
+				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
+				lcore_id, t_params[lcore_id].ops_per_sec,
+				t_params[lcore_id].mbps,
+				t_params[lcore_id].iter_count);
+		total_mops += t_params[lcore_id].ops_per_sec;
+		total_mbps += t_params[lcore_id].mbps;
+		iter_count = RTE_MAX(iter_count, t_params[lcore_id].iter_count);
+	}
+	printf(
+		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps @ max %u iterations\n",
+		used_cores, total_mops, total_mbps, iter_count);
+}
+
 /*
  * Test function that determines how long an enqueue + dequeue of a burst
  * takes on available lcores.
@@ -1677,8 +1709,10 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	/* Print throughput if interrupts are disabled and test passed */
 	if (!intr_enabled) {
-		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
-			print_throughput(t_params, num_lcores);
+		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+			print_dec_throughput(t_params, num_lcores);
+		else
+			print_enc_throughput(t_params, num_lcores);
 		return ret;
 	}
 
@@ -1713,9 +1747,12 @@ typedef int (test_case_function)(struct active_device *ad,
 	}
 
 	/* Print throughput if test passed */
-	if (!ret && test_vector.op_type != RTE_BBDEV_OP_NONE)
-		print_throughput(t_params, num_lcores);
-
+	if (!ret) {
+		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+			print_dec_throughput(t_params, num_lcores);
+		else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
+			print_enc_throughput(t_params, num_lcores);
+	}
 	return ret;
 }
 
diff --git a/app/test-bbdev/test_bbdev_vector.c b/app/test-bbdev/test_bbdev_vector.c
index 81b8ee7..45fe999 100644
--- a/app/test-bbdev/test_bbdev_vector.c
+++ b/app/test-bbdev/test_bbdev_vector.c
@@ -412,6 +412,10 @@
 		vector->mask |= TEST_BBDEV_VF_NUM_MAPS;
 		turbo_dec->num_maps = (uint8_t) strtoul(token, &err, 0);
 		ret = ((err == NULL) || (*err != '\0')) ? -1 : 0;
+	} else if (!strcmp(key_token, "r")) {
+		vector->mask |= TEST_BBDEV_VF_R;
+		turbo_dec->tb_params.r = (uint8_t) strtoul(token, &err, 0);
+		ret = ((err == NULL) || (*err != '\0')) ? -1 : 0;
 	} else if (!strcmp(key_token, "code_block_mode")) {
 		vector->mask |= TEST_BBDEV_VF_CODE_BLOCK_MODE;
 		turbo_dec->code_block_mode = (uint8_t) strtoul(token, &err, 0);
@@ -714,6 +718,9 @@
 		if (!(mask & TEST_BBDEV_VF_CAB))
 			printf(
 				"WARNING: cab was not specified in vector file and will be set to 0\n");
+		if (!(mask & TEST_BBDEV_VF_R))
+			printf(
+				"WARNING: r was not specified in vector file and will be set to 0\n");
 	} else {
 		if (!(mask & TEST_BBDEV_VF_E))
 			printf(
diff --git a/lib/librte_bbdev/rte_bbdev_op.h b/lib/librte_bbdev/rte_bbdev_op.h
index 83f62c2..962e2ed 100644
--- a/lib/librte_bbdev/rte_bbdev_op.h
+++ b/lib/librte_bbdev/rte_bbdev_op.h
@@ -216,6 +216,8 @@ struct rte_bbdev_op_dec_tb_params {
 	 * operation when r >= cab
 	 */
 	uint32_t eb;
+	/**< The index of the first CB in the inbound mbuf data, default is 0 */
+	uint8_t r;
 };
 
 /**< Operation structure for Turbo decode.
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test Kamil Chalupnik
@ 2018-12-07 15:15   ` Kamil Chalupnik
  2018-12-13 20:09     ` Mokhtar, Amr
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
  2018-12-13 20:08   ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Mokhtar, Amr
  3 siblings, 1 reply; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 15:15 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Test application and Turbo Software driver were adapted
to support chained-mbuf for bigger TB sizes.

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c                 |  60 +++++++++---
 drivers/baseband/turbo_sw/bbdev_turbo_software.c | 111 ++++++++++++++++-------
 2 files changed, 126 insertions(+), 45 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index a25e3a7..5bec70d 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -114,6 +114,17 @@ typedef int (test_case_function)(struct active_device *ad,
 		struct test_op_params *op_params);
 
 static inline void
+mbuf_reset(struct rte_mbuf *m)
+{
+	m->pkt_len = 0;
+
+	do {
+		m->data_len = 0;
+		m = m->next;
+	} while (m != NULL);
+}
+
+static inline void
 set_avail_op(struct active_device *ad, enum rte_bbdev_op_type op_type)
 {
 	ad->supported_ops |= (1 << op_type);
@@ -573,6 +584,10 @@ typedef int (test_case_function)(struct active_device *ad,
 				op_type, n * ref_entries->nb_segments,
 				mbuf_pool->size);
 
+		TEST_ASSERT_SUCCESS(((seg->length + RTE_PKTMBUF_HEADROOM) >
+				(uint32_t)UINT16_MAX),
+				"Given data is bigger than allowed mbuf segment size");
+
 		bufs[i].data = m_head;
 		bufs[i].offset = 0;
 		bufs[i].length = 0;
@@ -589,7 +604,6 @@ typedef int (test_case_function)(struct active_device *ad,
 			rte_memcpy(data, seg->addr, seg->length);
 			bufs[i].length += seg->length;
 
-
 			for (j = 1; j < ref_entries->nb_segments; ++j) {
 				struct rte_mbuf *m_tail =
 						rte_pktmbuf_alloc(mbuf_pool);
@@ -617,6 +631,24 @@ typedef int (test_case_function)(struct active_device *ad,
 						"Couldn't chain mbufs from %d data type mbuf pool",
 						op_type);
 			}
+
+		} else {
+
+			/* allocate chained-mbuf for output buffer */
+			for (j = 1; j < ref_entries->nb_segments; ++j) {
+				struct rte_mbuf *m_tail =
+						rte_pktmbuf_alloc(mbuf_pool);
+				TEST_ASSERT_NOT_NULL(m_tail,
+						"Not enough mbufs in %d data type mbuf pool (needed %u, available %u)",
+						op_type,
+						n * ref_entries->nb_segments,
+						mbuf_pool->size);
+
+				ret = rte_pktmbuf_chain(m_head, m_tail);
+				TEST_ASSERT_SUCCESS(ret,
+						"Couldn't chain mbufs from %d data type mbuf pool",
+						op_type);
+			}
 		}
 	}
 
@@ -655,7 +687,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		while (m != NULL) {
 			int8_t *llr = rte_pktmbuf_mtod_offset(m, int8_t *,
 					input_ops[i].offset);
-			for (byte_idx = 0; byte_idx < input_ops[i].length;
+			for (byte_idx = 0; byte_idx < rte_pktmbuf_data_len(m);
 					++byte_idx)
 				llr[byte_idx] = round((double)max_llr_modulus *
 						llr[byte_idx] / INT8_MAX);
@@ -864,15 +896,18 @@ typedef int (test_case_function)(struct active_device *ad,
 	uint8_t i;
 	struct rte_mbuf *m = op->data;
 	uint8_t nb_dst_segments = orig_op->nb_segments;
+	uint32_t total_data_size = 0;
 
 	TEST_ASSERT(nb_dst_segments == m->nb_segs,
 			"Number of segments differ in original (%u) and filled (%u) op",
 			nb_dst_segments, m->nb_segs);
 
+	/* Validate each mbuf segment length */
 	for (i = 0; i < nb_dst_segments; ++i) {
 		/* Apply offset to the first mbuf segment */
 		uint16_t offset = (i == 0) ? op->offset : 0;
-		uint16_t data_len = m->data_len - offset;
+		uint16_t data_len = rte_pktmbuf_data_len(m) - offset;
+		total_data_size += orig_op->segments[i].length;
 
 		TEST_ASSERT(orig_op->segments[i].length == data_len,
 				"Length of segment differ in original (%u) and filled (%u) op",
@@ -884,6 +919,12 @@ typedef int (test_case_function)(struct active_device *ad,
 		m = m->next;
 	}
 
+	/* Validate total mbuf pkt length */
+	uint32_t pkt_len = rte_pktmbuf_pkt_len(op->data) - op->offset;
+	TEST_ASSERT(total_data_size == pkt_len,
+			"Length of data differ in original (%u) and filled (%u) op",
+			total_data_size, pkt_len);
+
 	return TEST_SUCCESS;
 }
 
@@ -1427,10 +1468,8 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	for (i = 0; i < TEST_REPETITIONS; ++i) {
 
-		for (j = 0; j < num_ops; ++j) {
-			struct rte_bbdev_dec_op *op = ops_enq[j];
-			rte_pktmbuf_reset(op->turbo_dec.hard_output.data);
-		}
+		for (j = 0; j < num_ops; ++j)
+			mbuf_reset(ops_enq[j]->turbo_dec.hard_output.data);
 
 		start_time = rte_rdtsc_precise();
 
@@ -1529,8 +1568,7 @@ typedef int (test_case_function)(struct active_device *ad,
 
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE)
 			for (j = 0; j < num_ops; ++j)
-				rte_pktmbuf_reset(
-					ops_enq[j]->turbo_enc.output.data);
+				mbuf_reset(ops_enq[j]->turbo_enc.output.data);
 
 		start_time = rte_rdtsc_precise();
 
@@ -2025,7 +2063,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
-		rte_delay_ms(10);
+		rte_delay_us(200);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
@@ -2106,7 +2144,7 @@ typedef int (test_case_function)(struct active_device *ad,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* ensure enqueue has been completed */
-		rte_delay_ms(10);
+		rte_delay_us(200);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 57f6ba1..19fbb55 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -83,6 +83,18 @@ struct turbo_sw_queue {
 	enum rte_bbdev_op_type type;
 } __rte_cache_aligned;
 
+static inline char *
+mbuf_append(struct rte_mbuf *m_head, struct rte_mbuf *m, uint16_t len)
+{
+	if (unlikely(len > rte_pktmbuf_tailroom(m)))
+		return NULL;
+
+	char *tail = (char *)m->buf_addr + m->data_off + m->data_len;
+	m->data_len = (uint16_t)(m->data_len + len);
+	m_head->pkt_len  = (m_head->pkt_len + len);
+	return tail;
+}
+
 /* Calculate index based on Table 5.1.3-3 from TS34.212 */
 static inline int32_t
 compute_idx(uint16_t k)
@@ -437,7 +449,7 @@ struct turbo_sw_queue {
 		return -1;
 	}
 
-	if (in_length - kw < 0) {
+	if (in_length < kw) {
 		rte_bbdev_log(ERR,
 				"Mismatch between input length (%u) and kw (%u)",
 				in_length, kw);
@@ -456,9 +468,9 @@ struct turbo_sw_queue {
 static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		uint8_t r, uint8_t c, uint16_t k, uint16_t ncb,
-		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out,
-		uint16_t in_offset, uint16_t out_offset, uint16_t total_left,
-		struct rte_bbdev_stats *q_stats)
+		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out_head,
+		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
+		uint16_t in_length, struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int16_t k_idx;
@@ -484,7 +496,7 @@ struct turbo_sw_queue {
 	/* CRC24A (for TB) */
 	if ((enc->op_flags & RTE_BBDEV_TURBO_CRC_24A_ATTACH) &&
 		(enc->code_block_mode == 1)) {
-		ret = is_enc_input_valid(k - 24, k_idx, total_left);
+		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -494,7 +506,7 @@ struct turbo_sw_queue {
 		/* Check if there is a room for CRC bits if not use
 		 * the temporary buffer.
 		 */
-		if (rte_pktmbuf_append(m_in, 3) == NULL) {
+		if (mbuf_append(m_in, m_in, 3) == NULL) {
 			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 			in = q->enc_in;
 		} else {
@@ -517,7 +529,7 @@ struct turbo_sw_queue {
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
-		ret = is_enc_input_valid(k - 24, k_idx, total_left);
+		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -527,7 +539,7 @@ struct turbo_sw_queue {
 		/* Check if there is a room for CRC bits if this is the last
 		 * CB in TB. If not use temporary buffer.
 		 */
-		if ((c - r == 1) && (rte_pktmbuf_append(m_in, 3) == NULL)) {
+		if ((c - r == 1) && (mbuf_append(m_in, m_in, 3) == NULL)) {
 			rte_memcpy(q->enc_in, in, (k - 24) >> 3);
 			in = q->enc_in;
 		} else if (c - r > 1) {
@@ -549,7 +561,7 @@ struct turbo_sw_queue {
 		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else {
-		ret = is_enc_input_valid(k, k_idx, total_left);
+		ret = is_enc_input_valid(k, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
@@ -570,7 +582,8 @@ struct turbo_sw_queue {
 		out1 = RTE_PTR_ADD(out0, (k >> 3) + 1);
 		out2 = RTE_PTR_ADD(out1, (k >> 3) + 1);
 	} else {
-		out0 = (uint8_t *)rte_pktmbuf_append(m_out, (k >> 3) * 3 + 2);
+		out0 = (uint8_t *)mbuf_append(m_out_head, m_out,
+				(k >> 3) * 3 + 2);
 		if (out0 == NULL) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			rte_bbdev_log(ERR,
@@ -623,7 +636,7 @@ struct turbo_sw_queue {
 		const uint8_t mask_out[] = {0xFF, 0xC0, 0xF0, 0xFC};
 
 		/* get output data starting address */
-		rm_out = (uint8_t *)rte_pktmbuf_append(m_out, out_len);
+		rm_out = (uint8_t *)mbuf_append(m_out_head, m_out, out_len);
 		if (rm_out == NULL) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			rte_bbdev_log(ERR,
@@ -725,14 +738,16 @@ struct turbo_sw_queue {
 	uint16_t out_offset = enc->output.offset;
 	struct rte_mbuf *m_in = enc->input.data;
 	struct rte_mbuf *m_out = enc->output.data;
-	uint16_t total_left = enc->input.length;
+	struct rte_mbuf *m_out_head = enc->output.data;
+	uint32_t in_length, mbuf_total_left = enc->input.length;
+	uint16_t seg_total_left;
 
 	/* Clear op status */
 	op->status = 0;
 
-	if (total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) {
+	if (mbuf_total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) {
 		rte_bbdev_log(ERR, "TB size (%u) is too big, max: %d",
-				total_left, RTE_BBDEV_MAX_TB_SIZE);
+				mbuf_total_left, RTE_BBDEV_MAX_TB_SIZE);
 		op->status = 1 << RTE_BBDEV_DATA_ERROR;
 		return;
 	}
@@ -755,7 +770,10 @@ struct turbo_sw_queue {
 		r = 0;
 	}
 
-	while (total_left > 0 && r < c) {
+	while (mbuf_total_left > 0 && r < c) {
+
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
 		if (enc->code_block_mode == 0) {
 			k = (r < enc->tb_params.c_neg) ?
 				enc->tb_params.k_neg : enc->tb_params.k_pos;
@@ -769,22 +787,32 @@ struct turbo_sw_queue {
 			e = enc->cb_params.e;
 		}
 
-		process_enc_cb(q, op, r, c, k, ncb, e, m_in,
-				m_out, in_offset, out_offset, total_left,
+		process_enc_cb(q, op, r, c, k, ncb, e, m_in, m_out_head,
+				m_out, in_offset, out_offset, seg_total_left,
 				queue_stats);
 		/* Update total_left */
-		total_left -= (k - crc24_bits) >> 3;
+		in_length = ((k - crc24_bits) >> 3);
+		mbuf_total_left -= in_length;
 		/* Update offsets for next CBs (if exist) */
 		in_offset += (k - crc24_bits) >> 3;
 		if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH)
 			out_offset += e >> 3;
 		else
 			out_offset += (k >> 3) * 3 + 2;
+
+		/* Update offsets */
+		if (seg_total_left == in_length) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			in_offset = 0;
+			out_offset = 0;
+		}
 		r++;
 	}
 
 	/* check if all input data was processed */
-	if (total_left != 0) {
+	if (mbuf_total_left != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR,
 				"Mismatch between mbuf length and included CBs sizes");
@@ -903,8 +931,9 @@ struct turbo_sw_queue {
 static inline void
 process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in,
-		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
-		bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left,
+		struct rte_mbuf *m_out_head, struct rte_mbuf *m_out,
+		uint16_t in_offset, uint16_t out_offset, bool check_crc_24b,
+		uint16_t crc24_overlap, uint16_t in_length,
 		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
@@ -925,7 +954,7 @@ struct turbo_sw_queue {
 
 	k_idx = compute_idx(k);
 
-	ret = is_dec_input_valid(k_idx, kw, total_left);
+	ret = is_dec_input_valid(k_idx, kw, in_length);
 	if (ret != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		return;
@@ -983,7 +1012,8 @@ struct turbo_sw_queue {
 	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 
-	out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3));
+	out = (uint8_t *)mbuf_append(m_out_head, m_out,
+			((k - crc24_overlap) >> 3));
 	if (out == NULL) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR, "Too little space in output mbuf");
@@ -1038,9 +1068,11 @@ struct turbo_sw_queue {
 	struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec;
 	struct rte_mbuf *m_in = dec->input.data;
 	struct rte_mbuf *m_out = dec->hard_output.data;
+	struct rte_mbuf *m_out_head = dec->hard_output.data;
 	uint16_t in_offset = dec->input.offset;
-	uint16_t total_left = dec->input.length;
 	uint16_t out_offset = dec->hard_output.offset;
+	uint32_t mbuf_total_left = dec->input.length;
+	uint16_t seg_total_left;
 
 	/* Clear op status */
 	op->status = 0;
@@ -1062,11 +1094,13 @@ struct turbo_sw_queue {
 		RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP))
 		crc24_overlap = 24;
 
-	while (total_left > 0) {
+	while (mbuf_total_left > 0) {
 		if (dec->code_block_mode == 0)
 			k = (r < dec->tb_params.c_neg) ?
 				dec->tb_params.k_neg : dec->tb_params.k_pos;
 
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
 		/* Calculates circular buffer size (Kw).
 		 * According to 3gpp 36.212 section 5.1.4.2
 		 *   Kw = 3 * Kpi,
@@ -1079,23 +1113,32 @@ struct turbo_sw_queue {
 		 */
 		kw = RTE_ALIGN_CEIL(k + 4, RTE_BBDEV_C_SUBBLOCK) * 3;
 
-		process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset,
-				out_offset, check_bit(dec->op_flags,
+		process_dec_cb(q, op, c, k, kw, m_in, m_out_head, m_out,
+				in_offset, out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
-				total_left, queue_stats);
+				seg_total_left, queue_stats);
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
 		 */
 
-		/* Update total_left */
-		total_left -= kw;
-		/* Update offsets for next CBs (if exist) */
-		in_offset += kw;
-		out_offset += ((k - crc24_overlap) >> 3);
+		mbuf_total_left -= kw;
+
+		/* Update offsets */
+		if (seg_total_left == kw) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			in_offset = 0;
+			out_offset = 0;
+		} else {
+			/* Update offsets for next CBs (if exist) */
+			in_offset += kw;
+			out_offset += ((k - crc24_overlap) >> 3);
+		}
 		r++;
 	}
-	if (total_left != 0) {
+	if (mbuf_total_left != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR,
 				"Mismatch between mbuf length and included Circular buffer sizes");
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test Kamil Chalupnik
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
@ 2018-12-07 15:15   ` Kamil Chalupnik
  2018-12-13 20:10     ` Mokhtar, Amr
  2018-12-13 20:08   ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Mokhtar, Amr
  3 siblings, 1 reply; 13+ messages in thread
From: Kamil Chalupnik @ 2018-12-07 15:15 UTC (permalink / raw)
  To: dev; +Cc: amr.mokhtar, akhil.goyal, Kamil Chalupnik

Improvements added to interrupt test:
- test is run in loop (number of iterations is specified by
TEST_REPETITIONS define) which ensures more accurate results
- mapping cores to thread parameteres was put in order.
Master core is always set at first index. It fixes problem with
running test for only one core

Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
---
 app/test-bbdev/test_bbdev_perf.c | 246 +++++++++++++++++++++++++--------------
 1 file changed, 161 insertions(+), 85 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index 5bec70d..1c4a645 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -77,13 +77,17 @@ struct test_op_params {
 struct thread_params {
 	uint8_t dev_id;
 	uint16_t queue_id;
+	uint32_t lcore_id;
 	uint64_t start_time;
 	double ops_per_sec;
 	double mbps;
 	uint8_t iter_count;
 	rte_atomic16_t nb_dequeued;
 	rte_atomic16_t processing_status;
+	rte_atomic16_t burst_sz;
 	struct test_op_params *op_params;
+	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
+	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
 };
 
 #ifdef RTE_BBDEV_OFFLOAD_COST
@@ -1206,16 +1210,12 @@ typedef int (test_case_function)(struct active_device *ad,
 	uint16_t i;
 	uint64_t total_time;
 	uint16_t deq, burst_sz, num_ops;
-	uint16_t queue_id = INVALID_QUEUE_ID;
-	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
-	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
+	uint16_t queue_id = *(uint16_t *) ret_param;
 	struct rte_bbdev_info info;
 
 	double tb_len_bits;
 
 	struct thread_params *tp = cb_arg;
-	RTE_SET_USED(ret_param);
-	queue_id = tp->queue_id;
 
 	/* Find matching thread params using queue_id */
 	for (i = 0; i < MAX_QUEUES; ++i, ++tp)
@@ -1235,18 +1235,19 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	burst_sz = tp->op_params->burst_sz;
+	burst_sz = rte_atomic16_read(&tp->burst_sz);
 	num_ops = tp->op_params->num_to_process;
 
-	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
-		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops,
+	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
+		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
+				&tp->dec_ops[
+					rte_atomic16_read(&tp->nb_dequeued)],
 				burst_sz);
-		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
-	} else {
-		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops,
+	else
+		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
+				&tp->enc_ops[
+					rte_atomic16_read(&tp->nb_dequeued)],
 				burst_sz);
-		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
-	}
 
 	if (deq < burst_sz) {
 		printf(
@@ -1269,13 +1270,18 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) {
 		struct rte_bbdev_dec_op *ref_op = tp->op_params->ref_dec_op;
-		ret = validate_dec_op(dec_ops, num_ops, ref_op,
+		ret = validate_dec_op(tp->dec_ops, num_ops, ref_op,
 				tp->op_params->vector_mask);
-		rte_bbdev_dec_op_free_bulk(dec_ops, deq);
+		/* get the max of iter_count for all dequeued ops */
+		for (i = 0; i < num_ops; ++i)
+			tp->iter_count = RTE_MAX(
+					tp->dec_ops[i]->turbo_dec.iter_count,
+					tp->iter_count);
+		rte_bbdev_dec_op_free_bulk(tp->dec_ops, deq);
 	} else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC) {
 		struct rte_bbdev_enc_op *ref_op = tp->op_params->ref_enc_op;
-		ret = validate_enc_op(enc_ops, num_ops, ref_op);
-		rte_bbdev_enc_op_free_bulk(enc_ops, deq);
+		ret = validate_enc_op(tp->enc_ops, num_ops, ref_op);
+		rte_bbdev_enc_op_free_bulk(tp->enc_ops, deq);
 	}
 
 	if (ret) {
@@ -1299,9 +1305,9 @@ typedef int (test_case_function)(struct active_device *ad,
 		return;
 	}
 
-	tp->ops_per_sec = ((double)num_ops) /
+	tp->ops_per_sec += ((double)num_ops) /
 			((double)total_time / (double)rte_get_tsc_hz());
-	tp->mbps = (((double)(num_ops * tb_len_bits)) / 1000000.0) /
+	tp->mbps += (((double)(num_ops * tb_len_bits)) / 1000000.0) /
 			((double)total_time / (double)rte_get_tsc_hz());
 
 	rte_atomic16_add(&tp->nb_dequeued, deq);
@@ -1318,8 +1324,8 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_dec_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	int ret;
-	uint16_t num_to_enq;
+	int ret, i, j;
+	uint16_t num_to_enq, enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1351,16 +1357,47 @@ typedef int (test_case_function)(struct active_device *ad,
 				bufs->hard_outputs, bufs->soft_outputs,
 				tp->op_params->ref_dec_op);
 
-	tp->start_time = rte_rdtsc_precise();
-	for (enqueued = 0; enqueued < num_to_process;) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_to_process; ++j)
+		ops[j]->opaque_data = (void *)(uintptr_t)j;
 
-		num_to_enq = burst_sz;
+	for (j = 0; j < TEST_REPETITIONS; ++j) {
+		for (i = 0; i < num_to_process; ++i)
+			rte_pktmbuf_reset(ops[i]->turbo_dec.hard_output.data);
 
-		if (unlikely(num_to_process - enqueued < num_to_enq))
-			num_to_enq = num_to_process - enqueued;
+		tp->start_time = rte_rdtsc_precise();
+		for (enqueued = 0; enqueued < num_to_process;) {
+			num_to_enq = burst_sz;
+
+			if (unlikely(num_to_process - enqueued < num_to_enq))
+				num_to_enq = num_to_process - enqueued;
+
+			enq = 0;
+			do {
+				enq += rte_bbdev_enqueue_dec_ops(tp->dev_id,
+					queue_id, &ops[enqueued],
+					num_to_enq);
+			} while (unlikely(num_to_enq != enq));
+			enqueued += enq;
+
+			/* Write to thread burst_sz current number of enqueued
+			 * descriptors. It ensures that proper number of
+			 * descriptors will be dequeued in callback
+			 * function - needed for last batch in case where
+			 * the number of operations is not a multiple of
+			 * burst size.
+			 */
+			rte_atomic16_set(&tp->burst_sz, num_to_enq);
 
-		enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id,
-				&ops[enqueued], num_to_enq);
+			/* Wait until processing of previous batch is
+			 * completed.
+			 */
+			while (rte_atomic16_read(&tp->nb_dequeued) !=
+					(int16_t) enqueued)
+				rte_pause();
+		}
+		if (j != TEST_REPETITIONS - 1)
+			rte_atomic16_clear(&tp->nb_dequeued);
 	}
 
 	return TEST_SUCCESS;
@@ -1377,8 +1414,8 @@ typedef int (test_case_function)(struct active_device *ad,
 	struct rte_bbdev_enc_op *ops[num_to_process];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	int ret;
-	uint16_t num_to_enq;
+	int ret, i, j;
+	uint16_t num_to_enq, enq;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1409,16 +1446,47 @@ typedef int (test_case_function)(struct active_device *ad,
 		copy_reference_enc_op(ops, num_to_process, 0, bufs->inputs,
 				bufs->hard_outputs, tp->op_params->ref_enc_op);
 
-	tp->start_time = rte_rdtsc_precise();
-	for (enqueued = 0; enqueued < num_to_process;) {
+	/* Set counter to validate the ordering */
+	for (j = 0; j < num_to_process; ++j)
+		ops[j]->opaque_data = (void *)(uintptr_t)j;
+
+	for (j = 0; j < TEST_REPETITIONS; ++j) {
+		for (i = 0; i < num_to_process; ++i)
+			rte_pktmbuf_reset(ops[i]->turbo_enc.output.data);
 
-		num_to_enq = burst_sz;
+		tp->start_time = rte_rdtsc_precise();
+		for (enqueued = 0; enqueued < num_to_process;) {
+			num_to_enq = burst_sz;
 
-		if (unlikely(num_to_process - enqueued < num_to_enq))
-			num_to_enq = num_to_process - enqueued;
+			if (unlikely(num_to_process - enqueued < num_to_enq))
+				num_to_enq = num_to_process - enqueued;
+
+			enq = 0;
+			do {
+				enq += rte_bbdev_enqueue_enc_ops(tp->dev_id,
+						queue_id, &ops[enqueued],
+						num_to_enq);
+			} while (unlikely(enq != num_to_enq));
+			enqueued += enq;
+
+			/* Write to thread burst_sz current number of enqueued
+			 * descriptors. It ensures that proper number of
+			 * descriptors will be dequeued in callback
+			 * function - needed for last batch in case where
+			 * the number of operations is not a multiple of
+			 * burst size.
+			 */
+			rte_atomic16_set(&tp->burst_sz, num_to_enq);
 
-		enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id,
-				&ops[enqueued], num_to_enq);
+			/* Wait until processing of previous batch is
+			 * completed.
+			 */
+			while (rte_atomic16_read(&tp->nb_dequeued) !=
+					(int16_t) enqueued)
+				rte_pause();
+		}
+		if (j != TEST_REPETITIONS - 1)
+			rte_atomic16_clear(&tp->nb_dequeued);
 	}
 
 	return TEST_SUCCESS;
@@ -1613,18 +1681,16 @@ typedef int (test_case_function)(struct active_device *ad,
 static void
 print_enc_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
-	unsigned int lcore_id, iter = 0;
+	unsigned int iter = 0;
 	double total_mops = 0, total_mbps = 0;
 
-	RTE_LCORE_FOREACH(lcore_id) {
-		if (iter++ >= used_cores)
-			break;
+	for (iter = 0; iter < used_cores; iter++) {
 		printf(
-				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
-				lcore_id, t_params[lcore_id].ops_per_sec,
-				t_params[lcore_id].mbps);
-		total_mops += t_params[lcore_id].ops_per_sec;
-		total_mbps += t_params[lcore_id].mbps;
+			"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps\n",
+			t_params[iter].lcore_id, t_params[iter].ops_per_sec,
+			t_params[iter].mbps);
+		total_mops += t_params[iter].ops_per_sec;
+		total_mbps += t_params[iter].mbps;
 	}
 	printf(
 		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n",
@@ -1634,21 +1700,18 @@ typedef int (test_case_function)(struct active_device *ad,
 static void
 print_dec_throughput(struct thread_params *t_params, unsigned int used_cores)
 {
-	unsigned int lcore_id, iter = 0;
+	unsigned int iter = 0;
 	double total_mops = 0, total_mbps = 0;
 	uint8_t iter_count = 0;
 
-	RTE_LCORE_FOREACH(lcore_id) {
-		if (iter++ >= used_cores)
-			break;
+	for (iter = 0; iter < used_cores; iter++) {
 		printf(
-				"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
-				lcore_id, t_params[lcore_id].ops_per_sec,
-				t_params[lcore_id].mbps,
-				t_params[lcore_id].iter_count);
-		total_mops += t_params[lcore_id].ops_per_sec;
-		total_mbps += t_params[lcore_id].mbps;
-		iter_count = RTE_MAX(iter_count, t_params[lcore_id].iter_count);
+			"Throughput for core (%u): %.8lg Ops/s, %.8lg Mbps @ max %u iterations\n",
+			t_params[iter].lcore_id, t_params[iter].ops_per_sec,
+			t_params[iter].mbps, t_params[iter].iter_count);
+		total_mops += t_params[iter].ops_per_sec;
+		total_mbps += t_params[iter].mbps;
+		iter_count = RTE_MAX(iter_count, t_params[iter].iter_count);
 	}
 	printf(
 		"\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps @ max %u iterations\n",
@@ -1665,10 +1728,9 @@ typedef int (test_case_function)(struct active_device *ad,
 {
 	int ret;
 	unsigned int lcore_id, used_cores = 0;
-	struct thread_params t_params[MAX_QUEUES];
+	struct thread_params *t_params, *tp;
 	struct rte_bbdev_info info;
 	lcore_function_t *throughput_function;
-	struct thread_params *tp;
 	uint16_t num_lcores;
 	const char *op_type_str;
 
@@ -1691,6 +1753,13 @@ typedef int (test_case_function)(struct active_device *ad,
 			? ad->nb_queues
 			: op_params->num_lcores;
 
+	/* Allocate memory for thread parameters structure */
+	t_params = rte_zmalloc(NULL, num_lcores * sizeof(struct thread_params),
+			RTE_CACHE_LINE_SIZE);
+	TEST_ASSERT_NOT_NULL(t_params, "Failed to alloc %zuB for t_params",
+			RTE_ALIGN(sizeof(struct thread_params) * num_lcores,
+				RTE_CACHE_LINE_SIZE));
+
 	if (intr_enabled) {
 		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
 			throughput_function = throughput_intr_lcore_dec;
@@ -1700,9 +1769,11 @@ typedef int (test_case_function)(struct active_device *ad,
 		/* Dequeue interrupt callback registration */
 		ret = rte_bbdev_callback_register(ad->dev_id,
 				RTE_BBDEV_EVENT_DEQUEUE, dequeue_event_callback,
-				&t_params);
-		if (ret < 0)
+				t_params);
+		if (ret < 0) {
+			rte_free(t_params);
 			return ret;
+		}
 	} else {
 		if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
 			throughput_function = throughput_pmd_lcore_dec;
@@ -1712,38 +1783,39 @@ typedef int (test_case_function)(struct active_device *ad,
 
 	rte_atomic16_set(&op_params->sync, SYNC_WAIT);
 
-	t_params[rte_lcore_id()].dev_id = ad->dev_id;
-	t_params[rte_lcore_id()].op_params = op_params;
-	t_params[rte_lcore_id()].queue_id =
-			ad->queue_ids[used_cores++];
+	/* Master core is set at first entry */
+	t_params[0].dev_id = ad->dev_id;
+	t_params[0].lcore_id = rte_lcore_id();
+	t_params[0].op_params = op_params;
+	t_params[0].queue_id = ad->queue_ids[used_cores++];
+	t_params[0].iter_count = 0;
 
 	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
 		if (used_cores >= num_lcores)
 			break;
 
-		t_params[lcore_id].dev_id = ad->dev_id;
-		t_params[lcore_id].op_params = op_params;
-		t_params[lcore_id].queue_id = ad->queue_ids[used_cores++];
+		t_params[used_cores].dev_id = ad->dev_id;
+		t_params[used_cores].lcore_id = lcore_id;
+		t_params[used_cores].op_params = op_params;
+		t_params[used_cores].queue_id = ad->queue_ids[used_cores];
+		t_params[used_cores].iter_count = 0;
 
-		rte_eal_remote_launch(throughput_function, &t_params[lcore_id],
-				lcore_id);
+		rte_eal_remote_launch(throughput_function,
+				&t_params[used_cores++], lcore_id);
 	}
 
 	rte_atomic16_set(&op_params->sync, SYNC_START);
-	ret = throughput_function(&t_params[rte_lcore_id()]);
+	ret = throughput_function(&t_params[0]);
 
 	/* Master core is always used */
-	used_cores = 1;
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-		if (used_cores++ >= num_lcores)
-			break;
-
-		ret |= rte_eal_wait_lcore(lcore_id);
-	}
+	for (used_cores = 1; used_cores < num_lcores; used_cores++)
+		ret |= rte_eal_wait_lcore(t_params[used_cores].lcore_id);
 
 	/* Return if test failed */
-	if (ret)
+	if (ret) {
+		rte_free(t_params);
 		return ret;
+	}
 
 	/* Print throughput if interrupts are disabled and test passed */
 	if (!intr_enabled) {
@@ -1751,6 +1823,7 @@ typedef int (test_case_function)(struct active_device *ad,
 			print_dec_throughput(t_params, num_lcores);
 		else
 			print_enc_throughput(t_params, num_lcores);
+		rte_free(t_params);
 		return ret;
 	}
 
@@ -1759,21 +1832,20 @@ typedef int (test_case_function)(struct active_device *ad,
 	 * error using processing_status variable.
 	 * Wait for master lcore operations.
 	 */
-	tp = &t_params[rte_lcore_id()];
+	tp = &t_params[0];
 	while ((rte_atomic16_read(&tp->nb_dequeued) <
 			op_params->num_to_process) &&
 			(rte_atomic16_read(&tp->processing_status) !=
 			TEST_FAILED))
 		rte_pause();
 
+	tp->ops_per_sec /= TEST_REPETITIONS;
+	tp->mbps /= TEST_REPETITIONS;
 	ret |= rte_atomic16_read(&tp->processing_status);
 
 	/* Wait for slave lcores operations */
-	used_cores = 1;
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-		tp = &t_params[lcore_id];
-		if (used_cores++ >= num_lcores)
-			break;
+	for (used_cores = 1; used_cores < num_lcores; used_cores++) {
+		tp = &t_params[used_cores];
 
 		while ((rte_atomic16_read(&tp->nb_dequeued) <
 				op_params->num_to_process) &&
@@ -1781,6 +1853,8 @@ typedef int (test_case_function)(struct active_device *ad,
 				TEST_FAILED))
 			rte_pause();
 
+		tp->ops_per_sec /= TEST_REPETITIONS;
+		tp->mbps /= TEST_REPETITIONS;
 		ret |= rte_atomic16_read(&tp->processing_status);
 	}
 
@@ -1791,6 +1865,8 @@ typedef int (test_case_function)(struct active_device *ad,
 		else if (test_vector.op_type == RTE_BBDEV_OP_TURBO_ENC)
 			print_enc_throughput(t_params, num_lcores);
 	}
+
+	rte_free(t_params);
 	return ret;
 }
 
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test
  2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
                     ` (2 preceding siblings ...)
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
@ 2018-12-13 20:08   ` Mokhtar, Amr
  2018-12-18 10:25     ` Akhil Goyal
  3 siblings, 1 reply; 13+ messages in thread
From: Mokhtar, Amr @ 2018-12-13 20:08 UTC (permalink / raw)
  To: Chalupnik, KamilX, dev; +Cc: akhil.goyal


> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Friday 7 December 2018 15:16
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
> Subject: [PATCH v2 1/4] baseband: enhancement of offload cost test
> 
> Offload cost test was improved in order to collect
> more accurate results.
> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test Kamil Chalupnik
@ 2018-12-13 20:09     ` Mokhtar, Amr
  0 siblings, 0 replies; 13+ messages in thread
From: Mokhtar, Amr @ 2018-12-13 20:09 UTC (permalink / raw)
  To: Chalupnik, KamilX, dev; +Cc: akhil.goyal


> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Friday 7 December 2018 15:16
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
> Subject: [PATCH v2 2/4] baseband: enhancement of throughput test
> 
> Improvements added to throughput test:
> - test is run in loop (number of iterations is specified by
> TEST_REPETITIONS define) which ensures more accurate results
> - length of input data is calculated based on amount of CBs in TB
> - maximum number of decoding iterations is gathered from results
> - added new functions responsible for printing results
> - small fixes for memory management
> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
@ 2018-12-13 20:09     ` Mokhtar, Amr
  0 siblings, 0 replies; 13+ messages in thread
From: Mokhtar, Amr @ 2018-12-13 20:09 UTC (permalink / raw)
  To: Chalupnik, KamilX, dev; +Cc: akhil.goyal


> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Friday 7 December 2018 15:16
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
> Subject: [PATCH v2 3/4] baseband: support bigger Transpor Block
> 
> Test application and Turbo Software driver were adapted
> to support chained-mbuf for bigger TB sizes.
> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test
  2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
@ 2018-12-13 20:10     ` Mokhtar, Amr
  0 siblings, 0 replies; 13+ messages in thread
From: Mokhtar, Amr @ 2018-12-13 20:10 UTC (permalink / raw)
  To: Chalupnik, KamilX, dev; +Cc: akhil.goyal


> -----Original Message-----
> From: Chalupnik, KamilX
> Sent: Friday 7 December 2018 15:16
> To: dev@dpdk.org
> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
> Subject: [PATCH v2 4/4] baseband: enhancement of interrupt test
> 
> Improvements added to interrupt test:
> - test is run in loop (number of iterations is specified by
> TEST_REPETITIONS define) which ensures more accurate results
> - mapping cores to thread parameteres was put in order.
> Master core is always set at first index. It fixes problem with
> running test for only one core
> 
> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
> ---

Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test
  2018-12-13 20:08   ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Mokhtar, Amr
@ 2018-12-18 10:25     ` Akhil Goyal
  0 siblings, 0 replies; 13+ messages in thread
From: Akhil Goyal @ 2018-12-18 10:25 UTC (permalink / raw)
  To: Mokhtar, Amr, Chalupnik, KamilX, dev



On 12/14/2018 1:38 AM, Mokhtar, Amr wrote:
>> -----Original Message-----
>> From: Chalupnik, KamilX
>> Sent: Friday 7 December 2018 15:16
>> To: dev@dpdk.org
>> Cc: Mokhtar, Amr <amr.mokhtar@intel.com>; akhil.goyal@nxp.com;
>> Chalupnik, KamilX <kamilx.chalupnik@intel.com>
>> Subject: [PATCH v2 1/4] baseband: enhancement of offload cost test
>>
>> Offload cost test was improved in order to collect
>> more accurate results.
>>
>> Signed-off-by: Kamil Chalupnik <kamilx.chalupnik@intel.com>
>> ---
> Acked-by: Amr Mokhtar <amr.mokhtar@intel.com>
patchset Applied to dpdk-next-crypto

Thanks

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-12-18 10:25 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-07 14:31 [dpdk-dev] [PATCH 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
2018-12-07 14:31 ` [dpdk-dev] [PATCH 2/4] baseband: enhancement of throughput test Kamil Chalupnik
2018-12-07 14:31 ` [dpdk-dev] [PATCH 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
2018-12-07 15:15 ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Kamil Chalupnik
2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 2/4] baseband: enhancement of throughput test Kamil Chalupnik
2018-12-13 20:09     ` Mokhtar, Amr
2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
2018-12-13 20:09     ` Mokhtar, Amr
2018-12-07 15:15   ` [dpdk-dev] [PATCH v2 4/4] baseband: enhancement of interrupt test Kamil Chalupnik
2018-12-13 20:10     ` Mokhtar, Amr
2018-12-13 20:08   ` [dpdk-dev] [PATCH v2 1/4] baseband: enhancement of offload cost test Mokhtar, Amr
2018-12-18 10:25     ` Akhil Goyal
2018-12-07 14:31 [dpdk-dev] [PATCH 3/4] baseband: support bigger Transpor Block Kamil Chalupnik
2018-12-07 15:07 ` [dpdk-dev] [PATCH v2 " Kamil Chalupnik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).