v4: rebased on main latest v3: apologize again for typo and not double checking with check-git-log v2: typos missed in commit messages Serie updating and extending the app running the bbdev-test for the existing bbdev PMDs. Nicolas Chautru (7): app/bbdev: add explicit ut for latency vs validation app/bbdev: add explicit check for counters app/bbdev: include explict HARQ preloading app/bbdev: define wait for offload app/bbdev: skip bler ut when compression is used app/bbdev: reduce duration of throughput test app/bbdev: update offload test to dequeue full ring app/test-bbdev/main.h | 1 + app/test-bbdev/test_bbdev_perf.c | 193 ++++++++++++++++++++++++++++++--------- 2 files changed, 152 insertions(+), 42 deletions(-) -- 1.8.3.1
Adding explicit different ut when testing for validation or latency (early termination enabled or not). Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Aidan Goddard <aidan.goddard@accelercomm.com> Acked-by: Dave Burley <dave.burley@accelercomm.com> --- app/test-bbdev/test_bbdev_perf.c | 92 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index 6e5535d..3554a77 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -3999,12 +3999,14 @@ typedef int (test_case_function)(struct active_device *ad, return i; } +/* Test case for latency/validation for LDPC Decoder */ static int latency_test_ldpc_dec(struct rte_mempool *mempool, struct test_buffers *bufs, struct rte_bbdev_dec_op *ref_op, int vector_mask, uint16_t dev_id, uint16_t queue_id, const uint16_t num_to_process, uint16_t burst_sz, - uint64_t *total_time, uint64_t *min_time, uint64_t *max_time) + uint64_t *total_time, uint64_t *min_time, uint64_t *max_time, + bool disable_et) { int ret = TEST_SUCCESS; uint16_t i, j, dequeued; @@ -4026,7 +4028,7 @@ typedef int (test_case_function)(struct active_device *ad, "rte_bbdev_dec_op_alloc_bulk() failed"); /* For latency tests we need to disable early termination */ - if (check_bit(ref_op->ldpc_dec.op_flags, + if (disable_et && check_bit(ref_op->ldpc_dec.op_flags, RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE)) ref_op->ldpc_dec.op_flags -= RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE; @@ -4248,7 +4250,7 @@ typedef int (test_case_function)(struct active_device *ad, TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type); printf("+ ------------------------------------------------------- +\n"); - printf("== test: validation/latency\ndev: %s, burst size: %u, num ops: %u, op type: %s\n", + printf("== test: latency\ndev: %s, burst size: %u, num ops: %u, op type: %s\n", info.dev_name, burst_sz, num_to_process, op_type_str); if (op_type == RTE_BBDEV_OP_TURBO_DEC) @@ -4270,7 +4272,83 @@ typedef int (test_case_function)(struct active_device *ad, iter = latency_test_ldpc_dec(op_params->mp, bufs, op_params->ref_dec_op, op_params->vector_mask, ad->dev_id, queue_id, num_to_process, + burst_sz, &total_time, &min_time, &max_time, + true); + else + iter = latency_test_enc(op_params->mp, bufs, + op_params->ref_enc_op, + ad->dev_id, queue_id, + num_to_process, burst_sz, &total_time, + &min_time, &max_time); + + if (iter <= 0) + return TEST_FAILED; + + printf("Operation latency:\n" + "\tavg: %lg cycles, %lg us\n" + "\tmin: %lg cycles, %lg us\n" + "\tmax: %lg cycles, %lg us\n", + (double)total_time / (double)iter, + (double)(total_time * 1000000) / (double)iter / + (double)rte_get_tsc_hz(), (double)min_time, + (double)(min_time * 1000000) / (double)rte_get_tsc_hz(), + (double)max_time, (double)(max_time * 1000000) / + (double)rte_get_tsc_hz()); + + return TEST_SUCCESS; +} + +static int +validation_test(struct active_device *ad, + struct test_op_params *op_params) +{ + int iter; + uint16_t burst_sz = op_params->burst_sz; + const uint16_t num_to_process = op_params->num_to_process; + const enum rte_bbdev_op_type op_type = test_vector.op_type; + const uint16_t queue_id = ad->queue_ids[0]; + struct test_buffers *bufs = NULL; + struct rte_bbdev_info info; + uint64_t total_time, min_time, max_time; + const char *op_type_str; + + total_time = max_time = 0; + min_time = UINT64_MAX; + + TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST), + "BURST_SIZE should be <= %u", MAX_BURST); + + rte_bbdev_info_get(ad->dev_id, &info); + bufs = &op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id]; + + op_type_str = rte_bbdev_op_type_str(op_type); + TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type); + + printf("+ ------------------------------------------------------- +\n"); + printf("== test: validation\ndev: %s, burst size: %u, num ops: %u, op type: %s\n", + info.dev_name, burst_sz, num_to_process, op_type_str); + + if (op_type == RTE_BBDEV_OP_TURBO_DEC) + iter = latency_test_dec(op_params->mp, bufs, + op_params->ref_dec_op, op_params->vector_mask, + ad->dev_id, queue_id, num_to_process, burst_sz, &total_time, &min_time, &max_time); + else if (op_type == RTE_BBDEV_OP_TURBO_ENC) + iter = latency_test_enc(op_params->mp, bufs, + op_params->ref_enc_op, ad->dev_id, queue_id, + num_to_process, burst_sz, &total_time, + &min_time, &max_time); + else if (op_type == RTE_BBDEV_OP_LDPC_ENC) + iter = latency_test_ldpc_enc(op_params->mp, bufs, + op_params->ref_enc_op, ad->dev_id, queue_id, + num_to_process, burst_sz, &total_time, + &min_time, &max_time); + else if (op_type == RTE_BBDEV_OP_LDPC_DEC) + iter = latency_test_ldpc_dec(op_params->mp, bufs, + op_params->ref_dec_op, op_params->vector_mask, + ad->dev_id, queue_id, num_to_process, + burst_sz, &total_time, &min_time, &max_time, + false); else iter = latency_test_enc(op_params->mp, bufs, op_params->ref_enc_op, @@ -4930,6 +5008,12 @@ typedef int (test_case_function)(struct active_device *ad, } static int +validation_tc(void) +{ + return run_test_case(validation_test); +} + +static int interrupt_tc(void) { return run_test_case(throughput_test); @@ -4960,7 +5044,7 @@ typedef int (test_case_function)(struct active_device *ad, .setup = testsuite_setup, .teardown = testsuite_teardown, .unit_test_cases = { - TEST_CASE_ST(ut_setup, ut_teardown, latency_tc), + TEST_CASE_ST(ut_setup, ut_teardown, validation_tc), TEST_CASES_END() /**< NULL terminate unit test array */ } }; -- 1.8.3.1
Adding explict check in ut that the stats counters have the expect values. Was missing for coverage. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Aidan Goddard <aidan.goddard@accelercomm.com> Acked-by: Dave Burley <dave.burley@accelercomm.com> --- app/test-bbdev/test_bbdev_perf.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index 3554a77..b62848e 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -4840,6 +4840,23 @@ typedef int (test_case_function)(struct active_device *ad, (double)(time_st.deq_max_time * 1000000) / rte_get_tsc_hz()); + struct rte_bbdev_stats stats = {0}; + get_bbdev_queue_stats(ad->dev_id, queue_id, &stats); + if (op_type != RTE_BBDEV_OP_LDPC_DEC) { + TEST_ASSERT_SUCCESS(stats.enqueued_count != num_to_process, + "Mismatch in enqueue count %10"PRIu64" %d", + stats.enqueued_count, num_to_process); + TEST_ASSERT_SUCCESS(stats.dequeued_count != num_to_process, + "Mismatch in dequeue count %10"PRIu64" %d", + stats.dequeued_count, num_to_process); + } + TEST_ASSERT_SUCCESS(stats.enqueue_err_count != 0, + "Enqueue count Error %10"PRIu64"", + stats.enqueue_err_count); + TEST_ASSERT_SUCCESS(stats.dequeue_err_count != 0, + "Dequeue count Error (%10"PRIu64"", + stats.dequeue_err_count); + return TEST_SUCCESS; #endif } -- 1.8.3.1
Run preloading explictly for unit tests. Load each code block by reusing existing input op then restore for the actual test. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Liu Tianjiao <tianjiao.liu@intel.com> --- app/test-bbdev/main.h | 1 + app/test-bbdev/test_bbdev_perf.c | 51 +++++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/app/test-bbdev/main.h b/app/test-bbdev/main.h index fb3dec8..dc10a50 100644 --- a/app/test-bbdev/main.h +++ b/app/test-bbdev/main.h @@ -17,6 +17,7 @@ #define TEST_SKIPPED 1 #define MAX_BURST 512U +#define MAX_OPS 1024U #define DEFAULT_BURST 32U #define DEFAULT_OPS 64U #define DEFAULT_ITER 6U diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index b62848e..f30cbdb 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -2513,20 +2513,20 @@ typedef int (test_case_function)(struct active_device *ad, bool preload) { uint16_t j; - int ret; - uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * 1024; - struct rte_bbdev_op_data save_hc_in, save_hc_out; - struct rte_bbdev_dec_op *ops_deq[MAX_BURST]; + int deq; + uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * MAX_OPS; + struct rte_bbdev_op_data save_hc_in[MAX_OPS], save_hc_out[MAX_OPS]; + struct rte_bbdev_dec_op *ops_deq[MAX_OPS]; uint32_t flags = ops[0]->ldpc_dec.op_flags; bool mem_in = flags & RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_IN_ENABLE; bool hc_in = flags & RTE_BBDEV_LDPC_HQ_COMBINE_IN_ENABLE; bool mem_out = flags & RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_OUT_ENABLE; bool hc_out = flags & RTE_BBDEV_LDPC_HQ_COMBINE_OUT_ENABLE; bool h_comp = flags & RTE_BBDEV_LDPC_HARQ_6BIT_COMPRESSION; - for (j = 0; j < n; ++j) { - if ((mem_in || hc_in) && preload) { - save_hc_in = ops[j]->ldpc_dec.harq_combined_input; - save_hc_out = ops[j]->ldpc_dec.harq_combined_output; + if ((mem_in || hc_in) && preload) { + for (j = 0; j < n; ++j) { + save_hc_in[j] = ops[j]->ldpc_dec.harq_combined_input; + save_hc_out[j] = ops[j]->ldpc_dec.harq_combined_output; ops[j]->ldpc_dec.op_flags = RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_LOOPBACK + RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_OUT_ENABLE; @@ -2536,16 +2536,23 @@ typedef int (test_case_function)(struct active_device *ad, ops[j]->ldpc_dec.harq_combined_output.offset = harq_offset; ops[j]->ldpc_dec.harq_combined_input.offset = 0; - rte_bbdev_enqueue_ldpc_dec_ops(dev_id, queue_id, - &ops[j], 1); - ret = 0; - while (ret == 0) - ret = rte_bbdev_dequeue_ldpc_dec_ops( - dev_id, queue_id, &ops_deq[j], 1); + harq_offset += HARQ_INCR; + } + rte_bbdev_enqueue_ldpc_dec_ops(dev_id, queue_id, &ops[0], n); + deq = 0; + while (deq != n) + deq += rte_bbdev_dequeue_ldpc_dec_ops( + dev_id, queue_id, &ops_deq[deq], + n - deq); + /* Restore the operations */ + for (j = 0; j < n; ++j) { ops[j]->ldpc_dec.op_flags = flags; - ops[j]->ldpc_dec.harq_combined_input = save_hc_in; - ops[j]->ldpc_dec.harq_combined_output = save_hc_out; + ops[j]->ldpc_dec.harq_combined_input = save_hc_in[j]; + ops[j]->ldpc_dec.harq_combined_output = save_hc_out[j]; } + } + harq_offset = (uint32_t) queue_id * HARQ_INCR * MAX_OPS; + for (j = 0; j < n; ++j) { /* Adjust HARQ offset when we reach external DDR */ if (mem_in || hc_in) ops[j]->ldpc_dec.harq_combined_input.offset @@ -3231,11 +3238,9 @@ typedef int (test_case_function)(struct active_device *ad, mbuf_reset( ops_enq[j]->ldpc_dec.harq_combined_output.data); } - if (extDdr) { - bool preload = i == (TEST_REPETITIONS - 1); + if (extDdr) preload_harq_ddr(tp->dev_id, queue_id, ops_enq, - num_ops, preload); - } + num_ops, true); start_time = rte_rdtsc_precise(); for (enq = 0, deq = 0; enq < num_ops;) { @@ -3362,11 +3367,9 @@ typedef int (test_case_function)(struct active_device *ad, mbuf_reset( ops_enq[j]->ldpc_dec.harq_combined_output.data); } - if (extDdr) { - bool preload = i == (TEST_REPETITIONS - 1); + if (extDdr) preload_harq_ddr(tp->dev_id, queue_id, ops_enq, - num_ops, preload); - } + num_ops, true); start_time = rte_rdtsc_precise(); for (enq = 0, deq = 0; enq < num_ops;) { -- 1.8.3.1
Replacing magic number for default wait time for hw offload. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Liu Tianjiao <tianjiao.liu@intel.com> --- app/test-bbdev/test_bbdev_perf.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index f30cbdb..39f06db 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -25,6 +25,7 @@ #define MAX_QUEUES RTE_MAX_LCORE #define TEST_REPETITIONS 1000 +#define WAIT_OFFLOAD_US 1000 #ifdef RTE_LIBRTE_PMD_BBDEV_FPGA_LTE_FEC #include <fpga_lte_fec.h> @@ -4451,7 +4452,7 @@ typedef int (test_case_function)(struct active_device *ad, time_st->enq_acc_total_time += stats.acc_offload_cycles; /* give time for device to process ops */ - rte_delay_us(200); + rte_delay_us(WAIT_OFFLOAD_US); /* Start time meas for dequeue function offload latency */ deq_start_time = rte_rdtsc_precise(); @@ -4542,7 +4543,7 @@ typedef int (test_case_function)(struct active_device *ad, time_st->enq_acc_total_time += stats.acc_offload_cycles; /* give time for device to process ops */ - rte_delay_us(200); + rte_delay_us(WAIT_OFFLOAD_US); /* Start time meas for dequeue function offload latency */ deq_start_time = rte_rdtsc_precise(); @@ -4630,7 +4631,7 @@ typedef int (test_case_function)(struct active_device *ad, time_st->enq_acc_total_time += stats.acc_offload_cycles; /* give time for device to process ops */ - rte_delay_us(200); + rte_delay_us(WAIT_OFFLOAD_US); /* Start time meas for dequeue function offload latency */ deq_start_time = rte_rdtsc_precise(); @@ -4713,7 +4714,7 @@ typedef int (test_case_function)(struct active_device *ad, time_st->enq_acc_total_time += stats.acc_offload_cycles; /* give time for device to process ops */ - rte_delay_us(200); + rte_delay_us(WAIT_OFFLOAD_US); /* Start time meas for dequeue function offload latency */ deq_start_time = rte_rdtsc_precise(); -- 1.8.3.1
bler test results are not valid when LLR compression is used or for loopback scenarios. Skipping these. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Aidan Goddard <aidan.goddard@accelercomm.com> Acked-by: Dave Burley <dave.burley@accelercomm.com> --- app/test-bbdev/test_bbdev_perf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index 39f06db..a15ea69 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -3719,7 +3719,11 @@ typedef int (test_case_function)(struct active_device *ad, RTE_ALIGN(sizeof(struct thread_params) * num_lcores, RTE_CACHE_LINE_SIZE)); - if (test_vector.op_type == RTE_BBDEV_OP_LDPC_DEC) + if ((test_vector.op_type == RTE_BBDEV_OP_LDPC_DEC) && + !check_bit(test_vector.ldpc_dec.op_flags, + RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_LOOPBACK) + && !check_bit(test_vector.ldpc_dec.op_flags, + RTE_BBDEV_LDPC_LLR_COMPRESSION)) bler_function = bler_pmd_lcore_ldpc_dec; else return TEST_SKIPPED; -- 1.8.3.1
Reducing number of repetitions from 1000 to 100 to save time. Results are accurate enough with 100 loops. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Liu Tianjiao <tianjiao.liu@intel.com> --- app/test-bbdev/test_bbdev_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index a15ea69..b5dc536 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -24,7 +24,7 @@ #define GET_SOCKET(socket_id) (((socket_id) == SOCKET_ID_ANY) ? 0 : (socket_id)) #define MAX_QUEUES RTE_MAX_LCORE -#define TEST_REPETITIONS 1000 +#define TEST_REPETITIONS 100 #define WAIT_OFFLOAD_US 1000 #ifdef RTE_LIBRTE_PMD_BBDEV_FPGA_LTE_FEC -- 1.8.3.1
update offload dequeue to retrieve the full ring to be agnostic of implementation. Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com> Acked-by: Aidan Goddard <aidan.goddard@accelercomm.com> Acked-by: Dave Burley <dave.burley@accelercomm.com> --- app/test-bbdev/test_bbdev_perf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index b5dc536..a6884c5 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -4463,8 +4463,8 @@ typedef int (test_case_function)(struct active_device *ad, /* Dequeue one operation */ do { deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id, - &ops_deq[deq], 1); - } while (unlikely(deq != 1)); + &ops_deq[deq], enq); + } while (unlikely(deq == 0)); deq_last_time = rte_rdtsc_precise() - deq_start_time; time_st->deq_max_time = RTE_MAX(time_st->deq_max_time, @@ -4554,8 +4554,8 @@ typedef int (test_case_function)(struct active_device *ad, /* Dequeue one operation */ do { deq += rte_bbdev_dequeue_ldpc_dec_ops(dev_id, queue_id, - &ops_deq[deq], 1); - } while (unlikely(deq != 1)); + &ops_deq[deq], enq); + } while (unlikely(deq == 0)); deq_last_time = rte_rdtsc_precise() - deq_start_time; time_st->deq_max_time = RTE_MAX(time_st->deq_max_time, @@ -4642,8 +4642,8 @@ typedef int (test_case_function)(struct active_device *ad, /* Dequeue one operation */ do { deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id, - &ops_deq[deq], 1); - } while (unlikely(deq != 1)); + &ops_deq[deq], enq); + } while (unlikely(deq == 0)); deq_last_time = rte_rdtsc_precise() - deq_start_time; time_st->deq_max_time = RTE_MAX(time_st->deq_max_time, @@ -4725,8 +4725,8 @@ typedef int (test_case_function)(struct active_device *ad, /* Dequeue one operation */ do { deq += rte_bbdev_dequeue_ldpc_enc_ops(dev_id, queue_id, - &ops_deq[deq], 1); - } while (unlikely(deq != 1)); + &ops_deq[deq], enq); + } while (unlikely(deq == 0)); deq_last_time = rte_rdtsc_precise() - deq_start_time; time_st->deq_max_time = RTE_MAX(time_st->deq_max_time, -- 1.8.3.1