From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga05.intel.com (mga05.intel.com [192.55.52.43]) by dpdk.org (Postfix) with ESMTP id 1BFD5AACC for ; Tue, 17 Apr 2018 16:35:41 +0200 (CEST) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga001.jf.intel.com ([10.7.209.18]) by fmsmga105.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 17 Apr 2018 07:35:40 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.48,463,1517904000"; d="scan'208";a="48585916" Received: from kchalupx-mobl.ger.corp.intel.com ([10.103.104.151]) by orsmga001.jf.intel.com with ESMTP; 17 Apr 2018 07:35:38 -0700 From: KamilX Chalupnik To: dev@dpdk.org Cc: amr.mokhtar@intel.com, KamilX Chalupnik Date: Tue, 17 Apr 2018 16:34:08 +0200 Message-Id: <20180417143408.3520-1-kamilx.chalupnik@intel.com> X-Mailer: git-send-email 2.9.0.windows.1 In-Reply-To: <20180404140602.9344-3-kamilx.chalupnik@intel.com> References: <20180404140602.9344-3-kamilx.chalupnik@intel.com> Subject: [dpdk-dev] [PATCH v2] baseband/turbo_sw: optimization of turbo software driver X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 17 Apr 2018 14:35:42 -0000 Optimization of Turbo Software driver: - resource-hungry piece of code removed or optimized - validation of decoder/encoder parameters put under debug flug Signed-off-by: KamilX Chalupnik v2: - logging macros fixed --- drivers/baseband/turbo_sw/bbdev_turbo_software.c | 170 ++++++++++++++--------- lib/librte_bbdev/rte_bbdev_op.h | 18 ++- 2 files changed, 122 insertions(+), 66 deletions(-) diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c index 2a65d46..8c41ed5 100644 --- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c +++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c @@ -33,12 +33,6 @@ static int bbdev_turbo_sw_logtype; rte_bbdev_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \ ##__VA_ARGS__) -/* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */ -#define C_SUBBLOCK (32) -#define MAX_TB_SIZE (391656) -#define MAX_CB_SIZE (6144) -#define MAX_KW (18528) - /* private data structure */ struct bbdev_private { unsigned int max_nb_queues; /**< Max number of queues */ @@ -91,7 +85,7 @@ compute_idx(uint16_t k) { int32_t result = 0; - if (k < 40 || k > MAX_CB_SIZE) + if (k < RTE_BBDEV_MIN_CB_SIZE || k > RTE_BBDEV_MAX_CB_SIZE) return -1; if (k > 2048) { @@ -235,7 +229,8 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->enc_out = rte_zmalloc_socket(name, - ((MAX_TB_SIZE >> 3) + 3) * sizeof(*q->enc_out) * 3, + ((RTE_BBDEV_MAX_TB_SIZE >> 3) + 3) * + sizeof(*q->enc_out) * 3, RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->enc_out == NULL) { rte_bbdev_log(ERR, @@ -254,7 +249,7 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->enc_in = rte_zmalloc_socket(name, - (MAX_CB_SIZE >> 3) * sizeof(*q->enc_in), + (RTE_BBDEV_MAX_CB_SIZE >> 3) * sizeof(*q->enc_in), RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->enc_in == NULL) { rte_bbdev_log(ERR, @@ -272,7 +267,7 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->ag = rte_zmalloc_socket(name, - MAX_CB_SIZE * 10 * sizeof(*q->ag), + RTE_BBDEV_MAX_CB_SIZE * 10 * sizeof(*q->ag), RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->ag == NULL) { rte_bbdev_log(ERR, @@ -309,7 +304,7 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->deint_input = rte_zmalloc_socket(name, - MAX_KW * sizeof(*q->deint_input), + RTE_BBDEV_MAX_KW * sizeof(*q->deint_input), RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->deint_input == NULL) { rte_bbdev_log(ERR, @@ -328,7 +323,7 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->deint_output = rte_zmalloc_socket(NULL, - MAX_KW * sizeof(*q->deint_output), + RTE_BBDEV_MAX_KW * sizeof(*q->deint_output), RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->deint_output == NULL) { rte_bbdev_log(ERR, @@ -347,7 +342,7 @@ q_setup(struct rte_bbdev *dev, uint16_t q_id, return -ENAMETOOLONG; } q->adapter_output = rte_zmalloc_socket(NULL, - MAX_CB_SIZE * 6 * sizeof(*q->adapter_output), + RTE_BBDEV_MAX_CB_SIZE * 6 * sizeof(*q->adapter_output), RTE_CACHE_LINE_SIZE, queue_conf->socket); if (q->adapter_output == NULL) { rte_bbdev_log(ERR, @@ -396,6 +391,7 @@ static const struct rte_bbdev_ops pmd_ops = { .queue_release = q_release }; +#ifdef RTE_LIBRTE_BBDEV_DEBUG /* Checks if the encoder input buffer is correct. * Returns 0 if it's valid, -1 otherwise. */ @@ -415,15 +411,17 @@ is_enc_input_valid(const uint16_t k, const int32_t k_idx, return -1; } - if (k > MAX_CB_SIZE) { + if (k > RTE_BBDEV_MAX_CB_SIZE) { rte_bbdev_log(ERR, "CB size (%u) is too big, max: %d", - k, MAX_CB_SIZE); + k, RTE_BBDEV_MAX_CB_SIZE); return -1; } return 0; } +#endif +#ifdef RTE_LIBRTE_BBDEV_DEBUG /* Checks if the decoder input buffer is correct. * Returns 0 if it's valid, -1 otherwise. */ @@ -442,26 +440,32 @@ is_dec_input_valid(int32_t k_idx, int16_t kw, int16_t in_length) return -1; } - if (kw > MAX_KW) { + if (kw > RTE_BBDEV_MAX_KW) { rte_bbdev_log(ERR, "Input length (%u) is too big, max: %d", - kw, MAX_KW); + kw, RTE_BBDEV_MAX_KW); return -1; } return 0; } +#endif static inline void process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, - uint8_t cb_idx, uint8_t c, uint16_t k, uint16_t ncb, + uint8_t r, uint8_t c, uint16_t k, uint16_t ncb, uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset, uint16_t total_left, struct rte_bbdev_stats *q_stats) { +#ifdef RTE_LIBRTE_BBDEV_DEBUG int ret; +#else + RTE_SET_USED(total_left); +#endif int16_t k_idx; uint16_t m; uint8_t *in, *out0, *out1, *out2, *tmp_out, *rm_out; + uint64_t first_3_bytes = 0; struct rte_bbdev_op_turbo_enc *enc = &op->turbo_enc; struct bblib_crc_request crc_req; struct bblib_crc_response crc_resp; @@ -481,73 +485,116 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, /* CRC24A (for TB) */ if ((enc->op_flags & RTE_BBDEV_TURBO_CRC_24A_ATTACH) && (enc->code_block_mode == 1)) { +#ifdef RTE_LIBRTE_BBDEV_DEBUG ret = is_enc_input_valid(k - 24, k_idx, total_left); if (ret != 0) { op->status |= 1 << RTE_BBDEV_DATA_ERROR; return; } - /* copy the input to the temporary buffer to be able to extend - * it by 3 CRC bytes - */ - rte_memcpy(q->enc_in, in, (k - 24) >> 3); +#endif + crc_req.data = in; crc_req.len = (k - 24) >> 3; - crc_resp.data = q->enc_in; + /* Check if there is a room for CRC bits if not use + * the temporary buffer. + */ + if (rte_pktmbuf_append(m_in, 3) == NULL) { + rte_memcpy(q->enc_in, in, (k - 24) >> 3); + in = q->enc_in; + } else { + /* Store 3 first bytes of next CB as they will be + * overwritten by CRC bytes. If it is the last CB then + * there is no point to store 3 next bytes and this + * if..else branch will be omitted. + */ + first_3_bytes = *((uint64_t *)&in[(k - 32) >> 3]); + } + crc_resp.data = in; #ifdef RTE_TEST_BBDEV start_time = rte_rdtsc_precise(); #endif - bblib_lte_crc24a_gen(&crc_req, &crc_resp); - #ifdef RTE_TEST_BBDEV q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time; #endif - - in = q->enc_in; } else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) { /* CRC24B */ +#ifdef RTE_LIBRTE_BBDEV_DEBUG ret = is_enc_input_valid(k - 24, k_idx, total_left); if (ret != 0) { op->status |= 1 << RTE_BBDEV_DATA_ERROR; return; } - /* copy the input to the temporary buffer to be able to extend - * it by 3 CRC bytes - */ - rte_memcpy(q->enc_in, in, (k - 24) >> 3); +#endif + crc_req.data = in; crc_req.len = (k - 24) >> 3; - crc_resp.data = q->enc_in; + /* Check if there is a room for CRC bits if this is the last + * CB in TB. If not use temporary buffer. + */ + if ((c - r == 1) && (rte_pktmbuf_append(m_in, 3) == NULL)) { + rte_memcpy(q->enc_in, in, (k - 24) >> 3); + in = q->enc_in; + } else if (c - r > 1) { + /* Store 3 first bytes of next CB as they will be + * overwritten by CRC bytes. If it is the last CB then + * there is no point to store 3 next bytes and this + * if..else branch will be omitted. + */ + first_3_bytes = *((uint64_t *)&in[(k - 32) >> 3]); + } + crc_resp.data = in; #ifdef RTE_TEST_BBDEV start_time = rte_rdtsc_precise(); #endif - bblib_lte_crc24b_gen(&crc_req, &crc_resp); - #ifdef RTE_TEST_BBDEV q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time; #endif - - in = q->enc_in; - } else { + } +#ifdef RTE_LIBRTE_BBDEV_DEBUG + else { ret = is_enc_input_valid(k, k_idx, total_left); if (ret != 0) { op->status |= 1 << RTE_BBDEV_DATA_ERROR; return; } } +#endif /* Turbo encoder */ /* Each bit layer output from turbo encoder is (k+4) bits long, i.e. * input length + 4 tail bits. That's (k/8) + 1 bytes after rounding up. * So dst_data's length should be 3*(k/8) + 3 bytes. + * In Rate-matching bypass case outputs pointers passed to encoder + * (out0, out1 and out2) can directly point to addresses of output from + * turbo_enc entity. */ - out0 = q->enc_out; - out1 = RTE_PTR_ADD(out0, (k >> 3) + 1); - out2 = RTE_PTR_ADD(out1, (k >> 3) + 1); + if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) { + out0 = q->enc_out; + out1 = RTE_PTR_ADD(out0, (k >> 3) + 1); + out2 = RTE_PTR_ADD(out1, (k >> 3) + 1); + } else { + out0 = (uint8_t *)rte_pktmbuf_append(m_out, (k >> 3) * 3 + 2); + if (out0 == NULL) { + op->status |= 1 << RTE_BBDEV_DATA_ERROR; + rte_bbdev_log(ERR, + "Too little space in output mbuf"); + return; + } + enc->output.length += (k >> 3) * 3 + 2; + /* rte_bbdev_op_data.offset can be different than the + * offset of the appended bytes + */ + out0 = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset); + out1 = rte_pktmbuf_mtod_offset(m_out, uint8_t *, + out_offset + (k >> 3) + 1); + out2 = rte_pktmbuf_mtod_offset(m_out, uint8_t *, + out_offset + 2 * ((k >> 3) + 1)); + } turbo_req.case_id = k_idx; turbo_req.input_win = in; @@ -570,6 +617,10 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time; #endif + /* Restore 3 first bytes of next CB if they were overwritten by CRC*/ + if (first_3_bytes != 0) + *((uint64_t *)&in[(k - 32) >> 3]) = first_3_bytes; + /* Rate-matching */ if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) { /* get output data starting address */ @@ -586,7 +637,7 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, rm_out = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset); /* index of current code block */ - rm_req.r = cb_idx; + rm_req.r = r; /* total number of code block */ rm_req.C = c; /* For DL - 1, UL - 0 */ @@ -656,23 +707,6 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, tmp_out++; } *tmp_out = 0; - - /* copy shifted output to turbo_enc entity */ - out0 = (uint8_t *)rte_pktmbuf_append(m_out, - (k >> 3) * 3 + 2); - if (out0 == NULL) { - op->status |= 1 << RTE_BBDEV_DATA_ERROR; - rte_bbdev_log(ERR, - "Too little space in output mbuf"); - return; - } - enc->output.length += (k >> 3) * 3 + 2; - /* rte_bbdev_op_data.offset can be different than the - * offset of the appended bytes - */ - out0 = rte_pktmbuf_mtod_offset(m_out, uint8_t *, - out_offset); - rte_memcpy(out0, q->enc_out, (k >> 3) * 3 + 2); } } @@ -693,9 +727,9 @@ enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op, /* Clear op status */ op->status = 0; - if (total_left > MAX_TB_SIZE >> 3) { + if (total_left > RTE_BBDEV_MAX_TB_SIZE >> 3) { rte_bbdev_log(ERR, "TB size (%u) is too big, max: %d", - total_left, MAX_TB_SIZE); + total_left, RTE_BBDEV_MAX_TB_SIZE); op->status = 1 << RTE_BBDEV_DATA_ERROR; return; } @@ -784,11 +818,11 @@ remove_nulls_from_circular_buf(const uint8_t *in, uint8_t *out, uint16_t k, const uint32_t d = k + 4; const uint32_t kw = (ncb / 3); const uint32_t nd = kw - d; - const uint32_t r_subblock = kw / C_SUBBLOCK; + const uint32_t r_subblock = kw / RTE_BBDEV_C_SUBBLOCK; /* Inter-column permutation pattern */ - const uint32_t P[C_SUBBLOCK] = {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, - 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, - 11, 27, 7, 23, 15, 31}; + const uint32_t P[RTE_BBDEV_C_SUBBLOCK] = {0, 16, 8, 24, 4, 20, 12, 28, + 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, + 29, 3, 19, 11, 27, 7, 23, 15, 31}; in_idx = 0; out_idx = 0; @@ -869,7 +903,11 @@ process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op, struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset, bool check_crc_24b, uint16_t total_left) { +#ifdef RTE_LIBRTE_BBDEV_DEBUG int ret; +#else + RTE_SET_USED(total_left); +#endif int32_t k_idx; int32_t iter_cnt; uint8_t *in, *out, *adapter_input; @@ -882,11 +920,13 @@ process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op, k_idx = compute_idx(k); +#ifdef RTE_LIBRTE_BBDEV_DEBUG ret = is_dec_input_valid(k_idx, kw, total_left); if (ret != 0) { op->status |= 1 << RTE_BBDEV_DATA_ERROR; return; } +#endif in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset); ncb = kw; @@ -1005,7 +1045,7 @@ enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op) * where D is the size of each output from turbo encoder block * (k + 4). */ - kw = RTE_ALIGN_CEIL(k + 4, C_SUBBLOCK) * 3; + kw = RTE_ALIGN_CEIL(k + 4, RTE_BBDEV_C_SUBBLOCK) * 3; process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset, out_offset, check_bit(dec->op_flags, diff --git a/lib/librte_bbdev/rte_bbdev_op.h b/lib/librte_bbdev/rte_bbdev_op.h index 9a80c64..1a80588 100644 --- a/lib/librte_bbdev/rte_bbdev_op.h +++ b/lib/librte_bbdev/rte_bbdev_op.h @@ -25,7 +25,23 @@ extern "C" { #include #include -#define RTE_BBDEV_MAX_CODE_BLOCKS 64 +/* Number of columns in sub-block interleaver (36.212, section 5.1.4.1.1) */ +#define RTE_BBDEV_C_SUBBLOCK (32) +/* Maximum size of Transport Block (36.213, Table, Table 7.1.7.2.5-1) */ +#define RTE_BBDEV_MAX_TB_SIZE (391656) +/* Maximum size of Code Block (36.212, Table 5.1.3-3) */ +#define RTE_BBDEV_MAX_CB_SIZE (6144) +/* Minimum size of Code Block (36.212, Table 5.1.3-3) */ +#define RTE_BBDEV_MIN_CB_SIZE (40) +/* Maximum size of circular buffer */ +#define RTE_BBDEV_MAX_KW (18528) +/* + * Maximum number of Code Blocks in Transport Block. It is calculated based on + * maximum size of one Code Block and one Transport Block (considering CRC24A + * and CRC24B): + * (391656 + 24) / (6144 - 24) = 64 + */ +#define RTE_BBDEV_MAX_CODE_BLOCKS (64) /** Flags for turbo decoder operation and capability structure */ enum rte_bbdev_op_td_flag_bitmasks { -- 2.5.5 -------------------------------------------------------------- Intel Research and Development Ireland Limited Registered in Ireland Registered Office: Collinstown Industrial Park, Leixlip, County Kildare Registered Number: 308263 This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.