From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 7DA41A0C56;
	Wed,  1 Sep 2021 21:54:02 +0200 (CEST)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id 3F94C40687;
	Wed,  1 Sep 2021 21:54:02 +0200 (CEST)
Received: from mail.lysator.liu.se (mail.lysator.liu.se [130.236.254.3])
 by mails.dpdk.org (Postfix) with ESMTP id E89EF40686
 for <dev@dpdk.org>; Wed,  1 Sep 2021 21:54:00 +0200 (CEST)
Received: from mail.lysator.liu.se (localhost [127.0.0.1])
 by mail.lysator.liu.se (Postfix) with ESMTP id 98F4940013
 for <dev@dpdk.org>; Wed,  1 Sep 2021 21:53:59 +0200 (CEST)
Received: by mail.lysator.liu.se (Postfix, from userid 1004)
 id 854534000F; Wed,  1 Sep 2021 21:53:59 +0200 (CEST)
X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on
 bernadotte.lysator.liu.se
X-Spam-Level: 
X-Spam-Status: No, score=-1.5 required=5.0 tests=ALL_TRUSTED, AWL, NICE_REPLY_A
 autolearn=disabled version=3.4.2
X-Spam-Score: -1.5
Received: from [192.168.1.59] (h-62-63-215-114.A163.priv.bahnhof.se
 [62.63.215.114])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by mail.lysator.liu.se (Postfix) with ESMTPSA id 8BD3840005;
 Wed,  1 Sep 2021 21:53:54 +0200 (CEST)
To: Bruce Richardson <bruce.richardson@intel.com>, dev@dpdk.org
Cc: conor.walsh@intel.com, kevin.laatz@intel.com, fengchengwen@huawei.com,
 jerinj@marvell.com
References: <20210826183301.333442-1-bruce.richardson@intel.com>
 <20210901163216.120087-1-bruce.richardson@intel.com>
 <20210901163216.120087-6-bruce.richardson@intel.com>
From: =?UTF-8?Q?Mattias_R=c3=b6nnblom?= <mattias.ronnblom@ericsson.com>
Message-ID: <bd2fd740-6027-4b36-ca81-e2b4ff2c0600@ericsson.com>
Date: Wed, 1 Sep 2021 21:53:54 +0200
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101
 Thunderbird/78.13.0
MIME-Version: 1.0
In-Reply-To: <20210901163216.120087-6-bruce.richardson@intel.com>
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Language: en-US
Content-Transfer-Encoding: 7bit
X-Virus-Scanned: ClamAV using ClamSMTP
Subject: Re: [dpdk-dev] [PATCH v2 5/6] app/test: test dmadev instance
 failure handling
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

On 2021-09-01 18:32, Bruce Richardson wrote:
> Add a series of tests to inject bad copy operations into a dmadev to
> test the error handling and reporting capabilities. Various combinations
> of errors in various positions in a burst are tested, as are errors in
> bursts with fence flag set, and multiple errors in a single burst.
> 
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>   app/test/test_dmadev.c | 427 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 427 insertions(+)
> 
> diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
> index 7a808a9cba..5d7b6ddd87 100644
> --- a/app/test/test_dmadev.c
> +++ b/app/test/test_dmadev.c
> @@ -302,6 +302,414 @@ test_enqueue_copies(int dev_id, uint16_t vchan)
>   			|| do_multi_copies(dev_id, vchan, 0, 0, 1);
>   }
>   
> +/* Failure handling test cases - global macros and variables for those tests*/
> +#define COMP_BURST_SZ	16
> +#define OPT_FENCE(idx) ((fence && idx == 8) ? RTE_DMA_OP_FLAG_FENCE : 0)
> +
> +static int
> +test_failure_in_full_burst(int dev_id, uint16_t vchan, bool fence,
> +		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
> +{
> +	/* Test single full batch statuses with failures */
> +	enum rte_dma_status_code status[COMP_BURST_SZ];
> +	struct rte_dmadev_stats baseline, stats;
> +	uint16_t invalid_addr_id = 0;
> +	uint16_t idx;
> +	uint16_t count, status_count;
> +	unsigned int i;
> +	bool error = 0;

error = false;

> +	int err_count = 0;
> +
> +	rte_dmadev_stats_get(dev_id, vchan, &baseline); /* get a baseline set of stats */
> +	for (i = 0; i < COMP_BURST_SZ; i++) {
> +		int id = rte_dmadev_copy(dev_id, vchan,
> +				(i == fail_idx ? 0 : (srcs[i]->buf_iova + srcs[i]->data_off)),
> +				dsts[i]->buf_iova + dsts[i]->data_off,
> +				COPY_LEN, OPT_FENCE(i));
> +		if (id < 0) {
> +			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", i);
> +			return -1;
> +		}
> +		if (i == fail_idx)
> +			invalid_addr_id = id;
> +	}
> +	rte_dmadev_submit(dev_id, vchan);
> +	rte_dmadev_stats_get(dev_id, vchan, &stats);
> +	if (stats.submitted != baseline.submitted + COMP_BURST_SZ) {
> +		PRINT_ERR("Submitted stats value not as expected, %"PRIu64" not %"PRIu64"\n",
> +				stats.submitted, baseline.submitted + COMP_BURST_SZ);
> +		return -1;
> +	}
> +
> +	await_hw(dev_id, vchan);
> +
> +	count = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
> +	if (count != fail_idx) {
> +		PRINT_ERR("Error with rte_dmadev_completed for failure test. Got returned %u not %u.\n",
> +				count, fail_idx);
> +		rte_dmadev_dump(dev_id, stdout);
> +		return -1;
> +	}
> +	if (error == false) {
if (!error)
> +		PRINT_ERR("Error, missing expected failed copy, %u. has_error is not set\n",
> +				fail_idx);
> +		return -1;
> +	}
> +	if (idx != invalid_addr_id - 1) {
> +		PRINT_ERR("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
> +				fail_idx, idx, invalid_addr_id - 1);
> +		return -1;
> +	}
> +
> +	/* all checks ok, now verify calling completed() again always returns 0 */
> +	for (i = 0; i < 10; i++) {
> +		if (rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error) != 0
> +				|| error == false || idx != (invalid_addr_id - 1)) {
> +			PRINT_ERR("Error with follow-up completed calls for fail idx %u\n",
> +					fail_idx);
> +			return -1;
> +		}
> +	}
> +
> +	status_count = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ,
> +			&idx, status);
> +	/* some HW may stop on error and be restarted after getting error status for single value
> +	 * To handle this case, if we get just one error back, wait for more completions and get
> +	 * status for rest of the burst
> +	 */
> +	if (status_count == 1) {
> +		await_hw(dev_id, vchan);
> +		status_count += rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ - 1,
> +					&idx, &status[1]);
> +	}
> +	/* check that at this point we have all status values */
> +	if (status_count != COMP_BURST_SZ - count) {
> +		PRINT_ERR("Error with completed_status calls for fail idx %u. Got %u not %u\n",
> +				fail_idx, status_count, COMP_BURST_SZ - count);
> +		return -1;
> +	}
> +	/* now verify just one failure followed by multiple successful or skipped entries */
> +	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL) {
> +		PRINT_ERR("Error with status returned for fail idx %u. First status was not failure\n",
> +				fail_idx);
> +		return -1;
> +	}
> +	for (i = 1; i < status_count; i++) {
> +		/* after a failure in a burst, depending on ordering/fencing,
> +		 * operations may be successful or skipped because of previous error.
> +		 */
> +		if (status[i] != RTE_DMA_STATUS_SUCCESSFUL
> +				&& status[i] != RTE_DMA_STATUS_NOT_ATTEMPTED) {
> +			PRINT_ERR("Error with status calls for fail idx %u. Status for job %u (of %u) is not successful\n",
> +					fail_idx, count + i, COMP_BURST_SZ);
> +			return -1;
> +		}
> +	}
> +
> +	/* check the completed + errors stats are as expected */
> +	rte_dmadev_stats_get(dev_id, vchan, &stats);
> +	if (stats.completed != baseline.completed + COMP_BURST_SZ) {
> +		PRINT_ERR("Completed stats value not as expected, %"PRIu64" not %"PRIu64"\n",
> +				stats.completed, baseline.completed + COMP_BURST_SZ);
> +		return -1;
> +	}
> +	for (i = 0; i < status_count; i++)
> +		err_count += (status[i] != RTE_DMA_STATUS_SUCCESSFUL);
> +	if (stats.errors != baseline.errors + err_count) {
> +		PRINT_ERR("'Errors' stats value not as expected, %"PRIu64" not %"PRIu64"\n",
> +				stats.errors, baseline.errors + err_count);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +test_individual_status_query_with_failure(int dev_id, uint16_t vchan, bool fence,
> +		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
> +{
> +	/* Test gathering batch statuses one at a time */
> +	enum rte_dma_status_code status[COMP_BURST_SZ];
> +	uint16_t invalid_addr_id = 0;
> +	uint16_t idx;
> +	uint16_t count = 0, status_count = 0;
> +	unsigned int j;
> +	bool error = false;
> +
> +	for (j = 0; j < COMP_BURST_SZ; j++) {
> +		int id = rte_dmadev_copy(dev_id, vchan,
> +				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
> +				dsts[j]->buf_iova + dsts[j]->data_off,
> +				COPY_LEN, OPT_FENCE(j));
> +		if (id < 0) {
> +			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
> +			return -1;
> +		}
> +		if (j == fail_idx)
> +			invalid_addr_id = id;
> +	}
> +	rte_dmadev_submit(dev_id, vchan);
> +	await_hw(dev_id, vchan);
> +
> +	/* use regular "completed" until we hit error */
> +	while (!error) {
> +		uint16_t n = rte_dmadev_completed(dev_id, vchan, 1, &idx, &error);
> +		count += n;
> +		if (n > 1 || count >= COMP_BURST_SZ) {
> +			PRINT_ERR("Error - too many completions got\n");
> +			return -1;
> +		}
> +		if (n == 0 && !error) {
> +			PRINT_ERR("Error, unexpectedly got zero completions after %u completed\n",
> +					count);
> +			return -1;
> +		}
> +	}
> +	if (idx != invalid_addr_id - 1) {
> +		PRINT_ERR("Error, last successful index not as expected, got %u, expected %u\n",
> +				idx, invalid_addr_id - 1);
> +		return -1;
> +	}
> +
> +	/* use completed_status until we hit end of burst */
> +	while (count + status_count < COMP_BURST_SZ) {
> +		uint16_t n = rte_dmadev_completed_status(dev_id, vchan, 1, &idx,
> +				&status[status_count]);
> +		await_hw(dev_id, vchan); /* allow delay to ensure jobs are completed */
> +		status_count += n;
> +		if (n != 1) {
> +			PRINT_ERR("Error: unexpected number of completions received, %u, not 1\n",
> +					n);
> +			return -1;
> +		}
> +	}
> +
> +	/* check for single failure */
> +	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL) {
> +		PRINT_ERR("Error, unexpected successful DMA transaction\n");
> +		return -1;
> +	}
> +	for (j = 1; j < status_count; j++) {
> +		if (status[j] != RTE_DMA_STATUS_SUCCESSFUL
> +				&& status[j] != RTE_DMA_STATUS_NOT_ATTEMPTED) {
> +			PRINT_ERR("Error, unexpected DMA error reported\n");
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +test_single_item_status_query_with_failure(int dev_id, uint16_t vchan,
> +		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
> +{
> +	/* When error occurs just collect a single error using "completed_status()"
> +	 * before going to back to completed() calls
> +	 */
> +	enum rte_dma_status_code status;
> +	uint16_t invalid_addr_id = 0;
> +	uint16_t idx;
> +	uint16_t count, status_count, count2;
> +	unsigned int j;
> +	bool error = 0;

Same here.

> +
> +	for (j = 0; j < COMP_BURST_SZ; j++) {
> +		int id = rte_dmadev_copy(dev_id, vchan,
> +				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
> +				dsts[j]->buf_iova + dsts[j]->data_off,
> +				COPY_LEN, 0);
> +		if (id < 0) {
> +			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
> +			return -1;
> +		}
> +		if (j == fail_idx)
> +			invalid_addr_id = id;
> +	}
> +	rte_dmadev_submit(dev_id, vchan);
> +	await_hw(dev_id, vchan);
> +
> +	/* get up to the error point */
> +	count = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
> +	if (count != fail_idx) {
> +		PRINT_ERR("Error with rte_dmadev_completed for failure test. Got returned %u not %u.\n",
> +				count, fail_idx);
> +		rte_dmadev_dump(dev_id, stdout);
> +		return -1;
> +	}
> +	if (error == false) {

And here.

> +		PRINT_ERR("Error, missing expected failed copy, %u. has_error is not set\n",
> +				fail_idx);
> +		return -1;
> +	}
> +	if (idx != invalid_addr_id - 1) {
> +		PRINT_ERR("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
> +				fail_idx, idx, invalid_addr_id - 1);
> +		return -1;
> +	}
> +
> +	/* get the error code */
> +	status_count = rte_dmadev_completed_status(dev_id, vchan, 1, &idx, &status);
> +	if (status_count != 1) {
> +		PRINT_ERR("Error with completed_status calls for fail idx %u. Got %u not %u\n",
> +				fail_idx, status_count, COMP_BURST_SZ - count);
> +		return -1;
> +	}
> +	if (status == RTE_DMA_STATUS_SUCCESSFUL) {
> +		PRINT_ERR("Error with status returned for fail idx %u. First status was not failure\n",
> +				fail_idx);
> +		return -1;
> +	}
> +	/* delay in case time needed after err handled to complete other jobs */
> +	await_hw(dev_id, vchan);
> +
> +	/* get the rest of the completions without status */
> +	count2 = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
> +	if (error == true) {

if (error)

> +		PRINT_ERR("Error, got further errors post completed_status() call, for failure case %u.\n",
> +				fail_idx);
> +		return -1;
> +	}
> +	if (count + status_count + count2 != COMP_BURST_SZ) {
> +		PRINT_ERR("Error, incorrect number of completions received, got %u not %u\n",
> +				count + status_count + count2, COMP_BURST_SZ);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +test_multi_failure(int dev_id, uint16_t vchan, struct rte_mbuf **srcs, struct rte_mbuf **dsts,
> +		const unsigned int *fail, size_t num_fail)
> +{
> +	/* test having multiple errors in one go */
> +	enum rte_dma_status_code status[COMP_BURST_SZ];
> +	unsigned int i, j;
> +	uint16_t count, err_count = 0;
> +	bool error = 0;

false

> +
> +	/* enqueue and gather completions in one go */
> +	for (j = 0; j < COMP_BURST_SZ; j++) {
> +		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
> +		/* set up for failure if the current index is anywhere is the fails array */
> +		for (i = 0; i < num_fail; i++)
> +			if (j == fail[i])
> +				src = 0;
> +
> +		int id = rte_dmadev_copy(dev_id, vchan,
> +				src, dsts[j]->buf_iova + dsts[j]->data_off,
> +				COPY_LEN, 0);
> +		if (id < 0) {
> +			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
> +			return -1;
> +		}
> +	}
> +	rte_dmadev_submit(dev_id, vchan);
> +	await_hw(dev_id, vchan);
> +
> +	count = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ, NULL, status);
> +	while (count < COMP_BURST_SZ) {
> +		await_hw(dev_id, vchan);
> +
> +		uint16_t ret = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ - count,
> +				NULL, &status[count]);
> +		if (ret == 0) {
> +			PRINT_ERR("Error getting all completions for jobs. Got %u of %u\n",
> +					count, COMP_BURST_SZ);
> +			return -1;
> +		}
> +		count += ret;
> +	}
> +	for (i = 0; i < count; i++) {
> +		if (status[i] != RTE_DMA_STATUS_SUCCESSFUL)
> +			err_count++;
> +	}

Remove {} around the loop?

> +	if (err_count != num_fail) {
> +		PRINT_ERR("Error: Invalid number of failed completions returned, %u; expected %zu\n",
> +			err_count, num_fail);
> +		return -1;
> +	}
> +
> +	/* enqueue and gather completions in bursts, but getting errors one at a time */
> +	for (j = 0; j < COMP_BURST_SZ; j++) {
> +		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
> +		/* set up for failure if the current index is anywhere is the fails array */
> +		for (i = 0; i < num_fail; i++)
> +			if (j == fail[i])
> +				src = 0;
> +
> +		int id = rte_dmadev_copy(dev_id, vchan,
> +				src, dsts[j]->buf_iova + dsts[j]->data_off,
> +				COPY_LEN, 0);
> +		if (id < 0) {
> +			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
> +			return -1;
> +		}
> +	}
> +	rte_dmadev_submit(dev_id, vchan);
> +	await_hw(dev_id, vchan);
> +
> +	count = 0;
> +	err_count = 0;
> +	while (count + err_count < COMP_BURST_SZ) {
> +		count += rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, NULL, &error);
> +		if (error) {
> +			uint16_t ret = rte_dmadev_completed_status(dev_id, vchan, 1,
> +					NULL, status);
> +			if (ret != 1) {
> +				PRINT_ERR("Error getting error-status for completions\n");
> +				return -1;
> +			}
> +			err_count += ret;
> +			await_hw(dev_id, vchan);
> +		}
> +	}
> +	if (err_count != num_fail) {
> +		PRINT_ERR("Error: Incorrect number of failed completions received, got %u not %zu\n",
> +				err_count, num_fail);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +test_completion_status(int dev_id, uint16_t vchan, bool fence)
> +{
> +	const unsigned int fail[] = {0, 7, 14, 15};
> +	struct rte_mbuf *srcs[COMP_BURST_SZ], *dsts[COMP_BURST_SZ];
> +	unsigned int i;
> +
> +	for (i = 0; i < COMP_BURST_SZ; i++) {
> +		srcs[i] = rte_pktmbuf_alloc(pool);
> +		dsts[i] = rte_pktmbuf_alloc(pool);
> +	}
> +
> +	for (i = 0; i < RTE_DIM(fail); i++) {
> +		if (test_failure_in_full_burst(dev_id, vchan, fence, srcs, dsts, fail[i]) < 0)
> +			return -1;
> +
> +		if (test_individual_status_query_with_failure(dev_id, vchan, fence,
> +				srcs, dsts, fail[i]) < 0)
> +			return -1;
> +
> +		/* test is run the same fenced, or unfenced, but no harm in running it twice */
> +		if (test_single_item_status_query_with_failure(dev_id, vchan,
> +				srcs, dsts, fail[i]) < 0)
> +			return -1;
> +	}
> +
> +	if (test_multi_failure(dev_id, vchan, srcs, dsts, fail, RTE_DIM(fail)) < 0)
> +		return -1;
> +
> +	for (i = 0; i < COMP_BURST_SZ; i++) {
> +		rte_pktmbuf_free(srcs[i]);
> +		rte_pktmbuf_free(dsts[i]);
> +	}
> +	return 0;
> +}
> +
>   static int
>   test_dmadev_instance(uint16_t dev_id)
>   {
> @@ -386,6 +794,25 @@ test_dmadev_instance(uint16_t dev_id)
>   	if (check_stats(&stats, true) < 0)
>   		goto err;
>   
> +	/* to test error handling we can provide null pointers for source or dest in copies. This
> +	 * requires VA mode in DPDK, since NULL(0) is a valid physical address.
> +	 */
> +	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +		rte_dmadev_stats_reset(dev_id, vchan);
> +		printf("DMA Dev: %u, Running Completion Handling Tests (errors expected)\n",
> +				dev_id);
> +		if (test_completion_status(dev_id, vchan, false) != 0) /* without fences */
> +			goto err;
> +		if (test_completion_status(dev_id, vchan, true) != 0) /* with fences */
> +			goto err;
> +		rte_dmadev_stats_get(dev_id, 0, &stats);
> +		printf("Ops submitted: %"PRIu64"\t", stats.submitted);
> +		printf("Ops completed: %"PRIu64"\t", stats.completed);
> +		printf("Errors: %"PRIu64"\n", stats.errors);
> +		if (check_stats(&stats, false) < 0) /* don't check stats.errors this time */
> +			goto err;
> +	}
> +
>   	rte_mempool_free(pool);
>   	rte_dmadev_stop(dev_id);
>   	rte_dmadev_stats_reset(dev_id, vchan);
>