DPDK patches and discussions
 help / color / mirror / Atom feed
From: Bruce Richardson <bruce.richardson@intel.com>
To: Alex Kiselev <alex@therouter.net>
Cc: "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [PATCH v2] librte_lpm: Improve performance of the delete and add functions
Date: Mon, 9 Jul 2018 12:24:40 +0100	[thread overview]
Message-ID: <20180709112439.GA18644@bricha3-MOBL.ger.corp.intel.com> (raw)
In-Reply-To: <c6068a65-bee2-4f34-944a-6cd46ac6a188@orsmsx104.amr.corp.intel.com>

On Mon, Jul 02, 2018 at 07:42:11PM +0300, Alex Kiselev wrote:
> There are two major problems with the library:
> first, there is no need to rebuild the whole LPM tree
> when a rule is deleted and second, due to the current
> rules algorithm with complexity O(n) it's almost
> impossible to deal with large rule sets (50k or so rules).
> This patch addresses those two issues.
> 
> Signed-off-by: Alex Kiselev <alex@therouter.net>
> ---
>  lib/librte_lpm/rte_lpm6.c | 1073 ++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 816 insertions(+), 257 deletions(-)
> 

<snip code block previously commented on>

> @@ -806,156 +1188,333 @@ MAP_STATIC_SYMBOL(int rte_lpm6_is_rule_present(struct rte_lpm6 *lpm,
>  /*
>   * Delete a rule from the rule table.
>   * NOTE: Valid range for depth parameter is 1 .. 128 inclusive.
> + * return
> + *		0 if successful delete
> + *   <0 if failure

whitespace in comment.

>   */
> -static inline void
> -rule_delete(struct rte_lpm6 *lpm, int32_t rule_index)
> +static inline int
> +rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
>  {
> -	/*
> -	 * Overwrite redundant rule with last rule in group and decrement rule
> -	 * counter.
> -	 */
> -	lpm->rules_tbl[rule_index] = lpm->rules_tbl[lpm->used_rules-1];
> -	lpm->used_rules--;
> +	/* init a rule key */
> +	struct rte_lpm6_rule_key rule_key;
> +	rule_key_init(&rule_key, ip, depth);
> +
> +	/* Look for a rule */
> +	struct rte_lpm6_rule	*rule;

nit: whitespace

> +	int ret = rte_hash_lookup_data(lpm->rules_tbl, (void *) &rule_key,
> +		(void **) &rule);
> +	if (ret >= 0) {
> +		/* delete the rule */
> +		rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key);
> +		lpm->used_rules--;
> +		rte_mempool_put(lpm->rules_pool, rule);
> +	}

Rather than doing a lookup and then delete, why not just try the delete
straight off. If you want to check for the key not being present, it can be
detected from the output of the delete call. From rte_hash.h:

 * @return
 *   - -EINVAL if the parameters are invalid.
 *   - -ENOENT if the key is not found.


> +
> +	return ret;
>  }
>  
>  /*
> - * Deletes a rule
> + * Deletes a group of rules

Include a comment that this bulk function will rebuild the lpm table,
rather than doing incremental updates like the regular delete function.

>   */
>  int
> -rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
> +rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm,
> +		uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths,
> +		unsigned n)
>  {
> -	int32_t rule_to_delete_index;
> -	uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
> -	unsigned i;
> -
> -	/*
> -	 * Check input arguments.
> -	 */
> -	if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) {
> +	/* Check input arguments. */
> +	if ((lpm == NULL) || (ips == NULL) || (depths == NULL))
>  		return -EINVAL;
> -	}
> -
> -	/* Copy the IP and mask it to avoid modifying user's input data. */
> -	memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE);
> -	mask_ip(ip_masked, depth);
> -
> -	/*
> -	 * Find the index of the input rule, that needs to be deleted, in the
> -	 * rule table.
> -	 */
> -	rule_to_delete_index = rule_find(lpm, ip_masked, depth);
> -
> -	/*
> -	 * Check if rule_to_delete_index was found. If no rule was found the
> -	 * function rule_find returns -ENOENT.
> -	 */
> -	if (rule_to_delete_index < 0)
> -		return rule_to_delete_index;
>  
> -	/* Delete the rule from the rule table. */
> -	rule_delete(lpm, rule_to_delete_index);
> +	unsigned i;
> +	for (i = 0; i < n; i++)
> +		rule_delete(lpm, ips[i], depths[i]);
>  
>  	/*
>  	 * Set all the table entries to 0 (ie delete every rule
>  	 * from the data structure.
>  	 */
> -	lpm->next_tbl8 = 0;
>  	memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
>  	memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
>  			* RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
> +	tbl8_pool_init(lpm);
>  
>  	/*
> -	 * Add every rule again (except for the one that was removed from
> +	 * Add every rule again (except for the ones that were removed from
>  	 * the rules table).
>  	 */
> -	for (i = 0; i < lpm->used_rules; i++) {
> -		rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth,
> -				lpm->rules_tbl[i].next_hop);
> -	}
> +	recreate_lpm(lpm);
>  
>  	return 0;
>  }
>  
>  /*
> - * Deletes a group of rules
> + * Delete all rules from the LPM table.
>   */
> -int
> -rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm,
> -		uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n)
> +void
> +rte_lpm6_delete_all(struct rte_lpm6 *lpm)
>  {
> -	int32_t rule_to_delete_index;
> -	uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE];
> -	unsigned i;
> +	/* Zero used rules counter. */
> +	lpm->used_rules = 0;
>  
> -	/*
> -	 * Check input arguments.
> +	/* Zero tbl24. */
> +	memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
> +
> +	/* Zero tbl8. */
> +	memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) *
> +			RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
> +
> +	/* init pool of free tbl8 indexes */
> +	tbl8_pool_init(lpm);
> +
> +	/* put all rules back to the mempool */
> +	rules_free(lpm);
> +
> +	/* Delete all rules form the rules table. */
> +	rte_hash_reset(lpm->rules_tbl);
> +}
> +
> +/*
> + * Convert a depth to a one byte long mask
> + */
> +static uint8_t __attribute__((pure))
> +depth_to_mask_1b(uint8_t depth)
> +{
> +	/* To calculate a mask start with a 1 on the left hand side and right
> +	 * shift while populating the left hand side with 1's
>  	 */
> -	if ((lpm == NULL) || (ips == NULL) || (depths == NULL)) {
> -		return -EINVAL;
> +	return (signed char)0x80 >> (depth - 1);

I'd make the comment on the function a little clearer e.g. using an
example: "4 => 0xF0", which should remove the need to have the second comment
above the return statement.

An alternative that might be a little clearer for the calculation would be:
"(uint8_t)(~(0xFF >> depth))".

> +}
> +
> +/*
> + * Find a less specific rule
> + */
> +static struct rte_lpm6_rule*
> +rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
> +{
> +	if (depth == 1)
> +		return NULL;
> +
> +	struct rte_lpm6_rule *rule;
> +	struct rte_lpm6_rule_key rule_key;
> +	rule_key_init(&rule_key, ip, depth);
> +	uint8_t mask;
> +
> +	while (depth > 1) {
> +		depth--;
> +
> +		/* each iteration zero one more bit of the key */
> +		mask = depth & 7; /* depth % 8 */
> +		if (mask > 0)
> +			mask = depth_to_mask_1b(mask);
> +
> +		rule_key.depth = depth;
> +		rule_key.ip[depth >> 3] &= mask;
> +

It seems strange that when you adjust the depth, you also need to mask out
bits of the key which should be ignored. Can you make the masking part of
the hash calculation, which would simplify the logic here a lot, and if so,
does it affect performance much?

> +		rule = rule_find_with_key(lpm, &rule_key);
> +		if (rule != NULL)
> +			return rule;
>  	}
>  
> -	for (i = 0; i < n; i++) {
> -		/* Copy the IP and mask it to avoid modifying user's input data. */
> -		memcpy(ip_masked, ips[i], RTE_LPM6_IPV6_ADDR_SIZE);
> -		mask_ip(ip_masked, depths[i]);
> +	return NULL;
> +}
>  
> -		/*
> -		 * Find the index of the input rule, that needs to be deleted, in the
> -		 * rule table.
> +/*
> + * Find range of tbl8 cells occupied by a rule
> + */
> +static void
> +rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
> +		  struct rte_lpm6_tbl_entry **from,
> +		  struct rte_lpm6_tbl_entry **to,
> +		  uint32_t *out_tbl_ind)
> +{
> +	uint32_t ind;
> +	uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2];
> +
> +	if (depth <= 24) {
> +		/* rule is within the top level */
> +		ind = first_3bytes;
> +		*from = &lpm->tbl24[ind];
> +		ind += (1 << (24 - depth)) - 1;
> +		*to = &lpm->tbl24[ind];
> +		*out_tbl_ind = TBL24_IND;
> +	} else {
> +		/* top level entry */
> +		struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes];
> +		assert(tbl->ext_entry == 1);
> +		/* first tbl8 */
> +		uint32_t tbl_ind = tbl->lpm6_tbl8_gindex;
> +		tbl = &lpm->tbl8[tbl_ind *
> +				RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
> +		/* current ip byte, the top level is already behind */
> +		uint8_t byte = 3;
> +		/* minus top level */
> +		depth -= 24;
> +
> +		/* interate through levels (tbl8s)
> +		 * until we reach the last one
>  		 */
> -		rule_to_delete_index = rule_find(lpm, ip_masked, depths[i]);
> +		while (depth > 8) {
> +			tbl += ip[byte];
> +			assert(tbl->ext_entry == 1);
> +			/* go to the next level/tbl8 */
> +			tbl_ind = tbl->lpm6_tbl8_gindex;
> +			tbl = &lpm->tbl8[tbl_ind *
> +					RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
> +			byte += 1;
> +			depth -= 8;
> +		}
>  
> -		/*
> -		 * Check if rule_to_delete_index was found. If no rule was found the
> -		 * function rule_find returns -ENOENT.
> -		 */
> -		if (rule_to_delete_index < 0)
> -			continue;
> +		/* last level/tbl8 */
> +		ind = ip[byte] & depth_to_mask_1b(depth);
> +		*from = &tbl[ind];
> +		ind += (1 << (8 - depth)) - 1;
> +		*to = &tbl[ind];
> +		*out_tbl_ind = tbl_ind;
> +	}
> +}
>  
> -		/* Delete the rule from the rule table. */
> -		rule_delete(lpm, rule_to_delete_index);
> +/*
> + * Remove a table from the LPM tree
> + */
> +static void
> +remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr,
> +		  uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule)
> +{
> +	struct rte_lpm6_tbl_entry *owner_entry;
> +
> +	if (tbl_hdr->owner_tbl_ind == TBL24_IND)
> +		owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind];
> +	else {
> +		uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind;
> +		owner_entry = &lpm->tbl8[
> +			owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES +
> +			tbl_hdr->owner_entry_ind];
> +
> +		struct rte_lpm_tbl8_hdr *owner_tbl_hdr =
> +			&lpm->tbl8_hdrs[owner_tbl_ind];
> +		if (--owner_tbl_hdr->ref_cnt == 0)
> +			remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule);
>  	}
>  
> -	/*
> -	 * Set all the table entries to 0 (ie delete every rule
> -	 * from the data structure.
> -	 */
> -	lpm->next_tbl8 = 0;
> -	memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
> -	memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
> -			* RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
> +	assert(owner_entry->ext_entry == 1);
>  
> -	/*
> -	 * Add every rule again (except for the ones that were removed from
> -	 * the rules table).
> -	 */
> -	for (i = 0; i < lpm->used_rules; i++) {
> -		rte_lpm6_add(lpm, lpm->rules_tbl[i].ip, lpm->rules_tbl[i].depth,
> -				lpm->rules_tbl[i].next_hop);
> +	/* unlink the table */
> +	if (lsp_rule != NULL) {
> +		struct rte_lpm6_tbl_entry new_tbl_entry = {
> +			.next_hop = lsp_rule->next_hop,
> +			.depth = lsp_rule->depth,
> +			.valid = VALID,
> +			.valid_group = VALID,
> +			.ext_entry = 0
> +		};
> +
> +		*owner_entry = new_tbl_entry;
> +	} else {
> +		struct rte_lpm6_tbl_entry new_tbl_entry = {
> +			.next_hop = 0,
> +			.depth = 0,
> +			.valid = INVALID,
> +			.valid_group = INVALID,
> +			.ext_entry = 0
> +		};
> +
> +		*owner_entry = new_tbl_entry;
>  	}
>  
> -	return 0;
> +	/* return the table to the pool */
> +	tbl8_put(lpm, tbl_ind);
>  }
>  
>  /*
> - * Delete all rules from the LPM table.
> + * Deletes a rule
>   */
> -void
> -rte_lpm6_delete_all(struct rte_lpm6 *lpm)
> +int
> +rte_lpm6_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
>  {
> -	/* Zero used rules counter. */
> -	lpm->used_rules = 0;
> +	/* Check input arguments. */
> +	if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH))
> +		return -EINVAL;
>  
> -	/* Zero next tbl8 index. */
> -	lpm->next_tbl8 = 0;
> +	/* Copy the IP and mask it to avoid modifying user's input data. */
> +	uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
> +	ip6_copy_addr(masked_ip, ip);
> +	ip6_mask_addr(masked_ip, depth);
>  
> -	/* Zero tbl24. */
> -	memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
> +	/* Delete the rule from the rule table. */
> +	int ret = rule_delete(lpm, masked_ip, depth);
> +	if (ret < 0)
> +		return -ENOENT;
>  
> -	/* Zero tbl8. */
> -	memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) *
> -			RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
> +	/* find rule cells */
> +	struct rte_lpm6_tbl_entry *from, *to;
> +	uint32_t tbl_ind;
> +	rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind);
>  
> -	/* Delete all rules form the rules table. */
> -	memset(lpm->rules_tbl, 0, sizeof(struct rte_lpm6_rule) * lpm->max_rules);
> +	/* find a less specific rule (a rule with smaller depth)
> +	 * note: masked_ip will be modified, don't use it anymore
> +	 */
> +	struct rte_lpm6_rule *lsp_rule = rule_find_less_specific(lpm,
> +			masked_ip, depth);
> +
> +	/* decrement the table rule counter,
> +	 * note that tbl24 doesn't have a header
> +	 */
> +	if (tbl_ind != TBL24_IND) {
> +		struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind];
> +		if (--tbl_hdr->ref_cnt == 0) {
> +			/* remove the table */
> +			remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule);
> +			return 0;
> +		}
> +	}
> +
> +	/* iterate rule cells */
> +	for (; from <= to; from++)
> +		if (from->ext_entry == 1) {
> +			/* reference to a more specific space
> +			 * of the prefix/rule. Entries in a more
> +			 * specific space that are not used by
> +			 * a more specific prefix must be occupied
> +			 * by the prefix
> +			 */
> +			if (lsp_rule != NULL)
> +				expand_rule(lpm,
> +					from->lpm6_tbl8_gindex *
> +					RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
> +					depth, lsp_rule->depth,
> +					lsp_rule->next_hop, VALID);
> +			else
> +				/* since the prefix has no less specific prefix,
> +				 * its more specific space must be invalidated
> +				 */
> +				expand_rule(lpm,
> +					from->lpm6_tbl8_gindex *
> +					RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
> +					depth, 0, 0, INVALID);
> +		} else if (from->depth == depth) {
> +			/* entry is not a reference and belongs to the prefix */
> +			if (lsp_rule != NULL) {
> +				struct rte_lpm6_tbl_entry new_tbl_entry = {
> +					.next_hop = lsp_rule->next_hop,
> +					.depth = lsp_rule->depth,
> +					.valid = VALID,
> +					.valid_group = VALID,
> +					.ext_entry = 0
> +				};
> +
> +				*from = new_tbl_entry;
> +			} else {
> +				struct rte_lpm6_tbl_entry new_tbl_entry = {
> +					.next_hop = 0,
> +					.depth = 0,
> +					.valid = INVALID,
> +					.valid_group = INVALID,
> +					.ext_entry = 0
> +				};
> +
> +				*from = new_tbl_entry;
> +			}
> +		}
> +
> +	return 0;
>  }
> -- 
Rest of the patch looks fine to me, though I can't say I've followed all
the logic paths in full detail.

Main concern I have about the patch is the size. Is there any way this
patch could be split up into a few smaller ones with more gradual changes?

Regards,
/Bruce

  parent reply	other threads:[~2018-07-09 11:24 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <c6068a65-bee2-4f34-944a-6cd46ac6a188@orsmsx104.amr.corp.intel.com>
2018-07-06 10:13 ` Bruce Richardson
2018-07-06 10:25   ` Bruce Richardson
2018-07-06 10:23 ` Bruce Richardson
2018-07-06 10:56 ` Bruce Richardson
2018-07-06 12:00   ` Alex Kiselev
2018-07-06 16:16 ` Bruce Richardson
2018-07-06 16:59   ` Alex Kiselev
2018-07-09  9:07     ` Bruce Richardson
2018-07-09 11:24 ` Bruce Richardson [this message]
2018-07-09 12:33   ` Alex Kiselev
2018-07-09 13:35     ` Bruce Richardson
2018-07-02 16:42 Alex Kiselev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180709112439.GA18644@bricha3-MOBL.ger.corp.intel.com \
    --to=bruce.richardson@intel.com \
    --cc=alex@therouter.net \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).