From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga12.intel.com (mga12.intel.com [192.55.52.136]) by dpdk.org (Postfix) with ESMTP id C7D551BE41 for ; Fri, 6 Jul 2018 18:16:46 +0200 (CEST) X-Amp-Result: UNKNOWN X-Amp-Original-Verdict: FILE UNKNOWN X-Amp-File-Uploaded: False Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by fmsmga106.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 06 Jul 2018 09:16:44 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.51,316,1526367600"; d="scan'208";a="70195251" Received: from bricha3-mobl.ger.corp.intel.com ([10.237.221.107]) by fmsmga001.fm.intel.com with SMTP; 06 Jul 2018 09:16:42 -0700 Received: by (sSMTP sendmail emulation); Fri, 06 Jul 2018 17:16:41 +0100 Date: Fri, 6 Jul 2018 17:16:40 +0100 From: Bruce Richardson To: Alex Kiselev Cc: "dev@dpdk.org" Message-ID: <20180706161640.GA24764@bricha3-MOBL.ger.corp.intel.com> References: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: Organization: Intel Research and Development Ireland Ltd. User-Agent: Mutt/1.10.0 (2018-05-17) Subject: Re: [dpdk-dev] [PATCH v2] librte_lpm: Improve performance of the delete and add functions X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 06 Jul 2018 16:16:47 -0000 On Mon, Jul 02, 2018 at 07:42:11PM +0300, Alex Kiselev wrote: > There are two major problems with the library: > first, there is no need to rebuild the whole LPM tree > when a rule is deleted and second, due to the current > rules algorithm with complexity O(n) it's almost > impossible to deal with large rule sets (50k or so rules). > This patch addresses those two issues. > > Signed-off-by: Alex Kiselev Hi, Some initial review comments inline below /Bruce > --- > lib/librte_lpm/rte_lpm6.c | 1073 ++++++++++++++++++++++++++++++++++----------- > 1 file changed, 816 insertions(+), 257 deletions(-) > > diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c > index 149677eb1..438db0831 100644 > --- a/lib/librte_lpm/rte_lpm6.c > +++ b/lib/librte_lpm/rte_lpm6.c > @@ -21,6 +21,10 @@ > #include > #include > #include > +#include > +#include > +#include > +#include > > #include "rte_lpm6.h" > > @@ -37,6 +41,9 @@ > #define BYTE_SIZE 8 > #define BYTES2_SIZE 16 > > +#define RULE_HASH_TABLE_EXTRA_SPACE 256 > +#define TBL24_IND UINT32_MAX > + > #define lpm6_tbl8_gindex next_hop > > /** Flags for setting an entry as valid/invalid. */ > @@ -70,6 +77,23 @@ struct rte_lpm6_rule { > uint8_t depth; /**< Rule depth. */ > }; > > +/** Rules tbl entry key. */ > +struct rte_lpm6_rule_key { > + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ > + uint8_t depth; /**< Rule depth. */ > +}; > + > +/* Header of tbl8 */ > +struct rte_lpm_tbl8_hdr { > + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, > + * otherwise index of tbl8 > + */ > + uint32_t owner_entry_ind; /**< index of the owner table entry where > + * pointer to the tbl8 is stored > + */ > + uint32_t ref_cnt; /**< table reference counter */ > +}; > + > /** LPM6 structure. */ > struct rte_lpm6 { > /* LPM metadata. */ > @@ -77,12 +101,18 @@ struct rte_lpm6 { > uint32_t max_rules; /**< Max number of rules. */ > uint32_t used_rules; /**< Used rules so far. */ > uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ > - uint32_t next_tbl8; /**< Next tbl8 to be used. */ > > /* LPM Tables. */ > - struct rte_lpm6_rule *rules_tbl; /**< LPM rules. */ > + struct rte_mempool *rules_pool; /**< LPM rules mempool. */ > + struct rte_hash *rules_tbl; /**< LPM rules. */ > struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] > __rte_cache_aligned; /**< LPM tbl24 table. */ > + > + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ > + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ > + > + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ > + > struct rte_lpm6_tbl_entry tbl8[0] > __rte_cache_aligned; /**< LPM tbl8 table. */ > }; > @@ -93,22 +123,130 @@ struct rte_lpm6 { > * and set the rest to 0. > */ > static inline void > -mask_ip(uint8_t *ip, uint8_t depth) > +ip6_mask_addr(uint8_t *ip, uint8_t depth) > { > - int16_t part_depth, mask; > - int i; > + int16_t part_depth, mask; > + int i; > > - part_depth = depth; > + part_depth = depth; > > - for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { > - if (part_depth < BYTE_SIZE && part_depth >= 0) { > - mask = (uint16_t)(~(UINT8_MAX >> part_depth)); > - ip[i] = (uint8_t)(ip[i] & mask); > - } else if (part_depth < 0) { > - ip[i] = 0; > - } > - part_depth -= BYTE_SIZE; > - } > + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { > + if (part_depth < BYTE_SIZE && part_depth >= 0) { > + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); > + ip[i] = (uint8_t)(ip[i] & mask); > + } else if (part_depth < 0) > + ip[i] = 0; > + > + part_depth -= BYTE_SIZE; > + } > +} This block is just a whitespace cleanup, as far as I can see. If there are other little cleanups like that as part of this set, it would be nice to have them as a separate initial patch, to make it easier to review the more complicated changes. > + > +/* copy ipv6 address */ > +static inline void > +ip6_copy_addr(uint8_t *dst, const uint8_t *src) > +{ > + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); > +} > + > +/* > + * LPM6 rule hash function > + */ > +static inline uint32_t > +rule_hash_crc(const void *data, __rte_unused uint32_t data_len, > + uint32_t init_val) > +{ > + return rte_hash_crc(data, sizeof(struct rte_lpm6_rule_key), init_val); > +} Why bother passing in the length and making the data a void pointer. Why not just have the prototype: rule_hash_crc(struct rte_lpm6_rule_key *data, uint32_t init_val) > + > +/* > + * Init pool of free tbl8 indexes > + */ > +static void > +tbl8_pool_init(struct rte_lpm6 *lpm) > +{ > + /* put entire range of indexes to the tbl8 pool */ > + uint32_t i; > + for (i = 0; i < lpm->number_tbl8s; i++) > + lpm->tbl8_pool[i] = i; > + > + lpm->tbl8_pool_pos = 0; > +} > + > +/* > + * Get an index of a free tbl8 from the pool > + */ > +static inline uint32_t > +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) > +{ > + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) > + /* no more free tbl8 */ > + return -ENOSPC; > + > + /* next index */ > + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; > + return 0; > +} > + > +/* > + * Put an index of a free tbl8 back to the pool > + */ > +static inline uint32_t > +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) > +{ > + if (lpm->tbl8_pool_pos == 0) > + /* pool is full */ > + return -ENOSPC; > + > + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; > + return 0; > +} > + > +/* > + * Returns number of tbl8s awailable in the pool Typo: available > + */ > +static inline uint32_t > +tbl8_available(struct rte_lpm6 *lpm) > +{ > + return lpm->number_tbl8s - lpm->tbl8_pool_pos; > +} > + > +/* > + * Init a rule key. > + * note that ip must be already masked > + */ > +static inline void > +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) > +{ > + ip6_copy_addr(key->ip, ip); > + key->depth = depth; > +} > + > +/* > + * Recreate the entire LPM tree by reinserting all rules > + */ > +static void > +recreate_lpm(struct rte_lpm6 *lpm) > +{ > + struct rte_lpm6_rule *rule; > + struct rte_lpm6_rule *rule_key; > + uint32_t iter = 0; > + while (rte_hash_iterate(lpm->rules_tbl, (const void **) &rule_key, > + (void **) &rule, &iter) >= 0) > + rte_lpm6_add(lpm, rule->ip, rule->depth, rule->next_hop); > +} > + > +/* > + * Free all rules > + */ > +static void > +rules_free(struct rte_lpm6 *lpm) > +{ > + struct rte_lpm6_rule *rule; > + struct rte_lpm6_rule *rule_key; > + uint32_t iter = 0; > + while (rte_hash_iterate(lpm->rules_tbl, (const void **) &rule_key, > + (void **) &rule, &iter) >= 0) > + rte_mempool_put(lpm->rules_pool, rule); > } > > /* > @@ -121,7 +259,7 @@ rte_lpm6_create(const char *name, int socket_id, > char mem_name[RTE_LPM6_NAMESIZE]; > struct rte_lpm6 *lpm = NULL; > struct rte_tailq_entry *te; > - uint64_t mem_size, rules_size; > + uint64_t mem_size; > struct rte_lpm6_list *lpm_list; > > lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); > @@ -136,12 +274,72 @@ rte_lpm6_create(const char *name, int socket_id, > return NULL; > } > > + struct rte_mempool *rules_mempool = NULL; > + struct rte_hash *rules_tbl = NULL; > + uint32_t *tbl8_pool = NULL; > + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; > + > + /* allocate rules mempool */ > + snprintf(mem_name, sizeof(mem_name), "LRM_%s", name); LRM == "LPM Rules Mempool" I assume? [Not that it actually matters] > + rules_mempool = rte_mempool_create(mem_name, > + config->max_rules, sizeof(struct rte_lpm6_rule), 0, 0, > + NULL, NULL, NULL, NULL, socket_id, > + MEMPOOL_F_NO_CACHE_ALIGN); > + if (rules_mempool == NULL) { > + RTE_LOG(ERR, LPM, "LPM rules mempool allocation failed: %s (%d)", > + rte_strerror(rte_errno), rte_errno); > + rte_errno = ENOMEM; > + goto fail_wo_unlock; > + } > + > + /* create rules hash table */ > + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); > + > + struct rte_hash_parameters rule_hash_tbl_params = { > + .entries = config->max_rules + RULE_HASH_TABLE_EXTRA_SPACE, Are 256 extra slots going to be enough here? Would it be safer to multiply max_rules by e.g. 1.2 or similar? > + .key_len = sizeof(struct rte_lpm6_rule_key), > + .hash_func = rule_hash_crc, Given that this is not for data plane use, it might be better to use jhash as the hash function. It's slower, but it does give a better rule distribution so allows more compact tables. > + .hash_func_init_val = 0, > + .name = mem_name, > + .reserved = 0, > + .socket_id = socket_id, > + .extra_flag = 0 > + }; > + > + rules_tbl = rte_hash_create(&rule_hash_tbl_params); > + if (rules_tbl == NULL) { > + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", > + rte_strerror(rte_errno), rte_errno); > + goto fail_wo_unlock; > + } > + > + /* allocate tbl8 indexes pool */ > + tbl8_pool = rte_malloc(NULL, > + sizeof(uint32_t) * config->number_tbl8s, > + RTE_CACHE_LINE_SIZE); > + if (tbl8_pool == NULL) { > + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", > + rte_strerror(rte_errno), rte_errno); > + rte_errno = ENOMEM; > + goto fail_wo_unlock; > + } > + > + /* allocate tbl8 headers */ > + tbl8_hdrs = rte_malloc(NULL, > + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, > + RTE_CACHE_LINE_SIZE); > + if (tbl8_hdrs == NULL) { > + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", > + rte_strerror(rte_errno), rte_errno); > + rte_errno = ENOMEM; > + goto fail_wo_unlock; > + } > + > snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); > > /* Determine the amount of memory to allocate. */ > mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * > RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); > - rules_size = sizeof(struct rte_lpm6_rule) * config->max_rules; > > rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); > > @@ -154,7 +352,7 @@ rte_lpm6_create(const char *name, int socket_id, > lpm = NULL; > if (te != NULL) { > rte_errno = EEXIST; > - goto exit; > + goto fail; > } > > /* allocate tailq entry */ > @@ -162,30 +360,18 @@ rte_lpm6_create(const char *name, int socket_id, > if (te == NULL) { > RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); > rte_errno = ENOMEM; > - goto exit; > + goto fail; > } > > /* Allocate memory to store the LPM data structures. */ > - lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, > + lpm = (struct rte_lpm6 *)rte_zmalloc_socket(mem_name, (size_t)mem_size, > RTE_CACHE_LINE_SIZE, socket_id); > > if (lpm == NULL) { > RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); > rte_free(te); > rte_errno = ENOMEM; > - goto exit; > - } > - > - lpm->rules_tbl = rte_zmalloc_socket(NULL, > - (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); > - > - if (lpm->rules_tbl == NULL) { > - RTE_LOG(ERR, LPM, "LPM rules_tbl allocation failed\n"); > - rte_free(lpm); > - lpm = NULL; > - rte_free(te); > - rte_errno = ENOMEM; > - goto exit; > + goto fail; > } > > /* Save user arguments. */ > @@ -193,14 +379,37 @@ rte_lpm6_create(const char *name, int socket_id, > lpm->number_tbl8s = config->number_tbl8s; > snprintf(lpm->name, sizeof(lpm->name), "%s", name); > > + lpm->rules_tbl = rules_tbl; > + lpm->tbl8_pool = tbl8_pool; > + lpm->tbl8_hdrs = tbl8_hdrs; > + lpm->rules_pool = rules_mempool; > + > + /* init the stack */ > + tbl8_pool_init(lpm); > + > te->data = (void *) lpm; > > TAILQ_INSERT_TAIL(lpm_list, te, next); > + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); > + return lpm; > > -exit: > +fail: > rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); > > - return lpm; > +fail_wo_unlock: > + if (rules_mempool != NULL) > + rte_mempool_free(rules_mempool); > + > + if (tbl8_hdrs != NULL) > + rte_free(tbl8_hdrs); > + > + if (tbl8_pool != NULL) > + rte_free(tbl8_pool); > + > + if (rules_tbl != NULL) > + rte_hash_free(rules_tbl); > + rte_free doesn't have an issue with taking NULL as parameter, so you can omit the various NULL checks and just have the series of free calls. > + return NULL; > } > > /* > @@ -259,50 +468,93 @@ rte_lpm6_free(struct rte_lpm6 *lpm) > > rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); > > - rte_free(lpm->rules_tbl); > + rte_mempool_free(lpm->rules_pool); > + rte_free(lpm->tbl8_hdrs); > + rte_free(lpm->tbl8_pool); > + rte_hash_free(lpm->rules_tbl); > rte_free(lpm); > rte_free(te); > } > > +/* Find a rule */ > +static inline struct rte_lpm6_rule* > +rule_find_with_key(struct rte_lpm6 *lpm, > + const struct rte_lpm6_rule_key *rule_key) > +{ > + /* look for a rule */ > + struct rte_lpm6_rule *rule; > + int ret = rte_hash_lookup_data(lpm->rules_tbl, > + (const void *) rule_key, (void **) &rule); > + return (ret >= 0) ? rule : NULL; > +} > + > +/* Find a rule */ > +static struct rte_lpm6_rule* > +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) > +{ > + /* init a rule key */ > + struct rte_lpm6_rule_key rule_key; > + rule_key_init(&rule_key, ip, depth); > + > + return rule_find_with_key(lpm, &rule_key); > +} > + > /* > * Checks if a rule already exists in the rules table and updates > * the nexthop if so. Otherwise it adds a new rule if enough space is available. > + * > + * Returns: > + * 0 - next hop of existed rule is updated > + * 1 - new rule successfuly added > + * <0 - error > */ > -static inline int32_t > -rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) > +static inline int > +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) > { > - uint32_t rule_index; > + /* init a rule key */ > + struct rte_lpm6_rule_key rule_key; > + rule_key_init(&rule_key, ip, depth); > > /* Scan through rule list to see if rule already exists. */ > - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { > - > - /* If rule already exists update its next_hop and return. */ > - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, > - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && > - lpm->rules_tbl[rule_index].depth == depth) { > - lpm->rules_tbl[rule_index].next_hop = next_hop; > + struct rte_lpm6_rule *rule = rule_find_with_key(lpm, &rule_key); > > - return rule_index; > - } > + /* If rule already exists update its next_hop and return. */ > + if (rule != NULL) { > + rule->next_hop = next_hop; > + return 0; > } > > /* > * If rule does not exist check if there is space to add a new rule to > * this rule group. If there is no space return error. > */ > - if (lpm->used_rules == lpm->max_rules) { > + if (lpm->used_rules == lpm->max_rules) > return -ENOSPC; > - } > > - /* If there is space for the new rule add it. */ > - rte_memcpy(lpm->rules_tbl[rule_index].ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); > - lpm->rules_tbl[rule_index].next_hop = next_hop; > - lpm->rules_tbl[rule_index].depth = depth; > + /* > + * If there is space for the new rule add it. > + */ > + > + /* get a new rule */ > + int ret = rte_mempool_get(lpm->rules_pool, (void **) &rule); > + if (ret < 0) > + return ret; > + > + /* init the rule */ > + rule->depth = depth; > + ip6_copy_addr(rule->ip, ip); > + rule->next_hop = next_hop; > + > + /* add the rule */ > + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, rule); > + if (ret < 0) { > + rte_mempool_put(lpm->rules_pool, rule); > + return ret; > + } > > /* Increment the used rules counter for this rule group. */ > lpm->used_rules++; > - > - return rule_index; > + return 1; > } > > /* > @@ -311,24 +563,24 @@ rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) > * in the IP address returns a match. > */ > static void > -expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, > - uint32_t next_hop) > +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, > + uint8_t new_depth, uint32_t next_hop, uint8_t valid) Is this change to have separate old depths and new depths a bug fix for an existing issue, or just a change needed due to the rework below? > { > uint32_t tbl8_group_end, tbl8_gindex_next, j; > > tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; > > struct rte_lpm6_tbl_entry new_tbl8_entry = { > - .valid = VALID, > - .valid_group = VALID, > - .depth = depth, > + .valid = valid, > + .valid_group = valid, > + .depth = new_depth, > .next_hop = next_hop, > .ext_entry = 0, > }; > > for (j = tbl8_gindex; j < tbl8_group_end; j++) { > if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 > - && lpm->tbl8[j].depth <= depth)) { > + && lpm->tbl8[j].depth <= old_depth)) { > > lpm->tbl8[j] = new_tbl8_entry; > > @@ -336,11 +588,101 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, > > tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex > * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; > - expand_rule(lpm, tbl8_gindex_next, depth, next_hop); > + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, > + next_hop, valid); > } > } > } > > +/* > + * Init a tbl8 header > + */ > +static inline void > +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, > + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) > +{ > + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; > + tbl_hdr->owner_tbl_ind = owner_tbl_ind; > + tbl_hdr->owner_entry_ind = owner_entry_ind; > + tbl_hdr->ref_cnt = 0; > +} > + > +/* > + * Calculate index to the table based on the number and position > + * of the bytes being inspected in this step. > + */ > +static uint32_t > +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) > +{ > + uint32_t entry_ind, i; > + int8_t bitshift; > + > + entry_ind = 0; > + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { > + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); > + > + if (bitshift < 0) > + bitshift = 0; > + entry_ind = entry_ind | ip[i-1] << bitshift; > + } > + > + return entry_ind; > +} > + > +/* > + * Simulate adding a new route to the LPM counting number > + * of new tables that will be needed > + * > + * It returns 0 on success, or 1 if > + * the process needs to be continued by calling the function again. > + */ > +static inline int > +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, > + uint8_t bytes, uint8_t first_byte, uint8_t depth, > + uint32_t *need_tbl_nb) > +{ > + uint32_t entry_ind; > + uint8_t bits_covered; > + uint32_t next_tbl_ind; > + > + /* > + * Calculate index to the table based on the number and position > + * of the bytes being inspected in this step. > + */ > + entry_ind = get_bitshift(ip, first_byte, bytes); > + > + /* Number of bits covered in this step */ > + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); > + > + if (depth <= bits_covered) { > + *need_tbl_nb = 0; > + return 0; > + } > + > + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { > + /* from this point on a new table is needed on each level > + * that is not covered yet > + */ > + depth -= bits_covered; > + uint32_t cnt = depth >> 3; /* depth / 3 */ depth / 8, not / 3. Perhaps "/ BYTE_SIZE" might be best. > + if (depth & 7) /* 0b00000111 */ > + /* if depth % 8 > 0 then one more table is needed > + * for those last bits > + */ > + cnt++; > + > + *need_tbl_nb = cnt; > + return 0; > + } > + > + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; > + *next_tbl = &(lpm->tbl8[next_tbl_ind * > + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); > + *need_tbl_nb = 0; > + return 1; > +} > + > /* > * Partially adds a new route to the data structure (tbl24+tbl8s). > * It returns 0 on success, a negative number on failure, or 1 if > @@ -348,25 +690,21 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, > */ > static inline int > add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > - struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, uint8_t bytes, > - uint8_t first_byte, uint8_t depth, uint32_t next_hop) > + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, > + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, > + uint8_t first_byte, uint8_t depth, uint32_t next_hop, > + uint8_t is_new_rule) > { > - uint32_t tbl_index, tbl_range, tbl8_group_start, tbl8_group_end, i; > - int32_t tbl8_gindex; > - int8_t bitshift; > + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; > + uint32_t tbl8_gindex; > uint8_t bits_covered; > + int ret; > > /* > * Calculate index to the table based on the number and position > * of the bytes being inspected in this step. > */ > - tbl_index = 0; > - for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { > - bitshift = (int8_t)((bytes - i)*BYTE_SIZE); > - > - if (bitshift < 0) bitshift = 0; > - tbl_index = tbl_index | ip[i-1] << bitshift; > - } > + entry_ind = get_bitshift(ip, first_byte, bytes); > > /* Number of bits covered in this step */ > bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); > @@ -378,7 +716,7 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > if (depth <= bits_covered) { > tbl_range = 1 << (bits_covered - depth); > > - for (i = tbl_index; i < (tbl_index + tbl_range); i++) { > + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { > if (!tbl[i].valid || (tbl[i].ext_entry == 0 && > tbl[i].depth <= depth)) { > > @@ -400,10 +738,15 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > */ > tbl8_gindex = tbl[i].lpm6_tbl8_gindex * > RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; > - expand_rule(lpm, tbl8_gindex, depth, next_hop); > + expand_rule(lpm, tbl8_gindex, depth, depth, > + next_hop, VALID); > } > } > > + /* update tbl8 rule reference counter */ > + if (tbl_ind != TBL24_IND && is_new_rule) > + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; > + > return 0; > } > /* > @@ -412,12 +755,24 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > */ > else { > /* If it's invalid a new tbl8 is needed */ > - if (!tbl[tbl_index].valid) { > - if (lpm->next_tbl8 < lpm->number_tbl8s) > - tbl8_gindex = (lpm->next_tbl8)++; > - else > + if (!tbl[entry_ind].valid) { > + /* get a new table */ > + ret = tbl8_get(lpm, &tbl8_gindex); > + if (ret != 0) > return -ENOSPC; > > + /* invalidate all new tbl8 entries */ > + tbl8_group_start = tbl8_gindex * > + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; > + memset(&lpm->tbl8[tbl8_group_start], 0, > + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); > + > + /* init the new table's header: > + * save the reference to the owner table > + */ > + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); > + > + /* reference to a new tbl8 */ > struct rte_lpm6_tbl_entry new_tbl_entry = { > .lpm6_tbl8_gindex = tbl8_gindex, > .depth = 0, > @@ -426,17 +781,20 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > .ext_entry = 1, > }; > > - tbl[tbl_index] = new_tbl_entry; > + tbl[entry_ind] = new_tbl_entry; > + > + /* update the current table's reference counter */ > + if (tbl_ind != TBL24_IND) > + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; > } > /* > - * If it's valid but not extended the rule that was stored * > + * If it's valid but not extended the rule that was stored > * here needs to be moved to the next table. > */ > - else if (tbl[tbl_index].ext_entry == 0) { > - /* Search for free tbl8 group. */ > - if (lpm->next_tbl8 < lpm->number_tbl8s) > - tbl8_gindex = (lpm->next_tbl8)++; > - else > + else if (tbl[entry_ind].ext_entry == 0) { > + /* get a new tbl8 index */ > + ret = tbl8_get(lpm, &tbl8_gindex); > + if (ret != 0) > return -ENOSPC; > > tbl8_group_start = tbl8_gindex * > @@ -444,13 +802,22 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > tbl8_group_end = tbl8_group_start + > RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; > > + struct rte_lpm6_tbl_entry tbl_entry = { > + .next_hop = tbl[entry_ind].next_hop, > + .depth = tbl[entry_ind].depth, > + .valid = VALID, > + .valid_group = VALID, > + .ext_entry = 0 > + }; > + > /* Populate new tbl8 with tbl value. */ > - for (i = tbl8_group_start; i < tbl8_group_end; i++) { > - lpm->tbl8[i].valid = VALID; > - lpm->tbl8[i].depth = tbl[tbl_index].depth; > - lpm->tbl8[i].next_hop = tbl[tbl_index].next_hop; > - lpm->tbl8[i].ext_entry = 0; > - } > + for (i = tbl8_group_start; i < tbl8_group_end; i++) > + lpm->tbl8[i] = tbl_entry; > + > + /* init the new table's header: > + * save the reference to the owner table > + */ > + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); > > /* > * Update tbl entry to point to new tbl8 entry. Note: The > @@ -465,11 +832,16 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, > .ext_entry = 1, > }; > > - tbl[tbl_index] = new_tbl_entry; > + tbl[entry_ind] = new_tbl_entry; > + > + /* update the current table's reference counter */ > + if (tbl_ind != TBL24_IND) > + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; > } > > - *tbl_next = &(lpm->tbl8[tbl[tbl_index].lpm6_tbl8_gindex * > - RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); > + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; > + *next_tbl = &(lpm->tbl8[*next_tbl_ind * > + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); > } > > return 1; > @@ -486,13 +858,55 @@ rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, > } > VERSION_SYMBOL(rte_lpm6_add, _v20, 2.0); > > + > +/* > + * Simulate adding a route to LPM > + * > + * Returns: > + * 0 if success > + * -ENOSPC not enought tbl8 left > + */ > +static int > +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) > +{ > + struct rte_lpm6_tbl_entry *tbl; > + struct rte_lpm6_tbl_entry *tbl_next = NULL; > + int ret, i; > + > + /* number of new tables needed for a step */ > + uint32_t need_tbl_nb; > + /* total number of new tables needed */ > + uint32_t total_need_tbl_nb; > + > + /* Inspect the first three bytes through tbl24 on the first step. */ > + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, > + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); > + total_need_tbl_nb = need_tbl_nb; > + /* > + * Inspect one by one the rest of the bytes until > + * the process is completed. > + */ > + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { > + tbl = tbl_next; > + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, > + (uint8_t)(i+1), depth, &need_tbl_nb); > + total_need_tbl_nb += need_tbl_nb; > + } > + > + if (tbl8_available(lpm) < total_need_tbl_nb) > + /* not enought tbl8 to add a rule */ > + return -ENOSPC; > + > + return 0; > +} > + > int > rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, > uint32_t next_hop) > { > struct rte_lpm6_tbl_entry *tbl; > struct rte_lpm6_tbl_entry *tbl_next = NULL; > - int32_t rule_index; > + uint32_t tbl_next_num; > int status; > uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; > int i; > @@ -502,26 +916,26 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, > return -EINVAL; > > /* Copy the IP and mask it to avoid modifying user's input data. */ > - memcpy(masked_ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); > - mask_ip(masked_ip, depth); > + ip6_copy_addr(masked_ip, ip); > + ip6_mask_addr(masked_ip, depth); > > /* Add the rule to the rule table. */ > - rule_index = rule_add(lpm, masked_ip, next_hop, depth); > - > + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); > /* If there is no space available for new rule return error. */ > - if (rule_index < 0) { > - return rule_index; > - } > + if (is_new_rule < 0) > + return is_new_rule; > + > + /* Simulate adding a new route */ > + int ret = simulate_add(lpm, masked_ip, depth); > + if (ret < 0) > + return ret; Do we need any rollback here for the rule that was just added? > > /* Inspect the first three bytes through tbl24 on the first step. */ > tbl = lpm->tbl24; > - status = add_step (lpm, tbl, &tbl_next, masked_ip, ADD_FIRST_BYTE, 1, > - depth, next_hop); > - if (status < 0) { > - rte_lpm6_delete(lpm, masked_ip, depth); > - > - return status; > - } > + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, > + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, > + is_new_rule); > + assert(status >= 0); > > /* > * Inspect one by one the rest of the bytes until > @@ -529,13 +943,10 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, > */ > for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { > tbl = tbl_next; > - status = add_step (lpm, tbl, &tbl_next, masked_ip, 1, (uint8_t)(i+1), > - depth, next_hop); > - if (status < 0) { > - rte_lpm6_delete(lpm, masked_ip, depth); > - > - return status; > - } > + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, > + &tbl_next_num, masked_ip, 1, (uint8_t)(i+1), > + depth, next_hop, is_new_rule); > + assert(status >= 0); > } > > return status; > @@ -610,9 +1021,8 @@ rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip, > uint32_t tbl24_index; > > /* DEBUG: Check user input arguments. */ > - if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) { > + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) > return -EINVAL; > - } > > first_byte = LOOKUP_FIRST_BYTE; > tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; > @@ -648,9 +1058,8 @@ rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm, > int status; > > /* DEBUG: Check user input arguments. */ > - if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) { > + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) > return -EINVAL; > - } > > for (i = 0; i < n; i++) { > first_byte = LOOKUP_FIRST_BYTE; > @@ -724,30 +1133,6 @@ MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, > int32_t *next_hops, unsigned int n), > rte_lpm6_lookup_bulk_func_v1705); > > -/* > - * Finds a rule in rule table. > - * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. > - */ > -static inline int32_t > -rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) > -{ > - uint32_t rule_index; > - > - /* Scan used rules at given depth to find rule. */ > - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { > - /* If rule is found return the rule index. */ > - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, > - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && > - lpm->rules_tbl[rule_index].depth == depth) { > - > - return rule_index; > - } > - } > - > - /* If rule is not found return -ENOENT. */ > - return -ENOENT; > -} > - > /* > * Look for a rule in the high-level rules table > */ > @@ -775,23 +1160,20 @@ int > rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, > uint32_t *next_hop) > { > - uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; > - int32_t rule_index; > - > /* Check user arguments. */ > if ((lpm == NULL) || next_hop == NULL || ip == NULL || > (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) > return -EINVAL; > > /* Copy the IP and mask it to avoid modifying user's input data. */ > - memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); > - mask_ip(ip_masked, depth); > + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; The line above doesn't strictly need to be moved. It's still recommended to have variables at the top of the block. > + ip6_copy_addr(masked_ip, ip); > + ip6_mask_addr(masked_ip, depth); > > - /* Look for the rule using rule_find. */ > - rule_index = rule_find(lpm, ip_masked, depth); > + struct rte_lpm6_rule *rule = rule_find(lpm, masked_ip, depth); > Can mark this as const, which can help justify having it declared mid-block. :-)