From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from relay-out6.mail.masterhost.ru (relay-out6.mail.masterhost.ru [83.222.12.16]) by dpdk.org (Postfix) with ESMTP id 245631BE7A for ; Fri, 6 Jul 2018 18:59:32 +0200 (CEST) Received: from [37.139.80.50] (helo=nw) by relay6.mail.masterhost.ru with esmtpa envelope from authenticated with alex@therouter.net message id 1fbU4p-0005Ii-TV; Fri, 06 Jul 2018 19:59:24 +0300 Date: Fri, 6 Jul 2018 19:59:22 +0300 From: Alex Kiselev Message-ID: <710674507.20180706195922@therouter.net> To: Bruce Richardson CC: "dev@dpdk.org" In-Reply-To: <20180706161640.GA24764@bricha3-MOBL.ger.corp.intel.com> References: <20180706161640.GA24764@bricha3-MOBL.ger.corp.intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit X-KLMS-Rule-ID: 1 X-KLMS-Message-Action: clean X-KLMS-AntiSpam-Lua-Profiles: 126623 [Jul 06 2018] X-KLMS-AntiSpam-Version: 5.8.3.0 X-KLMS-AntiSpam-Envelope-From: alex@therouter.net X-KLMS-AntiSpam-Rate: 0 X-KLMS-AntiSpam-Status: not_detected X-KLMS-AntiSpam-Method: none X-KLMS-AntiSpam-Info: LuaCore: 149 149 93a1592153427f08a0384b1e4f099aed80803c7b, {rep_avail}, DmarcAF: none X-MS-Exchange-Organization-SCL: -1 X-KLMS-AntiSpam-Interceptor-Info: scan successful X-KLMS-AntiPhishing: not scanned, disabled by settings X-KLMS-AntiVirus: Kaspersky Security for Linux Mail Server, version 8.0.2.16, not scanned, license restriction Subject: Re: [dpdk-dev] [PATCH v2] librte_lpm: Improve performance of the delete and add functions X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 06 Jul 2018 16:59:32 -0000 Please see inline replies > On Mon, Jul 02, 2018 at 07:42:11PM +0300, Alex Kiselev wrote: >> There are two major problems with the library: >> first, there is no need to rebuild the whole LPM tree >> when a rule is deleted and second, due to the current >> rules algorithm with complexity O(n) it's almost >> impossible to deal with large rule sets (50k or so rules). >> This patch addresses those two issues. >> Signed-off-by: Alex Kiselev > Hi, > Some initial review comments inline below > /Bruce >> --- >> lib/librte_lpm/rte_lpm6.c | 1073 ++++++++++++++++++++++++++++++++++----------- >> 1 file changed, 816 insertions(+), 257 deletions(-) >> diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c >> index 149677eb1..438db0831 100644 >> --- a/lib/librte_lpm/rte_lpm6.c >> +++ b/lib/librte_lpm/rte_lpm6.c >> @@ -21,6 +21,10 @@ >> #include >> #include >> #include >> +#include >> +#include >> +#include >> +#include >> >> #include "rte_lpm6.h" >> >> @@ -37,6 +41,9 @@ >> #define BYTE_SIZE 8 >> #define BYTES2_SIZE 16 >> >> +#define RULE_HASH_TABLE_EXTRA_SPACE 256 >> +#define TBL24_IND UINT32_MAX >> + >> #define lpm6_tbl8_gindex next_hop >> >> /** Flags for setting an entry as valid/invalid. */ >> @@ -70,6 +77,23 @@ struct rte_lpm6_rule { >> uint8_t depth; /**< Rule depth. */ >> }; >> >> +/** Rules tbl entry key. */ >> +struct rte_lpm6_rule_key { >> + uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */ >> + uint8_t depth; /**< Rule depth. */ >> +}; >> + >> +/* Header of tbl8 */ >> +struct rte_lpm_tbl8_hdr { >> + uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24, >> + * otherwise index of tbl8 >> + */ >> + uint32_t owner_entry_ind; /**< index of the owner table entry where >> + * pointer to the tbl8 is stored >> + */ >> + uint32_t ref_cnt; /**< table reference counter */ >> +}; >> + >> /** LPM6 structure. */ >> struct rte_lpm6 { >> /* LPM metadata. */ >> @@ -77,12 +101,18 @@ struct rte_lpm6 { >> uint32_t max_rules; /**< Max number of rules. */ >> uint32_t used_rules; /**< Used rules so far. */ >> uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */ >> - uint32_t next_tbl8; /**< Next tbl8 to be used. */ >> >> /* LPM Tables. */ >> - struct rte_lpm6_rule *rules_tbl; /**< LPM rules. */ >> + struct rte_mempool *rules_pool; /**< LPM rules mempool. */ >> + struct rte_hash *rules_tbl; /**< LPM rules. */ >> struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES] >> __rte_cache_aligned; /**< LPM tbl24 table. */ >> + >> + uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */ >> + uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */ >> + >> + struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */ >> + >> struct rte_lpm6_tbl_entry tbl8[0] >> __rte_cache_aligned; /**< LPM tbl8 table. */ >> }; >> @@ -93,22 +123,130 @@ struct rte_lpm6 { >> * and set the rest to 0. >> */ >> static inline void >> -mask_ip(uint8_t *ip, uint8_t depth) >> +ip6_mask_addr(uint8_t *ip, uint8_t depth) >> { >> - int16_t part_depth, mask; >> - int i; >> + int16_t part_depth, mask; >> + int i; >> >> - part_depth = depth; >> + part_depth = depth; >> >> - for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { >> - if (part_depth < BYTE_SIZE && part_depth >= 0) { >> - mask = (uint16_t)(~(UINT8_MAX >> part_depth)); >> - ip[i] = (uint8_t)(ip[i] & mask); >> - } else if (part_depth < 0) { >> - ip[i] = 0; >> - } >> - part_depth -= BYTE_SIZE; >> - } >> + for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) { >> + if (part_depth < BYTE_SIZE && part_depth >= 0) { >> + mask = (uint16_t)(~(UINT8_MAX >> part_depth)); >> + ip[i] = (uint8_t)(ip[i] & mask); >> + } else if (part_depth < 0) >> + ip[i] = 0; >> + >> + part_depth -= BYTE_SIZE; >> + } >> +} > This block is just a whitespace cleanup, as far as I can see. If there are > other little cleanups like that as part of this set, it would be nice to > have them as a separate initial patch, to make it easier to review the more > complicated changes. If I am not mistaken there was only one place that needed cleanup. >> + >> +/* copy ipv6 address */ >> +static inline void >> +ip6_copy_addr(uint8_t *dst, const uint8_t *src) >> +{ >> + rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE); >> +} >> + >> +/* >> + * LPM6 rule hash function >> + */ >> +static inline uint32_t >> +rule_hash_crc(const void *data, __rte_unused uint32_t data_len, >> + uint32_t init_val) >> +{ >> + return rte_hash_crc(data, sizeof(struct rte_lpm6_rule_key), init_val); >> +} > Why bother passing in the length and making the data a void pointer. I beleive it should be compatible with the rte_hash_function prototype. > Why > not just have the prototype: > rule_hash_crc(struct rte_lpm6_rule_key *data, uint32_t init_val) >> + >> +/* >> + * Init pool of free tbl8 indexes >> + */ >> +static void >> +tbl8_pool_init(struct rte_lpm6 *lpm) >> +{ >> + /* put entire range of indexes to the tbl8 pool */ >> + uint32_t i; >> + for (i = 0; i < lpm->number_tbl8s; i++) >> + lpm->tbl8_pool[i] = i; >> + >> + lpm->tbl8_pool_pos = 0; >> +} >> + >> +/* >> + * Get an index of a free tbl8 from the pool >> + */ >> +static inline uint32_t >> +tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind) >> +{ >> + if (lpm->tbl8_pool_pos == lpm->number_tbl8s) >> + /* no more free tbl8 */ >> + return -ENOSPC; >> + >> + /* next index */ >> + *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++]; >> + return 0; >> +} >> + >> +/* >> + * Put an index of a free tbl8 back to the pool >> + */ >> +static inline uint32_t >> +tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind) >> +{ >> + if (lpm->tbl8_pool_pos == 0) >> + /* pool is full */ >> + return -ENOSPC; >> + >> + lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind; >> + return 0; >> +} >> + >> +/* >> + * Returns number of tbl8s awailable in the pool > Typo: available ok, I'll fix it. >> + */ >> +static inline uint32_t >> +tbl8_available(struct rte_lpm6 *lpm) >> +{ >> + return lpm->number_tbl8s - lpm->tbl8_pool_pos; >> +} >> + >> +/* >> + * Init a rule key. >> + * note that ip must be already masked >> + */ >> +static inline void >> +rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth) >> +{ >> + ip6_copy_addr(key->ip, ip); >> + key->depth = depth; >> +} >> + >> +/* >> + * Recreate the entire LPM tree by reinserting all rules >> + */ >> +static void >> +recreate_lpm(struct rte_lpm6 *lpm) >> +{ >> + struct rte_lpm6_rule *rule; >> + struct rte_lpm6_rule *rule_key; >> + uint32_t iter = 0; >> + while (rte_hash_iterate(lpm->rules_tbl, (const void **) &rule_key, >> + (void **) &rule, &iter) >= 0) >> + rte_lpm6_add(lpm, rule->ip, rule->depth, rule->next_hop); >> +} >> + >> +/* >> + * Free all rules >> + */ >> +static void >> +rules_free(struct rte_lpm6 *lpm) >> +{ >> + struct rte_lpm6_rule *rule; >> + struct rte_lpm6_rule *rule_key; >> + uint32_t iter = 0; >> + while (rte_hash_iterate(lpm->rules_tbl, (const void **) &rule_key, >> + (void **) &rule, &iter) >= 0) >> + rte_mempool_put(lpm->rules_pool, rule); >> } >> >> /* >> @@ -121,7 +259,7 @@ rte_lpm6_create(const char *name, int socket_id, >> char mem_name[RTE_LPM6_NAMESIZE]; >> struct rte_lpm6 *lpm = NULL; >> struct rte_tailq_entry *te; >> - uint64_t mem_size, rules_size; >> + uint64_t mem_size; >> struct rte_lpm6_list *lpm_list; >> >> lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list); >> @@ -136,12 +274,72 @@ rte_lpm6_create(const char *name, int socket_id, >> return NULL; >> } >> >> + struct rte_mempool *rules_mempool = NULL; >> + struct rte_hash *rules_tbl = NULL; >> + uint32_t *tbl8_pool = NULL; >> + struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL; >> + >> + /* allocate rules mempool */ >> + snprintf(mem_name, sizeof(mem_name), "LRM_%s", name); > LRM == "LPM Rules Mempool" I assume? [Not that it actually matters] yes >> + rules_mempool = rte_mempool_create(mem_name, >> + config->max_rules, sizeof(struct rte_lpm6_rule), 0, 0, >> + NULL, NULL, NULL, NULL, socket_id, >> + MEMPOOL_F_NO_CACHE_ALIGN); >> + if (rules_mempool == NULL) { >> + RTE_LOG(ERR, LPM, "LPM rules mempool allocation failed: %s (%d)", >> + rte_strerror(rte_errno), rte_errno); >> + rte_errno = ENOMEM; >> + goto fail_wo_unlock; >> + } >> + >> + /* create rules hash table */ >> + snprintf(mem_name, sizeof(mem_name), "LRH_%s", name); >> + >> + struct rte_hash_parameters rule_hash_tbl_params = { >> + .entries = config->max_rules + RULE_HASH_TABLE_EXTRA_SPACE, > Are 256 extra slots going to be enough here? Would it be safer to multiply > max_rules by e.g. 1.2 or similar? yes. it would be safer. >> + .key_len = sizeof(struct rte_lpm6_rule_key), >> + .hash_func = rule_hash_crc, > Given that this is not for data plane use, it might be better to use jhash > as the hash function. It's slower, but it does give a better rule > distribution so allows more compact tables. yeah, good idea >> + .hash_func_init_val = 0, >> + .name = mem_name, >> + .reserved = 0, >> + .socket_id = socket_id, >> + .extra_flag = 0 >> + }; >> + >> + rules_tbl = rte_hash_create(&rule_hash_tbl_params); >> + if (rules_tbl == NULL) { >> + RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)", >> + rte_strerror(rte_errno), rte_errno); >> + goto fail_wo_unlock; >> + } >> + >> + /* allocate tbl8 indexes pool */ >> + tbl8_pool = rte_malloc(NULL, >> + sizeof(uint32_t) * config->number_tbl8s, >> + RTE_CACHE_LINE_SIZE); >> + if (tbl8_pool == NULL) { >> + RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)", >> + rte_strerror(rte_errno), rte_errno); >> + rte_errno = ENOMEM; >> + goto fail_wo_unlock; >> + } >> + >> + /* allocate tbl8 headers */ >> + tbl8_hdrs = rte_malloc(NULL, >> + sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s, >> + RTE_CACHE_LINE_SIZE); >> + if (tbl8_hdrs == NULL) { >> + RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)", >> + rte_strerror(rte_errno), rte_errno); >> + rte_errno = ENOMEM; >> + goto fail_wo_unlock; >> + } >> + >> snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); >> >> /* Determine the amount of memory to allocate. */ >> mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) * >> RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s); >> - rules_size = sizeof(struct rte_lpm6_rule) * config->max_rules; >> >> rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); >> >> @@ -154,7 +352,7 @@ rte_lpm6_create(const char *name, int socket_id, >> lpm = NULL; >> if (te != NULL) { >> rte_errno = EEXIST; >> - goto exit; >> + goto fail; >> } >> >> /* allocate tailq entry */ >> @@ -162,30 +360,18 @@ rte_lpm6_create(const char *name, int socket_id, >> if (te == NULL) { >> RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n"); >> rte_errno = ENOMEM; >> - goto exit; >> + goto fail; >> } >> >> /* Allocate memory to store the LPM data structures. */ >> - lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size, >> + lpm = (struct rte_lpm6 *)rte_zmalloc_socket(mem_name, (size_t)mem_size, >> RTE_CACHE_LINE_SIZE, socket_id); >> >> if (lpm == NULL) { >> RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); >> rte_free(te); >> rte_errno = ENOMEM; >> - goto exit; >> - } >> - >> - lpm->rules_tbl = rte_zmalloc_socket(NULL, >> - (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id); >> - >> - if (lpm->rules_tbl == NULL) { >> - RTE_LOG(ERR, LPM, "LPM rules_tbl allocation failed\n"); >> - rte_free(lpm); >> - lpm = NULL; >> - rte_free(te); >> - rte_errno = ENOMEM; >> - goto exit; >> + goto fail; >> } >> >> /* Save user arguments. */ >> @@ -193,14 +379,37 @@ rte_lpm6_create(const char *name, int socket_id, >> lpm->number_tbl8s = config->number_tbl8s; >> snprintf(lpm->name, sizeof(lpm->name), "%s", name); >> >> + lpm->rules_tbl = rules_tbl; >> + lpm->tbl8_pool = tbl8_pool; >> + lpm->tbl8_hdrs = tbl8_hdrs; >> + lpm->rules_pool = rules_mempool; >> + >> + /* init the stack */ >> + tbl8_pool_init(lpm); >> + >> te->data = (void *) lpm; >> >> TAILQ_INSERT_TAIL(lpm_list, te, next); >> + rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); >> + return lpm; >> >> -exit: >> +fail: >> rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); >> >> - return lpm; >> +fail_wo_unlock: >> + if (rules_mempool != NULL) >> + rte_mempool_free(rules_mempool); >> + >> + if (tbl8_hdrs != NULL) >> + rte_free(tbl8_hdrs); >> + >> + if (tbl8_pool != NULL) >> + rte_free(tbl8_pool); >> + >> + if (rules_tbl != NULL) >> + rte_hash_free(rules_tbl); >> + > rte_free doesn't have an issue with taking NULL as parameter, so you can > omit the various NULL checks and just have the series of free calls. got it >> + return NULL; >> } >> >> /* >> @@ -259,50 +468,93 @@ rte_lpm6_free(struct rte_lpm6 *lpm) >> >> rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); >> >> - rte_free(lpm->rules_tbl); >> + rte_mempool_free(lpm->rules_pool); >> + rte_free(lpm->tbl8_hdrs); >> + rte_free(lpm->tbl8_pool); >> + rte_hash_free(lpm->rules_tbl); >> rte_free(lpm); >> rte_free(te); >> } >> >> +/* Find a rule */ >> +static inline struct rte_lpm6_rule* >> +rule_find_with_key(struct rte_lpm6 *lpm, >> + const struct rte_lpm6_rule_key *rule_key) >> +{ >> + /* look for a rule */ >> + struct rte_lpm6_rule *rule; >> + int ret = rte_hash_lookup_data(lpm->rules_tbl, >> + (const void *) rule_key, (void **) &rule); >> + return (ret >= 0) ? rule : NULL; >> +} >> + >> +/* Find a rule */ >> +static struct rte_lpm6_rule* >> +rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) >> +{ >> + /* init a rule key */ >> + struct rte_lpm6_rule_key rule_key; >> + rule_key_init(&rule_key, ip, depth); >> + >> + return rule_find_with_key(lpm, &rule_key); >> +} >> + >> /* >> * Checks if a rule already exists in the rules table and updates >> * the nexthop if so. Otherwise it adds a new rule if enough space is available. >> + * >> + * Returns: >> + * 0 - next hop of existed rule is updated >> + * 1 - new rule successfuly added >> + * <0 - error >> */ >> -static inline int32_t >> -rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) >> +static inline int >> +rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop) >> { >> - uint32_t rule_index; >> + /* init a rule key */ >> + struct rte_lpm6_rule_key rule_key; >> + rule_key_init(&rule_key, ip, depth); >> >> /* Scan through rule list to see if rule already exists. */ >> - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { >> - >> - /* If rule already exists update its next_hop and return. */ >> - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, >> - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && >> - lpm->rules_tbl[rule_index].depth == depth) { >> - lpm->rules_tbl[rule_index].next_hop = next_hop; >> + struct rte_lpm6_rule *rule = rule_find_with_key(lpm, &rule_key); >> >> - return rule_index; >> - } >> + /* If rule already exists update its next_hop and return. */ >> + if (rule != NULL) { >> + rule->next_hop = next_hop; >> + return 0; >> } >> >> /* >> * If rule does not exist check if there is space to add a new rule to >> * this rule group. If there is no space return error. >> */ >> - if (lpm->used_rules == lpm->max_rules) { >> + if (lpm->used_rules == lpm->max_rules) >> return -ENOSPC; >> - } >> >> - /* If there is space for the new rule add it. */ >> - rte_memcpy(lpm->rules_tbl[rule_index].ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); >> - lpm->rules_tbl[rule_index].next_hop = next_hop; >> - lpm->rules_tbl[rule_index].depth = depth; >> + /* >> + * If there is space for the new rule add it. >> + */ >> + >> + /* get a new rule */ >> + int ret = rte_mempool_get(lpm->rules_pool, (void **) &rule); >> + if (ret < 0) >> + return ret; >> + >> + /* init the rule */ >> + rule->depth = depth; >> + ip6_copy_addr(rule->ip, ip); >> + rule->next_hop = next_hop; >> + >> + /* add the rule */ >> + ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key, rule); >> + if (ret < 0) { >> + rte_mempool_put(lpm->rules_pool, rule); >> + return ret; >> + } >> >> /* Increment the used rules counter for this rule group. */ >> lpm->used_rules++; >> - >> - return rule_index; >> + return 1; >> } >> >> /* >> @@ -311,24 +563,24 @@ rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint32_t next_hop, uint8_t depth) >> * in the IP address returns a match. >> */ >> static void >> -expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, >> - uint32_t next_hop) >> +expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth, >> + uint8_t new_depth, uint32_t next_hop, uint8_t valid) > Is this change to have separate old depths and new depths a bug fix for an > existing issue, or just a change needed due to the rework below? It's not a bug fix. the function is now used not only in the add rule, but in delete rule operation too, which requires a little bit more complicated logic. >> { >> uint32_t tbl8_group_end, tbl8_gindex_next, j; >> >> tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; >> >> struct rte_lpm6_tbl_entry new_tbl8_entry = { >> - .valid = VALID, >> - .valid_group = VALID, >> - .depth = depth, >> + .valid = valid, >> + .valid_group = valid, >> + .depth = new_depth, >> .next_hop = next_hop, >> .ext_entry = 0, >> }; >> >> for (j = tbl8_gindex; j < tbl8_group_end; j++) { >> if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0 >> - && lpm->tbl8[j].depth <= depth)) { >> + && lpm->tbl8[j].depth <= old_depth)) { >> >> lpm->tbl8[j] = new_tbl8_entry; >> >> @@ -336,11 +588,101 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, >> >> tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex >> * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; >> - expand_rule(lpm, tbl8_gindex_next, depth, next_hop); >> + expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth, >> + next_hop, valid); >> } >> } >> } >> >> +/* >> + * Init a tbl8 header >> + */ >> +static inline void >> +init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind, >> + uint32_t owner_tbl_ind, uint32_t owner_entry_ind) >> +{ >> + struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind]; >> + tbl_hdr->owner_tbl_ind = owner_tbl_ind; >> + tbl_hdr->owner_entry_ind = owner_entry_ind; >> + tbl_hdr->ref_cnt = 0; >> +} >> + >> +/* >> + * Calculate index to the table based on the number and position >> + * of the bytes being inspected in this step. >> + */ >> +static uint32_t >> +get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes) >> +{ >> + uint32_t entry_ind, i; >> + int8_t bitshift; >> + >> + entry_ind = 0; >> + for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { >> + bitshift = (int8_t)((bytes - i)*BYTE_SIZE); >> + >> + if (bitshift < 0) >> + bitshift = 0; >> + entry_ind = entry_ind | ip[i-1] << bitshift; >> + } >> + >> + return entry_ind; >> +} >> + >> +/* >> + * Simulate adding a new route to the LPM counting number >> + * of new tables that will be needed >> + * >> + * It returns 0 on success, or 1 if >> + * the process needs to be continued by calling the function again. >> + */ >> +static inline int >> +simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> + struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip, >> + uint8_t bytes, uint8_t first_byte, uint8_t depth, >> + uint32_t *need_tbl_nb) >> +{ >> + uint32_t entry_ind; >> + uint8_t bits_covered; >> + uint32_t next_tbl_ind; >> + >> + /* >> + * Calculate index to the table based on the number and position >> + * of the bytes being inspected in this step. >> + */ >> + entry_ind = get_bitshift(ip, first_byte, bytes); >> + >> + /* Number of bits covered in this step */ >> + bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); >> + >> + if (depth <= bits_covered) { >> + *need_tbl_nb = 0; >> + return 0; >> + } >> + >> + if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) { >> + /* from this point on a new table is needed on each level >> + * that is not covered yet >> + */ >> + depth -= bits_covered; >> + uint32_t cnt = depth >> 3; /* depth / 3 */ > depth / 8, not / 3. Perhaps "/ BYTE_SIZE" might be best. >> + if (depth & 7) /* 0b00000111 */ >> + /* if depth % 8 > 0 then one more table is needed >> + * for those last bits >> + */ >> + cnt++; >> + >> + *need_tbl_nb = cnt; >> + return 0; >> + } >> + >> + next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; >> + *next_tbl = &(lpm->tbl8[next_tbl_ind * >> + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); >> + *need_tbl_nb = 0; >> + return 1; >> +} >> + >> /* >> * Partially adds a new route to the data structure (tbl24+tbl8s). >> * It returns 0 on success, a negative number on failure, or 1 if >> @@ -348,25 +690,21 @@ expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t depth, >> */ >> static inline int >> add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> - struct rte_lpm6_tbl_entry **tbl_next, uint8_t *ip, uint8_t bytes, >> - uint8_t first_byte, uint8_t depth, uint32_t next_hop) >> + uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl, >> + uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes, >> + uint8_t first_byte, uint8_t depth, uint32_t next_hop, >> + uint8_t is_new_rule) >> { >> - uint32_t tbl_index, tbl_range, tbl8_group_start, tbl8_group_end, i; >> - int32_t tbl8_gindex; >> - int8_t bitshift; >> + uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i; >> + uint32_t tbl8_gindex; >> uint8_t bits_covered; >> + int ret; >> >> /* >> * Calculate index to the table based on the number and position >> * of the bytes being inspected in this step. >> */ >> - tbl_index = 0; >> - for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) { >> - bitshift = (int8_t)((bytes - i)*BYTE_SIZE); >> - >> - if (bitshift < 0) bitshift = 0; >> - tbl_index = tbl_index | ip[i-1] << bitshift; >> - } >> + entry_ind = get_bitshift(ip, first_byte, bytes); >> >> /* Number of bits covered in this step */ >> bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE); >> @@ -378,7 +716,7 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> if (depth <= bits_covered) { >> tbl_range = 1 << (bits_covered - depth); >> >> - for (i = tbl_index; i < (tbl_index + tbl_range); i++) { >> + for (i = entry_ind; i < (entry_ind + tbl_range); i++) { >> if (!tbl[i].valid || (tbl[i].ext_entry == 0 && >> tbl[i].depth <= depth)) { >> >> @@ -400,10 +738,15 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> */ >> tbl8_gindex = tbl[i].lpm6_tbl8_gindex * >> RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; >> - expand_rule(lpm, tbl8_gindex, depth, next_hop); >> + expand_rule(lpm, tbl8_gindex, depth, depth, >> + next_hop, VALID); >> } >> } >> >> + /* update tbl8 rule reference counter */ >> + if (tbl_ind != TBL24_IND && is_new_rule) >> + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; >> + >> return 0; >> } >> /* >> @@ -412,12 +755,24 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> */ >> else { >> /* If it's invalid a new tbl8 is needed */ >> - if (!tbl[tbl_index].valid) { >> - if (lpm->next_tbl8 < lpm->number_tbl8s) >> - tbl8_gindex = (lpm->next_tbl8)++; >> - else >> + if (!tbl[entry_ind].valid) { >> + /* get a new table */ >> + ret = tbl8_get(lpm, &tbl8_gindex); >> + if (ret != 0) >> return -ENOSPC; >> >> + /* invalidate all new tbl8 entries */ >> + tbl8_group_start = tbl8_gindex * >> + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; >> + memset(&lpm->tbl8[tbl8_group_start], 0, >> + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); >> + >> + /* init the new table's header: >> + * save the reference to the owner table >> + */ >> + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); >> + >> + /* reference to a new tbl8 */ >> struct rte_lpm6_tbl_entry new_tbl_entry = { >> .lpm6_tbl8_gindex = tbl8_gindex, >> .depth = 0, >> @@ -426,17 +781,20 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> .ext_entry = 1, >> }; >> >> - tbl[tbl_index] = new_tbl_entry; >> + tbl[entry_ind] = new_tbl_entry; >> + >> + /* update the current table's reference counter */ >> + if (tbl_ind != TBL24_IND) >> + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; >> } >> /* >> - * If it's valid but not extended the rule that was stored * >> + * If it's valid but not extended the rule that was stored >> * here needs to be moved to the next table. >> */ >> - else if (tbl[tbl_index].ext_entry == 0) { >> - /* Search for free tbl8 group. */ >> - if (lpm->next_tbl8 < lpm->number_tbl8s) >> - tbl8_gindex = (lpm->next_tbl8)++; >> - else >> + else if (tbl[entry_ind].ext_entry == 0) { >> + /* get a new tbl8 index */ >> + ret = tbl8_get(lpm, &tbl8_gindex); >> + if (ret != 0) >> return -ENOSPC; >> >> tbl8_group_start = tbl8_gindex * >> @@ -444,13 +802,22 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> tbl8_group_end = tbl8_group_start + >> RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; >> >> + struct rte_lpm6_tbl_entry tbl_entry = { >> + .next_hop = tbl[entry_ind].next_hop, >> + .depth = tbl[entry_ind].depth, >> + .valid = VALID, >> + .valid_group = VALID, >> + .ext_entry = 0 >> + }; >> + >> /* Populate new tbl8 with tbl value. */ >> - for (i = tbl8_group_start; i < tbl8_group_end; i++) { >> - lpm->tbl8[i].valid = VALID; >> - lpm->tbl8[i].depth = tbl[tbl_index].depth; >> - lpm->tbl8[i].next_hop = tbl[tbl_index].next_hop; >> - lpm->tbl8[i].ext_entry = 0; >> - } >> + for (i = tbl8_group_start; i < tbl8_group_end; i++) >> + lpm->tbl8[i] = tbl_entry; >> + >> + /* init the new table's header: >> + * save the reference to the owner table >> + */ >> + init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind); >> >> /* >> * Update tbl entry to point to new tbl8 entry. Note: The >> @@ -465,11 +832,16 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, >> .ext_entry = 1, >> }; >> >> - tbl[tbl_index] = new_tbl_entry; >> + tbl[entry_ind] = new_tbl_entry; >> + >> + /* update the current table's reference counter */ >> + if (tbl_ind != TBL24_IND) >> + lpm->tbl8_hdrs[tbl_ind].ref_cnt++; >> } >> >> - *tbl_next = &(lpm->tbl8[tbl[tbl_index].lpm6_tbl8_gindex * >> - RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); >> + *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex; >> + *next_tbl = &(lpm->tbl8[*next_tbl_ind * >> + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]); >> } >> >> return 1; >> @@ -486,13 +858,55 @@ rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, >> } >> VERSION_SYMBOL(rte_lpm6_add, _v20, 2.0); >> >> + >> +/* >> + * Simulate adding a route to LPM >> + * >> + * Returns: >> + * 0 if success >> + * -ENOSPC not enought tbl8 left >> + */ >> +static int >> +simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) >> +{ >> + struct rte_lpm6_tbl_entry *tbl; >> + struct rte_lpm6_tbl_entry *tbl_next = NULL; >> + int ret, i; >> + >> + /* number of new tables needed for a step */ >> + uint32_t need_tbl_nb; >> + /* total number of new tables needed */ >> + uint32_t total_need_tbl_nb; >> + >> + /* Inspect the first three bytes through tbl24 on the first step. */ >> + ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip, >> + ADD_FIRST_BYTE, 1, depth, &need_tbl_nb); >> + total_need_tbl_nb = need_tbl_nb; >> + /* >> + * Inspect one by one the rest of the bytes until >> + * the process is completed. >> + */ >> + for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) { >> + tbl = tbl_next; >> + ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1, >> + (uint8_t)(i+1), depth, &need_tbl_nb); >> + total_need_tbl_nb += need_tbl_nb; >> + } >> + >> + if (tbl8_available(lpm) < total_need_tbl_nb) >> + /* not enought tbl8 to add a rule */ >> + return -ENOSPC; >> + >> + return 0; >> +} >> + >> int >> rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, >> uint32_t next_hop) >> { >> struct rte_lpm6_tbl_entry *tbl; >> struct rte_lpm6_tbl_entry *tbl_next = NULL; >> - int32_t rule_index; >> + uint32_t tbl_next_num; >> int status; >> uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; >> int i; >> @@ -502,26 +916,26 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, >> return -EINVAL; >> >> /* Copy the IP and mask it to avoid modifying user's input data. */ >> - memcpy(masked_ip, ip, RTE_LPM6_IPV6_ADDR_SIZE); >> - mask_ip(masked_ip, depth); >> + ip6_copy_addr(masked_ip, ip); >> + ip6_mask_addr(masked_ip, depth); >> >> /* Add the rule to the rule table. */ >> - rule_index = rule_add(lpm, masked_ip, next_hop, depth); >> - >> + int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop); >> /* If there is no space available for new rule return error. */ >> - if (rule_index < 0) { >> - return rule_index; >> - } >> + if (is_new_rule < 0) >> + return is_new_rule; >> + >> + /* Simulate adding a new route */ >> + int ret = simulate_add(lpm, masked_ip, depth); >> + if (ret < 0) >> + return ret; > Do we need any rollback here for the rule that was just added? yeah, my mistake. definitely, a rollback must be used. >> >> /* Inspect the first three bytes through tbl24 on the first step. */ >> tbl = lpm->tbl24; >> - status = add_step (lpm, tbl, &tbl_next, masked_ip, ADD_FIRST_BYTE, 1, >> - depth, next_hop); >> - if (status < 0) { >> - rte_lpm6_delete(lpm, masked_ip, depth); >> - >> - return status; >> - } >> + status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num, >> + masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop, >> + is_new_rule); >> + assert(status >= 0); >> >> /* >> * Inspect one by one the rest of the bytes until >> @@ -529,13 +943,10 @@ rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, >> */ >> for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) { >> tbl = tbl_next; >> - status = add_step (lpm, tbl, &tbl_next, masked_ip, 1, (uint8_t)(i+1), >> - depth, next_hop); >> - if (status < 0) { >> - rte_lpm6_delete(lpm, masked_ip, depth); >> - >> - return status; >> - } >> + status = add_step(lpm, tbl, tbl_next_num, &tbl_next, >> + &tbl_next_num, masked_ip, 1, (uint8_t)(i+1), >> + depth, next_hop, is_new_rule); >> + assert(status >= 0); >> } >> >> return status; >> @@ -610,9 +1021,8 @@ rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip, >> uint32_t tbl24_index; >> >> /* DEBUG: Check user input arguments. */ >> - if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) { >> + if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL)) >> return -EINVAL; >> - } >> >> first_byte = LOOKUP_FIRST_BYTE; >> tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2]; >> @@ -648,9 +1058,8 @@ rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm, >> int status; >> >> /* DEBUG: Check user input arguments. */ >> - if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) { >> + if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL)) >> return -EINVAL; >> - } >> >> for (i = 0; i < n; i++) { >> first_byte = LOOKUP_FIRST_BYTE; >> @@ -724,30 +1133,6 @@ MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm, >> int32_t *next_hops, unsigned int n), >> rte_lpm6_lookup_bulk_func_v1705); >> >> -/* >> - * Finds a rule in rule table. >> - * NOTE: Valid range for depth parameter is 1 .. 128 inclusive. >> - */ >> -static inline int32_t >> -rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth) >> -{ >> - uint32_t rule_index; >> - >> - /* Scan used rules at given depth to find rule. */ >> - for (rule_index = 0; rule_index < lpm->used_rules; rule_index++) { >> - /* If rule is found return the rule index. */ >> - if ((memcmp (lpm->rules_tbl[rule_index].ip, ip, >> - RTE_LPM6_IPV6_ADDR_SIZE) == 0) && >> - lpm->rules_tbl[rule_index].depth == depth) { >> - >> - return rule_index; >> - } >> - } >> - >> - /* If rule is not found return -ENOENT. */ >> - return -ENOENT; >> -} >> - >> /* >> * Look for a rule in the high-level rules table >> */ >> @@ -775,23 +1160,20 @@ int >> rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, >> uint32_t *next_hop) >> { >> - uint8_t ip_masked[RTE_LPM6_IPV6_ADDR_SIZE]; >> - int32_t rule_index; >> - >> /* Check user arguments. */ >> if ((lpm == NULL) || next_hop == NULL || ip == NULL || >> (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH)) >> return -EINVAL; >> >> /* Copy the IP and mask it to avoid modifying user's input data. */ >> - memcpy(ip_masked, ip, RTE_LPM6_IPV6_ADDR_SIZE); >> - mask_ip(ip_masked, depth); >> + uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE]; > The line above doesn't strictly need to be moved. It's still recommended to > have variables at the top of the block. ok, I'll move it the top of the block >> + ip6_copy_addr(masked_ip, ip); >> + ip6_mask_addr(masked_ip, depth); >> >> - /* Look for the rule using rule_find. */ >> - rule_index = rule_find(lpm, ip_masked, depth); >> + struct rte_lpm6_rule *rule = rule_find(lpm, masked_ip, depth); >> > Can mark this as const, which can help justify having it declared > mid-block. :-) ok, I'll move it the top of the block > -- Alex