From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pa0-f50.google.com (mail-pa0-f50.google.com [209.85.220.50]) by dpdk.org (Postfix) with ESMTP id D3E29592B for ; Fri, 23 Oct 2015 21:58:59 +0200 (CEST) Received: by pacfv9 with SMTP id fv9so132230619pac.3 for ; Fri, 23 Oct 2015 12:58:59 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:date:from:to:cc:subject:message-id:in-reply-to :references:mime-version:content-type:content-transfer-encoding; bh=E17LhrnyW24m2xyJ0iCL6sG7FiXk+vJ2JkWJHHbn8Rs=; b=EeefJ6/GytlZnGcR+pSNwZ40H+7su0G/bDvBmPntfCxC7SMWgunUYC5WjdXe7zIE6E 5ZFGw0sYIc2URK8GgsbTUOEKBCydI2GO80oB5QyaQS1WuoexhF4f6Cw/6V7M6W926FZG 6IrKfn+0qtf3oj02LDBjnltq+t6th58dhGlrA0B1LBSmX38Rzh8DxPWnD4YJP1KfbzNA +UVBjdIyIg9GHNEt8lq2aQff9k9I+oJ83xCRktZUTAusEs6JbWT1+HnEJPOto45/iVy2 0wnaGc9jLTA1TbYh1qfFD4qCSm8yxadkM7t0xYwZNNG48+DKUca2ga99tjeAmcKPcdSL Djqg== X-Gm-Message-State: ALoCoQn6w2YoGf3OwtvPX318jK1AJvDDfm3YUtCNt9vQLM7zJU1MQh2xh3yy3DMGVtSPHSaYuW+X X-Received: by 10.68.65.37 with SMTP id u5mr25741794pbs.76.1445630338943; Fri, 23 Oct 2015 12:58:58 -0700 (PDT) Received: from xeon-e3 (static-50-53-82-155.bvtn.or.frontiernet.net. [50.53.82.155]) by smtp.gmail.com with ESMTPSA id xm4sm20469280pab.27.2015.10.23.12.58.58 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Fri, 23 Oct 2015 12:58:58 -0700 (PDT) Date: Fri, 23 Oct 2015 12:59:06 -0700 From: Stephen Hemminger To: Matthew Hall Message-ID: <20151023125906.36fd3856@xeon-e3> In-Reply-To: <20151023183811.GA11859@mhcomputing.net> References: <1445608311-8092-1-git-send-email-michalx.k.jastrzebski@intel.com> <20151023162033.GA10036@mhcomputing.net> <20151023093305.2e971298@xeon-e3> <20151023183811.GA11859@mhcomputing.net> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Cc: dev@dpdk.org Subject: Re: [dpdk-dev] [PATCH v1 0/3] lpm: increase number of next hops for lpm (ipv4) X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 23 Oct 2015 19:59:00 -0000 =46rom 9efec4571eec4db455a29773b95cf9264c046a03 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 23 Oct 2015 12:55:05 -0700 Subject: [PATCH] lpm: brocade extensions This is a brute-force merge of the Brocade extension to LPM to current DPDK source tree. No API/ABI compatibility is expected. 1. Allow arbitrary number of rules 2. Get rid of N^2 search for rule add/delete 3. Add route scope 4. Extend nexthop to 16 bits 5. Extend to allow for more info on delete, (callback and nexthop) 6. Dynamically grow /8 table (requires RCU) 7. Support full /0 and /32 rules --- lib/librte_lpm/rte_lpm.c | 814 ++++++++++++++++++++++++++-----------------= ---- lib/librte_lpm/rte_lpm.h | 381 +++++++--------------- 2 files changed, 567 insertions(+), 628 deletions(-) diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c index 163ba3c..ef1f0bf 100644 --- a/lib/librte_lpm/rte_lpm.c +++ b/lib/librte_lpm/rte_lpm.c @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2015 Brocade Communications Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,13 +39,15 @@ #include #include #include +#include =20 #include #include #include -#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include /* for definition of RTE_CACHE_LINE_= SIZE */ #include #include +#include #include #include #include @@ -52,9 +55,25 @@ #include #include #include +#include =20 #include "rte_lpm.h" =20 +#include + +/** Auto-growth of tbl8 */ +#define RTE_LPM_TBL8_INIT_GROUPS 256 /* power of 2 */ +#define RTE_LPM_TBL8_INIT_ENTRIES (RTE_LPM_TBL8_INIT_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) +/** Rule structure. */ +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint16_t next_hop; /**< Rule next hop. */ + uint8_t scope; /**< Rule scope */ + uint8_t reserved; + RB_ENTRY(rte_lpm_rule) link; +}; + TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); =20 static struct rte_tailq_elem rte_lpm_tailq =3D { @@ -71,31 +90,55 @@ enum valid_flag { =20 /* Macro to enable/disable run-time checks. */ #if defined(RTE_LIBRTE_LPM_DEBUG) -#include -#define VERIFY_DEPTH(depth) do { \ - if ((depth =3D=3D 0) || (depth > RTE_LPM_MAX_DEPTH)) \ +#define VERIFY_DEPTH(depth) do { \ + if (depth > RTE_LPM_MAX_DEPTH) \ rte_panic("LPM: Invalid depth (%u) at line %d", \ - (unsigned)(depth), __LINE__); \ + (unsigned)(depth), __LINE__); \ } while (0) #else #define VERIFY_DEPTH(depth) #endif =20 +/* Comparison function for red-black tree nodes. + "If the first argument is smaller than the second, the function + returns a value smaller than zero. If they are equal, the function + returns zero. Otherwise, it should return a value greater than zero." +*/ +static inline int rules_cmp(const struct rte_lpm_rule *r1, + const struct rte_lpm_rule *r2) +{ + if (r1->ip < r2->ip) + return -1; + else if (r1->ip > r2->ip) + return 1; + else + return r1->scope - r2->scope; +} + +/* Satisfy old style attribute in tree.h header */ +#ifndef __unused +#define __unused __attribute__ ((unused)) +#endif + +/* Generate internal functions and make them static. */ +RB_GENERATE_STATIC(rte_lpm_rules_tree, rte_lpm_rule, link, rules_cmp) + /* * Converts a given depth value to its corresponding mask value. * * depth (IN) : range =3D 1 - 32 - * mask (OUT) : 32bit mask + * mask (OUT) : 32bit mask */ static uint32_t __attribute__((pure)) depth_to_mask(uint8_t depth) { VERIFY_DEPTH(depth); =20 - /* To calculate a mask start with a 1 on the left hand side and right - * shift while populating the left hand side with 1's - */ - return (int)0x80000000 >> (depth - 1); + /* per C std. shift of 32 bits is undefined */ + if (depth =3D=3D 0) + return 0; + + return ~0u << (32 - depth); } =20 /* @@ -113,7 +156,7 @@ depth_to_range(uint8_t depth) return 1 << (MAX_DEPTH_TBL24 - depth); =20 /* Else if depth is greater than 24 */ - return (1 << (RTE_LPM_MAX_DEPTH - depth)); + return 1 << (32 - depth); } =20 /* @@ -148,31 +191,28 @@ rte_lpm_find_existing(const char *name) * Allocates memory for LPM object */ struct rte_lpm * -rte_lpm_create(const char *name, int socket_id, int max_rules, - __rte_unused int flags) +rte_lpm_create(const char *name, int socket_id) { char mem_name[RTE_LPM_NAMESIZE]; struct rte_lpm *lpm =3D NULL; struct rte_tailq_entry *te; - uint32_t mem_size; + unsigned int depth; struct rte_lpm_list *lpm_list; =20 + /* check that we have an initialized tail queue */ lpm_list =3D RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); =20 - RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl24_entry) !=3D 2); - RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl8_entry) !=3D 2); + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl24_entry) !=3D 4); + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl8_entry) !=3D 4); =20 /* Check user arguments. */ - if ((name =3D=3D NULL) || (socket_id < -1) || (max_rules =3D=3D 0)){ + if ((name =3D=3D NULL) || (socket_id < -1)) { rte_errno =3D EINVAL; return NULL; } =20 snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); =20 - /* Determine the amount of memory to allocate. */ - mem_size =3D sizeof(*lpm) + (sizeof(lpm->rules_tbl[0]) * max_rules); - rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); =20 /* guarantee there's no existing */ @@ -192,17 +232,33 @@ rte_lpm_create(const char *name, int socket_id, int m= ax_rules, } =20 /* Allocate memory to store the LPM data structures. */ - lpm =3D (struct rte_lpm *)rte_zmalloc_socket(mem_name, mem_size, - RTE_CACHE_LINE_SIZE, socket_id); + lpm =3D rte_zmalloc_socket(mem_name, sizeof(*lpm), RTE_CACHE_LINE_SIZE, + socket_id); if (lpm =3D=3D NULL) { RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); - rte_free(te); goto exit; } =20 /* Save user arguments. */ - lpm->max_rules =3D max_rules; snprintf(lpm->name, sizeof(lpm->name), "%s", name); + lpm->socket_id =3D socket_id; + + /* Vyatta change to use red-black tree */ + for (depth =3D 0; depth < RTE_LPM_MAX_DEPTH; ++depth) + RB_INIT(&lpm->rules[depth]); + + /* Vyatta change to dynamically grow tbl8 */ + lpm->tbl8_num_groups =3D RTE_LPM_TBL8_INIT_GROUPS; + lpm->tbl8_rover =3D RTE_LPM_TBL8_INIT_GROUPS - 1; + lpm->tbl8 =3D rte_calloc_socket(NULL, RTE_LPM_TBL8_INIT_ENTRIES, + sizeof(struct rte_lpm_tbl8_entry), + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm->tbl8 =3D=3D NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 group allocation failed\n"); + rte_free(lpm); + lpm =3D NULL; + goto exit; + } =20 te->data =3D (void *) lpm; =20 @@ -245,248 +301,237 @@ rte_lpm_free(struct rte_lpm *lpm) =20 rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); =20 + rte_free(lpm->tbl8); rte_free(lpm); rte_free(te); } =20 + /* - * Adds a rule to the rule table. - * - * NOTE: The rule table is split into 32 groups. Each group contains rules= that - * apply to a specific prefix depth (i.e. group 1 contains rules that appl= y to - * prefixes with a depth of 1 etc.). In the following code (depth - 1) is = used - * to refer to depth 1 because even though the depth range is 1 - 32, dept= hs - * are stored in the rule table from 0 - 31. - * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + * Finds a rule in rule table. */ -static inline int32_t -rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, - uint8_t next_hop) +static struct rte_lpm_rule * +rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, uint8_t = scope) { - uint32_t rule_gindex, rule_index, last_rule; - int i; - - VERIFY_DEPTH(depth); - - /* Scan through rule group to see if rule already exists. */ - if (lpm->rule_info[depth - 1].used_rules > 0) { - - /* rule_gindex stands for rule group index. */ - rule_gindex =3D lpm->rule_info[depth - 1].first_rule; - /* Initialise rule_index to point to start of rule group. */ - rule_index =3D rule_gindex; - /* Last rule =3D Last used rule in this rule group. */ - last_rule =3D rule_gindex + lpm->rule_info[depth - 1].used_rules; - - for (; rule_index < last_rule; rule_index++) { + struct rte_lpm_rules_tree *head =3D &lpm->rules[depth]; + struct rte_lpm_rule k =3D { + .ip =3D ip_masked, + .scope =3D scope, + }; =20 - /* If rule already exists update its next_hop and return. */ - if (lpm->rules_tbl[rule_index].ip =3D=3D ip_masked) { - lpm->rules_tbl[rule_index].next_hop =3D next_hop; - - return rule_index; - } - } - - if (rule_index =3D=3D lpm->max_rules) - return -ENOSPC; - } else { - /* Calculate the position in which the rule will be stored. */ - rule_index =3D 0; + return RB_FIND(rte_lpm_rules_tree, head, &k); +} =20 - for (i =3D depth - 1; i > 0; i--) { - if (lpm->rule_info[i - 1].used_rules > 0) { - rule_index =3D lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1= ].used_rules; - break; - } - } - if (rule_index =3D=3D lpm->max_rules) - return -ENOSPC; +/* Finds rule in table in scope order */ +static struct rte_lpm_rule * +rule_find_any(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + struct rte_lpm_rule *r; + int scope; =20 - lpm->rule_info[depth - 1].first_rule =3D rule_index; + for (scope =3D 255; scope >=3D 0; --scope) { + r =3D rule_find(lpm, ip_masked, depth, scope); + if (r) + return r; } =20 - /* Make room for the new rule in the array. */ - for (i =3D RTE_LPM_MAX_DEPTH; i > depth; i--) { - if (lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1].used_rules = =3D=3D lpm->max_rules) - return -ENOSPC; + return NULL; +} =20 - if (lpm->rule_info[i - 1].used_rules > 0) { - lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1]= .used_rules] - =3D lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; - lpm->rule_info[i - 1].first_rule++; - } - } +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules= that + * apply to a specific prefix depth (i.e. group 1 contains rules that appl= y to + * prefixes with a depth of 1 etc.). + * NOTE: Valid range for depth parameter is 0 .. 32 inclusive. + */ +static struct rte_lpm_rule * +rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint16_t next_hop, uint8_t scope) +{ + struct rte_lpm_rules_tree *head =3D &lpm->rules[depth]; + struct rte_lpm_rule *r, *old; =20 - /* Add the new rule. */ - lpm->rules_tbl[rule_index].ip =3D ip_masked; - lpm->rules_tbl[rule_index].next_hop =3D next_hop; + /* + * NB: uses regular malloc to avoid chewing up precious + * memory pool space for rules. + */ + r =3D malloc(sizeof(*r)); + if (!r) + return NULL; =20 - /* Increment the used rules counter for this rule group. */ - lpm->rule_info[depth - 1].used_rules++; + r->ip =3D ip_masked; + r->next_hop =3D next_hop; + r->scope =3D scope; =20 - return rule_index; + old =3D RB_INSERT(rte_lpm_rules_tree, head, r); + if (!old) + return r; + + /* collision with existing rule */ + free(r); + return old; } =20 /* * Delete a rule from the rule table. * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. */ -static inline void -rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +static void +rule_delete(struct rte_lpm *lpm, struct rte_lpm_rule *r, uint8_t depth) { - int i; + struct rte_lpm_rules_tree *head =3D &lpm->rules[depth]; =20 - VERIFY_DEPTH(depth); - - lpm->rules_tbl[rule_index] =3D lpm->rules_tbl[lpm->rule_info[depth - 1].f= irst_rule - + lpm->rule_info[depth - 1].used_rules - 1]; + RB_REMOVE(rte_lpm_rules_tree, head, r); =20 - for (i =3D depth; i < RTE_LPM_MAX_DEPTH; i++) { - if (lpm->rule_info[i].used_rules > 0) { - lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] =3D - lpm->rules_tbl[lpm->rule_info[i].first_rule + lpm->rule_info[i].used_= rules - 1]; - lpm->rule_info[i].first_rule--; - } - } - - lpm->rule_info[depth - 1].used_rules--; + free(r); } =20 /* - * Finds a rule in rule table. - * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + * Dynamically increase size of tbl8 */ -static inline int32_t -rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +static int +tbl8_grow(struct rte_lpm *lpm) { - uint32_t rule_gindex, last_rule, rule_index; - - VERIFY_DEPTH(depth); + size_t old_size, new_size; + struct rte_lpm_tbl8_entry *new_tbl8; + + /* This should not happen, + * worst case is each /24 can point to one tbl8 */ + if (lpm->tbl8_num_groups >=3D RTE_LPM_TBL24_NUM_ENTRIES) + rte_panic("LPM: tbl8 grow already at %u", + lpm->tbl8_num_groups); + + old_size =3D lpm->tbl8_num_groups; + new_size =3D old_size << 1; + new_tbl8 =3D rte_calloc_socket(NULL, + new_size * RTE_LPM_TBL8_GROUP_NUM_ENTRIES, + sizeof(struct rte_lpm_tbl8_entry), + RTE_CACHE_LINE_SIZE, + lpm->socket_id); + if (new_tbl8 =3D=3D NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 group expand allocation failed\n"); + return -ENOMEM; + } =20 - rule_gindex =3D lpm->rule_info[depth - 1].first_rule; - last_rule =3D rule_gindex + lpm->rule_info[depth - 1].used_rules; + memcpy(new_tbl8, lpm->tbl8, + old_size * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + * sizeof(struct rte_lpm_tbl8_entry)); =20 - /* Scan used rules at given depth to find rule. */ - for (rule_index =3D rule_gindex; rule_index < last_rule; rule_index++) { - /* If rule is found return the rule index. */ - if (lpm->rules_tbl[rule_index].ip =3D=3D ip_masked) - return rule_index; - } + /* swap in new table */ + defer_rcu(rte_free, lpm->tbl8); + rcu_assign_pointer(lpm->tbl8, new_tbl8); + lpm->tbl8_num_groups =3D new_size; =20 - /* If rule is not found return -EINVAL. */ - return -EINVAL; + return 0; } =20 /* * Find, clean and allocate a tbl8. */ -static inline int32_t -tbl8_alloc(struct rte_lpm_tbl8_entry *tbl8) +static int32_t +tbl8_alloc(struct rte_lpm *lpm) { uint32_t tbl8_gindex; /* tbl8 group index. */ struct rte_lpm_tbl8_entry *tbl8_entry; =20 /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ - for (tbl8_gindex =3D 0; tbl8_gindex < RTE_LPM_TBL8_NUM_GROUPS; - tbl8_gindex++) { - tbl8_entry =3D &tbl8[tbl8_gindex * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + for (tbl8_gindex =3D (lpm->tbl8_rover + 1) & (lpm->tbl8_num_groups - 1); + tbl8_gindex !=3D lpm->tbl8_rover; + tbl8_gindex =3D (tbl8_gindex + 1) & (lpm->tbl8_num_groups - 1)) { + tbl8_entry =3D lpm->tbl8 + + tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + /* If a free tbl8 group is found clean it and set as VALID. */ - if (!tbl8_entry->valid_group) { - memset(&tbl8_entry[0], 0, - RTE_LPM_TBL8_GROUP_NUM_ENTRIES * - sizeof(tbl8_entry[0])); + if (likely(!tbl8_entry->valid_group)) + goto found; + } =20 - tbl8_entry->valid_group =3D VALID; + /* Out of space expand */ + tbl8_gindex =3D lpm->tbl8_num_groups; + if (tbl8_grow(lpm) < 0) + return -ENOSPC; =20 - /* Return group index for allocated tbl8 group. */ - return tbl8_gindex; - } - } + tbl8_entry =3D lpm->tbl8 + + tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + found: + memset(tbl8_entry, 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * sizeof(tbl8_entry[0])); + + tbl8_entry->valid_group =3D VALID; =20 - /* If there are no tbl8 groups free then return error. */ - return -ENOSPC; + /* Remember last slot to start looking there */ + lpm->tbl8_rover =3D tbl8_gindex; + + /* Return group index for allocated tbl8 group. */ + return tbl8_gindex; } =20 static inline void -tbl8_free(struct rte_lpm_tbl8_entry *tbl8, uint32_t tbl8_group_start) +tbl8_free(struct rte_lpm *lpm, uint32_t tbl8_group_start) { /* Set tbl8 group invalid*/ - tbl8[tbl8_group_start].valid_group =3D INVALID; + lpm->tbl8[tbl8_group_start].valid_group =3D INVALID; } =20 -static inline int32_t +static void add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop) { uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + struct rte_lpm_tbl24_entry new_tbl24_entry =3D { + .valid =3D VALID, + .ext_entry =3D 0, + .depth =3D depth, + { .next_hop =3D next_hop, } + }; + struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, + .valid =3D VALID, + .depth =3D depth, + .next_hop =3D next_hop, + }; + + /* Force compiler to initialize before assignment */ + rte_barrier(); =20 /* Calculate the index into Table24. */ tbl24_index =3D ip >> 8; tbl24_range =3D depth_to_range(depth); - for (i =3D tbl24_index; i < (tbl24_index + tbl24_range); i++) { /* * For invalid OR valid and non-extended tbl 24 entries set * entry. */ - if (!lpm->tbl24[i].valid || (lpm->tbl24[i].ext_entry =3D=3D 0 && - lpm->tbl24[i].depth <=3D depth)) { - - struct rte_lpm_tbl24_entry new_tbl24_entry =3D { - { .next_hop =3D next_hop, }, - .valid =3D VALID, - .ext_entry =3D 0, - .depth =3D depth, - }; - - /* Setting tbl24 entry in one go to avoid race - * conditions - */ - lpm->tbl24[i] =3D new_tbl24_entry; - + if (!lpm->tbl24[i].valid || lpm->tbl24[i].ext_entry =3D=3D 0) { + if (!lpm->tbl24[i].valid || + lpm->tbl24[i].depth <=3D depth) + lpm->tbl24[i] =3D new_tbl24_entry; continue; } =20 - if (lpm->tbl24[i].ext_entry =3D=3D 1) { - /* If tbl24 entry is valid and extended calculate the - * index into tbl8. - */ - tbl8_index =3D lpm->tbl24[i].tbl8_gindex * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl8_group_end =3D tbl8_index + - RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - - for (j =3D tbl8_index; j < tbl8_group_end; j++) { - if (!lpm->tbl8[j].valid || - lpm->tbl8[j].depth <=3D depth) { - struct rte_lpm_tbl8_entry - new_tbl8_entry =3D { - .valid =3D VALID, - .valid_group =3D VALID, - .depth =3D depth, - .next_hop =3D next_hop, - }; - - /* - * Setting tbl8 entry in one go to avoid - * race conditions - */ - lpm->tbl8[j] =3D new_tbl8_entry; - - continue; - } + /* If tbl24 entry is valid and extended calculate the index + * into tbl8. */ + tbl8_index =3D lpm->tbl24[i].tbl8_gindex + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end =3D tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + for (j =3D tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <=3D depth) { + /* + * Setting tbl8 entry in one go to avoid race + * conditions + */ + lpm->tbl8[j] =3D new_tbl8_entry; } } } - - return 0; } =20 -static inline int32_t +static int32_t add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop) { uint32_t tbl24_index; int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, @@ -497,12 +542,11 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked= , uint8_t depth, =20 if (!lpm->tbl24[tbl24_index].valid) { /* Search for a free tbl8 group. */ - tbl8_group_index =3D tbl8_alloc(lpm->tbl8); + tbl8_group_index =3D tbl8_alloc(lpm); =20 - /* Check tbl8 allocation was successful. */ - if (tbl8_group_index < 0) { + /* Check tbl8 allocation was unsuccessful. */ + if (tbl8_group_index < 0) return tbl8_group_index; - } =20 /* Find index into tbl8 and range. */ tbl8_index =3D (tbl8_group_index * @@ -510,35 +554,38 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked= , uint8_t depth, (ip_masked & 0xFF); =20 /* Set tbl8 entry. */ - for (i =3D tbl8_index; i < (tbl8_index + tbl8_range); i++) { - lpm->tbl8[i].depth =3D depth; - lpm->tbl8[i].next_hop =3D next_hop; - lpm->tbl8[i].valid =3D VALID; - } + struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, + .valid =3D VALID, + .depth =3D depth, + .next_hop =3D next_hop, + }; + + for (i =3D tbl8_index; i < (tbl8_index + tbl8_range); i++) + lpm->tbl8[i] =3D new_tbl8_entry; =20 /* * Update tbl24 entry to point to new tbl8 entry. Note: The * ext_flag and tbl8_index need to be updated simultaneously, * so assign whole structure in one go */ - struct rte_lpm_tbl24_entry new_tbl24_entry =3D { - { .tbl8_gindex =3D (uint8_t)tbl8_group_index, }, .valid =3D VALID, .ext_entry =3D 1, .depth =3D 0, + { .tbl8_gindex =3D tbl8_group_index, } }; =20 + rte_barrier(); lpm->tbl24[tbl24_index] =3D new_tbl24_entry; - - }/* If valid entry but not extended calculate the index into Table8. */ + } + /* If valid entry but not extended calculate the index into Table8. */ else if (lpm->tbl24[tbl24_index].ext_entry =3D=3D 0) { /* Search for free tbl8 group. */ - tbl8_group_index =3D tbl8_alloc(lpm->tbl8); + tbl8_group_index =3D tbl8_alloc(lpm); =20 - if (tbl8_group_index < 0) { + if (tbl8_group_index < 0) return tbl8_group_index; - } =20 tbl8_group_start =3D tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -546,69 +593,68 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked= , uint8_t depth, RTE_LPM_TBL8_GROUP_NUM_ENTRIES; =20 /* Populate new tbl8 with tbl24 value. */ - for (i =3D tbl8_group_start; i < tbl8_group_end; i++) { - lpm->tbl8[i].valid =3D VALID; - lpm->tbl8[i].depth =3D lpm->tbl24[tbl24_index].depth; - lpm->tbl8[i].next_hop =3D - lpm->tbl24[tbl24_index].next_hop; - } + struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, + .valid =3D VALID, + .depth =3D lpm->tbl24[tbl24_index].depth, + .next_hop =3D lpm->tbl24[tbl24_index].next_hop, + }; + + for (i =3D tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] =3D new_tbl8_entry; =20 tbl8_index =3D tbl8_group_start + (ip_masked & 0xFF); =20 - /* Insert new rule into the tbl8 entry. */ - for (i =3D tbl8_index; i < tbl8_index + tbl8_range; i++) { - if (!lpm->tbl8[i].valid || - lpm->tbl8[i].depth <=3D depth) { - lpm->tbl8[i].valid =3D VALID; - lpm->tbl8[i].depth =3D depth; - lpm->tbl8[i].next_hop =3D next_hop; - - continue; - } - } + /* Insert new specific rule into the tbl8 entry. */ + new_tbl8_entry.depth =3D depth; + new_tbl8_entry.next_hop =3D next_hop; + for (i =3D tbl8_index; i < tbl8_index + tbl8_range; i++) + lpm->tbl8[i] =3D new_tbl8_entry; =20 /* * Update tbl24 entry to point to new tbl8 entry. Note: The * ext_flag and tbl8_index need to be updated simultaneously, * so assign whole structure in one go. */ - struct rte_lpm_tbl24_entry new_tbl24_entry =3D { - { .tbl8_gindex =3D (uint8_t)tbl8_group_index, }, .valid =3D VALID, .ext_entry =3D 1, .depth =3D 0, + { .tbl8_gindex =3D tbl8_group_index, } }; =20 + /* + * Ensure compiler isn't doing something completely odd + * like updating tbl24 before tbl8. + */ + rte_barrier(); lpm->tbl24[tbl24_index] =3D new_tbl24_entry; =20 - } - else { /* - * If it is valid, extended entry calculate the index into tbl8. - */ + } else { + /* + * If it is valid, extended entry calculate the index into tbl8. + */ + struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, + .valid =3D VALID, + .depth =3D depth, + .next_hop =3D next_hop, + }; + rte_barrier(); + tbl8_group_index =3D lpm->tbl24[tbl24_index].tbl8_gindex; tbl8_group_start =3D tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; tbl8_index =3D tbl8_group_start + (ip_masked & 0xFF); =20 for (i =3D tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (!lpm->tbl8[i].valid || - lpm->tbl8[i].depth <=3D depth) { - struct rte_lpm_tbl8_entry new_tbl8_entry =3D { - .valid =3D VALID, - .depth =3D depth, - .next_hop =3D next_hop, - .valid_group =3D lpm->tbl8[i].valid_group, - }; - + lpm->tbl8[i].depth <=3D depth) { /* * Setting tbl8 entry in one go to avoid race * condition */ lpm->tbl8[i] =3D new_tbl8_entry; - - continue; } } } @@ -621,38 +667,32 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked= , uint8_t depth, */ int rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop, uint8_t scope) { - int32_t rule_index, status =3D 0; - uint32_t ip_masked; + struct rte_lpm_rule *rule; + uint32_t ip_masked =3D (ip & depth_to_mask(depth)); =20 /* Check user arguments. */ - if ((lpm =3D=3D NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + if ((lpm =3D=3D NULL) || (depth >=3D RTE_LPM_MAX_DEPTH)) return -EINVAL; =20 - ip_masked =3D ip & depth_to_mask(depth); - /* Add the rule to the rule table. */ - rule_index =3D rule_add(lpm, ip_masked, depth, next_hop); + rule =3D rule_add(lpm, ip_masked, depth, next_hop, scope); =20 /* If the is no space available for new rule return error. */ - if (rule_index < 0) { - return rule_index; - } - - if (depth <=3D MAX_DEPTH_TBL24) { - status =3D add_depth_small(lpm, ip_masked, depth, next_hop); - } - else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ - status =3D add_depth_big(lpm, ip_masked, depth, next_hop); + if (rule =3D=3D NULL) + return -ENOSPC; =20 + if (depth <=3D MAX_DEPTH_TBL24) + add_depth_small(lpm, ip_masked, depth, next_hop); + else { /* * If add fails due to exhaustion of tbl8 extensions delete * rule that was added to rule table. */ + int status =3D add_depth_big(lpm, ip_masked, depth, next_hop); if (status < 0) { - rule_delete(lpm, rule_index, depth); - + rule_delete(lpm, rule, depth); return status; } } @@ -665,10 +705,10 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t= depth, */ int rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, -uint8_t *next_hop) + uint16_t *next_hop, uint8_t scope) { uint32_t ip_masked; - int32_t rule_index; + struct rte_lpm_rule *rule; =20 /* Check user arguments. */ if ((lpm =3D=3D NULL) || @@ -678,10 +718,10 @@ uint8_t *next_hop) =20 /* Look for the rule using rule_find. */ ip_masked =3D ip & depth_to_mask(depth); - rule_index =3D rule_find(lpm, ip_masked, depth); + rule =3D rule_find(lpm, ip_masked, depth, scope); =20 - if (rule_index >=3D 0) { - *next_hop =3D lpm->rules_tbl[rule_index].next_hop; + if (rule !=3D NULL) { + *next_hop =3D rule->next_hop; return 1; } =20 @@ -689,30 +729,29 @@ uint8_t *next_hop) return 0; } =20 -static inline int32_t -find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_= t *sub_rule_depth) +static struct rte_lpm_rule * +find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) { - int32_t rule_index; + struct rte_lpm_rule *rule; uint32_t ip_masked; - uint8_t prev_depth; + int prev_depth; =20 - for (prev_depth =3D (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + for (prev_depth =3D depth - 1; prev_depth >=3D 0; prev_depth--) { ip_masked =3D ip & depth_to_mask(prev_depth); - - rule_index =3D rule_find(lpm, ip_masked, prev_depth); - - if (rule_index >=3D 0) { + rule =3D rule_find_any(lpm, ip_masked, prev_depth); + if (rule) { *sub_rule_depth =3D prev_depth; - return rule_index; + return rule; } } =20 - return -1; + return NULL; } =20 -static inline int32_t -delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, - uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +static void +delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + struct rte_lpm_rule *sub_rule, uint8_t new_depth) { uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; =20 @@ -720,28 +759,22 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_m= asked, tbl24_range =3D depth_to_range(depth); tbl24_index =3D (ip_masked >> 8); =20 - /* - * Firstly check the sub_rule_index. A -1 indicates no replacement rule - * and a positive number indicates a sub_rule_index. - */ - if (sub_rule_index < 0) { + /* Firstly check the sub_rule. */ + if (sub_rule =3D=3D NULL) { /* * If no replacement rule exists then invalidate entries * associated with this rule. */ for (i =3D tbl24_index; i < (tbl24_index + tbl24_range); i++) { - - if (lpm->tbl24[i].ext_entry =3D=3D 0 && - lpm->tbl24[i].depth <=3D depth ) { - lpm->tbl24[i].valid =3D INVALID; - } - else { + if (lpm->tbl24[i].ext_entry =3D=3D 0) { + if (lpm->tbl24[i].depth <=3D depth) + lpm->tbl24[i].valid =3D INVALID; + } else { /* * If TBL24 entry is extended, then there has * to be a rule with depth >=3D 25 in the * associated TBL8 group. */ - tbl8_group_index =3D lpm->tbl24[i].tbl8_gindex; tbl8_index =3D tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -749,60 +782,54 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_m= asked, for (j =3D tbl8_index; j < (tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { =20 - if (lpm->tbl8[j].depth <=3D depth) + if (lpm->tbl8[j].valid && + lpm->tbl8[j].depth <=3D depth) lpm->tbl8[j].valid =3D INVALID; } } } - } - else { + } else { /* * If a replacement rule exists then modify entries * associated with this rule. */ - struct rte_lpm_tbl24_entry new_tbl24_entry =3D { - {.next_hop =3D lpm->rules_tbl[sub_rule_index].next_hop,}, .valid =3D VALID, .ext_entry =3D 0, - .depth =3D sub_rule_depth, + .depth =3D new_depth, + { .next_hop =3D sub_rule->next_hop, } }; =20 struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, .valid =3D VALID, - .depth =3D sub_rule_depth, - .next_hop =3D lpm->rules_tbl - [sub_rule_index].next_hop, + .depth =3D new_depth, + .next_hop =3D sub_rule->next_hop, }; =20 for (i =3D tbl24_index; i < (tbl24_index + tbl24_range); i++) { - - if (lpm->tbl24[i].ext_entry =3D=3D 0 && - lpm->tbl24[i].depth <=3D depth ) { - lpm->tbl24[i] =3D new_tbl24_entry; - } - else { + if (lpm->tbl24[i].ext_entry =3D=3D 0) { + if (lpm->tbl24[i].depth <=3D depth) + lpm->tbl24[i] =3D new_tbl24_entry; + } else { /* * If TBL24 entry is extended, then there has * to be a rule with depth >=3D 25 in the * associated TBL8 group. */ - tbl8_group_index =3D lpm->tbl24[i].tbl8_gindex; tbl8_index =3D tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; =20 for (j =3D tbl8_index; j < (tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { - - if (lpm->tbl8[j].depth <=3D depth) + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <=3D depth) lpm->tbl8[j] =3D new_tbl8_entry; } } } } - - return 0; } =20 /* @@ -813,8 +840,9 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_mas= ked, * Return of value > -1 means tbl8 is in use but has all the same values a= nd * thus can be recycled */ -static inline int32_t -tbl8_recycle_check(struct rte_lpm_tbl8_entry *tbl8, uint32_t tbl8_group_st= art) +static int32_t +tbl8_recycle_check(const struct rte_lpm_tbl8_entry *tbl8, + uint32_t tbl8_group_start) { uint32_t tbl8_group_end, i; tbl8_group_end =3D tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -855,13 +883,14 @@ tbl8_recycle_check(struct rte_lpm_tbl8_entry *tbl8, u= int32_t tbl8_group_start) if (tbl8[i].valid) return -EEXIST; } + /* If no valid entries are found then return -EINVAL. */ return -EINVAL; } =20 -static inline int32_t -delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, - uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +static void +delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + struct rte_lpm_rule *sub_rule, uint8_t new_depth) { uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, tbl8_range, i; @@ -879,23 +908,22 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_mas= ked, tbl8_index =3D tbl8_group_start + (ip_masked & 0xFF); tbl8_range =3D depth_to_range(depth); =20 - if (sub_rule_index < 0) { + if (sub_rule =3D=3D NULL) { /* * Loop through the range of entries on tbl8 for which the * rule_to_delete must be removed or modified. */ for (i =3D tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (lpm->tbl8[i].depth <=3D depth) + if (lpm->tbl8[i].valid && lpm->tbl8[i].depth <=3D depth) lpm->tbl8[i].valid =3D INVALID; } - } - else { + } else { /* Set new tbl8 entry. */ struct rte_lpm_tbl8_entry new_tbl8_entry =3D { + .valid_group =3D VALID, .valid =3D VALID, - .depth =3D sub_rule_depth, - .valid_group =3D lpm->tbl8[tbl8_group_start].valid_group, - .next_hop =3D lpm->rules_tbl[sub_rule_index].next_hop, + .depth =3D new_depth, + .next_hop =3D sub_rule->next_hop, }; =20 /* @@ -903,7 +931,7 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_maske= d, * rule_to_delete must be modified. */ for (i =3D tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (lpm->tbl8[i].depth <=3D depth) + if (!lpm->tbl8[i].valid || lpm->tbl8[i].depth <=3D depth) lpm->tbl8[i] =3D new_tbl8_entry; } } @@ -915,100 +943,158 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_m= asked, */ =20 tbl8_recycle_index =3D tbl8_recycle_check(lpm->tbl8, tbl8_group_start); - - if (tbl8_recycle_index =3D=3D -EINVAL){ + if (tbl8_recycle_index =3D=3D -EINVAL) { /* Set tbl24 before freeing tbl8 to avoid race condition. */ lpm->tbl24[tbl24_index].valid =3D 0; - tbl8_free(lpm->tbl8, tbl8_group_start); - } - else if (tbl8_recycle_index > -1) { + rte_barrier(); + tbl8_free(lpm, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { /* Update tbl24 entry. */ struct rte_lpm_tbl24_entry new_tbl24_entry =3D { - { .next_hop =3D lpm->tbl8[tbl8_recycle_index].next_hop, }, .valid =3D VALID, .ext_entry =3D 0, .depth =3D lpm->tbl8[tbl8_recycle_index].depth, + { .next_hop =3D lpm->tbl8[tbl8_recycle_index].next_hop, } }; =20 /* Set tbl24 before freeing tbl8 to avoid race condition. */ lpm->tbl24[tbl24_index] =3D new_tbl24_entry; - tbl8_free(lpm->tbl8, tbl8_group_start); + rte_barrier(); + tbl8_free(lpm, tbl8_group_start); } +} =20 - return 0; +/* + * Find rule to replace the just deleted. If there is no rule to + * replace the rule_to_delete we return NULL and invalidate the table + * entries associated with this rule. + */ +static void rule_replace(struct rte_lpm *lpm, uint32_t ip, uint8_t depth) +{ + uint32_t ip_masked; + struct rte_lpm_rule *sub_rule; + uint8_t sub_depth =3D 0; + + ip_masked =3D ip & depth_to_mask(depth); + sub_rule =3D find_previous_rule(lpm, ip, depth, &sub_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <=3D MAX_DEPTH_TBL24) + delete_depth_small(lpm, ip_masked, depth, sub_rule, sub_depth); + else + delete_depth_big(lpm, ip_masked, depth, sub_rule, sub_depth); } =20 /* * Deletes a rule */ int -rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth) +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t *next_hop, uint8_t scope) { - int32_t rule_to_delete_index, sub_rule_index; + struct rte_lpm_rule *rule; uint32_t ip_masked; - uint8_t sub_rule_depth; + /* * Check input arguments. Note: IP must be a positive integer of 32 * bits in length therefore it need not be checked. */ - if ((lpm =3D=3D NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + if ((lpm =3D=3D NULL) || (depth >=3D RTE_LPM_MAX_DEPTH)) return -EINVAL; - } =20 ip_masked =3D ip & depth_to_mask(depth); =20 /* - * Find the index of the input rule, that needs to be deleted, in the + * Find the input rule, that needs to be deleted, in the * rule table. */ - rule_to_delete_index =3D rule_find(lpm, ip_masked, depth); + rule =3D rule_find(lpm, ip_masked, depth, scope); =20 /* * Check if rule_to_delete_index was found. If no rule was found the - * function rule_find returns -EINVAL. + * function rule_find returns -E_RTE_NO_TAILQ. */ - if (rule_to_delete_index < 0) + if (rule =3D=3D NULL) return -EINVAL; =20 - /* Delete the rule from the rule table. */ - rule_delete(lpm, rule_to_delete_index, depth); - /* - * Find rule to replace the rule_to_delete. If there is no rule to - * replace the rule_to_delete we return -1 and invalidate the table - * entries associated with this rule. + * Return next hop so caller can avoid lookup. */ - sub_rule_depth =3D 0; - sub_rule_index =3D find_previous_rule(lpm, ip, depth, &sub_rule_depth); + if (next_hop) + *next_hop =3D rule->next_hop; =20 - /* - * If the input depth value is less than 25 use function - * delete_depth_small otherwise use delete_depth_big. - */ - if (depth <=3D MAX_DEPTH_TBL24) { - return delete_depth_small(lpm, ip_masked, depth, - sub_rule_index, sub_rule_depth); - } - else { /* If depth > MAX_DEPTH_TBL24 */ - return delete_depth_big(lpm, ip_masked, depth, sub_rule_index, sub_rule_= depth); - } + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule, depth); + + /* Replace with next level up rule */ + rule_replace(lpm, ip, depth); + + return 0; } =20 /* * Delete all rules from the LPM table. */ void -rte_lpm_delete_all(struct rte_lpm *lpm) +rte_lpm_delete_all(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *ar= g) { - /* Zero rule information. */ - memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + uint8_t depth; =20 /* Zero tbl24. */ memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); =20 /* Zero tbl8. */ - memset(lpm->tbl8, 0, sizeof(lpm->tbl8)); + memset(lpm->tbl8, 0, + lpm->tbl8_num_groups * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + * sizeof(struct rte_lpm_tbl8_entry)); + lpm->tbl8_rover =3D lpm->tbl8_num_groups - 1; =20 /* Delete all rules form the rules table. */ - memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); + for (depth =3D 0; depth < RTE_LPM_MAX_DEPTH; ++depth) { + struct rte_lpm_rules_tree *head =3D &lpm->rules[depth]; + struct rte_lpm_rule *r, *n; + + RB_FOREACH_SAFE(r, rte_lpm_rules_tree, head, n) { + if (func) + func(lpm, r->ip, depth, r->scope, + r->next_hop, arg); + rule_delete(lpm, r, depth); + } + } +} + +/* + * Iterate over LPM rules + */ +void +rte_lpm_walk(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg) +{ + uint8_t depth; + + for (depth =3D 0; depth < RTE_LPM_MAX_DEPTH; depth++) { + struct rte_lpm_rules_tree *head =3D &lpm->rules[depth]; + struct rte_lpm_rule *r, *n; + + RB_FOREACH_SAFE(r, rte_lpm_rules_tree, head, n) { + func(lpm, r->ip, depth, r->scope, r->next_hop, arg); + } + } +} + +/* Count usage of tbl8 */ +unsigned +rte_lpm_tbl8_count(const struct rte_lpm *lpm) +{ + unsigned i, count =3D 0; + + for (i =3D 0; i < lpm->tbl8_num_groups; i++) { + const struct rte_lpm_tbl8_entry *tbl8_entry + =3D lpm->tbl8 + i * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + if (tbl8_entry->valid_group) + ++count; + } + return count; } diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index c299ce2..a39e3b5 100644 --- a/lib/librte_lpm/rte_lpm.h +++ b/lib/librte_lpm/rte_lpm.h @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2015 Brocade Communications Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,11 +44,9 @@ #include #include #include +#include #include -#include #include -#include -#include =20 #ifdef __cplusplus extern "C" { @@ -55,130 +54,89 @@ extern "C" { =20 /** Max number of characters in LPM name. */ #define RTE_LPM_NAMESIZE 32 +=20 + /** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 33 +=20 +/** Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) =20 -/** Maximum depth value possible for IPv4 LPM. */ -#define RTE_LPM_MAX_DEPTH 32 +/** Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 =20 -/** @internal Total number of tbl24 entries. */ -#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) - -/** @internal Number of entries in a tbl8 group. */ -#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 - -/** @internal Total number of tbl8 groups in the tbl8. */ -#define RTE_LPM_TBL8_NUM_GROUPS 256 - -/** @internal Total number of tbl8 entries. */ -#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ - RTE_LPM_TBL8_GROUP_NUM_ENTRIES) - -/** @internal Macro to enable/disable run-time checks. */ -#if defined(RTE_LIBRTE_LPM_DEBUG) -#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ - if (cond) return (retval); \ -} while (0) -#else -#define RTE_LPM_RETURN_IF_TRUE(cond, retval) -#endif - -/** @internal bitmask with valid and ext_entry/valid_group fields set */ -#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x0300 - -/** Bitmask used to indicate successful lookup */ -#define RTE_LPM_LOOKUP_SUCCESS 0x0100 - -#if RTE_BYTE_ORDER =3D=3D RTE_LITTLE_ENDIAN -/** @internal Tbl24 entry structure. */ +/** Tbl24 entry structure. */ struct rte_lpm_tbl24_entry { + /* Using single uint8_t to store 3 values. */ + uint8_t valid :1; /**< Validation flag. */ + uint8_t ext_entry :1; /**< external entry? */ + uint8_t depth; /**< Rule depth. */ /* Stores Next hop or group index (i.e. gindex)into tbl8. */ union { - uint8_t next_hop; - uint8_t tbl8_gindex; + uint16_t next_hop; + uint16_t tbl8_gindex; }; - /* Using single uint8_t to store 3 values. */ - uint8_t valid :1; /**< Validation flag. */ - uint8_t ext_entry :1; /**< External entry. */ - uint8_t depth :6; /**< Rule depth. */ }; =20 -/** @internal Tbl8 entry structure. */ +/** Tbl8 entry structure. */ struct rte_lpm_tbl8_entry { - uint8_t next_hop; /**< next hop. */ - /* Using single uint8_t to store 3 values. */ + uint16_t next_hop; /**< next hop. */ + uint8_t depth; /**< Rule depth. */ uint8_t valid :1; /**< Validation flag. */ uint8_t valid_group :1; /**< Group validation flag. */ - uint8_t depth :6; /**< Rule depth. */ -}; -#else -struct rte_lpm_tbl24_entry { - uint8_t depth :6; - uint8_t ext_entry :1; - uint8_t valid :1; - union { - uint8_t tbl8_gindex; - uint8_t next_hop; - }; -}; - -struct rte_lpm_tbl8_entry { - uint8_t depth :6; - uint8_t valid_group :1; - uint8_t valid :1; - uint8_t next_hop; -}; -#endif - -/** @internal Rule structure. */ -struct rte_lpm_rule { - uint32_t ip; /**< Rule IP address. */ - uint8_t next_hop; /**< Rule next hop. */ -}; - -/** @internal Contains metadata about the rules table. */ -struct rte_lpm_rule_info { - uint32_t used_rules; /**< Used rules so far. */ - uint32_t first_rule; /**< Indexes the first rule of a given depth. */ }; =20 /** @internal LPM structure. */ struct rte_lpm { + TAILQ_ENTRY(rte_lpm) next; /**< Next in list. */ + /* LPM metadata. */ - char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ - uint32_t max_rules; /**< Max. balanced rules per lpm. */ - struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info tab= le. */ + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + + /**< LPM rules. */ + int socket_id; /**< socket to allocate rules on */ + RB_HEAD(rte_lpm_rules_tree, rte_lpm_rule) rules[RTE_LPM_MAX_DEPTH]; =20 /* LPM Tables. */ - struct rte_lpm_tbl24_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] \ + uint32_t tbl8_num_groups; /* Number of slots */ + uint32_t tbl8_rover; /* Next slot to check */ + struct rte_lpm_tbl8_entry *tbl8; /* Actual table */ + + struct rte_lpm_tbl24_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] __rte_cache_aligned; /**< LPM tbl24 table. */ - struct rte_lpm_tbl8_entry tbl8[RTE_LPM_TBL8_NUM_ENTRIES] \ - __rte_cache_aligned; /**< LPM tbl8 table. */ - struct rte_lpm_rule rules_tbl[0] \ - __rte_cache_aligned; /**< LPM rules. */ }; =20 /** + * Compiler memory barrier. + * + * Protects against compiler optimization of ordered operations. + */ +#ifdef __GNUC__ +#define rte_barrier() asm volatile("": : :"memory") +#else +/* Intel compiler has intrinsic for this. */ +#define rte_barrier() __memory_barrier() +#endif + +/** * Create an LPM object. * * @param name * LPM object name * @param socket_id * NUMA socket ID for LPM table memory allocation - * @param max_rules - * Maximum number of LPM rules that can be added - * @param flags - * This parameter is currently unused * @return * Handle to LPM object on success, NULL otherwise with rte_errno set * to an appropriate values. Possible rte_errno values include: * - E_RTE_NO_CONFIG - function could not get pointer to rte_config str= ucture * - E_RTE_SECONDARY - function was called from a secondary process ins= tance + * - E_RTE_NO_TAILQ - no tailq list could be got for the lpm object list * - EINVAL - invalid parameter passed to function * - ENOSPC - the maximum number of memzones has already been allocated * - EEXIST - a memzone with the same name already exists * - ENOMEM - no appropriate memory area found in which to create memzo= ne */ struct rte_lpm * -rte_lpm_create(const char *name, int socket_id, int max_rules, int flags); +rte_lpm_create(const char *name, int socket_id); =20 /** * Find an existing LPM object and return a pointer to it. @@ -215,11 +173,14 @@ rte_lpm_free(struct rte_lpm *lpm); * Depth of the rule to be added to the LPM table * @param next_hop * Next hop of the rule to be added to the LPM table + * @param scope + * Priority scope of this route rule * @return * 0 on success, negative value otherwise */ int -rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_t next_= hop); +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t next_hop, uint8_t scope); =20 /** * Check if a rule is present in the LPM table, @@ -231,6 +192,8 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t d= epth, uint8_t next_hop); * IP of the rule to be searched * @param depth * Depth of the rule to searched + * @param scope + * Priority scope of the rule * @param next_hop * Next hop of the rule (valid only if it is found) * @return @@ -238,7 +201,7 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t d= epth, uint8_t next_hop); */ int rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, -uint8_t *next_hop); + uint16_t *next_hop, uint8_t scope); =20 /** * Delete a rule from the LPM table. @@ -249,20 +212,30 @@ uint8_t *next_hop); * IP of the rule to be deleted from the LPM table * @param depth * Depth of the rule to be deleted from the LPM table + * @param scope + * Priority scope of this route rule * @return * 0 on success, negative value otherwise */ int -rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth); +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t *next_hop, uint8_t scope); + +/** iterator function for LPM rule */ +typedef void (*rte_lpm_walk_func_t)(struct rte_lpm *lpm, + uint32_t ip, uint8_t depth, uint8_t scope, + uint16_t next_hop, void *arg); =20 /** * Delete all rules from the LPM table. * * @param lpm * LPM object handle + * @param func + * Optional callback for each entry */ void -rte_lpm_delete_all(struct rte_lpm *lpm); +rte_lpm_delete_all(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *ar= g); =20 /** * Lookup an IP into the LPM table. @@ -277,200 +250,80 @@ rte_lpm_delete_all(struct rte_lpm *lpm); * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup = hit */ static inline int -rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint8_t *next_hop) +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint16_t *next_hop) { - unsigned tbl24_index =3D (ip >> 8); - uint16_t tbl_entry; + struct rte_lpm_tbl24_entry tbl24; + struct rte_lpm_tbl8_entry tbl8; =20 - /* DEBUG: Check user input arguments. */ - RTE_LPM_RETURN_IF_TRUE(((lpm =3D=3D NULL) || (next_hop =3D=3D NULL)), -EI= NVAL); + /* Copy tbl24 entry (to avoid conconcurrency issues) */ + tbl24 =3D lpm->tbl24[ip >> 8]; + rte_barrier(); =20 - /* Copy tbl24 entry */ - tbl_entry =3D *(const uint16_t *)&lpm->tbl24[tbl24_index]; + /* + * Use the tbl24_index to access the required tbl24 entry then check if + * the tbl24 entry is INVALID, if so return -ENOENT. + */ + if (unlikely(!tbl24.valid)) + return -ENOENT; /* Lookup miss. */ =20 - /* Copy tbl8 entry (only if needed) */ - if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + /* + * If tbl24 entry is valid check if it is NOT extended (i.e. it does + * not use a tbl8 extension) if so return the next hop. + */ + if (tbl24.ext_entry =3D=3D 0) { + *next_hop =3D tbl24.next_hop; + return 0; /* Lookup hit. */ + } =20 - unsigned tbl8_index =3D (uint8_t)ip + - ((uint8_t)tbl_entry * RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + /* + * If tbl24 entry is valid and extended calculate the index into the + * tbl8 entry. + */ + tbl8 =3D lpm->tbl8[tbl24.tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + + (ip & 0xFF)]; + rte_barrier(); =20 - tbl_entry =3D *(const uint16_t *)&lpm->tbl8[tbl8_index]; - } + /* Check if the tbl8 entry is invalid and if so return -ENOENT. */ + if (unlikely(!tbl8.valid)) + return -ENOENT; /* Lookup miss. */ =20 - *next_hop =3D (uint8_t)tbl_entry; - return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; + /* If the tbl8 entry is valid return return the next_hop. */ + *next_hop =3D tbl8.next_hop; + return 0; /* Lookup hit. */ } =20 /** - * Lookup multiple IP addresses in an LPM table. This may be implemented a= s a - * macro, so the address of the function should not be used. + * Iterate over all rules in the LPM table. * * @param lpm * LPM object handle - * @param ips - * Array of IPs to be looked up in the LPM table - * @param next_hops - * Next hop of the most specific rule found for IP (valid on lookup hit = only). - * This is an array of two byte values. The most significant byte in each - * value says whether the lookup was successful (bitmask - * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the - * actual next hop. - * @param n - * Number of elements in ips (and next_hops) array to lookup. This shoul= d be a - * compile time constant, and divisible by 8 for best performance. - * @return - * -EINVAL for incorrect arguments, otherwise 0 + * @param func + * Callback to display + * @param arg + * Argument passed to iterator */ -#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ - rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) - -static inline int -rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips, - uint16_t * next_hops, const unsigned n) -{ - unsigned i; - unsigned tbl24_indexes[n]; - - /* DEBUG: Check user input arguments. */ - RTE_LPM_RETURN_IF_TRUE(((lpm =3D=3D NULL) || (ips =3D=3D NULL) || - (next_hops =3D=3D NULL)), -EINVAL); - - for (i =3D 0; i < n; i++) { - tbl24_indexes[i] =3D ips[i] >> 8; - } - - for (i =3D 0; i < n; i++) { - /* Simply copy tbl24 entry to output */ - next_hops[i] =3D *(const uint16_t *)&lpm->tbl24[tbl24_indexes[i]]; - - /* Overwrite output with tbl8 entry if needed */ - if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - - unsigned tbl8_index =3D (uint8_t)ips[i] + - ((uint8_t)next_hops[i] * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES); - - next_hops[i] =3D *(const uint16_t *)&lpm->tbl8[tbl8_index]; - } - } - return 0; -} +void +rte_lpm_walk(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg); =20 -/* Mask four results. */ -#define RTE_LPM_MASKX4_RES UINT64_C(0x00ff00ff00ff00ff) +/** + * Return the number of entries in the Tbl8 array + * + * @param lpm + * LPM object handle + */ +unsigned +rte_lpm_tbl8_count(const struct rte_lpm *lpm); =20 /** - * Lookup four IP addresses in an LPM table. + * Return the number of free entries in the Tbl8 array * * @param lpm * LPM object handle - * @param ip - * Four IPs to be looked up in the LPM table - * @param hop - * Next hop of the most specific rule found for IP (valid on lookup hit = only). - * This is an 4 elements array of two byte values. - * If the lookup was succesfull for the given IP, then least significant= byte - * of the corresponding element is the actual next hop and the most - * significant byte is zero. - * If the lookup for the given IP failed, then corresponding element wou= ld - * contain default value, see description of then next parameter. - * @param defv - * Default value to populate into corresponding element of hop[] array, - * if lookup would fail. */ -static inline void -rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4], - uint16_t defv) +static inline unsigned +rte_lpm_tbl8_free_count(const struct rte_lpm *lpm) { - __m128i i24; - rte_xmm_t i8; - uint16_t tbl[4]; - uint64_t idx, pt; - - const __m128i mask8 =3D - _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX); - - /* - * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries - * as one 64-bit value (0x0300030003000300). - */ - const uint64_t mask_xv =3D - ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48); - - /* - * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries - * as one 64-bit value (0x0100010001000100). - */ - const uint64_t mask_v =3D - ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48); - - /* get 4 indexes for tbl24[]. */ - i24 =3D _mm_srli_epi32(ip, CHAR_BIT); - - /* extract values from tbl24[] */ - idx =3D _mm_cvtsi128_si64(i24); - i24 =3D _mm_srli_si128(i24, sizeof(uint64_t)); - - tbl[0] =3D *(const uint16_t *)&lpm->tbl24[(uint32_t)idx]; - tbl[1] =3D *(const uint16_t *)&lpm->tbl24[idx >> 32]; - - idx =3D _mm_cvtsi128_si64(i24); - - tbl[2] =3D *(const uint16_t *)&lpm->tbl24[(uint32_t)idx]; - tbl[3] =3D *(const uint16_t *)&lpm->tbl24[idx >> 32]; - - /* get 4 indexes for tbl8[]. */ - i8.x =3D _mm_and_si128(ip, mask8); - - pt =3D (uint64_t)tbl[0] | - (uint64_t)tbl[1] << 16 | - (uint64_t)tbl[2] << 32 | - (uint64_t)tbl[3] << 48; - - /* search successfully finished for all 4 IP addresses. */ - if (likely((pt & mask_xv) =3D=3D mask_v)) { - uintptr_t ph =3D (uintptr_t)hop; - *(uint64_t *)ph =3D pt & RTE_LPM_MASKX4_RES; - return; - } - - if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[0] =3D i8.u32[0] + - (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[0] =3D *(const uint16_t *)&lpm->tbl8[i8.u32[0]]; - } - if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[1] =3D i8.u32[1] + - (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[1] =3D *(const uint16_t *)&lpm->tbl8[i8.u32[1]]; - } - if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[2] =3D i8.u32[2] + - (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[2] =3D *(const uint16_t *)&lpm->tbl8[i8.u32[2]]; - } - if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) =3D=3D - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[3] =3D i8.u32[3] + - (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[3] =3D *(const uint16_t *)&lpm->tbl8[i8.u32[3]]; - } - - hop[0] =3D (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv; - hop[1] =3D (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv; - hop[2] =3D (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv; - hop[3] =3D (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv; + return lpm->tbl8_num_groups - rte_lpm_tbl8_count(lpm); } =20 #ifdef __cplusplus --=20 2.1.4