From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 8EB0D4550E;
	Thu, 27 Jun 2024 20:06:00 +0200 (CEST)
Received: from mails.dpdk.org (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id 7BD3040ED0;
	Thu, 27 Jun 2024 20:06:00 +0200 (CEST)
Received: from forward500a.mail.yandex.net (forward500a.mail.yandex.net
 [178.154.239.80])
 by mails.dpdk.org (Postfix) with ESMTP id A8E8540E8A;
 Thu, 27 Jun 2024 20:05:59 +0200 (CEST)
Received: from mail-nwsmtp-smtp-production-main-45.myt.yp-c.yandex.net
 (mail-nwsmtp-smtp-production-main-45.myt.yp-c.yandex.net
 [IPv6:2a02:6b8:c12:28e:0:640:4e0e:0])
 by forward500a.mail.yandex.net (Yandex) with ESMTPS id 561A261FA1;
 Thu, 27 Jun 2024 21:05:59 +0300 (MSK)
Received: by mail-nwsmtp-smtp-production-main-45.myt.yp-c.yandex.net
 (smtp/Yandex) with ESMTPSA id 55WIPZ5OoSw0-M3SkRLm9; 
 Thu, 27 Jun 2024 21:05:58 +0300
X-Yandex-Fwd: 1
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yandex.ru; s=mail;
 t=1719511558; bh=Rlabw5j544t8qp1XlS2whHqj4Iw92JjRr8JsCOuV5y8=;
 h=Cc:Message-Id:References:Date:In-Reply-To:Subject:To:From;
 b=bNJsaur1HeZCDSQTA1GWLspVHIGhDqTftDvJ2PmJWnNqL+PlXSt0EQ1wWHUyJQUfx
 9Rr9LsgpLGMlBlSCqQcSHkibJ7By01fhQXDY7wrD3fzUjt0g5/4ZLHM9BiwImLnvM0
 q/j/61nCzijBumEWCMLbN4kVCOaPgvHDKr8veg1Q=
Authentication-Results: mail-nwsmtp-smtp-production-main-45.myt.yp-c.yandex.net;
 dkim=pass header.i=@yandex.ru
From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
To: dev@dpdk.org
Cc: Konstantin Ananyev <konstantin.ananyev@huawei.com>, stable@dpdk.org,
 Isaac Boukris <iboukris@gmail.com>,
 =?UTF-8?q?Morten=20Br=C3=B8rup?= <mb@smartsharesystems.com>,
 Stephen Hemminger <stephen@networkplumber.org>
Subject: [PATCH v2 2/3] bfp: fix load hangs with six IPv6 addresses
Date: Thu, 27 Jun 2024 19:04:41 +0100
Message-Id: <20240627180442.1602-3-konstantin.v.ananyev@yandex.ru>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20240627180442.1602-1-konstantin.v.ananyev@yandex.ru>
References: <20240627115531.1440-1-konstantin.v.ananyev@yandex.ru>
 <20240627180442.1602-1-konstantin.v.ananyev@yandex.ru>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org

From: Konstantin Ananyev <konstantin.ananyev@huawei.com>

As described in: https://bugs.dpdk.org/show_bug.cgi?id=1465
converting from following cBPF filter:
"host 1::1 or host 1::1 or host 1::1 or host 1::1 or host 1::1 or host 1::1"
taking too long for BPF verifier ito complete (up to 25 seconds).
Looking at it I didn't find any actual functional bug.
In fact, it does what is expected: goes through each possible path of
BPF program and evaluates register/stack state for each instruction.
The problem is that for program with a lot of conditional branches number
of possible paths starts to grow exponentially and such walk becomes very
excessive.
So  to minimize number of evaluations, this patch implements heuristic
similar to what Linux kernel does - state pruning:
If from given instruction for given program state we explore all possible
paths and for each of them reach bpf_exit() without any complaints and a
valid R0 value, then for that instruction this program state can be
marked as 'safe'. When we later arrive at the same instruction with a state
equivalent to an earlier instruction 'safe' state, we can prune the search.
For now, only states for JCC  targets are saved/examined.
Plus added few extra logging for DEBUG level.

Bugzilla ID: 1465
Fixes: 8021917293d0 ("bpf: add extra validation for input BPF program")
Cc: stable@dpdk.org

Reported-by: Isaac Boukris <iboukris@gmail.com>
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/bpf/bpf_validate.c | 305 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 255 insertions(+), 50 deletions(-)

diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index 11344fff4d..3cfdc9ddf4 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -29,10 +29,13 @@ struct bpf_reg_val {
 };
 
 struct bpf_eval_state {
+	SLIST_ENTRY(bpf_eval_state) next; /* for @safe list traversal */
 	struct bpf_reg_val rv[EBPF_REG_NUM];
 	struct bpf_reg_val sv[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
 };
 
+SLIST_HEAD(bpf_evst_head, bpf_eval_state);
+
 /* possible instruction node colour */
 enum {
 	WHITE,
@@ -52,6 +55,9 @@ enum {
 
 #define	MAX_EDGES	2
 
+/* max number of 'safe' evaluated states to track per node */
+#define NODE_EVST_MAX	32
+
 struct inst_node {
 	uint8_t colour;
 	uint8_t nb_edge:4;
@@ -59,7 +65,18 @@ struct inst_node {
 	uint8_t edge_type[MAX_EDGES];
 	uint32_t edge_dest[MAX_EDGES];
 	uint32_t prev_node;
-	struct bpf_eval_state *evst;
+	struct {
+		struct bpf_eval_state *cur;   /* save/restore for jcc targets */
+		struct bpf_eval_state *start;
+		struct bpf_evst_head safe;    /* safe states for track/prune */
+		uint32_t nb_safe;
+	} evst;
+};
+
+struct evst_pool {
+	uint32_t num;
+	uint32_t cur;
+	struct bpf_eval_state *ent;
 };
 
 struct bpf_verifier {
@@ -73,11 +90,8 @@ struct bpf_verifier {
 	uint32_t edge_type[MAX_EDGE_TYPE];
 	struct bpf_eval_state *evst;
 	struct inst_node *evin;
-	struct {
-		uint32_t num;
-		uint32_t cur;
-		struct bpf_eval_state *ent;
-	} evst_pool;
+	struct evst_pool evst_sr_pool; /* for evst save/restore */
+	struct evst_pool evst_tp_pool; /* for evst track/prune */
 };
 
 struct bpf_ins_check {
@@ -1085,7 +1099,7 @@ eval_jcc(struct bpf_verifier *bvf, const struct ebpf_insn *ins)
 	struct bpf_reg_val rvf, rvt;
 
 	tst = bvf->evst;
-	fst = bvf->evin->evst;
+	fst = bvf->evin->evst.cur;
 
 	frd = fst->rv + ins->dst_reg;
 	trd = tst->rv + ins->dst_reg;
@@ -1814,8 +1828,8 @@ add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx)
 	uint32_t ne;
 
 	if (nidx > bvf->prm->nb_ins) {
-		RTE_BPF_LOG_LINE(ERR, "%s: program boundary violation at pc: %u, "
-			"next pc: %u",
+		RTE_BPF_LOG_LINE(ERR,
+			"%s: program boundary violation at pc: %u, next pc: %u",
 			__func__, get_node_idx(bvf, node), nidx);
 		return -EINVAL;
 	}
@@ -2091,60 +2105,114 @@ validate(struct bpf_verifier *bvf)
  * helper functions get/free eval states.
  */
 static struct bpf_eval_state *
-pull_eval_state(struct bpf_verifier *bvf)
+pull_eval_state(struct evst_pool *pool)
 {
 	uint32_t n;
 
-	n = bvf->evst_pool.cur;
-	if (n == bvf->evst_pool.num)
+	n = pool->cur;
+	if (n == pool->num)
 		return NULL;
 
-	bvf->evst_pool.cur = n + 1;
-	return bvf->evst_pool.ent + n;
+	pool->cur = n + 1;
+	return pool->ent + n;
 }
 
 static void
-push_eval_state(struct bpf_verifier *bvf)
+push_eval_state(struct evst_pool *pool)
 {
-	bvf->evst_pool.cur--;
+	RTE_ASSERT(pool->cur != 0);
+	pool->cur--;
 }
 
 static void
 evst_pool_fini(struct bpf_verifier *bvf)
 {
 	bvf->evst = NULL;
-	free(bvf->evst_pool.ent);
-	memset(&bvf->evst_pool, 0, sizeof(bvf->evst_pool));
+	free(bvf->evst_sr_pool.ent);
+	memset(&bvf->evst_sr_pool, 0, sizeof(bvf->evst_sr_pool));
+	memset(&bvf->evst_tp_pool, 0, sizeof(bvf->evst_tp_pool));
 }
 
 static int
 evst_pool_init(struct bpf_verifier *bvf)
 {
-	uint32_t n;
+	uint32_t k, n;
 
-	n = bvf->nb_jcc_nodes + 1;
+	/*
+	 * We need nb_jcc_nodes + 1 for save_cur/restore_cur
+	 * remaining ones will be used for state tracking/pruning.
+	 */
+	k = bvf->nb_jcc_nodes + 1;
+	n = k * 3;
 
-	bvf->evst_pool.ent = calloc(n, sizeof(bvf->evst_pool.ent[0]));
-	if (bvf->evst_pool.ent == NULL)
+	bvf->evst_sr_pool.ent = calloc(n, sizeof(bvf->evst_sr_pool.ent[0]));
+	if (bvf->evst_sr_pool.ent == NULL)
 		return -ENOMEM;
 
-	bvf->evst_pool.num = n;
-	bvf->evst_pool.cur = 0;
+	bvf->evst_sr_pool.num = k;
+	bvf->evst_sr_pool.cur = 0;
 
-	bvf->evst = pull_eval_state(bvf);
+	bvf->evst_tp_pool.ent = bvf->evst_sr_pool.ent + k;
+	bvf->evst_tp_pool.num = n - k;
+	bvf->evst_tp_pool.cur = 0;
+
+	bvf->evst = pull_eval_state(&bvf->evst_sr_pool);
 	return 0;
 }
 
+/*
+ * try to allocate and initialise new eval state for given node.
+ * later if no errors will be encountered, this state will be accepted as
+ * one of the possible 'safe' states for that node.
+ */
+static void
+save_start_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
+{
+	RTE_ASSERT(node->evst.start == NULL);
+
+	/* limit number of states for one node with some reasonable value */
+	if (node->evst.nb_safe >= NODE_EVST_MAX)
+		return;
+
+	/* try to get new eval_state */
+	node->evst.start = pull_eval_state(&bvf->evst_tp_pool);
+
+	/* make a copy of current state */
+	if (node->evst.start != NULL) {
+		memcpy(node->evst.start, bvf->evst, sizeof(*node->evst.start));
+		SLIST_NEXT(node->evst.start, next) = NULL;
+	}
+}
+
+/*
+ * add @start state to the list of @safe states.
+ */
+static void
+save_safe_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
+{
+	if (node->evst.start == NULL)
+		return;
+
+	SLIST_INSERT_HEAD(&node->evst.safe, node->evst.start, next);
+	node->evst.nb_safe++;
+
+	RTE_BPF_LOG_LINE(DEBUG, "%s(bvf=%p,node=%u,state=%p): nb_safe=%u;",
+		__func__, bvf, get_node_idx(bvf, node), node->evst.start,
+		node->evst.nb_safe);
+
+	node->evst.start = NULL;
+}
+
 /*
  * Save current eval state.
  */
 static int
-save_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
+save_cur_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
 {
 	struct bpf_eval_state *st;
 
 	/* get new eval_state for this node */
-	st = pull_eval_state(bvf);
+	st = pull_eval_state(&bvf->evst_sr_pool);
 	if (st == NULL) {
 		RTE_BPF_LOG_LINE(ERR,
 			"%s: internal error (out of space) at pc: %u",
@@ -2156,11 +2224,13 @@ save_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
 	memcpy(st, bvf->evst, sizeof(*st));
 
 	/* swap current state with new one */
-	node->evst = bvf->evst;
+	RTE_ASSERT(node->evst.cur == NULL);
+	node->evst.cur = bvf->evst;
 	bvf->evst = st;
 
 	RTE_BPF_LOG_LINE(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;",
-		__func__, bvf, get_node_idx(bvf, node), node->evst, bvf->evst);
+		__func__, bvf, get_node_idx(bvf, node), node->evst.cur,
+		bvf->evst);
 
 	return 0;
 }
@@ -2169,14 +2239,15 @@ save_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
  * Restore previous eval state and mark current eval state as free.
  */
 static void
-restore_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
+restore_cur_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
 {
 	RTE_BPF_LOG_LINE(DEBUG, "%s(bvf=%p,node=%u) old/new states: %p/%p;",
-		__func__, bvf, get_node_idx(bvf, node), bvf->evst, node->evst);
+		__func__, bvf, get_node_idx(bvf, node), bvf->evst,
+		node->evst.cur);
 
-	bvf->evst = node->evst;
-	node->evst = NULL;
-	push_eval_state(bvf);
+	bvf->evst = node->evst.cur;
+	node->evst.cur = NULL;
+	push_eval_state(&bvf->evst_sr_pool);
 }
 
 static void
@@ -2193,26 +2264,124 @@ log_dbg_eval_state(const struct bpf_verifier *bvf, const struct ebpf_insn *ins,
 
 	RTE_LOG(DEBUG, BPF,
 		"r%u={\n"
-		"\tv={type=%u, size=%zu},\n"
+		"\tv={type=%u, size=%zu, buf_size=%zu},\n"
 		"\tmask=0x%" PRIx64 ",\n"
 		"\tu={min=0x%" PRIx64 ", max=0x%" PRIx64 "},\n"
 		"\ts={min=%" PRId64 ", max=%" PRId64 "},\n"
 		"};\n",
 		ins->dst_reg,
-		rv->v.type, rv->v.size,
+		rv->v.type, rv->v.size, rv->v.buf_size,
 		rv->mask,
 		rv->u.min, rv->u.max,
 		rv->s.min, rv->s.max);
 }
 
 /*
- * Do second pass through CFG and try to evaluate instructions
- * via each possible path.
- * Right now evaluation functionality is quite limited.
- * Still need to add extra checks for:
- * - use/return uninitialized registers.
- * - use uninitialized data from the stack.
- * - memory boundaries violation.
+ * compare two evaluation states.
+ * returns zero if @lv is more conservative (safer) then @rv.
+ * returns non-zero value otherwise.
+ */
+static int
+cmp_reg_val_within(const struct bpf_reg_val *lv, const struct bpf_reg_val *rv)
+{
+	/* expect @v and @mask to be identical */
+	if (memcmp(&lv->v, &rv->v, sizeof(lv->v)) != 0 || lv->mask != rv->mask)
+		return -1;
+
+	/* exact match only for mbuf and stack pointers */
+	if (lv->v.type == RTE_BPF_ARG_PTR_MBUF ||
+			lv->v.type == BPF_ARG_PTR_STACK)
+		return -1;
+
+	if (lv->u.min <= rv->u.min && lv->u.max >= rv->u.max &&
+			lv->s.min <= rv->s.min && lv->s.max >= rv->s.max)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * compare two evaluation states.
+ * returns zero if they are identical.
+ * returns positive value if @lv is more conservative (safer) then @rv.
+ * returns negative value otherwise.
+ */
+static int
+cmp_eval_state(const struct bpf_eval_state *lv, const struct bpf_eval_state *rv)
+{
+	int32_t rc;
+	uint32_t i, k;
+
+	/* for stack expect identical values */
+	rc = memcmp(lv->sv, rv->sv, sizeof(lv->sv));
+	if (rc != 0)
+		return -(2 * EBPF_REG_NUM);
+
+	k = 0;
+	/* check register values */
+	for (i = 0; i != RTE_DIM(lv->rv); i++) {
+		rc = memcmp(&lv->rv[i], &rv->rv[i], sizeof(lv->rv[i]));
+		if (rc != 0 && cmp_reg_val_within(&lv->rv[i], &rv->rv[i]) != 0)
+			return -(i + 1);
+		k += (rc != 0);
+	}
+
+	return k;
+}
+
+/*
+ * check did we already evaluated that path and can it be pruned that time.
+ */
+static int
+prune_eval_state(struct bpf_verifier *bvf, const struct inst_node *node,
+	struct inst_node *next)
+{
+	int32_t rc;
+	struct bpf_eval_state *safe;
+
+	rc = INT32_MIN;
+	SLIST_FOREACH(safe, &next->evst.safe, next) {
+		rc = cmp_eval_state(safe, bvf->evst);
+		if (rc >= 0)
+			break;
+	}
+
+	rc = (rc >= 0) ? 0 : -1;
+
+	/*
+	 * current state doesn't match any safe states,
+	 * so no prunning is possible right now,
+	 * track current state for future references.
+	 */
+	if (rc != 0)
+		save_start_eval_state(bvf, next);
+
+	RTE_BPF_LOG_LINE(DEBUG, "%s(bvf=%p,node=%u,next=%u) returns %d, "
+		"next->evst.start=%p, next->evst.nb_safe=%u",
+		__func__, bvf, get_node_idx(bvf, node),
+		get_node_idx(bvf, next), rc,
+		next->evst.start, next->evst.nb_safe);
+	return rc;
+}
+
+/* Do second pass through CFG and try to evaluate instructions
+ * via each possible path. The verifier will try all paths, tracking types of
+ * registers used as input to instructions, and updating resulting type via
+ * register state values. Plus for each register and possible stack value it
+ * tries to estimate possible max/min value.
+ * For conditional jumps, a stack is used to save evaluation state, so one
+ * path is explored while the state for the other path is pushed onto the stack.
+ * Then later, we backtrack to the first pushed instruction and repeat the cycle
+ * until the stack is empty and we're done.
+ * For program with many conditional branches walking through all possible path
+ * could be very excessive. So to minimize number of evaluations we use
+ * heuristic similar to what Linux kernel does - state pruning:
+ * If from given instruction for given program state we explore all possible
+ * paths and for each of them reach _exit() without any complaints and a valid
+ * R0 value, then for that instruction, that program state can be marked as
+ * 'safe'. When we later arrive at the same instruction with a state
+ * equivalent to an earlier instruction's 'safe' state, we can prune the search.
+ * For now, only states for JCC  targets are saved/examined.
  */
 static int
 evaluate(struct bpf_verifier *bvf)
@@ -2223,6 +2392,13 @@ evaluate(struct bpf_verifier *bvf)
 	const struct ebpf_insn *ins;
 	struct inst_node *next, *node;
 
+	struct {
+		uint32_t nb_eval;
+		uint32_t nb_prune;
+		uint32_t nb_save;
+		uint32_t nb_restore;
+	} stats;
+
 	/* initial state of frame pointer */
 	static const struct bpf_reg_val rvfp = {
 		.v = {
@@ -2246,6 +2422,8 @@ evaluate(struct bpf_verifier *bvf)
 	next = node;
 	rc = 0;
 
+	memset(&stats, 0, sizeof(stats));
+
 	while (node != NULL && rc == 0) {
 
 		/*
@@ -2259,11 +2437,14 @@ evaluate(struct bpf_verifier *bvf)
 			op = ins[idx].code;
 
 			/* for jcc node make a copy of evaluation state */
-			if (node->nb_edge > 1)
-				rc |= save_eval_state(bvf, node);
+			if (node->nb_edge > 1) {
+				rc |= save_cur_eval_state(bvf, node);
+				stats.nb_save++;
+			}
 
 			if (ins_chk[op].eval != NULL && rc == 0) {
 				err = ins_chk[op].eval(bvf, ins + idx);
+				stats.nb_eval++;
 				if (err != NULL) {
 					RTE_BPF_LOG_LINE(ERR, "%s: %s at pc: %u",
 						__func__, err, idx);
@@ -2277,21 +2458,37 @@ evaluate(struct bpf_verifier *bvf)
 
 		/* proceed through CFG */
 		next = get_next_node(bvf, node);
+
 		if (next != NULL) {
 
 			/* proceed with next child */
 			if (node->cur_edge == node->nb_edge &&
-					node->evst != NULL)
-				restore_eval_state(bvf, node);
+					node->evst.cur != NULL) {
+				restore_cur_eval_state(bvf, node);
+				stats.nb_restore++;
+			}
 
-			next->prev_node = get_node_idx(bvf, node);
-			node = next;
+			/*
+			 * for jcc targets: check did we already evaluated
+			 * that path and can it's evaluation be skipped that
+			 * time.
+			 */
+			if (node->nb_edge > 1 && prune_eval_state(bvf, node,
+					next) == 0) {
+				next = NULL;
+				stats.nb_prune++;
+			} else {
+				next->prev_node = get_node_idx(bvf, node);
+				node = next;
+			}
 		} else {
 			/*
 			 * finished with current node and all it's kids,
-			 * proceed with parent
+			 * mark it's @start state as safe for future references,
+			 * and proceed with parent.
 			 */
 			node->cur_edge = 0;
+			save_safe_eval_state(bvf, node);
 			node = get_prev_node(bvf, node);
 
 			/* finished */
@@ -2300,6 +2497,14 @@ evaluate(struct bpf_verifier *bvf)
 		}
 	}
 
+	RTE_LOG(DEBUG, BPF, "%s(%p) returns %d, stats:\n"
+		"node evaluations=%u;\n"
+		"state pruned=%u;\n"
+		"state saves=%u;\n"
+		"state restores=%u;\n",
+		__func__, bvf, rc,
+		stats.nb_eval, stats.nb_prune, stats.nb_save, stats.nb_restore);
+
 	return rc;
 }
 
-- 
2.35.3