DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC 0/8] Packet Capture enhancements
@ 2019-10-07 16:52 Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 1/8] pdump: use new pktmbuf copy function Stephen Hemminger
                   ` (7 more replies)
  0 siblings, 8 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

This is a set of patches to provide enhanced capabilities
for packet capture. It adds new features to base pdump
library (with API versioning); new PCAPNG output formatter
and new application which works like dumpcap from Wireshark.

Stephen Hemminger (8):
  pdump: use new pktmbuf copy function
  pdump: use dynamic logtype
  pdump: tag copied mbuf with port
  pdump: stamp packets with current timestamp
  pdump: add classic BPF filtering
  pdump: add packet header truncation
  pcapng: add new library for writing pcapng files
  app/capture: add packet capture using pcapng

 app/Makefile                             |   1 +
 app/capture/Makefile                     |  19 +
 app/capture/main.c                       | 675 +++++++++++++++++++++++
 app/capture/meson.build                  |  22 +
 app/meson.build                          |   1 +
 app/pdump/main.c                         |  32 +-
 app/test/test_pdump.c                    |  10 +-
 config/common_base                       |  11 +
 lib/Makefile                             |   2 +
 lib/librte_pcapng/Makefile               |  22 +
 lib/librte_pcapng/meson.build            |  10 +
 lib/librte_pcapng/pcapng_proto.h         | 112 ++++
 lib/librte_pcapng/rte_pcapng.c           | 449 +++++++++++++++
 lib/librte_pcapng/rte_pcapng.h           | 132 +++++
 lib/librte_pcapng/rte_pcapng_version.map |  12 +
 lib/librte_pdump/Makefile                |   2 +-
 lib/librte_pdump/pdump_bpf.h             | 168 ++++++
 lib/librte_pdump/rte_pcap_filter.c       | 462 ++++++++++++++++
 lib/librte_pdump/rte_pdump.c             | 326 ++++++-----
 lib/librte_pdump/rte_pdump.h             |  65 ++-
 lib/librte_pdump/rte_pdump_version.map   |   7 +
 lib/meson.build                          |   2 +-
 mk/rte.app.mk                            |   1 +
 23 files changed, 2367 insertions(+), 176 deletions(-)
 create mode 100644 app/capture/Makefile
 create mode 100644 app/capture/main.c
 create mode 100644 app/capture/meson.build
 create mode 100644 lib/librte_pcapng/Makefile
 create mode 100644 lib/librte_pcapng/meson.build
 create mode 100644 lib/librte_pcapng/pcapng_proto.h
 create mode 100644 lib/librte_pcapng/rte_pcapng.c
 create mode 100644 lib/librte_pcapng/rte_pcapng.h
 create mode 100644 lib/librte_pcapng/rte_pcapng_version.map
 create mode 100644 lib/librte_pdump/pdump_bpf.h
 create mode 100644 lib/librte_pdump/rte_pcap_filter.c

-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 1/8] pdump: use new pktmbuf copy function
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 2/8] pdump: use dynamic logtype Stephen Hemminger
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

The rte_pktmbuf_copy handles varying size mbuf pools correctly.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/librte_pdump/rte_pdump.c | 69 +-----------------------------------
 1 file changed, 1 insertion(+), 68 deletions(-)

diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index cd24dd010951..c665cf237f65 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -64,73 +64,6 @@ static struct pdump_rxtx_cbs {
 } rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT],
 tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
 
-static inline int
-pdump_pktmbuf_copy_data(struct rte_mbuf *seg, const struct rte_mbuf *m)
-{
-	if (rte_pktmbuf_tailroom(seg) < m->data_len) {
-		RTE_LOG(ERR, PDUMP,
-			"User mempool: insufficient data_len of mbuf\n");
-		return -EINVAL;
-	}
-
-	seg->port = m->port;
-	seg->vlan_tci = m->vlan_tci;
-	seg->hash = m->hash;
-	seg->tx_offload = m->tx_offload;
-	seg->ol_flags = m->ol_flags;
-	seg->packet_type = m->packet_type;
-	seg->vlan_tci_outer = m->vlan_tci_outer;
-	seg->data_len = m->data_len;
-	seg->pkt_len = seg->data_len;
-	rte_memcpy(rte_pktmbuf_mtod(seg, void *),
-			rte_pktmbuf_mtod(m, void *),
-			rte_pktmbuf_data_len(seg));
-
-	return 0;
-}
-
-static inline struct rte_mbuf *
-pdump_pktmbuf_copy(struct rte_mbuf *m, struct rte_mempool *mp)
-{
-	struct rte_mbuf *m_dup, *seg, **prev;
-	uint32_t pktlen;
-	uint16_t nseg;
-
-	m_dup = rte_pktmbuf_alloc(mp);
-	if (unlikely(m_dup == NULL))
-		return NULL;
-
-	seg = m_dup;
-	prev = &seg->next;
-	pktlen = m->pkt_len;
-	nseg = 0;
-
-	do {
-		nseg++;
-		if (pdump_pktmbuf_copy_data(seg, m) < 0) {
-			if (seg != m_dup)
-				rte_pktmbuf_free_seg(seg);
-			rte_pktmbuf_free(m_dup);
-			return NULL;
-		}
-		*prev = seg;
-		prev = &seg->next;
-	} while ((m = m->next) != NULL &&
-			(seg = rte_pktmbuf_alloc(mp)) != NULL);
-
-	*prev = NULL;
-	m_dup->nb_segs = nseg;
-	m_dup->pkt_len = pktlen;
-
-	/* Allocation of new indirect segment failed */
-	if (unlikely(seg == NULL)) {
-		rte_pktmbuf_free(m_dup);
-		return NULL;
-	}
-
-	__rte_mbuf_sanity_check(m_dup, 1);
-	return m_dup;
-}
 
 static inline void
 pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
@@ -148,7 +81,7 @@ pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
 	ring = cbs->ring;
 	mp = cbs->mp;
 	for (i = 0; i < nb_pkts; i++) {
-		p = pdump_pktmbuf_copy(pkts[i], mp);
+		p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX);
 		if (p)
 			dup_bufs[d_pkts++] = p;
 	}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 2/8] pdump: use dynamic logtype
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 1/8] pdump: use new pktmbuf copy function Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 3/8] pdump: tag copied mbuf with port Stephen Hemminger
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

The logtype USER1 should not be overloaded for library function.
Instead use a dynamic log type.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/librte_pdump/rte_pdump.c | 68 ++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index c665cf237f65..cfb8be7ca227 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -13,8 +13,12 @@
 #include "rte_pdump.h"
 
 #define DEVICE_ID_SIZE 64
-/* Macros for printing using RTE_LOG */
-#define RTE_LOGTYPE_PDUMP RTE_LOGTYPE_USER1
+
+/* Macro for printing using RTE_LOG */
+static int pdump_logtype;
+#define PDUMP_LOG(level, fmt, args...)				\
+	rte_log(RTE_LOG_ ## level, pdump_logtype, "%s(): " fmt,	\
+		__func__, ## args)
 
 /* Used for the multi-process communication */
 #define PDUMP_MP	"mp_pdump"
@@ -88,7 +92,7 @@ pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
 
 	ring_enq = rte_ring_enqueue_burst(ring, (void *)dup_bufs, d_pkts, NULL);
 	if (unlikely(ring_enq < d_pkts)) {
-		RTE_LOG(DEBUG, PDUMP,
+		PDUMP_LOG(DEBUG,
 			"only %d of packets enqueued to ring\n", ring_enq);
 		do {
 			rte_pktmbuf_free(dup_bufs[ring_enq]);
@@ -127,7 +131,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 		cbs = &rx_cbs[port][qid];
 		if (cbs && operation == ENABLE) {
 			if (cbs->cb) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to add rx callback for port=%d "
 					"and queue=%d, callback already exists\n",
 					port, qid);
@@ -138,7 +142,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			cbs->cb = rte_eth_add_first_rx_callback(port, qid,
 								pdump_rx, cbs);
 			if (cbs->cb == NULL) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to add rx callback, errno=%d\n",
 					rte_errno);
 				return rte_errno;
@@ -148,7 +152,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			int ret;
 
 			if (cbs->cb == NULL) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to delete non existing rx "
 					"callback for port=%d and queue=%d\n",
 					port, qid);
@@ -156,7 +160,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			}
 			ret = rte_eth_remove_rx_callback(port, qid, cbs->cb);
 			if (ret < 0) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to remove rx callback, errno=%d\n",
 					-ret);
 				return ret;
@@ -182,7 +186,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 		cbs = &tx_cbs[port][qid];
 		if (cbs && operation == ENABLE) {
 			if (cbs->cb) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to add tx callback for port=%d "
 					"and queue=%d, callback already exists\n",
 					port, qid);
@@ -193,7 +197,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx,
 								cbs);
 			if (cbs->cb == NULL) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to add tx callback, errno=%d\n",
 					rte_errno);
 				return rte_errno;
@@ -203,7 +207,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			int ret;
 
 			if (cbs->cb == NULL) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to delete non existing tx "
 					"callback for port=%d and queue=%d\n",
 					port, qid);
@@ -211,7 +215,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			}
 			ret = rte_eth_remove_tx_callback(port, qid, cbs->cb);
 			if (ret < 0) {
-				RTE_LOG(ERR, PDUMP,
+				PDUMP_LOG(ERR,
 					"failed to remove tx callback, errno=%d\n",
 					-ret);
 				return ret;
@@ -240,7 +244,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 		ret = rte_eth_dev_get_port_by_name(p->data.en_v1.device,
 				&port);
 		if (ret < 0) {
-			RTE_LOG(ERR, PDUMP,
+			PDUMP_LOG(ERR,
 				"failed to get port id for device id=%s\n",
 				p->data.en_v1.device);
 			return -EINVAL;
@@ -252,7 +256,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 		ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device,
 				&port);
 		if (ret < 0) {
-			RTE_LOG(ERR, PDUMP,
+			PDUMP_LOG(ERR,
 				"failed to get port id for device id=%s\n",
 				p->data.dis_v1.device);
 			return -EINVAL;
@@ -270,18 +274,18 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 		nb_rx_q = dev_info.nb_rx_queues;
 		nb_tx_q = dev_info.nb_tx_queues;
 		if (nb_rx_q == 0 && flags & RTE_PDUMP_FLAG_RX) {
-			RTE_LOG(ERR, PDUMP,
+			PDUMP_LOG(ERR,
 				"number of rx queues cannot be 0\n");
 			return -EINVAL;
 		}
 		if (nb_tx_q == 0 && flags & RTE_PDUMP_FLAG_TX) {
-			RTE_LOG(ERR, PDUMP,
+			PDUMP_LOG(ERR,
 				"number of tx queues cannot be 0\n");
 			return -EINVAL;
 		}
 		if ((nb_tx_q == 0 || nb_rx_q == 0) &&
 			flags == RTE_PDUMP_FLAG_RXTX) {
-			RTE_LOG(ERR, PDUMP,
+			PDUMP_LOG(ERR,
 				"both tx&rx queues must be non zero\n");
 			return -EINVAL;
 		}
@@ -317,7 +321,7 @@ pdump_server(const struct rte_mp_msg *mp_msg, const void *peer)
 
 	/* recv client requests */
 	if (mp_msg->len_param != sizeof(*cli_req)) {
-		RTE_LOG(ERR, PDUMP, "failed to recv from client\n");
+		PDUMP_LOG(ERR, "failed to recv from client\n");
 		resp->err_value = -EINVAL;
 	} else {
 		cli_req = (const struct pdump_request *)mp_msg->param;
@@ -330,8 +334,8 @@ pdump_server(const struct rte_mp_msg *mp_msg, const void *peer)
 	mp_resp.len_param = sizeof(*resp);
 	mp_resp.num_fds = 0;
 	if (rte_mp_reply(&mp_resp, peer) < 0) {
-		RTE_LOG(ERR, PDUMP, "failed to send to client:%s, %s:%d\n",
-			strerror(rte_errno), __func__, __LINE__);
+		PDUMP_LOG(ERR, "failed to send to client:%s\n",
+			  strerror(rte_errno));
 		return -1;
 	}
 
@@ -359,19 +363,18 @@ static int
 pdump_validate_ring_mp(struct rte_ring *ring, struct rte_mempool *mp)
 {
 	if (ring == NULL || mp == NULL) {
-		RTE_LOG(ERR, PDUMP, "NULL ring or mempool are passed %s:%d\n",
-			__func__, __LINE__);
+		PDUMP_LOG(ERR, "NULL ring or mempool\n");
 		rte_errno = EINVAL;
 		return -1;
 	}
 	if (mp->flags & MEMPOOL_F_SP_PUT || mp->flags & MEMPOOL_F_SC_GET) {
-		RTE_LOG(ERR, PDUMP, "mempool with either SP or SC settings"
+		PDUMP_LOG(ERR, "mempool with either SP or SC settings"
 		" is not valid for pdump, should have MP and MC settings\n");
 		rte_errno = EINVAL;
 		return -1;
 	}
 	if (ring->prod.single || ring->cons.single) {
-		RTE_LOG(ERR, PDUMP, "ring with either SP or SC settings"
+		PDUMP_LOG(ERR, "ring with either SP or SC settings"
 		" is not valid for pdump, should have MP and MC settings\n");
 		rte_errno = EINVAL;
 		return -1;
@@ -385,7 +388,7 @@ pdump_validate_flags(uint32_t flags)
 {
 	if (flags != RTE_PDUMP_FLAG_RX && flags != RTE_PDUMP_FLAG_TX &&
 		flags != RTE_PDUMP_FLAG_RXTX) {
-		RTE_LOG(ERR, PDUMP,
+		PDUMP_LOG(ERR,
 			"invalid flags, should be either rx/tx/rxtx\n");
 		rte_errno = EINVAL;
 		return -1;
@@ -400,17 +403,15 @@ pdump_validate_port(uint16_t port, char *name)
 	int ret = 0;
 
 	if (port >= RTE_MAX_ETHPORTS) {
-		RTE_LOG(ERR, PDUMP, "Invalid port id %u, %s:%d\n", port,
-			__func__, __LINE__);
+		PDUMP_LOG(ERR, "Invalid port id %u\n", port);
 		rte_errno = EINVAL;
 		return -1;
 	}
 
 	ret = rte_eth_dev_get_name_by_port(port, name);
 	if (ret < 0) {
-		RTE_LOG(ERR, PDUMP,
-			"port id to name mapping failed for port id=%u, %s:%d\n",
-			port, __func__, __LINE__);
+		PDUMP_LOG(ERR, "port %u to name mapping failed\n",
+			  port);
 		rte_errno = EINVAL;
 		return -1;
 	}
@@ -465,7 +466,7 @@ pdump_prepare_client_request(char *device, uint16_t queue,
 	}
 
 	if (ret < 0)
-		RTE_LOG(ERR, PDUMP,
+		PDUMP_LOG(ERR,
 			"client request for pdump enable/disable failed\n");
 	return ret;
 }
@@ -552,3 +553,10 @@ rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue,
 
 	return ret;
 }
+
+RTE_INIT(pdump_log)
+{
+	pdump_logtype = rte_log_register("lib.pdump");
+	if (pdump_logtype >= 0)
+		rte_log_set_level(pdump_logtype, RTE_LOG_NOTICE);
+}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 3/8] pdump: tag copied mbuf with port
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 1/8] pdump: use new pktmbuf copy function Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 2/8] pdump: use dynamic logtype Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 4/8] pdump: stamp packets with current timestamp Stephen Hemminger
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Set the port when packets go into ring, so same ring can be
used for multiple ports.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/librte_pdump/rte_pdump.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index cfb8be7ca227..830decef91e2 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -68,9 +68,9 @@ static struct pdump_rxtx_cbs {
 } rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT],
 tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
 
-
-static inline void
-pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
+static void
+pdump_copy(uint16_t port, struct rte_mbuf **pkts,
+	   uint16_t nb_pkts, void *user_params)
 {
 	unsigned i;
 	int ring_enq;
@@ -86,8 +86,10 @@ pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
 	mp = cbs->mp;
 	for (i = 0; i < nb_pkts; i++) {
 		p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX);
-		if (p)
+		if (p) {
+			p->port = port;
 			dup_bufs[d_pkts++] = p;
+		}
 	}
 
 	ring_enq = rte_ring_enqueue_burst(ring, (void *)dup_bufs, d_pkts, NULL);
@@ -101,20 +103,20 @@ pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
 }
 
 static uint16_t
-pdump_rx(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+pdump_rx(uint16_t port, uint16_t qidx __rte_unused,
 	struct rte_mbuf **pkts, uint16_t nb_pkts,
 	uint16_t max_pkts __rte_unused,
 	void *user_params)
 {
-	pdump_copy(pkts, nb_pkts, user_params);
+	pdump_copy(port, pkts, nb_pkts, user_params);
 	return nb_pkts;
 }
 
 static uint16_t
-pdump_tx(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
+pdump_tx(uint16_t port, uint16_t qidx __rte_unused,
 		struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params)
 {
-	pdump_copy(pkts, nb_pkts, user_params);
+	pdump_copy(port, pkts, nb_pkts, user_params);
 	return nb_pkts;
 }
 
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 4/8] pdump: stamp packets with current timestamp
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
                   ` (2 preceding siblings ...)
  2019-10-07 16:52 ` [dpdk-dev] [RFC 3/8] pdump: tag copied mbuf with port Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering Stephen Hemminger
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Put the current cycle count in as timestamp when
they are placed in the ring for packet capture.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/librte_pdump/rte_pdump.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index 830decef91e2..41f2ec17a26b 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -75,6 +75,7 @@ pdump_copy(uint16_t port, struct rte_mbuf **pkts,
 	unsigned i;
 	int ring_enq;
 	uint16_t d_pkts = 0;
+	uint64_t now = rte_get_tsc_cycles();
 	struct rte_mbuf *dup_bufs[nb_pkts];
 	struct pdump_rxtx_cbs *cbs;
 	struct rte_ring *ring;
@@ -88,6 +89,7 @@ pdump_copy(uint16_t port, struct rte_mbuf **pkts,
 		p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX);
 		if (p) {
 			p->port = port;
+			p->timestamp = now;
 			dup_bufs[d_pkts++] = p;
 		}
 	}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
                   ` (3 preceding siblings ...)
  2019-10-07 16:52 ` [dpdk-dev] [RFC 4/8] pdump: stamp packets with current timestamp Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 17:07   ` Jerin Jacob
  2019-10-07 16:52 ` [dpdk-dev] [RFC 6/8] pdump: add packet header truncation Stephen Hemminger
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Simple classic BPF interpreter based off of libpcap.

This is a copy of the BPF interpreter from libpcap which is
modified to handle mbuf meta data. The existing pcap_offline_filter
does not expose a way to match VLAN tags. Copying the BPF interpreter
also means that rte_pdump still does not have a hard dependency
on libpcap.

The API for pdump is versioned because the filter needs to
know both byte code and length of program to validate it.

This patch does cause a small checkpatch warning because
it keeps the original variable names from the pcap code.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/pdump/main.c                       |  16 +-
 app/test/test_pdump.c                  |   4 +-
 lib/librte_pdump/Makefile              |   2 +-
 lib/librte_pdump/pdump_bpf.h           | 168 +++++++++
 lib/librte_pdump/rte_pcap_filter.c     | 462 +++++++++++++++++++++++++
 lib/librte_pdump/rte_pdump.c           | 145 ++++++--
 lib/librte_pdump/rte_pdump.h           |  54 ++-
 lib/librte_pdump/rte_pdump_version.map |   7 +
 8 files changed, 806 insertions(+), 52 deletions(-)
 create mode 100644 lib/librte_pdump/pdump_bpf.h
 create mode 100644 lib/librte_pdump/rte_pcap_filter.c

diff --git a/app/pdump/main.c b/app/pdump/main.c
index c1b901279f4b..c3eb554ef28b 100644
--- a/app/pdump/main.c
+++ b/app/pdump/main.c
@@ -828,20 +828,20 @@ enable_pdump(void)
 						pt->queue,
 						RTE_PDUMP_FLAG_RX,
 						pt->rx_ring,
-						pt->mp, NULL);
+						pt->mp, NULL, 0);
 				ret1 = rte_pdump_enable_by_deviceid(
 						pt->device_id,
 						pt->queue,
 						RTE_PDUMP_FLAG_TX,
 						pt->tx_ring,
-						pt->mp, NULL);
+						pt->mp, NULL, 0);
 			} else if (pt->dump_by_type == PORT_ID) {
 				ret = rte_pdump_enable(pt->port, pt->queue,
 						RTE_PDUMP_FLAG_RX,
-						pt->rx_ring, pt->mp, NULL);
+						pt->rx_ring, pt->mp, NULL, 0);
 				ret1 = rte_pdump_enable(pt->port, pt->queue,
 						RTE_PDUMP_FLAG_TX,
-						pt->tx_ring, pt->mp, NULL);
+						pt->tx_ring, pt->mp, NULL, 0);
 			}
 		} else if (pt->dir == RTE_PDUMP_FLAG_RX) {
 			if (pt->dump_by_type == DEVICE_ID)
@@ -849,22 +849,22 @@ enable_pdump(void)
 						pt->device_id,
 						pt->queue,
 						pt->dir, pt->rx_ring,
-						pt->mp, NULL);
+						pt->mp, NULL, 0);
 			else if (pt->dump_by_type == PORT_ID)
 				ret = rte_pdump_enable(pt->port, pt->queue,
 						pt->dir,
-						pt->rx_ring, pt->mp, NULL);
+						pt->rx_ring, pt->mp, NULL, 0);
 		} else if (pt->dir == RTE_PDUMP_FLAG_TX) {
 			if (pt->dump_by_type == DEVICE_ID)
 				ret = rte_pdump_enable_by_deviceid(
 						pt->device_id,
 						pt->queue,
 						pt->dir,
-						pt->tx_ring, pt->mp, NULL);
+						pt->tx_ring, pt->mp, NULL, 0);
 			else if (pt->dump_by_type == PORT_ID)
 				ret = rte_pdump_enable(pt->port, pt->queue,
 						pt->dir,
-						pt->tx_ring, pt->mp, NULL);
+						pt->tx_ring, pt->mp, NULL, 0);
 		}
 		if (ret < 0 || ret1 < 0) {
 			cleanup_pdump_resources();
diff --git a/app/test/test_pdump.c b/app/test/test_pdump.c
index af206968b38d..f0187a4cd279 100644
--- a/app/test/test_pdump.c
+++ b/app/test/test_pdump.c
@@ -79,7 +79,7 @@ run_pdump_client_tests(void)
 
 	for (itr = 0; itr < NUM_ITR; itr++) {
 		ret = rte_pdump_enable(portid, QUEUE_ID, flags, ring_client,
-				       mp, NULL);
+				       mp, NULL, 0);
 		if (ret < 0) {
 			printf("rte_pdump_enable failed\n");
 			return -1;
@@ -94,7 +94,7 @@ run_pdump_client_tests(void)
 		printf("pdump_disable success\n");
 
 		ret = rte_pdump_enable_by_deviceid(deviceid, QUEUE_ID, flags,
-						   ring_client, mp, NULL);
+						   ring_client, mp, NULL, 0);
 		if (ret < 0) {
 			printf("rte_pdump_enable_by_deviceid failed\n");
 			return -1;
diff --git a/lib/librte_pdump/Makefile b/lib/librte_pdump/Makefile
index 89593689a7d5..4a631c06a0ec 100644
--- a/lib/librte_pdump/Makefile
+++ b/lib/librte_pdump/Makefile
@@ -15,7 +15,7 @@ EXPORT_MAP := rte_pdump_version.map
 LIBABIVER := 3
 
 # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) := rte_pdump.c
+SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) := rte_pdump.c rte_pcap_filter.c
 
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_PDUMP)-include := rte_pdump.h
diff --git a/lib/librte_pdump/pdump_bpf.h b/lib/librte_pdump/pdump_bpf.h
new file mode 100644
index 000000000000..8f6d00f3cee2
--- /dev/null
+++ b/lib/librte_pdump/pdump_bpf.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This is code is derived from the libpcap bpf_filter which
+ * in turn is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ */
+
+#ifndef _PDUMP_BPF_H_
+#define _PDUMP_BPF_H__
+
+/*
+ * This is based off of libpcap's cut-down version of bpf.h;
+ * it includes only  the stuff needed for the BPF interpreter.
+ *
+ * Note: this is the original classic BPF generated by libpcap
+ *  not the new eBPF used elsewhere.
+ */
+
+typedef	int bpf_int32;
+typedef	unsigned int bpf_u_int32;
+
+/*
+ * Alignment macros.  BPF_WORDALIGN rounds up to the next
+ * even multiple of BPF_ALIGNMENT.
+ */
+#define BPF_ALIGNMENT sizeof(bpf_int32)
+#define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1))
+
+/*
+ * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST).
+ */
+#define BPF_MEMWORDS 16
+
+/*
+ * The instruction encodings.
+ *
+ * Please inform tcpdump-workers@lists.tcpdump.org if you use any
+ * of the reserved values, so that we can note that they're used
+ * (and perhaps implement it in the reference BPF implementation
+ * and encourage its implementation elsewhere).
+ */
+
+/*
+ * The upper 8 bits of the opcode aren't used. BSD/OS used 0x8000.
+ */
+
+/* instruction classes */
+#define BPF_CLASS(code) ((code) & 0x07)
+#define		BPF_LD		0x00
+#define		BPF_LDX		0x01
+#define		BPF_ST		0x02
+#define		BPF_STX		0x03
+#define		BPF_ALU		0x04
+#define		BPF_JMP		0x05
+#define		BPF_RET		0x06
+#define		BPF_MISC	0x07
+
+/* ld/ldx fields */
+#define BPF_SIZE(code)	((code) & 0x18)
+#define		BPF_W		0x00
+#define		BPF_H		0x08
+#define		BPF_B		0x10
+/*				0x18	reserved; used by BSD/OS */
+#define BPF_MODE(code)	((code) & 0xe0)
+#define		BPF_IMM		0x00
+#define		BPF_ABS		0x20
+#define		BPF_IND		0x40
+#define		BPF_MEM		0x60
+#define		BPF_LEN		0x80
+#define		BPF_MSH		0xa0
+/*				0xc0	reserved; used by BSD/OS */
+/*				0xe0	reserved; used by BSD/OS */
+
+/* alu/jmp fields */
+#define BPF_OP(code)	((code) & 0xf0)
+#define		BPF_ADD		0x00
+#define		BPF_SUB		0x10
+#define		BPF_MUL		0x20
+#define		BPF_DIV		0x30
+#define		BPF_OR		0x40
+#define		BPF_AND		0x50
+#define		BPF_LSH		0x60
+#define		BPF_RSH		0x70
+#define		BPF_NEG		0x80
+#define		BPF_MOD		0x90
+#define		BPF_XOR		0xa0
+/*				0xb0	reserved */
+/*				0xc0	reserved */
+/*				0xd0	reserved */
+/*				0xe0	reserved */
+/*				0xf0	reserved */
+
+#define		BPF_JA		0x00
+#define		BPF_JEQ		0x10
+#define		BPF_JGT		0x20
+#define		BPF_JGE		0x30
+#define		BPF_JSET	0x40
+/*				0x50	reserved; used on BSD/OS */
+/*				0x60	reserved */
+/*				0x70	reserved */
+/*				0x80	reserved */
+/*				0x90	reserved */
+/*				0xa0	reserved */
+/*				0xb0	reserved */
+/*				0xc0	reserved */
+/*				0xd0	reserved */
+/*				0xe0	reserved */
+/*				0xf0	reserved */
+#define BPF_SRC(code)	((code) & 0x08)
+#define		BPF_K		0x00
+#define		BPF_X		0x08
+
+/* ret - BPF_K and BPF_X also apply */
+#define BPF_RVAL(code)	((code) & 0x18)
+#define		BPF_A		0x10
+/*				0x18	reserved */
+
+/* misc */
+#define BPF_MISCOP(code) ((code) & 0xf8)
+#define		BPF_TAX		0x00
+/*				0x08	reserved */
+/*				0x10	reserved */
+/*				0x18	reserved */
+/* #define	BPF_COP		0x20	NetBSD "coprocessor" extensions */
+/*				0x28	reserved */
+/*				0x30	reserved */
+/*				0x38	reserved */
+/* #define	BPF_COPX	0x40	NetBSD "coprocessor" extensions */
+/*					also used on BSD/OS */
+/*				0x48	reserved */
+/*				0x50	reserved */
+/*				0x58	reserved */
+/*				0x60	reserved */
+/*				0x68	reserved */
+/*				0x70	reserved */
+/*				0x78	reserved */
+#define		BPF_TXA		0x80
+/*				0x88	reserved */
+/*				0x90	reserved */
+/*				0x98	reserved */
+/*				0xa0	reserved */
+/*				0xa8	reserved */
+/*				0xb0	reserved */
+/*				0xb8	reserved */
+/*				0xc0	reserved; used on BSD/OS */
+/*				0xc8	reserved */
+/*				0xd0	reserved */
+/*				0xd8	reserved */
+/*				0xe0	reserved */
+/*				0xe8	reserved */
+/*				0xf0	reserved */
+/*				0xf8	reserved */
+
+/*
+ * The instruction data structure.
+ */
+struct bpf_insn {
+	u_short	code;
+	u_char	jt;
+	u_char	jf;
+	bpf_u_int32 k;
+};
+
+#endif /* _PDUMP_BPF_H_ */
diff --git a/lib/librte_pdump/rte_pcap_filter.c b/lib/librte_pdump/rte_pcap_filter.c
new file mode 100644
index 000000000000..1d8caeee6628
--- /dev/null
+++ b/lib/librte_pdump/rte_pcap_filter.c
@@ -0,0 +1,462 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This is code is derived from the libpcap bpf_filter which
+ * in turn is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_pdump.h>
+
+#include "pdump_bpf.h"
+
+/* These magic values are used to do negative offset to find vlan */
+#define SKF_AD_OFF    (-0x1000)
+#define SKF_AD_VLAN_TAG	44
+#define SKF_AD_VLAN_TAG_PRESENT 48
+
+#define EXTRACT32(p) rte_be_to_cpu_32(*(const unaligned_uint32_t *)(p))
+#define EXTRACT16(p) rte_be_to_cpu_16(*(const unaligned_uint16_t *)(p))
+
+static inline u_short vlan_present(const struct rte_mbuf *m)
+{
+	return	(m->ol_flags & (PKT_TX_VLAN|PKT_RX_VLAN_STRIPPED)) != 0;
+}
+
+/*
+ * Execute the filter program starting at pc on the packet p
+ * wirelen is the length of the original packet
+ * buflen is the amount of data present
+ * aux_data is auxiliary data, currently used only when interpreting
+ * filters intended for the Linux kernel in cases where the kernel
+ * rejects the filter; it contains VLAN tag information
+ * For the kernel, p is assumed to be a pointer to an mbuf if buflen is 0,
+ * in all other cases, p is a pointer to a buffer and buflen is its size.
+ *
+ * Thanks to Ani Sinha <ani@arista.com> for providing initial implementation
+ */
+int
+rte_pcap_filter(const void *filter, const struct rte_mbuf *m)
+{
+	const struct bpf_insn *pc = filter;
+	uint32_t buflen = rte_pktmbuf_data_len(m);
+	uint32_t wirelen = rte_pktmbuf_pkt_len(m);
+	const uint8_t *p = rte_pktmbuf_mtod(m, const uint8_t *);
+	uint32_t A, X;
+	bpf_u_int32 k;
+	uint32_t mem[BPF_MEMWORDS];
+
+	/* No filter means accept all. */
+	if (pc == NULL)
+		return -1;
+
+	A = 0;
+	X = 0;
+	--pc;
+	for (;;) {
+		++pc;
+
+		switch (pc->code) {
+		default:
+			/* this must be caught by validation */
+			rte_panic("invalid BPF opcode\n");
+			return 0;
+
+		case BPF_RET|BPF_K:
+			return pc->k;
+
+		case BPF_RET|BPF_A:
+			return A;
+
+		case BPF_LD|BPF_W|BPF_ABS:
+			k = pc->k;
+			if (k > buflen || sizeof(int32_t) > buflen - k)
+				return 0;
+
+			A = EXTRACT32(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_ABS:
+			k = pc->k;
+			if (k > buflen || sizeof(int16_t) > buflen - k)
+				return 0;
+
+			A = EXTRACT16(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_ABS:
+			switch (pc->k) {
+			case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+				A = m->vlan_tci;
+				break;
+			case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+				A = vlan_present(m);
+				break;
+			default:
+				k = pc->k;
+				if (k >= buflen)
+					return 0;
+
+				A = p[k];
+				break;
+			}
+			continue;
+
+		case BPF_LD|BPF_W|BPF_LEN:
+			A = wirelen;
+			continue;
+
+		case BPF_LDX|BPF_W|BPF_LEN:
+			X = wirelen;
+			continue;
+
+		case BPF_LD|BPF_W|BPF_IND:
+			k = X + pc->k;
+			if (pc->k > buflen || X > buflen - pc->k ||
+			    sizeof(int32_t) > buflen - k) {
+				return 0;
+			}
+			A = EXTRACT32(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_IND:
+			k = X + pc->k;
+			if (X > buflen ||
+			    pc->k > buflen - X ||
+			    sizeof(int16_t) > buflen - k)
+				return 0;
+
+			A = EXTRACT16(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_IND:
+			k = X + pc->k;
+			if (pc->k >= buflen || X >= buflen - pc->k)
+				return 0;
+
+			A = p[k];
+			continue;
+
+		case BPF_LDX|BPF_MSH|BPF_B:
+			k = pc->k;
+			if (k >= buflen)
+				return 0;
+
+			X = (p[pc->k] & 0xf) << 2;
+			continue;
+
+		case BPF_LD|BPF_IMM:
+			A = pc->k;
+			continue;
+
+		case BPF_LDX|BPF_IMM:
+			X = pc->k;
+			continue;
+
+		case BPF_LD|BPF_MEM:
+			A = mem[pc->k];
+			continue;
+
+		case BPF_LDX|BPF_MEM:
+			X = mem[pc->k];
+			continue;
+
+		case BPF_ST:
+			mem[pc->k] = A;
+			continue;
+
+		case BPF_STX:
+			mem[pc->k] = X;
+			continue;
+
+		case BPF_JMP|BPF_JA:
+			/*
+			 * XXX - we currently implement "ip6 protochain"
+			 * with backward jumps, so sign-extend pc->k.
+			 */
+			pc += (bpf_int32)pc->k;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_K:
+			pc += (pc->k < A) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_K:
+			pc += (pc->k <= A) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_K:
+			pc += (pc->k == A) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_K:
+			pc += (A & pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_X:
+			pc += (A > X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_X:
+			pc += (A >= X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_X:
+			pc += (A == X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_X:
+			pc += (A & X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_X:
+			A += X;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_X:
+			A -= X;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_X:
+			A *= X;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_X:
+			if (X == 0)
+				return 0;
+			A /= X;
+			continue;
+
+		case BPF_ALU|BPF_MOD|BPF_X:
+			if (X == 0)
+				return 0;
+			A %= X;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_X:
+			A &= X;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_X:
+			A |= X;
+			continue;
+
+		case BPF_ALU|BPF_XOR|BPF_X:
+			A ^= X;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_X:
+			if (X < 32)
+				A <<= X;
+			else
+				A = 0;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_X:
+			if (X < 32)
+				A >>= X;
+			else
+				A = 0;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_K:
+			A += pc->k;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_K:
+			A -= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_K:
+			A *= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_K:
+			A /= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_MOD|BPF_K:
+			A %= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_K:
+			A &= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_K:
+			A |= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_XOR|BPF_K:
+			A ^= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_K:
+			A <<= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_K:
+			A >>= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_NEG:
+			/*
+			 * Most BPF arithmetic is unsigned, but negation
+			 * can't be unsigned; respecify it as subtracting
+			 * the accumulator from 0U, so that 1) we don't
+			 * get compiler warnings about negating an unsigned
+			 * value and 2) don't get UBSan warnings about
+			 * the result of negating 0x80000000 being undefined.
+			 */
+			A = (0U - A);
+			continue;
+
+		case BPF_MISC|BPF_TAX:
+			X = A;
+			continue;
+
+		case BPF_MISC|BPF_TXA:
+			A = X;
+			continue;
+		}
+	}
+}
+
+/*
+ * Return true if the 'fcode' is a valid filter program.
+ * The constraints are that each jump be forward and to a valid
+ * code, that memory accesses are within valid ranges (to the
+ * extent that this can be checked statically; loads of packet
+ * data have to be, and are, also checked at run time), and that
+ * the code terminates with either an accept or reject.
+ */
+int
+rte_pcap_validate_filter(const void *filter, uint32_t len)
+{
+	const struct bpf_insn *f = filter;
+	unsigned int i, from;
+
+	if (len < 1)
+		return 0;
+
+	for (i = 0; i < len; ++i) {
+		const struct bpf_insn *p = &f[i];
+
+		switch (BPF_CLASS(p->code)) {
+		/*
+		 * Check that memory operations use valid addresses.
+		 */
+		case BPF_LD:
+		case BPF_LDX:
+			switch (BPF_MODE(p->code)) {
+			case BPF_IMM:
+				break;
+			case BPF_ABS:
+			case BPF_IND:
+			case BPF_MSH:
+				/*
+				 * There's no maximum packet data size
+				 * in userland.  The runtime packet length
+				 * check suffices.
+				 */
+				break;
+			case BPF_MEM:
+				if (p->k >= BPF_MEMWORDS)
+					return 0;
+				break;
+			case BPF_LEN:
+				break;
+			default:
+				return 0;
+			}
+			break;
+		case BPF_ST:
+		case BPF_STX:
+			if (p->k >= BPF_MEMWORDS)
+				return 0;
+			break;
+		case BPF_ALU:
+			switch (BPF_OP(p->code)) {
+			case BPF_ADD:
+			case BPF_SUB:
+			case BPF_MUL:
+			case BPF_OR:
+			case BPF_AND:
+			case BPF_XOR:
+			case BPF_LSH:
+			case BPF_RSH:
+			case BPF_NEG:
+				break;
+			case BPF_DIV:
+			case BPF_MOD:
+				/*
+				 * Check for constant division or modulus
+				 * by 0.
+				 */
+				if (BPF_SRC(p->code) == BPF_K && p->k == 0)
+					return 0;
+				break;
+			default:
+				return 0;
+			}
+			break;
+		case BPF_JMP:
+			/*
+			 * Check that jumps are within the code block,
+			 * and that unconditional branches don't go
+			 * backwards as a result of an overflow.
+			 * Unconditional branches have a 32-bit offset,
+			 * so they could overflow; we check to make
+			 * sure they don't.  Conditional branches have
+			 * an 8-bit offset, and the from address is <=
+			 * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
+			 * is sufficiently small that adding 255 to it
+			 * won't overflow.
+			 *
+			 * We know that len is <= BPF_MAXINSNS, and we
+			 * assume that BPF_MAXINSNS is < the maximum size
+			 * of a unsigned int, so that i + 1 doesn't overflow.
+			 *
+			 * For userland, we don't know that the from
+			 * or len are <= BPF_MAXINSNS, but we know that
+			 * from <= len, and, except on a 64-bit system,
+			 * it's unlikely that len, if it truly reflects
+			 * the size of the program we've been handed,
+			 * will be anywhere near the maximum size of
+			 * a unsigned int.  We also don't check for backward
+			 * branches, as we currently support them in
+			 * userland for the protochain operation.
+			 */
+			from = i + 1;
+			switch (BPF_OP(p->code)) {
+			case BPF_JA:
+				if (from + p->k >= (unsigned int)len)
+					return 0;
+				break;
+			case BPF_JEQ:
+			case BPF_JGT:
+			case BPF_JGE:
+			case BPF_JSET:
+				if (from + p->jt >= (unsigned int)len ||
+				    from + p->jf >= (unsigned int)len)
+					return 0;
+				break;
+			default:
+				return 0;
+			}
+			break;
+		case BPF_RET:
+			break;
+		case BPF_MISC:
+			break;
+		default:
+			return 0;
+		}
+	}
+
+	return BPF_CLASS(f[len - 1].code) == BPF_RET;
+}
diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index 41f2ec17a26b..1206671c6f60 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -8,11 +8,13 @@
 #include <rte_lcore.h>
 #include <rte_log.h>
 #include <rte_errno.h>
+#include <rte_malloc.h>
 #include <rte_string_fns.h>
 
 #include "rte_pdump.h"
 
 #define DEVICE_ID_SIZE 64
+#define BPF_INS_SIZE sizeof(uint64_t)
 
 /* Macro for printing using RTE_LOG */
 static int pdump_logtype;
@@ -32,6 +34,8 @@ enum pdump_version {
 	V1 = 1
 };
 
+#define PDUMP_FILTER_V1	0x7064756d7066696c
+
 struct pdump_request {
 	uint16_t ver;
 	uint16_t op;
@@ -42,14 +46,14 @@ struct pdump_request {
 			uint16_t queue;
 			struct rte_ring *ring;
 			struct rte_mempool *mp;
-			void *filter;
+			const void *filter;
 		} en_v1;
 		struct disable_v1 {
 			char device[DEVICE_ID_SIZE];
 			uint16_t queue;
 			struct rte_ring *ring;
 			struct rte_mempool *mp;
-			void *filter;
+			const void *filter;
 		} dis_v1;
 	} data;
 };
@@ -64,7 +68,7 @@ static struct pdump_rxtx_cbs {
 	struct rte_ring *ring;
 	struct rte_mempool *mp;
 	const struct rte_eth_rxtx_callback *cb;
-	void *filter;
+	const void *filter;
 } rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT],
 tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
 
@@ -86,6 +90,9 @@ pdump_copy(uint16_t port, struct rte_mbuf **pkts,
 	ring = cbs->ring;
 	mp = cbs->mp;
 	for (i = 0; i < nb_pkts; i++) {
+		if (rte_pcap_filter(cbs->filter, pkts[i]) == 0)
+			continue;
+
 		p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX);
 		if (p) {
 			p->port = port;
@@ -124,8 +131,8 @@ pdump_tx(uint16_t port, uint16_t qidx __rte_unused,
 
 static int
 pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
-				struct rte_ring *ring, struct rte_mempool *mp,
-				uint16_t operation)
+			    struct rte_ring *ring, struct rte_mempool *mp,
+			    uint16_t operation, const void *filter)
 {
 	uint16_t qid;
 	struct pdump_rxtx_cbs *cbs = NULL;
@@ -143,6 +150,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			}
 			cbs->ring = ring;
 			cbs->mp = mp;
+			cbs->filter = filter;
 			cbs->cb = rte_eth_add_first_rx_callback(port, qid,
 								pdump_rx, cbs);
 			if (cbs->cb == NULL) {
@@ -178,8 +186,8 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 
 static int
 pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
-				struct rte_ring *ring, struct rte_mempool *mp,
-				uint16_t operation)
+			    struct rte_ring *ring, struct rte_mempool *mp,
+			    uint16_t operation, const void *filter)
 {
 
 	uint16_t qid;
@@ -198,6 +206,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			}
 			cbs->ring = ring;
 			cbs->mp = mp;
+			cbs->filter = filter;
 			cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx,
 								cbs);
 			if (cbs->cb == NULL) {
@@ -241,6 +250,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 	uint16_t operation;
 	struct rte_ring *ring;
 	struct rte_mempool *mp;
+	const void *filter = NULL;
 
 	flags = p->flags;
 	operation = p->op;
@@ -256,6 +266,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 		queue = p->data.en_v1.queue;
 		ring = p->data.en_v1.ring;
 		mp = p->data.en_v1.mp;
+		filter = p->data.en_v1.filter;
 	} else {
 		ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device,
 				&port);
@@ -299,7 +310,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 	if (flags & RTE_PDUMP_FLAG_RX) {
 		end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1;
 		ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp,
-							operation);
+						  operation, filter);
 		if (ret < 0)
 			return ret;
 	}
@@ -308,7 +319,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 	if (flags & RTE_PDUMP_FLAG_TX) {
 		end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1;
 		ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp,
-							operation);
+						  operation, filter);
 		if (ret < 0)
 			return ret;
 	}
@@ -424,12 +435,41 @@ pdump_validate_port(uint16_t port, char *name)
 }
 
 static int
-pdump_prepare_client_request(char *device, uint16_t queue,
-				uint32_t flags,
-				uint16_t operation,
-				struct rte_ring *ring,
-				struct rte_mempool *mp,
-				void *filter)
+pdump_validate_filter(const void *filter, unsigned int len)
+{
+	size_t alloc_len;
+
+	if (filter == NULL)
+		return 0;
+
+	/* must be in malloc memory to be accesible in primary */
+	if (rte_malloc_validate(filter, &alloc_len) != 0) {
+		PDUMP_LOG(ERR, "filter is not in rte_malloc memory\n");
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	if (len * BPF_INS_SIZE > alloc_len) {
+		PDUMP_LOG(ERR, "filter length error\n");
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	if (!rte_pcap_validate_filter(filter, len)) {
+		PDUMP_LOG(ERR, "filter is not valid BPF code\n");
+		rte_errno = EINVAL;
+		return -1;
+	}
+	return 0;
+}
+
+static int
+pdump_prepare_client_request(const char *device, uint16_t queue,
+			     uint32_t flags,
+			     uint16_t operation,
+			     struct rte_ring *ring,
+			     struct rte_mempool *mp,
+			     const void *filter)
 {
 	int ret = -1;
 	struct rte_mp_msg mp_req, *mp_rep;
@@ -476,14 +516,13 @@ pdump_prepare_client_request(char *device, uint16_t queue,
 }
 
 int
-rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags,
-			struct rte_ring *ring,
-			struct rte_mempool *mp,
-			void *filter)
+rte_pdump_enable_v1911(uint16_t port, uint16_t queue, uint32_t flags,
+		       struct rte_ring *ring, struct rte_mempool *mp,
+		       const void *filter, uint32_t filter_len)
 {
 
-	int ret = 0;
 	char name[DEVICE_ID_SIZE];
+	int ret;
 
 	ret = pdump_validate_port(port, name);
 	if (ret < 0)
@@ -492,36 +531,86 @@ rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags,
 	if (ret < 0)
 		return ret;
 	ret = pdump_validate_flags(flags);
+	if (ret < 0)
+		return ret;
+	ret = pdump_validate_filter(filter, filter_len);
 	if (ret < 0)
 		return ret;
 
 	ret = pdump_prepare_client_request(name, queue, flags,
-						ENABLE, ring, mp, filter);
+					   ENABLE, ring, mp, filter);
 
 	return ret;
 }
+BIND_DEFAULT_SYMBOL(rte_pdump_enable, _v1911, 19.11);
+MAP_STATIC_SYMBOL(int rte_pdump_enable(uint16_t port, uint16_t queue,
+				       uint32_t flags, struct rte_ring *ring,
+				       struct rte_mempool *mp,
+				       const void *filter, uint32_t len),
+		  rte_pdump_enable_v1911);
 
 int
-rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue,
-				uint32_t flags,
-				struct rte_ring *ring,
-				struct rte_mempool *mp,
-				void *filter)
+rte_pdump_enable_v1607(uint16_t port, uint16_t queue, uint32_t flags,
+		       struct rte_ring *ring,
+		       struct rte_mempool *mp,
+		       void *filter)
 {
-	int ret = 0;
+	if (filter != NULL)
+		PDUMP_LOG(WARNING, "filter not supported in this version\n");
+
+	return rte_pdump_enable_v1911(port, queue, flags, ring, mp,
+				      NULL, 0);
+}
+VERSION_SYMBOL(rte_pdump_enable, _v1607, 16.07);
+
+int
+rte_pdump_enable_by_deviceid_v1911(const char *device_id, uint16_t queue,
+				   uint32_t flags,
+				   struct rte_ring *ring,
+				   struct rte_mempool *mp,
+				   const void *filter, uint32_t filter_len)
+{
+	int ret;
 
 	ret = pdump_validate_ring_mp(ring, mp);
 	if (ret < 0)
 		return ret;
 	ret = pdump_validate_flags(flags);
+	if (ret < 0)
+		return ret;
+	ret = pdump_validate_filter(filter, filter_len);
 	if (ret < 0)
 		return ret;
 
 	ret = pdump_prepare_client_request(device_id, queue, flags,
-						ENABLE, ring, mp, filter);
+					   ENABLE, ring, mp, filter);
 
 	return ret;
 }
+BIND_DEFAULT_SYMBOL(rte_pdump_enable_by_deviceid, _v1911, 19.11);
+MAP_STATIC_SYMBOL(int rte_pdump_enable_by_deviceid(const char *device_id,
+						   uint16_t queue,
+						   uint32_t flags,
+						   struct rte_ring *ring,
+						   struct rte_mempool *mp,
+						   const void *filter,
+						   uint32_t len),
+		  rte_pdump_enable_by_deviceid_v1911);
+
+int
+rte_pdump_enable_by_deviceid_v1607(char *device_id, uint16_t queue,
+				   uint32_t flags,
+				   struct rte_ring *ring,
+				   struct rte_mempool *mp,
+				   void *filter)
+{
+	if (filter != NULL)
+		PDUMP_LOG(WARNING, "filter not supported in this version\n");
+
+	return rte_pdump_enable_by_deviceid_v1911(device_id, queue, flags,
+						  ring, mp, NULL, 0);
+}
+VERSION_SYMBOL(rte_pdump_enable_by_deviceid, _v1607, 16.07);
 
 int
 rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags)
diff --git a/lib/librte_pdump/rte_pdump.h b/lib/librte_pdump/rte_pdump.h
index 6b00fc17aeb2..12cb46f8b0e9 100644
--- a/lib/librte_pdump/rte_pdump.h
+++ b/lib/librte_pdump/rte_pdump.h
@@ -68,17 +68,25 @@ rte_pdump_uninit(void);
  * @param mp
  *  mempool on to which original packets will be mirrored or duplicated.
  * @param filter
- *  place holder for packet filtering.
+ *  filter to apply to incoming packet (classic BPF)
+ * @param len
+ *  length of filter (in BPF instructions)
  *
  * @return
  *    0 on success, -1 on error, rte_errno is set accordingly.
  */
-
 int
 rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags,
-		struct rte_ring *ring,
-		struct rte_mempool *mp,
-		void *filter);
+		 struct rte_ring *ring, struct rte_mempool *mp,
+		 const void *filter, uint32_t len);
+int
+rte_pdump_enable_v1607(uint16_t port, uint16_t queue, uint32_t flags,
+		       struct rte_ring *ring, struct rte_mempool *mp,
+		       void *filter);
+int
+rte_pdump_enable_v1911(uint16_t port, uint16_t queue, uint32_t flags,
+		       struct rte_ring *ring, struct rte_mempool *mp,
+		       const void *filter, uint32_t len);
 
 /**
  * Disables packet capturing on given port and queue.
@@ -118,18 +126,29 @@ rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags);
  * @param mp
  *  mempool on to which original packets will be mirrored or duplicated.
  * @param filter
- *  place holder for packet filtering.
+ *  filter to apply to incoming packet (classic BPF)
+ * @param len
+ *  length of filter (in BPF instructions)
  *
  * @return
  *    0 on success, -1 on error, rte_errno is set accordingly.
  */
-
 int
-rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue,
-				uint32_t flags,
-				struct rte_ring *ring,
-				struct rte_mempool *mp,
-				void *filter);
+rte_pdump_enable_by_deviceid(const char *device_id, uint16_t queue,
+			     uint32_t flags,
+			     struct rte_ring *ring,
+			     struct rte_mempool *mp,
+			     const void *filter, uint32_t len);
+int
+rte_pdump_enable_by_deviceid_v1607(char *device_id, uint16_t queue,
+				   uint32_t flags, struct rte_ring *ring,
+				   struct rte_mempool *mp,
+				   void *filter);
+int
+rte_pdump_enable_by_deviceid_v1911(const char *device_id, uint16_t queue,
+				   uint32_t flags, struct rte_ring *ring,
+				   struct rte_mempool *mp,
+				   const void *filter, uint32_t len);
 
 /**
  * Disables packet capturing on given device_id and queue.
@@ -151,7 +170,16 @@ rte_pdump_enable_by_deviceid(char *device_id, uint16_t queue,
  */
 int
 rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue,
-				uint32_t flags);
+			      uint32_t flags);
+
+
+/* internal */
+int
+rte_pcap_filter(const void *filter, const struct rte_mbuf *m);
+
+/* internal */
+int
+rte_pcap_validate_filter(const void *filter, uint32_t len);
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_pdump/rte_pdump_version.map b/lib/librte_pdump/rte_pdump_version.map
index 3e744f30123c..e78ba5a8350a 100644
--- a/lib/librte_pdump/rte_pdump_version.map
+++ b/lib/librte_pdump/rte_pdump_version.map
@@ -10,3 +10,10 @@ DPDK_16.07 {
 
 	local: *;
 };
+
+DPDK_19.11 {
+	global:
+
+	rte_pdump_enable;
+	rte_pdump_enable_by_deviceid;
+} DPDK_16.07;
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 6/8] pdump: add packet header truncation
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
                   ` (4 preceding siblings ...)
  2019-10-07 16:52 ` [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 7/8] pcapng: add new library for writing pcapng files Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 8/8] app/capture: add packet capture using pcapng Stephen Hemminger
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Add support for pcap style header truncation (called snap length).
This optimizes packet capture by not copying the whole packet.

Since API is already versioned (in previous patch),
it is ok to add one new argument for the length.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/pdump/main.c             | 16 ++++++-------
 app/test/test_pdump.c        |  6 +++--
 lib/librte_pdump/rte_pdump.c | 46 +++++++++++++++++++++++-------------
 lib/librte_pdump/rte_pdump.h | 29 ++++++++++++++---------
 4 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/app/pdump/main.c b/app/pdump/main.c
index c3eb554ef28b..e55a94c0f10e 100644
--- a/app/pdump/main.c
+++ b/app/pdump/main.c
@@ -825,45 +825,45 @@ enable_pdump(void)
 			if (pt->dump_by_type == DEVICE_ID) {
 				ret = rte_pdump_enable_by_deviceid(
 						pt->device_id,
-						pt->queue,
+						pt->queue, UINT16_MAX,
 						RTE_PDUMP_FLAG_RX,
 						pt->rx_ring,
 						pt->mp, NULL, 0);
 				ret1 = rte_pdump_enable_by_deviceid(
 						pt->device_id,
-						pt->queue,
+						pt->queue, UINT16_MAX,
 						RTE_PDUMP_FLAG_TX,
 						pt->tx_ring,
 						pt->mp, NULL, 0);
 			} else if (pt->dump_by_type == PORT_ID) {
 				ret = rte_pdump_enable(pt->port, pt->queue,
-						RTE_PDUMP_FLAG_RX,
+						UINT16_MAX, RTE_PDUMP_FLAG_RX,
 						pt->rx_ring, pt->mp, NULL, 0);
 				ret1 = rte_pdump_enable(pt->port, pt->queue,
-						RTE_PDUMP_FLAG_TX,
+						UINT16_MAX, RTE_PDUMP_FLAG_TX,
 						pt->tx_ring, pt->mp, NULL, 0);
 			}
 		} else if (pt->dir == RTE_PDUMP_FLAG_RX) {
 			if (pt->dump_by_type == DEVICE_ID)
 				ret = rte_pdump_enable_by_deviceid(
 						pt->device_id,
-						pt->queue,
+						pt->queue, UINT16_MAX,
 						pt->dir, pt->rx_ring,
 						pt->mp, NULL, 0);
 			else if (pt->dump_by_type == PORT_ID)
 				ret = rte_pdump_enable(pt->port, pt->queue,
-						pt->dir,
+						UINT16_MAX, pt->dir,
 						pt->rx_ring, pt->mp, NULL, 0);
 		} else if (pt->dir == RTE_PDUMP_FLAG_TX) {
 			if (pt->dump_by_type == DEVICE_ID)
 				ret = rte_pdump_enable_by_deviceid(
 						pt->device_id,
-						pt->queue,
+						pt->queue, UINT16_MAX,
 						pt->dir,
 						pt->tx_ring, pt->mp, NULL, 0);
 			else if (pt->dump_by_type == PORT_ID)
 				ret = rte_pdump_enable(pt->port, pt->queue,
-						pt->dir,
+						UINT16_MAX, pt->dir,
 						pt->tx_ring, pt->mp, NULL, 0);
 		}
 		if (ret < 0 || ret1 < 0) {
diff --git a/app/test/test_pdump.c b/app/test/test_pdump.c
index f0187a4cd279..c2468b968247 100644
--- a/app/test/test_pdump.c
+++ b/app/test/test_pdump.c
@@ -78,7 +78,8 @@ run_pdump_client_tests(void)
 	printf("\n***** flags = RTE_PDUMP_FLAG_TX *****\n");
 
 	for (itr = 0; itr < NUM_ITR; itr++) {
-		ret = rte_pdump_enable(portid, QUEUE_ID, flags, ring_client,
+		ret = rte_pdump_enable(portid, QUEUE_ID, flags,
+				       UINT16_MAX, ring_client,
 				       mp, NULL, 0);
 		if (ret < 0) {
 			printf("rte_pdump_enable failed\n");
@@ -93,7 +94,8 @@ run_pdump_client_tests(void)
 		}
 		printf("pdump_disable success\n");
 
-		ret = rte_pdump_enable_by_deviceid(deviceid, QUEUE_ID, flags,
+		ret = rte_pdump_enable_by_deviceid(deviceid, QUEUE_ID,
+						   flags, UINT16_MAX,
 						   ring_client, mp, NULL, 0);
 		if (ret < 0) {
 			printf("rte_pdump_enable_by_deviceid failed\n");
diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c
index 1206671c6f60..c3636bcdd029 100644
--- a/lib/librte_pdump/rte_pdump.c
+++ b/lib/librte_pdump/rte_pdump.c
@@ -44,6 +44,7 @@ struct pdump_request {
 		struct enable_v1 {
 			char device[DEVICE_ID_SIZE];
 			uint16_t queue;
+			uint16_t snaplen;
 			struct rte_ring *ring;
 			struct rte_mempool *mp;
 			const void *filter;
@@ -51,6 +52,7 @@ struct pdump_request {
 		struct disable_v1 {
 			char device[DEVICE_ID_SIZE];
 			uint16_t queue;
+			uint16_t snaplen;
 			struct rte_ring *ring;
 			struct rte_mempool *mp;
 			const void *filter;
@@ -69,6 +71,7 @@ static struct pdump_rxtx_cbs {
 	struct rte_mempool *mp;
 	const struct rte_eth_rxtx_callback *cb;
 	const void *filter;
+	uint16_t snaplen;
 } rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT],
 tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
 
@@ -93,7 +96,7 @@ pdump_copy(uint16_t port, struct rte_mbuf **pkts,
 		if (rte_pcap_filter(cbs->filter, pkts[i]) == 0)
 			continue;
 
-		p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX);
+		p = rte_pktmbuf_copy(pkts[i], mp, 0, cbs->snaplen);
 		if (p) {
 			p->port = port;
 			p->timestamp = now;
@@ -132,7 +135,8 @@ pdump_tx(uint16_t port, uint16_t qidx __rte_unused,
 static int
 pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			    struct rte_ring *ring, struct rte_mempool *mp,
-			    uint16_t operation, const void *filter)
+			    uint16_t operation, uint16_t snaplen,
+			    const void *filter)
 {
 	uint16_t qid;
 	struct pdump_rxtx_cbs *cbs = NULL;
@@ -151,6 +155,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			cbs->ring = ring;
 			cbs->mp = mp;
 			cbs->filter = filter;
+			cbs->snaplen = snaplen;
 			cbs->cb = rte_eth_add_first_rx_callback(port, qid,
 								pdump_rx, cbs);
 			if (cbs->cb == NULL) {
@@ -187,7 +192,8 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 static int
 pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			    struct rte_ring *ring, struct rte_mempool *mp,
-			    uint16_t operation, const void *filter)
+			    uint16_t operation, uint16_t snaplen,
+			    const void *filter)
 {
 
 	uint16_t qid;
@@ -207,6 +213,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue,
 			cbs->ring = ring;
 			cbs->mp = mp;
 			cbs->filter = filter;
+			cbs->snaplen = snaplen;
 			cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx,
 								cbs);
 			if (cbs->cb == NULL) {
@@ -244,7 +251,7 @@ static int
 set_pdump_rxtx_cbs(const struct pdump_request *p)
 {
 	uint16_t nb_rx_q = 0, nb_tx_q = 0, end_q, queue;
-	uint16_t port;
+	uint16_t port, snaplen = 0;
 	int ret = 0;
 	uint32_t flags;
 	uint16_t operation;
@@ -266,6 +273,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 		queue = p->data.en_v1.queue;
 		ring = p->data.en_v1.ring;
 		mp = p->data.en_v1.mp;
+		snaplen = p->data.en_v1.snaplen;
 		filter = p->data.en_v1.filter;
 	} else {
 		ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device,
@@ -310,7 +318,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 	if (flags & RTE_PDUMP_FLAG_RX) {
 		end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1;
 		ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp,
-						  operation, filter);
+						  operation, snaplen, filter);
 		if (ret < 0)
 			return ret;
 	}
@@ -319,7 +327,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p)
 	if (flags & RTE_PDUMP_FLAG_TX) {
 		end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1;
 		ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp,
-						  operation, filter);
+						  operation, snaplen, filter);
 		if (ret < 0)
 			return ret;
 	}
@@ -465,7 +473,7 @@ pdump_validate_filter(const void *filter, unsigned int len)
 
 static int
 pdump_prepare_client_request(const char *device, uint16_t queue,
-			     uint32_t flags,
+			     uint32_t flags, uint16_t snaplen,
 			     uint16_t operation,
 			     struct rte_ring *ring,
 			     struct rte_mempool *mp,
@@ -485,6 +493,7 @@ pdump_prepare_client_request(const char *device, uint16_t queue,
 		strlcpy(req->data.en_v1.device, device,
 			sizeof(req->data.en_v1.device));
 		req->data.en_v1.queue = queue;
+		req->data.en_v1.snaplen = snaplen;
 		req->data.en_v1.ring = ring;
 		req->data.en_v1.mp = mp;
 		req->data.en_v1.filter = filter;
@@ -492,6 +501,7 @@ pdump_prepare_client_request(const char *device, uint16_t queue,
 		strlcpy(req->data.dis_v1.device, device,
 			sizeof(req->data.dis_v1.device));
 		req->data.dis_v1.queue = queue;
+		req->data.en_v1.snaplen = snaplen;
 		req->data.dis_v1.ring = NULL;
 		req->data.dis_v1.mp = NULL;
 		req->data.dis_v1.filter = NULL;
@@ -517,6 +527,7 @@ pdump_prepare_client_request(const char *device, uint16_t queue,
 
 int
 rte_pdump_enable_v1911(uint16_t port, uint16_t queue, uint32_t flags,
+		       uint16_t snap_len,
 		       struct rte_ring *ring, struct rte_mempool *mp,
 		       const void *filter, uint32_t filter_len)
 {
@@ -537,14 +548,15 @@ rte_pdump_enable_v1911(uint16_t port, uint16_t queue, uint32_t flags,
 	if (ret < 0)
 		return ret;
 
-	ret = pdump_prepare_client_request(name, queue, flags,
+	ret = pdump_prepare_client_request(name, queue, flags, snap_len,
 					   ENABLE, ring, mp, filter);
 
 	return ret;
 }
 BIND_DEFAULT_SYMBOL(rte_pdump_enable, _v1911, 19.11);
 MAP_STATIC_SYMBOL(int rte_pdump_enable(uint16_t port, uint16_t queue,
-				       uint32_t flags, struct rte_ring *ring,
+				       uint32_t flags, uint16_t snap_len,
+				       struct rte_ring *ring,
 				       struct rte_mempool *mp,
 				       const void *filter, uint32_t len),
 		  rte_pdump_enable_v1911);
@@ -558,14 +570,14 @@ rte_pdump_enable_v1607(uint16_t port, uint16_t queue, uint32_t flags,
 	if (filter != NULL)
 		PDUMP_LOG(WARNING, "filter not supported in this version\n");
 
-	return rte_pdump_enable_v1911(port, queue, flags, ring, mp,
-				      NULL, 0);
+	return rte_pdump_enable_v1911(port, queue, flags, UINT16_MAX,
+				      ring, mp, NULL, 0);
 }
 VERSION_SYMBOL(rte_pdump_enable, _v1607, 16.07);
 
 int
 rte_pdump_enable_by_deviceid_v1911(const char *device_id, uint16_t queue,
-				   uint32_t flags,
+				   uint32_t flags, uint16_t snap_len,
 				   struct rte_ring *ring,
 				   struct rte_mempool *mp,
 				   const void *filter, uint32_t filter_len)
@@ -582,7 +594,7 @@ rte_pdump_enable_by_deviceid_v1911(const char *device_id, uint16_t queue,
 	if (ret < 0)
 		return ret;
 
-	ret = pdump_prepare_client_request(device_id, queue, flags,
+	ret = pdump_prepare_client_request(device_id, queue, flags, snap_len,
 					   ENABLE, ring, mp, filter);
 
 	return ret;
@@ -591,6 +603,7 @@ BIND_DEFAULT_SYMBOL(rte_pdump_enable_by_deviceid, _v1911, 19.11);
 MAP_STATIC_SYMBOL(int rte_pdump_enable_by_deviceid(const char *device_id,
 						   uint16_t queue,
 						   uint32_t flags,
+						   uint16_t snap_len,
 						   struct rte_ring *ring,
 						   struct rte_mempool *mp,
 						   const void *filter,
@@ -607,7 +620,8 @@ rte_pdump_enable_by_deviceid_v1607(char *device_id, uint16_t queue,
 	if (filter != NULL)
 		PDUMP_LOG(WARNING, "filter not supported in this version\n");
 
-	return rte_pdump_enable_by_deviceid_v1911(device_id, queue, flags,
+	return rte_pdump_enable_by_deviceid_v1911(device_id, queue,
+						  flags, UINT16_MAX,
 						  ring, mp, NULL, 0);
 }
 VERSION_SYMBOL(rte_pdump_enable_by_deviceid, _v1607, 16.07);
@@ -625,7 +639,7 @@ rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags)
 	if (ret < 0)
 		return ret;
 
-	ret = pdump_prepare_client_request(name, queue, flags,
+	ret = pdump_prepare_client_request(name, queue, flags, 0,
 						DISABLE, NULL, NULL, NULL);
 
 	return ret;
@@ -641,7 +655,7 @@ rte_pdump_disable_by_deviceid(char *device_id, uint16_t queue,
 	if (ret < 0)
 		return ret;
 
-	ret = pdump_prepare_client_request(device_id, queue, flags,
+	ret = pdump_prepare_client_request(device_id, queue, flags, 0,
 						DISABLE, NULL, NULL, NULL);
 
 	return ret;
diff --git a/lib/librte_pdump/rte_pdump.h b/lib/librte_pdump/rte_pdump.h
index 12cb46f8b0e9..e04ef4c1933b 100644
--- a/lib/librte_pdump/rte_pdump.h
+++ b/lib/librte_pdump/rte_pdump.h
@@ -63,13 +63,15 @@ rte_pdump_uninit(void);
  * @param flags
  *  flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX
  *  on which packet capturing should be enabled for a given port and queue.
+ * @param snap_len
+ *  only the first snap_len bytes of packet will be captured.
  * @param ring
  *  ring on which captured packets will be enqueued for user.
  * @param mp
- *  mempool on to which original packets will be mirrored or duplicated.
+ *  mempool on to which original packets will be duplicated.
  * @param filter
  *  filter to apply to incoming packet (classic BPF)
- * @param len
+ * @param filter_len
  *  length of filter (in BPF instructions)
  *
  * @return
@@ -77,15 +79,17 @@ rte_pdump_uninit(void);
  */
 int
 rte_pdump_enable(uint16_t port, uint16_t queue, uint32_t flags,
-		 struct rte_ring *ring, struct rte_mempool *mp,
-		 const void *filter, uint32_t len);
+		 uint16_t snap_len, struct rte_ring *ring,
+		 struct rte_mempool *mp,
+		 const void *filter, uint32_t filter_len);
 int
 rte_pdump_enable_v1607(uint16_t port, uint16_t queue, uint32_t flags,
 		       struct rte_ring *ring, struct rte_mempool *mp,
 		       void *filter);
 int
 rte_pdump_enable_v1911(uint16_t port, uint16_t queue, uint32_t flags,
-		       struct rte_ring *ring, struct rte_mempool *mp,
+		       uint16_t snap_len, struct rte_ring *ring,
+		       struct rte_mempool *mp,
 		       const void *filter, uint32_t len);
 
 /**
@@ -121,13 +125,15 @@ rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags);
  * @param flags
  *  flags specifies RTE_PDUMP_FLAG_RX/RTE_PDUMP_FLAG_TX/RTE_PDUMP_FLAG_RXTX
  *  on which packet capturing should be enabled for a given port and queue.
+ * @param snap_len
+ *  only the first snap_len bytes of packet will be captured.
  * @param ring
  *  ring on which captured packets will be enqueued for user.
  * @param mp
- *  mempool on to which original packets will be mirrored or duplicated.
+ *  mempool on to which original packets will be duplicated.
  * @param filter
  *  filter to apply to incoming packet (classic BPF)
- * @param len
+ * @param filter_len
  *  length of filter (in BPF instructions)
  *
  * @return
@@ -135,10 +141,10 @@ rte_pdump_disable(uint16_t port, uint16_t queue, uint32_t flags);
  */
 int
 rte_pdump_enable_by_deviceid(const char *device_id, uint16_t queue,
-			     uint32_t flags,
+			     uint32_t flags, uint16_t snap_len,
 			     struct rte_ring *ring,
 			     struct rte_mempool *mp,
-			     const void *filter, uint32_t len);
+			     const void *filter, uint32_t filter_len);
 int
 rte_pdump_enable_by_deviceid_v1607(char *device_id, uint16_t queue,
 				   uint32_t flags, struct rte_ring *ring,
@@ -146,9 +152,10 @@ rte_pdump_enable_by_deviceid_v1607(char *device_id, uint16_t queue,
 				   void *filter);
 int
 rte_pdump_enable_by_deviceid_v1911(const char *device_id, uint16_t queue,
-				   uint32_t flags, struct rte_ring *ring,
+				   uint32_t flags, uint16_t snap_len,
+				   struct rte_ring *ring,
 				   struct rte_mempool *mp,
-				   const void *filter, uint32_t len);
+				   const void *filter, uint32_t filter_len);
 
 /**
  * Disables packet capturing on given device_id and queue.
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 7/8] pcapng: add new library for writing pcapng files
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
                   ` (5 preceding siblings ...)
  2019-10-07 16:52 ` [dpdk-dev] [RFC 6/8] pdump: add packet header truncation Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  2019-10-07 16:52 ` [dpdk-dev] [RFC 8/8] app/capture: add packet capture using pcapng Stephen Hemminger
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

Simple library for formatting pcapng files used by wireshark
and other programs.

See PCAP next generation file format specification
https://github.com/pcapng/pcapng

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 config/common_base                       |   6 +
 lib/Makefile                             |   2 +
 lib/librte_pcapng/Makefile               |  22 ++
 lib/librte_pcapng/meson.build            |  10 +
 lib/librte_pcapng/pcapng_proto.h         | 112 ++++++
 lib/librte_pcapng/rte_pcapng.c           | 449 +++++++++++++++++++++++
 lib/librte_pcapng/rte_pcapng.h           | 132 +++++++
 lib/librte_pcapng/rte_pcapng_version.map |  12 +
 lib/meson.build                          |   2 +-
 mk/rte.app.mk                            |   1 +
 10 files changed, 747 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_pcapng/Makefile
 create mode 100644 lib/librte_pcapng/meson.build
 create mode 100644 lib/librte_pcapng/pcapng_proto.h
 create mode 100644 lib/librte_pcapng/rte_pcapng.c
 create mode 100644 lib/librte_pcapng/rte_pcapng.h
 create mode 100644 lib/librte_pcapng/rte_pcapng_version.map

diff --git a/config/common_base b/config/common_base
index 8ef75c2039a2..0ccfcfae377d 100644
--- a/config/common_base
+++ b/config/common_base
@@ -998,6 +998,12 @@ CONFIG_RTE_KNI_PREEMPT_DEFAULT=y
 #
 CONFIG_RTE_LIBRTE_PDUMP=y
 
+#
+# Compile the pcapng library
+#
+CONFIG_RTE_LIBRTE_PCAPNG=y
+
+#
 #
 # Compile vhost user library
 #
diff --git a/lib/Makefile b/lib/Makefile
index 41c463d92139..47786030fade 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -102,6 +102,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_REORDER) += librte_reorder
 DEPDIRS-librte_reorder := librte_eal librte_mempool librte_mbuf
 DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += librte_pdump
 DEPDIRS-librte_pdump := librte_eal librte_mempool librte_mbuf librte_ethdev
+DIRS-$(CONFIG_RTE_LIBRTE_PCAPNG) += librte_pcapng
+DEPDIRS-librte_pcapng := librte_eal librte_mbuf librte_ethdev librte_net
 DIRS-$(CONFIG_RTE_LIBRTE_GSO) += librte_gso
 DEPDIRS-librte_gso := librte_eal librte_mbuf librte_ethdev librte_net
 DEPDIRS-librte_gso += librte_mempool
diff --git a/lib/librte_pcapng/Makefile b/lib/librte_pcapng/Makefile
new file mode 100644
index 000000000000..eaf3e85e3ad2
--- /dev/null
+++ b/lib/librte_pcapng/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Microsoft Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_pcapng.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+EXPORT_MAP := rte_pcapng_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_PCAPNG) := rte_pcapng.c
+
+# install includes
+SYMLINK-$(CONFIG_RTE_LIBRTE_MBUF)-include := rte_pcapng.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_pcapng/meson.build b/lib/librte_pcapng/meson.build
new file mode 100644
index 000000000000..7904b4759b00
--- /dev/null
+++ b/lib/librte_pcapng/meson.build
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Microsoft Corporation
+
+version = 1
+sources = files('rte_pcapng.c')
+headers = files('rte_pcapng.h')
+
+allow_experimental_apis = true
+
+deps += ['ethdev']
diff --git a/lib/librte_pcapng/pcapng_proto.h b/lib/librte_pcapng/pcapng_proto.h
new file mode 100644
index 000000000000..8f35bc31d979
--- /dev/null
+++ b/lib/librte_pcapng/pcapng_proto.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Microsoft Corporation
+ *
+ * Pcapng protocol data structures
+ *  from Draft RFC
+ */
+
+enum pcapng_block_types {
+	PCAPNG_INTERFACE_BLOCK		= 1,
+	PCAPNG_PACKET_BLOCK,		/* Obsolete */
+	PCAPNG_SIMPLE_PACKET_BLOCK,
+	PCAPNG_NAME_RESOLUTION_BLOCK,
+	PCAPNG_INTERFACE_STATS_BLOCK,
+	PCAPNG_ENHANCED_PACKET_BLOCK,
+
+	PCAPNG_SECTION_BLOCK		= 0x0A0D0D0A,
+};
+
+struct pcapng_option {
+	uint16_t code;
+	uint16_t length;
+	uint8_t data[];
+};
+
+#define PCAPNG_BYTE_ORDER_MAGIC 0x1A2B3C4D
+#define PCAPNG_MAJOR_VERS 1
+#define PCAPNG_MINOR_VERS 0
+
+enum pcapng_opt {
+	PCAPNG_OPT_END	= 0,
+	PCAPNG_OPT_COMMENT = 1,
+};
+
+struct pcapng_section_header {
+	uint32_t block_type;
+	uint32_t block_length;
+	uint32_t byte_order_magic;
+	uint16_t major_version;
+	uint16_t minor_version;
+	uint64_t section_length;
+};
+
+enum pcapng_section_opt {
+	PCAPNG_SHB_HARDWARE = 2,
+	PCAPNG_SHB_OS	    = 3,
+	PCAPNG_SHB_USERAPPL = 4,
+};
+
+struct pcapng_interface_block {
+	uint32_t block_type;	/* 1 */
+	uint32_t block_length;
+	uint16_t link_type;
+	uint16_t reserved;
+	uint32_t snap_len;
+};
+
+enum pcapng_interface_options {
+	PCAPNG_IFB_NAME	 = 2,
+	PCAPNG_IFB_DESCRIPTION,
+	PCAPNG_IFB_IPV4ADDR,
+	PCAPNG_IFB_IPV6ADDR,
+	PCAPNG_IFB_MACADDR,
+	PCAPNG_IFB_EUIADDR,
+	PCAPNG_IFB_SPEED,
+	PCAPNG_IFB_TSRESOL,
+	PCAPNG_IFB_TZONE,
+	PCAPNG_IFB_FILTER,
+	PCAPNG_IFB_OS,
+	PCAPNG_IFB_FCSLEN,
+	PCAPNG_IFB_TSOFFSET,
+	PCAPNG_IFB_HARDWARE,
+};
+
+struct pcapng_enhance_packet_block {
+	uint32_t block_type;	/* 6 */
+	uint32_t block_length;
+	uint32_t interface_id;
+	uint32_t timestamp_hi;
+	uint32_t timestamp_lo;
+	uint32_t capture_length;
+	uint32_t original_length;
+};
+
+enum pcapng_epb_options {
+	PCAPNG_EPB_FLAGS = 2,
+	PCAPNG_EPB_HASH,
+	PCAPNG_EPB_DROPS
+};
+
+struct pcapng_simple_packet {
+	uint32_t block_type;	/* 3 */
+	uint32_t block_length;
+	uint32_t packet_length;
+};
+
+struct pcapng_statistics {
+	uint32_t block_type;	/* 5 */
+	uint32_t block_length;
+	uint32_t interface_id;
+	uint32_t timestamp_hi;
+	uint32_t timestamp_lo;
+};
+
+enum pcapng_isb_options {
+	PCAPNG_ISB_STARTTIME = 2,
+	PCAPNG_ISB_ENDTIME,
+	PCAPNG_ISB_IFRECV,
+	PCAPNG_ISB_IFDROP,
+	PCAPNG_ISB_FILTERACCEPT,
+	PCAPNG_ISB_OSDROP,
+	PCAPNG_ISB_USRDELIV
+};
diff --git a/lib/librte_pcapng/rte_pcapng.c b/lib/librte_pcapng/rte_pcapng.c
new file mode 100644
index 000000000000..2beb3c24f882
--- /dev/null
+++ b/lib/librte_pcapng/rte_pcapng.c
@@ -0,0 +1,449 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Microsoft Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <net/if.h>
+
+#include <rte_config.h>
+#include <rte_ethdev.h>
+#include <rte_mbuf.h>
+#include <rte_cycles.h>
+#include <rte_version.h>
+#include <rte_pcapng.h>
+#include <rte_malloc.h>
+
+#include "pcapng_proto.h"
+
+/* conversion from DPDK speed to PCAPNG */
+#define PCAPNG_MBPS_SPEED 1000000ull
+
+#define PCAPNG_MAX_COMMENT 256
+
+#define NS_PER_US	1000ul
+#define US_PER_SEC	1000ul
+#define NS_PER_SEC	(NS_PER_US * US_PER_SEC)
+
+/* Private state for the library */
+struct rte_pcapng {
+	int  outfd;		/* output file */
+
+	uint64_t tsc_t0;	/* TSC cycles when opened */
+	uint32_t tsc_per_us;	/* TSC cycles per microsecond */
+	uint64_t tsc_offset;	/* nanosecond since 1970 */
+
+	uint32_t next_index;
+	uint32_t port_index[RTE_MAX_ETHPORTS];
+};
+
+/* length of option including padding */
+static size_t pcapng_optlen(uint16_t len)
+{
+	return RTE_ALIGN(sizeof(struct pcapng_option) + len,
+			 sizeof(uint32_t));
+}
+
+/* build TLV option and return location of next */
+static struct pcapng_option *
+pcapng_add_option(struct pcapng_option *popt, uint16_t code,
+		  const void *data, uint16_t len)
+{
+	popt->code = code;
+	popt->length = len;
+	memcpy(popt->data, data, len);
+
+	return (struct pcapng_option *)((uint8_t *)popt + pcapng_optlen(len));
+}
+
+/*
+ * Convert from DPDK timestamp (tsc cycles)
+ * to what Pcapng uses (nanosecond since 1 Jan 1970 UTC)
+ */
+static inline uint64_t tsc_to_timestamp(const rte_pcapng_t *self,
+					uint64_t tsc)
+{
+	uint64_t ticks = tsc - self->tsc_t0;
+
+	return (ticks * NS_PER_US) / self->tsc_per_us + self->tsc_offset;
+}
+
+/*
+ * Write required initial section header describing the capture
+ */
+static int
+write_section_block(rte_pcapng_t *self,
+		    const char *os, const char *hw,
+		    const char *app, const char *comment)
+{
+	struct pcapng_section_header *hdr;
+	struct pcapng_option *opt;
+	void *buf;
+	size_t len;
+	ssize_t cc;
+
+	len = sizeof(*hdr);
+	if (hw)
+		len += pcapng_optlen(strlen(hw));
+	if (os)
+		len += pcapng_optlen(strlen(os));
+	if (app)
+		len += pcapng_optlen(strlen(app));
+	if (comment)
+		len += pcapng_optlen(strlen(comment));
+
+	len += pcapng_optlen(0);
+	len += sizeof(uint32_t);
+
+	buf = calloc(1, len);
+	if (!buf)
+		return -1;
+
+	hdr = (struct pcapng_section_header *)buf;
+	*hdr = (struct pcapng_section_header) {
+		.block_type = PCAPNG_SECTION_BLOCK,
+		.block_length = len,
+		.byte_order_magic = PCAPNG_BYTE_ORDER_MAGIC,
+		.major_version = PCAPNG_MAJOR_VERS,
+		.minor_version = PCAPNG_MINOR_VERS,
+		.section_length = UINT64_MAX,
+	};
+	hdr->block_length = len;
+
+	opt = (struct pcapng_option *)(hdr + 1);
+	if (comment)
+		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
+					comment, strlen(comment));
+	if (hw)
+		opt = pcapng_add_option(opt, PCAPNG_SHB_HARDWARE,
+					hw, strlen(hw));
+	if (os)
+		opt = pcapng_add_option(opt, PCAPNG_SHB_OS,
+					os, strlen(os));
+	if (app)
+		opt = pcapng_add_option(opt, PCAPNG_SHB_USERAPPL,
+					app, strlen(app));
+
+	opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
+	/* clone block_length after option */
+	memcpy(opt, &hdr->block_length, sizeof(uint32_t));
+
+	cc = write(self->outfd, buf, len);
+	free(buf);
+
+	return cc;
+}
+
+static int
+capture_header(rte_pcapng_t *self,
+	       const char *os, const char *hw,
+	       const char *app, const char *comment)
+{
+	char osbuf[256];
+
+	if (app == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (os == NULL) {
+		struct utsname uts;
+
+		uname(&uts);
+		snprintf(osbuf, sizeof(osbuf),
+			 "%s %s", uts.sysname, uts.release);
+		os = osbuf;
+	}
+
+	if (hw == NULL)
+		hw = rte_version();
+
+	if (write_section_block(self, os, hw, app, comment) > 0)
+		return 0;
+	else
+		return -1;
+}
+
+static ssize_t
+write_interface_block(rte_pcapng_t *self, const char *if_name,
+		      uint64_t if_speed, const uint8_t *mac_addr,
+		      const char *if_hw, const char *comment)
+{
+	struct pcapng_interface_block *hdr;
+	struct pcapng_option *opt;
+	const uint8_t tsresol = 9;	/* nanosecond resolution */
+	size_t len = sizeof(*hdr);
+	ssize_t cc;
+	void *buf;
+
+	len += pcapng_optlen(sizeof(tsresol));
+	if (if_name)
+		len += pcapng_optlen(strlen(if_name));
+	if (mac_addr)
+		len += pcapng_optlen(6);
+	if (if_speed)
+		len += pcapng_optlen(sizeof(uint64_t));
+	if (if_hw)
+		len += pcapng_optlen(strlen(if_hw));
+	if (comment)
+		len += pcapng_optlen(strlen(comment));
+
+	len += pcapng_optlen(0);
+	len += sizeof(uint32_t);
+	buf = calloc(1, len);
+	if (!buf)
+		return -ENOMEM;
+
+	hdr = (struct pcapng_interface_block *)buf;
+	hdr->block_type = PCAPNG_INTERFACE_BLOCK;
+	hdr->link_type = 1;	/* Ethernet */
+	hdr->block_length = len;
+
+	opt = (struct pcapng_option *)(hdr + 1);
+	if (if_name)
+		opt = pcapng_add_option(opt, PCAPNG_IFB_NAME,
+					 if_name, strlen(if_name));
+	if (mac_addr)
+		opt = pcapng_add_option(opt, PCAPNG_IFB_MACADDR,
+					mac_addr, RTE_ETHER_ADDR_LEN);
+	if (if_speed)
+		opt = pcapng_add_option(opt, PCAPNG_IFB_SPEED,
+					 &if_speed, sizeof(uint64_t));
+	opt = pcapng_add_option(opt, PCAPNG_IFB_TSRESOL,
+				&tsresol, sizeof(tsresol));
+	if (if_hw)
+		opt = pcapng_add_option(opt, PCAPNG_IFB_HARDWARE,
+					 if_hw, strlen(if_hw));
+	if (comment)
+		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
+					comment, strlen(comment));
+
+	opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
+
+	memcpy(opt, &hdr->block_length, sizeof(uint32_t));
+	cc = write(self->outfd, buf, len);
+	free(buf);
+
+	return cc;
+}
+
+int
+rte_pcapng_add_interface(rte_pcapng_t *self, uint16_t port,
+			 const char *comment, uint32_t flags)
+{
+	struct rte_eth_dev_info dev_info;
+	struct rte_ether_addr macaddr;
+	const struct rte_device *dev;
+	struct rte_eth_link link;
+	char ifname[IF_NAMESIZE];
+	char ifhw[256];
+	uint64_t speed = 0;
+	int cc;
+
+	if (flags != 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	rte_eth_dev_info_get(port, &dev_info);
+
+	/* make something like an interface name */
+	if (if_indextoname(dev_info.if_index, ifname) == NULL)
+		snprintf(ifname, IF_NAMESIZE, "dpdk:%u", port);
+
+	/* make a useful device hardware string */
+	dev = dev_info.device;
+	if (dev)
+		snprintf(ifhw, sizeof(ifhw),
+			 "%s-%s", dev->bus->name, dev->name);
+
+	/* DPDK reports in units of Mbps */
+	rte_eth_link_get(port, &link);
+	if (link.link_status == ETH_LINK_UP)
+		speed = link.link_speed * PCAPNG_MBPS_SPEED;
+
+	rte_eth_macaddr_get(port, &macaddr);
+
+	cc = write_interface_block(self, ifname, speed,
+				   macaddr.addr_bytes,
+				   dev ? ifhw : NULL,
+				   comment);
+	if (cc < 0) {
+		rte_errno = errno;
+		return -1;
+	}
+
+	self->port_index[port] = self->next_index++;
+	return 0;
+}
+
+/* Create new pcapng writer handle */
+rte_pcapng_t *
+rte_pcapng_fdopen(int fd,
+		  const char *osname, const char *hardware,
+		  const char *appname, const char *comment,
+		  uint32_t flags)
+{
+	rte_pcapng_t *self;
+	struct timeval tv;
+
+	if (flags != 0) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	self = rte_zmalloc("pcapng", sizeof(*self), 0);
+	if (!self) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	self->outfd = fd;
+
+	/* compute clock offsets */
+	self->tsc_t0 = rte_rdtsc();
+	self->tsc_per_us = rte_get_tsc_hz() / US_PER_SEC;
+	gettimeofday(&tv, NULL);
+	self->tsc_offset = (tv.tv_sec * US_PER_SEC + tv.tv_usec) * NS_PER_US;
+
+	if (capture_header(self, osname, hardware,
+			   appname, comment) < 0)
+		goto error;
+
+	return self;
+
+error:
+	close(self->outfd);
+	rte_errno = errno;
+	rte_free(self);
+	return NULL;
+}
+
+void
+rte_pcapng_close(rte_pcapng_t *self)
+{
+	close(self->outfd);
+	rte_free(self);
+}
+
+int
+rte_pcapng_dump_packet(rte_pcapng_t *self, uint16_t port,
+		       const struct rte_mbuf *m,
+		       enum rte_pcapng_direction dir,
+		       const char *comment)
+{
+	struct pcapng_enhance_packet_block hdr;
+	uint32_t snap_len, padded_len, pad_bytes;
+	uint64_t ts = tsc_to_timestamp(self, m->timestamp);
+	static uint64_t zero_pad;
+	struct pcapng_option *opt;
+	uint32_t flags = dir;
+	uint32_t options[512];
+	uint16_t i;
+	struct iovec iov[m->nb_segs + 4];
+	size_t len, optlen;
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+#endif
+	/* only handle single segment mbuf */
+	snap_len = rte_pktmbuf_data_len(m);
+	padded_len = RTE_ALIGN(snap_len, sizeof(uint32_t));
+	pad_bytes = padded_len - snap_len;
+
+	len = sizeof(hdr) + padded_len + sizeof(uint32_t);
+	opt = (struct pcapng_option *)options;
+
+	opt = pcapng_add_option(opt, PCAPNG_EPB_FLAGS,
+				&flags, sizeof(flags));
+
+	if (comment)
+		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT, comment,
+					strnlen(comment, PCAPNG_MAX_COMMENT));
+
+	optlen = (char *)opt - (char *)options;
+	len += optlen;
+
+	hdr.block_type = PCAPNG_ENHANCED_PACKET_BLOCK;
+	hdr.block_length = len;
+
+	hdr.interface_id = self->port_index[port];
+	hdr.timestamp_hi = ts >> 32;
+	hdr.timestamp_lo = (uint32_t)ts;
+	hdr.capture_length = snap_len;
+	hdr.original_length = rte_pktmbuf_pkt_len(m);
+
+	iov[0].iov_base = &hdr;
+	iov[0].iov_len = sizeof(hdr);
+
+	for (i = 1; m; ++i, m = m->next) {
+		iov[i].iov_base = rte_pktmbuf_mtod(m, void *);
+		iov[i].iov_len = rte_pktmbuf_data_len(m);
+	}
+
+	if (pad_bytes > 0) {
+		iov[i].iov_base = &zero_pad;
+		iov[i].iov_len = pad_bytes;
+		++i;
+	}
+
+	iov[i].iov_base = options;
+	iov[i].iov_len = optlen;
+	++i;
+
+	iov[i].iov_base = &hdr.block_length;
+	iov[i].iov_len = sizeof(uint32_t);
+	++i;
+
+	if (unlikely(writev(self->outfd, iov, i) < 0)) {
+		rte_errno = errno;
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+rte_pcapng_dump_tx(rte_pcapng_t *out, uint16_t port,
+		   struct rte_mbuf *pkts[], uint32_t nb_pkts)
+{
+	uint32_t i;
+	int r = 0;
+
+	for (i = 0; i < nb_pkts; i++) {
+		struct rte_mbuf *m = pkts[i];
+
+		r = rte_pcapng_dump_packet(out, port, m,
+					   RTE_PCAPNG_DIR_OUTBOUND, NULL);
+		if (unlikely(r < 0))
+			break;
+	}
+
+	return r;
+}
+
+int
+rte_pcapng_dump_rx(rte_pcapng_t *out,
+		   struct rte_mbuf *pkts[], uint32_t nb_pkts)
+{
+	uint32_t i;
+	int r = 0;
+
+	for (i = 0; i < nb_pkts; i++) {
+		struct rte_mbuf *m = pkts[i];
+
+		r = rte_pcapng_dump_packet(out, m->port, m,
+					   RTE_PCAPNG_DIR_INBOUND, NULL);
+		if (unlikely(r < 0))
+			break;
+	}
+	return r;
+}
diff --git a/lib/librte_pcapng/rte_pcapng.h b/lib/librte_pcapng/rte_pcapng.h
new file mode 100644
index 000000000000..23763c6882c8
--- /dev/null
+++ b/lib/librte_pcapng/rte_pcapng.h
@@ -0,0 +1,132 @@
+/*
+ * Simple library to write files in Pcap-ng format.
+ * Copyright(c) 2019 Microsoft Corporation
+ * All rights reserved.
+ */
+
+
+/* Handle used for functions in this library. */
+typedef struct rte_pcapng rte_pcapng_t;
+
+/**
+ * Open new packet capture file
+ *
+ * @param fd
+ *   file descriptor
+ * @param osname
+ *   Operating system name.
+ *   If NULL will be filled in based on result of utsname.
+ * @param hardware
+ *   Hardware information.
+ *   If NULL will be filled in with DPDK version.
+ * @param appname
+ *   Applicaton name.
+ * @param comment
+ *   Comment for the file (optional can be NULL)
+ * @param flags
+ *   Options for capture (reserved must be 0)
+ * @return
+ *   handle to library, or NULL in case of error (and rte_errno is set).
+ */
+__rte_experimental
+rte_pcapng_t *
+rte_pcapng_fdopen(int fd,
+		  const char *osname, const char *hardware,
+		  const char *appname, const char *comment,
+		  uint32_t flags);
+
+
+enum rte_pcapng_direction {
+	RTE_PCAPNG_DIR_UNKNOWN = 0,
+	RTE_PCAPNG_DIR_INBOUND = 1,
+	RTE_PCAPNG_DIR_OUTBOUND = 2,
+};
+
+/**
+ * Add interface to capture file
+ * This must be done after opening and before dumping any packets.
+ * Call once for each port being followed.
+ *
+ * @param out
+ *  The handle to the packet capture file
+ * @param port
+ *  The Ethernet port being captured.
+ * @param comment
+ *   Comment for the file (optional can be NULL)
+ * @param flags
+ *   Options for capture (reserved must be 0)
+ * @return
+ *  0 on success, -1 on failure (and rte_errno is set).
+ */
+__rte_experimental
+int
+rte_pcapng_add_interface(rte_pcapng_t *self, uint16_t port,
+			 const char *comment, uint32_t flags);
+
+/**
+ * Record packet in capture file
+ *
+ * @param out
+ *  The handle to the packet capture file
+ * @param port
+ *  The Ethernet port on which the packet is being sent.
+ * @param m
+ *  The packet to record
+ * @param dir
+ *  Direction in/out or unknown
+ * @param comment
+ *  Comment to add to record
+ * @return
+ *  0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_pcapng_dump_packet(rte_pcapng_t *self, uint16_t port,
+		       const struct rte_mbuf *m,
+		       enum rte_pcapng_direction dir,
+		       const char *comment);
+/**
+ * Dump packets to be transmitted.
+ *
+ * @param out
+ *  The handle to the packet capture file
+ * @param port_id
+ *  The Ethernet port on which the packet is being sent.
+ * @param pkts
+ *  The packets to be recorded.
+ * @param nb_pkts
+ *  The number of packets in the burst pointed to by "pkts".
+ * @return
+ *  0 on success, -1 on failure (and rte_errno is set).
+ */
+__rte_experimental
+int
+rte_pcapng_dump_tx(rte_pcapng_t *out, uint16_t port,
+		   struct rte_mbuf *pkts[], uint32_t nb_pkts);
+
+/**
+ * Dump a packets received.
+ *
+ * @param out
+ *  The handle to the packet capture file
+ * @param pkts
+ *  The packets to be recorded.
+ * @param nb_pkts
+ *  The number of packets in the burst pointed to by "pkts".
+ * @return
+ *  0 on success, -1 on failure (and rte_errno is set).
+ */
+__rte_experimental
+int
+rte_pcapng_dump_rx(rte_pcapng_t *out,
+		   struct rte_mbuf *pkts[], uint32_t nb_pkts);
+
+/**
+ * Close capture file
+ *
+ * @param self
+ *  handle to library
+ */
+__rte_experimental
+void
+rte_pcapng_close(rte_pcapng_t *self);
diff --git a/lib/librte_pcapng/rte_pcapng_version.map b/lib/librte_pcapng/rte_pcapng_version.map
new file mode 100644
index 000000000000..eb8ae2b50984
--- /dev/null
+++ b/lib/librte_pcapng/rte_pcapng_version.map
@@ -0,0 +1,12 @@
+EXPERIMENTAL {
+	global:
+
+	rte_pcapng_fdopen;
+	rte_pcapng_add_interface;
+	rte_pcapng_dump_packet;
+	rte_pcapng_dump_tx;
+	rte_pcapng_dump_rx;
+	rte_pcapng_close;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index e5ff83893489..cb07e38d8b58 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -21,7 +21,7 @@ libraries = [
 	'distributor', 'efd', 'eventdev',
 	'gro', 'gso', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
-	'power', 'pdump', 'rawdev',
+	'power', 'pdump', 'rawdev', 'pcapng',
 	'rcu', 'reorder', 'sched', 'security', 'stack', 'vhost',
 	# ipsec lib depends on net, crypto and security
 	'ipsec',
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index ba5c39e01957..b50b74ed6c99 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -42,6 +42,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += --no-whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PCAPNG)         += -lrte_pcapng
 _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
 _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [RFC 8/8] app/capture: add packet capture using pcapng
  2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
                   ` (6 preceding siblings ...)
  2019-10-07 16:52 ` [dpdk-dev] [RFC 7/8] pcapng: add new library for writing pcapng files Stephen Hemminger
@ 2019-10-07 16:52 ` Stephen Hemminger
  7 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 16:52 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

New application (dpdk-capture) with syntax analogous to tshark's
dumpcap command. It runs as a secondary process and produces
capture output in pcapng format.

It does not use DPDK style EAL arguments; instead the flags
are meant to be the same as dumpcap.

The program depends on libpcap since it uses the pcap_compile()
function to compile a string into a BPF program.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/Makefile            |   1 +
 app/capture/Makefile    |  19 ++
 app/capture/main.c      | 675 ++++++++++++++++++++++++++++++++++++++++
 app/capture/meson.build |  22 ++
 app/meson.build         |   1 +
 config/common_base      |   5 +
 6 files changed, 723 insertions(+)
 create mode 100644 app/capture/Makefile
 create mode 100644 app/capture/main.c
 create mode 100644 app/capture/meson.build

diff --git a/app/Makefile b/app/Makefile
index 28acbceca904..509cd7f4de13 100644
--- a/app/Makefile
+++ b/app/Makefile
@@ -7,6 +7,7 @@ DIRS-$(CONFIG_RTE_APP_TEST) += test
 DIRS-$(CONFIG_RTE_TEST_PMD) += test-pmd
 DIRS-$(CONFIG_RTE_PROC_INFO) += proc-info
 DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += pdump
+DIRS-$(CONFIG_RTE_APP_CAPTURE) += capture
 DIRS-$(CONFIG_RTE_LIBRTE_ACL) += test-acl
 DIRS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test-cmdline
 DIRS-$(CONFIG_RTE_LIBRTE_PIPELINE) += test-pipeline
diff --git a/app/capture/Makefile b/app/capture/Makefile
new file mode 100644
index 000000000000..78ff7d2e97bf
--- /dev/null
+++ b/app/capture/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Microsoft Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+ifeq ($(CONFIG_RTE_LIBRTE_PCAPNG),y)
+
+APP = dpdk-capture
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lpcap
+
+SRCS-y := main.c
+
+include $(RTE_SDK)/mk/rte.app.mk
+
+endif
diff --git a/app/capture/main.c b/app/capture/main.c
new file mode 100644
index 000000000000..394c1edcc01b
--- /dev/null
+++ b/app/capture/main.c
@@ -0,0 +1,675 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Microsoft Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#include <net/if.h>
+
+#include <rte_eal.h>
+#include <rte_version.h>
+#include <rte_alarm.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_mempool.h>
+#include <rte_pdump.h>
+#include <rte_string_fns.h>
+#include <rte_malloc.h>
+#include <rte_pcapng.h>
+
+#include <pcap/pcap.h>
+
+#define RING_NAME "capture-ring"
+#define MONITOR_INTERVAL  (500 * 1000)
+#define MBUF_POOL_CACHE_SIZE 32
+#define BURST_SIZE 32
+#define SLEEP_THRESHOLD 1000
+
+static const char *prog;
+static volatile bool quit_signal;
+static bool group_read;
+static bool quiet;
+static bool promiscuous_mode = true;
+static bool use_pcapng = true;
+static char *output_name;
+static const char *filter_str;
+static unsigned int ring_size = 2048;
+static uint64_t packet_count, packets_received;
+static const char *capture_comment;
+static uint16_t snaplen = UINT16_MAX;
+static bool dump_bpf;
+
+struct interface {
+	uint64_t received;
+	uint64_t dropped;
+	uint16_t port;
+	char name[RTE_ETH_NAME_MAX_LEN];
+
+	struct rte_rxtx_callback *rx_cb[RTE_MAX_QUEUES_PER_PORT];
+
+	TAILQ_ENTRY(interface) next;
+};
+
+TAILQ_HEAD(interface_list, interface);
+struct interface_list interfaces = TAILQ_HEAD_INITIALIZER(interfaces);
+
+static struct interface *port2intf[RTE_MAX_ETHPORTS];
+
+static void usage(void)
+{
+	printf("Usage: %s [options] ...\n\n", prog);
+	printf("Interface:\n"
+	       "  -i <interface>         name or port index of interface\n"
+	       "  -f <capture filter>    packet filter in libpcap filter syntax\n"
+	       "  -s <snaplen>           packet snapshot length (default: infinite)\n"
+	       "  -p                     don't put interface in promiscuous mode\n"
+	       "  -D                     print list of interfaces and exit\n"
+	       "  -d                     print generated BPF code for capture filter\n"
+	       "  -S                     print statistics for each interface\n\n");
+	printf("Stop condition:\n"
+	       "  -c <packet count>      stop after N packets (default: infinite)\n\n");
+	printf("Output file:\n"
+	       "  -w <filename>          name of file to save (default: tempfile)\n"
+	       "  -g                     enable group read access of output file\n"
+	       "  -n                     use pcapng format instead of pcap (default)\n"
+	       "  -P                     use libpcap format instead of pcapng\n"
+	       "  --capture-comment <comment>\n"
+	       "                         add capture comment to output file\n");
+	printf("Miscellaneous\n"
+	       "  -N <packet limit>      maximum number of packets buffered (default: %u)\n",
+	       ring_size);
+	printf("  -q                     don't report packet capture counts\n"
+	       "  -v                     print version information and exit\n"
+	       "  -h                     display this help and exit\n");
+}
+
+static void version(void)
+{
+	printf("%s 1.0 (DPDK %s)\n", prog, rte_version());
+}
+
+/* Parse numeric argument from command line */
+static unsigned int get_uint(const char *arg, const char *name,
+			     unsigned int limit)
+{
+	unsigned long u;
+	char *endp;
+
+	u = strtoul(arg, &endp, 0);
+	if (*arg == '\0' || *endp != '\0')
+		rte_exit(EXIT_FAILURE,
+			 "Specified %s \"%s\" is not a valid number\n",
+			 name, arg);
+	if (limit && u > limit)
+		rte_exit(EXIT_FAILURE,
+			 "Specified %s \"%s\" is too large (greater than %u)\n",
+			 name, arg, limit);
+
+	return u;
+}
+
+/* Add interface to list of interfaces to capture */
+static void add_interface(uint16_t port, const char *name)
+{
+	struct interface *intf;
+
+	intf = malloc(sizeof(*intf));
+	if (!intf)
+		rte_exit(EXIT_FAILURE, "no memory for interface\n");
+
+	memset(intf, 0, sizeof(*intf));
+	strlcpy(intf->name, name, sizeof(intf->name));
+
+	printf("Capturing on '%s'\n", name);
+
+	port2intf[port] = intf;
+	TAILQ_INSERT_TAIL(&interfaces, intf, next);
+}
+
+/* Select all valid DPDK interfaces */
+static void select_all_interfaces(void)
+{
+	char name[RTE_ETH_NAME_MAX_LEN];
+	uint16_t p;
+
+	RTE_ETH_FOREACH_DEV(p) {
+		if (rte_eth_dev_get_name_by_port(p, name) < 0)
+			continue;
+		add_interface(p, name);
+	}
+}
+
+/*
+ * Choose interface to capture if no -i option given.
+ * Select the first DPDK port, this matches what dumpcap does.
+ */
+static void set_default_interface(void)
+{
+	char name[RTE_ETH_NAME_MAX_LEN];
+	uint16_t p;
+
+	RTE_ETH_FOREACH_DEV(p) {
+		if (rte_eth_dev_get_name_by_port(p, name) < 0)
+			continue;
+		add_interface(p, name);
+		return;
+	}
+	rte_exit(EXIT_FAILURE, "No usable interfaces found\n");
+}
+
+/* Lookup interface by name or port and add it to the list */
+static void select_interface(const char *arg)
+{
+	uint16_t port;
+
+	if (strcmp(arg, "*"))
+		select_all_interfaces();
+	else if (rte_eth_dev_get_port_by_name(arg, &port) == 0)
+		add_interface(port, arg);
+	else {
+		char name[RTE_ETH_NAME_MAX_LEN];
+
+		port = get_uint(arg, "port_number", UINT16_MAX);
+		if (rte_eth_dev_get_name_by_port(port, name) < 0)
+			rte_exit(EXIT_FAILURE, "Invalid port number %u\n",
+				 port);
+		add_interface(port, name);
+	}
+}
+
+/* Display list of possible interfaces that can be used. */
+static void show_interfaces(void)
+{
+	char name[RTE_ETH_NAME_MAX_LEN];
+	uint16_t p;
+
+	RTE_ETH_FOREACH_DEV(p) {
+		if (rte_eth_dev_get_name_by_port(p, name) < 0)
+			continue;
+		printf("%u. %s\n", p, name);
+	}
+}
+
+static struct bpf_insn *compile_filter(uint32_t *len)
+{
+	struct bpf_program fcode;
+	pcap_t *pcap;
+	void *fmem;
+	size_t sz;
+
+	pcap = pcap_open_dead(DLT_EN10MB, snaplen);
+	if (!pcap)
+		rte_exit(EXIT_FAILURE, "can not open pcap\n");
+
+	if (pcap_compile(pcap, &fcode, filter_str,
+			 1, PCAP_NETMASK_UNKNOWN) != 0)
+		rte_exit(EXIT_FAILURE, "pcap filter string not valid (%s)\n",
+			 pcap_geterr(pcap));
+
+	/*
+	 * Need to put filter in shared memory where it can
+	 * be read by primary process.
+	 */
+	*len = fcode.bf_len;
+	sz = fcode.bf_len * sizeof(struct bpf_insn);
+	fmem = rte_malloc("pcap_filter", sz, 0);
+	if (!fmem)
+		rte_exit(EXIT_FAILURE, "rte_malloc for filter failed\n");
+
+	rte_memcpy(fmem, fcode.bf_insns, sz);
+	pcap_freecode(&fcode);
+	pcap_close(pcap);
+
+	return fmem;
+}
+
+static void dump_filter(const struct bpf_insn *insn, uint32_t len)
+{
+	unsigned int i;
+
+	if (insn == NULL)
+		rte_exit(EXIT_FAILURE, "no filter specified\n");
+
+	for (i = 0; i < len; insn++, i++)
+		printf("%s\n", bpf_image(insn, i));
+
+	exit(0);
+}
+
+/*
+ * Parse command line options.
+ * These are chosen to be similar to dumpcap command.
+ */
+static void parse_opts(int argc, char **argv)
+{
+	static const struct option long_options[] = {
+		{ "capture-comment", required_argument, NULL, 0 },
+		{ "help",    no_argument, NULL, 'h' },
+		{ "version", no_argument, NULL, 'v' },
+		{ NULL },
+	};
+	int option_index, c;
+
+	for (;;) {
+		c = getopt_long(argc, argv, "i:f:ds:c:w:gN:pqvhDnP",
+				long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 0:
+			switch (option_index) {
+			case 0:
+				capture_comment = optarg;
+				break;
+			default:
+				usage();
+				exit(1);
+			}
+			break;
+		case 'i':
+			select_interface(optarg);
+			break;
+		case 'f':
+			filter_str = optarg;
+			break;
+		case 'd':
+			dump_bpf = true;
+			break;
+		case 's':
+			snaplen = get_uint(optarg, "snap_len", 0);
+			break;
+		case 'D':
+			show_interfaces();
+			exit(0);
+		case 'c':
+			packet_count = get_uint(optarg, "packet_count", 0);
+			break;
+		case 'w':
+			output_name = optarg;
+			break;
+		case 'g':
+			group_read = true;
+			break;
+		case 'N':
+			ring_size = get_uint(optarg, "packet_limit", 0);
+			break;
+		case 'p':
+			promiscuous_mode = false;
+			break;
+		case 'q':
+			quiet = true;
+			break;
+		case 'n':
+			use_pcapng = true;
+			break;
+		case 'P':
+			use_pcapng = false;
+			break;
+		case 'v':
+			version();
+			exit(0);
+		case 'h':
+			usage();
+			exit(0);
+		default:
+			fprintf(stderr, "Invalid option: %s", argv[optind - 1]);
+			usage();
+			exit(1);
+		}
+	}
+}
+
+static void
+signal_handler(int sig_num __rte_unused)
+{
+	quit_signal = 1;
+}
+
+static void
+cleanup_pdump_resources(void)
+{
+	struct interface *intf;
+
+	TAILQ_FOREACH(intf, &interfaces, next) {
+		rte_pdump_disable(intf->port,
+				  RTE_PDUMP_ALL_QUEUES, RTE_PDUMP_FLAG_RXTX);
+		if (promiscuous_mode)
+			rte_eth_promiscuous_disable(intf->port);
+	}
+}
+
+/* Alarm signal handler, used to check that primary process */
+static void
+monitor_primary(void *arg __rte_unused)
+{
+	if (quit_signal)
+		return;
+
+	if (rte_eal_primary_proc_alive(NULL)) {
+		rte_eal_alarm_set(MONITOR_INTERVAL, monitor_primary, NULL);
+		return;
+	}
+
+	fprintf(stderr, "Primary process is no longer active, exiting...\n");
+	quit_signal = 1;
+}
+
+/* Setup handler to check when primary exits. */
+static void
+enable_primary_monitor(void)
+{
+	int ret;
+
+	/* Once primary exits, so will pdump. */
+	ret = rte_eal_alarm_set(MONITOR_INTERVAL, monitor_primary, NULL);
+	if (ret < 0)
+		fprintf(stderr, "Fail to enable monitor:%d\n", ret);
+}
+
+static void
+disable_primary_monitor(void)
+{
+	int ret;
+
+	ret = rte_eal_alarm_cancel(monitor_primary, NULL);
+	if (ret < 0)
+		fprintf(stderr, "Fail to disable monitor:%d\n", ret);
+}
+
+static void
+print_pdump_stats(void)
+{
+	struct interface *intf;
+
+	fputc('\n', stderr);
+	TAILQ_FOREACH(intf, &interfaces, next) {
+		fprintf(stderr, "Packets received/dropped on interface '%s': "
+		       "%"PRIu64 "/%" PRIu64 "\n", intf->name,
+		       intf->received, intf->dropped);
+	}
+}
+
+/*
+ * Start DPDK EAL with arguments.
+ * Unlike most DPDK programs, for usabilty,
+ * the arguments to EAL do not come from user command line.
+ */
+static void dpdk_init(void)
+{
+	const char *args[] = {
+		prog,
+		"--log-level", "error",
+		"--proc-type", "secondary",
+	};
+	int eal_argc = RTE_DIM(args);
+	char **eal_argv;
+	size_t i;
+
+	/* Make a mutable copy of args because... */
+	eal_argv = calloc(sizeof(char *), RTE_DIM(args) + 1);
+	if (!eal_argv)
+		rte_exit(EXIT_FAILURE, "EAL arg alloc failed\n");
+
+	for (i = 0; i < RTE_DIM(args); i++)
+		eal_argv[i] = strdup(args[i]);
+
+	if (rte_eal_init(eal_argc, eal_argv) < 0)
+		rte_panic("EAL init failed\n");
+
+	if (rte_eth_dev_count_avail() == 0)
+		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+}
+
+/* Create packet ring shared between callbacks and process */
+static struct rte_ring *create_ring(void)
+{
+	struct rte_ring *pring;
+	size_t size, log2;
+
+	/* Find next power of 2 >= size. */
+	size = ring_size;
+	log2 = sizeof(size) * 8 - __builtin_clzl(size - 1);
+	size = 1u << log2;
+
+	if (size != ring_size) {
+		fprintf(stderr, "Ring size %u rounded up to %zu\n",
+			ring_size, size);
+		ring_size = size;
+	}
+
+	pring = rte_ring_lookup(RING_NAME);
+	if (pring == NULL) {
+		pring = rte_ring_create(RING_NAME, ring_size,
+					rte_socket_id(), 0);
+		if (pring == NULL)
+			rte_exit(EXIT_FAILURE, "Could not create ring :%s\n",
+				 rte_strerror(rte_errno));
+	}
+	return pring;
+}
+
+static struct rte_mempool *create_mempool(void)
+{
+	static const char pool_name[] = "capture_mbufs";
+	size_t num_mbufs = 2 * ring_size;
+	struct rte_mempool *mp;
+
+	mp = rte_mempool_lookup(pool_name);
+	if (mp)
+		return mp;
+
+	mp = rte_pktmbuf_pool_create_by_ops(pool_name, num_mbufs,
+					    MBUF_POOL_CACHE_SIZE, 0,
+					    RTE_MIN(snaplen,
+						    RTE_MBUF_DEFAULT_BUF_SIZE),
+					    rte_socket_id(), "ring_mp_sc");
+	if (mp == NULL)
+		rte_exit(EXIT_FAILURE,
+			 "Mempool (%s) creation failed: %s\n", pool_name,
+			 rte_strerror(rte_errno));
+
+	return mp;
+}
+
+static rte_pcapng_t *create_output(void)
+{
+	int fd;
+
+	/* If no filename specified make a tempfile name */
+	if (output_name == NULL) {
+		struct interface *intf;
+		struct tm *tm;
+		time_t now;
+		char ts[32];
+
+		intf = TAILQ_FIRST(&interfaces);
+		now = time(NULL);
+		tm = localtime(&now);
+		if (!tm)
+			rte_panic("localtime failed\n");
+
+		strftime(ts, sizeof(ts), "%Y%m%d%H%M%S", tm);
+		if (asprintf(&output_name, "/tmp/%s_%u_%s_%s.pcapng",
+			     prog, intf->port, intf->name, ts) < 0)
+			rte_panic("asprintf failed\n");
+	}
+
+	if (strcmp(output_name, "-") == 0)
+		fd = STDOUT_FILENO;
+	else {
+		mode_t mode = group_read ? 0640 : 0600;
+
+		fd = open(output_name, O_WRONLY | O_APPEND | O_CREAT, mode);
+		if (fd < 0)
+			rte_exit(EXIT_FAILURE, "Can not open \"%s\": %s\n",
+				 output_name, strerror(errno));
+	}
+
+	return rte_pcapng_fdopen(fd, NULL, NULL, prog, capture_comment, 0);
+}
+
+/*
+ * Take list of interfaces (from command line)
+ * and put records for them at start of capture file.
+ */
+static void dump_interfaces(rte_pcapng_t *out)
+{
+	struct interface *intf;
+
+	TAILQ_FOREACH(intf, &interfaces, next)
+		rte_pcapng_add_interface(out, intf->port, NULL, 0);
+}
+
+static void enable_pdump(struct rte_ring *r, struct rte_mempool *mp,
+			 const struct bpf_insn *filter, uint32_t filter_len)
+{
+	struct interface *intf;
+	int ret;
+
+	TAILQ_FOREACH(intf, &interfaces, next) {
+		if (promiscuous_mode)
+			rte_eth_promiscuous_enable(intf->port);
+
+		ret = rte_pdump_enable(intf->port,
+				       RTE_PDUMP_ALL_QUEUES,
+				       snaplen,
+				       RTE_PDUMP_FLAG_RXTX,
+				       snaplen,
+				       r, mp, filter, filter_len);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE,
+				 "Packet dump enable failed: %s\n",
+				 rte_strerror(rte_errno));
+	}
+}
+
+/*
+ * Show current count of captured packets
+ * with backspaces to overwrite last value.
+ */
+static void show_count(uint64_t count)
+{
+	unsigned int i;
+	static unsigned int bt;
+
+	for (i = 0; i < bt; i++)
+		fputc('\b', stderr);
+
+	bt = fprintf(stderr, "%"PRIu64" ", count);
+}
+
+/* Process all packets in ring and dump to capture file */
+static void process_ring(rte_pcapng_t *out, struct rte_ring *r)
+{
+	struct rte_mbuf *pkts[BURST_SIZE];
+	unsigned int i, avail, n;
+	static unsigned int empty_count;
+
+	n = rte_ring_sc_dequeue_burst(r, (void **) pkts, BURST_SIZE,
+				      &avail);
+	if (n == 0) {
+		/* don't consume endless amounts of cpu if idle */
+		if (empty_count < SLEEP_THRESHOLD)
+			++empty_count;
+		else
+			usleep(10);
+		return;
+	}
+
+	empty_count = (avail == 0);
+
+	for (i = 0; i < n; i++) {
+		struct rte_mbuf *m = pkts[i];
+		struct interface *intf;
+
+		intf = port2intf[m->port];
+		if (likely(intf)) {
+			rte_pcapng_dump_packet(out, m->port, m,
+					       RTE_PCAPNG_DIR_UNKNOWN, NULL);
+			++intf->received;
+		}
+		rte_pktmbuf_free(m);
+	}
+
+	packets_received += n;
+
+	if (!quiet)
+		show_count(packets_received);
+}
+
+int main(int argc, char **argv)
+{
+	struct rte_ring *r;
+	struct rte_mempool *mp;
+	rte_pcapng_t *out;
+	struct bpf_insn *bpf_filter = NULL;
+	uint32_t bpf_len = 0;
+
+	prog = basename(argv[0]);
+	dpdk_init();
+
+	parse_opts(argc, argv);
+
+	if (filter_str)
+		bpf_filter = compile_filter(&bpf_len);
+
+	if (dump_bpf)
+		dump_filter(bpf_filter, bpf_len);
+
+	if (TAILQ_EMPTY(&interfaces))
+		set_default_interface();
+
+	r = create_ring();
+	mp = create_mempool();
+	out = create_output();
+	if (out == NULL)
+		rte_exit(EXIT_FAILURE, "can not open output file: %s\n",
+			 rte_strerror(rte_errno));
+
+	dump_interfaces(out);
+
+	enable_pdump(r, mp, bpf_filter, bpf_len);
+
+	signal(SIGINT, signal_handler);
+	signal(SIGPIPE, SIG_IGN);
+
+	enable_primary_monitor();
+
+	if (!quiet) {
+		fprintf(stderr, "Packets captured: ");
+		show_count(0);
+	}
+
+	while (!quit_signal) {
+		process_ring(out, r);
+
+		if (packet_count != 0 && packets_received >= packet_count)
+			break;
+	}
+
+	disable_primary_monitor();
+
+	print_pdump_stats();
+
+	rte_pcapng_close(out);
+
+	cleanup_pdump_resources();
+	rte_free(bpf_filter);
+	rte_ring_free(r);
+	rte_mempool_free(mp);
+
+	return rte_eal_cleanup() ? EXIT_FAILURE : 0;
+}
diff --git a/app/capture/meson.build b/app/capture/meson.build
new file mode 100644
index 000000000000..9558f10562bd
--- /dev/null
+++ b/app/capture/meson.build
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Microsoft Corporation
+pcap_dep = dependency('pcap', required: false)
+if pcap_dep.found()
+	build = true
+else
+	# pcap got a pkg-config file only in 1.9.0 and before that meson uses
+	# an internal pcap-config finder, which is not compatible with
+	# cross-compilation, so try to fallback to find_library
+	pcap_dep = cc.find_library('pcap', required: false)
+	if pcap_dep.found() and cc.has_header('pcap.h', dependencies: pcap_dep)
+		build = true
+		pkgconfig_extra_libs += '-lpcap'
+	else
+		build = false
+		reason = 'missing dependency, "libpcap"'
+	endif
+endif
+
+sources = files('main.c')
+ext_deps += pcap_dep
+deps += ['ethdev', 'pdump', 'pcapng']
diff --git a/app/meson.build b/app/meson.build
index b0e6afbbe9d9..a33198182133 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -6,6 +6,7 @@ if is_windows
 endif
 
 apps = [
+	'capture',
 	'pdump',
 	'proc-info',
 	'test-acl',
diff --git a/config/common_base b/config/common_base
index 0ccfcfae377d..e1bab8e77988 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1073,3 +1073,8 @@ CONFIG_RTE_APP_CRYPTO_PERF=y
 # Compile the eventdev application
 #
 CONFIG_RTE_APP_EVENTDEV=y
+
+#
+# Compile the capture application
+#
+CONFIG_RTE_APP_CAPTURE=n
-- 
2.20.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 16:52 ` [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering Stephen Hemminger
@ 2019-10-07 17:07   ` Jerin Jacob
  2019-10-07 17:33     ` Stephen Hemminger
  0 siblings, 1 reply; 20+ messages in thread
From: Jerin Jacob @ 2019-10-07 17:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dpdk-dev

On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <stephen@networkplumber.org>
wrote:

> Simple classic BPF interpreter based off of libpcap.
>
> This is a copy of the BPF interpreter from libpcap which is
> modified to handle mbuf meta data. The existing pcap_offline_filter
> does not expose a way to match VLAN tags. Copying the BPF interpreter
> also means that rte_pdump still does not have a hard dependency
> on libpcap.
>

Why not use DPDK's librte_bpf library? Rather implementing cBPF
interpreter. Currently it supports eBPF which is super set of cBPF.if is
this features very specific to cBPF, we clould simply implement cBPF using
eBPF or implement a new cBPF program type. That scheme could leverage
existing JIT infrastructure also. Using JIT will improve filtering
performance.

>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 17:07   ` Jerin Jacob
@ 2019-10-07 17:33     ` Stephen Hemminger
  2019-10-07 19:33       ` Jerin Jacob
  0 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 17:33 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dpdk-dev

On Mon, 7 Oct 2019 22:37:43 +0530
Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <stephen@networkplumber.org>
> wrote:
> 
> > Simple classic BPF interpreter based off of libpcap.
> >
> > This is a copy of the BPF interpreter from libpcap which is
> > modified to handle mbuf meta data. The existing pcap_offline_filter
> > does not expose a way to match VLAN tags. Copying the BPF interpreter
> > also means that rte_pdump still does not have a hard dependency
> > on libpcap.
> >  
> 
> Why not use DPDK's librte_bpf library? Rather implementing cBPF
> interpreter. Currently it supports eBPF which is super set of cBPF.if is
> this features very specific to cBPF, we clould simply implement cBPF using
> eBPF or implement a new cBPF program type. That scheme could leverage
> existing JIT infrastructure also. Using JIT will improve filtering
> performance.
> 
> >
> >  

Because pcap library generates cBPF in its string to BPF compiler.
Translating cBPF to eBPF is non trivial.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 17:33     ` Stephen Hemminger
@ 2019-10-07 19:33       ` Jerin Jacob
  2019-10-07 21:45         ` Stephen Hemminger
  0 siblings, 1 reply; 20+ messages in thread
From: Jerin Jacob @ 2019-10-07 19:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dpdk-dev

On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <stephen@networkplumber.org>
wrote:

> On Mon, 7 Oct 2019 22:37:43 +0530
> Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> stephen@networkplumber.org>
> > wrote:
> >
> > > Simple classic BPF interpreter based off of libpcap.
> > >
> > > This is a copy of the BPF interpreter from libpcap which is
> > > modified to handle mbuf meta data. The existing pcap_offline_filter
> > > does not expose a way to match VLAN tags. Copying the BPF interpreter
> > > also means that rte_pdump still does not have a hard dependency
> > > on libpcap.
> > >
> >
> > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > interpreter. Currently it supports eBPF which is super set of cBPF.if is
> > this features very specific to cBPF, we clould simply implement cBPF
> using
> > eBPF or implement a new cBPF program type. That scheme could leverage
> > existing JIT infrastructure also. Using JIT will improve filtering
> > performance.
> >
> > >
> > >
>
> Because pcap library generates cBPF in its string to BPF compiler.
> Translating cBPF to eBPF is non trivial.
>

Then at least cBPF interpreter should move to librte_bpf. We can hook to
JIT if required in future.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 19:33       ` Jerin Jacob
@ 2019-10-07 21:45         ` Stephen Hemminger
  2019-10-08  3:47           ` Jerin Jacob
  0 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-07 21:45 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dpdk-dev

On Tue, 8 Oct 2019 01:03:17 +0530
Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <stephen@networkplumber.org>
> wrote:
> 
> > On Mon, 7 Oct 2019 22:37:43 +0530
> > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> >  
> > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > stephen@networkplumber.org>
> > > wrote:
> > >  
> > > > Simple classic BPF interpreter based off of libpcap.
> > > >
> > > > This is a copy of the BPF interpreter from libpcap which is
> > > > modified to handle mbuf meta data. The existing pcap_offline_filter
> > > > does not expose a way to match VLAN tags. Copying the BPF interpreter
> > > > also means that rte_pdump still does not have a hard dependency
> > > > on libpcap.
> > > >  
> > >
> > > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > > interpreter. Currently it supports eBPF which is super set of cBPF.if is
> > > this features very specific to cBPF, we clould simply implement cBPF  
> > using  
> > > eBPF or implement a new cBPF program type. That scheme could leverage
> > > existing JIT infrastructure also. Using JIT will improve filtering
> > > performance.
> > >  
> > > >
> > > >  
> >
> > Because pcap library generates cBPF in its string to BPF compiler.
> > Translating cBPF to eBPF is non trivial.
> >  
> 
> Then at least cBPF interpreter should move to librte_bpf. We can hook to
> JIT if required in future.

The opcodes for cBPF and eBPF are not compatiable.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-07 21:45         ` Stephen Hemminger
@ 2019-10-08  3:47           ` Jerin Jacob
  2019-10-08  4:01             ` Stephen Hemminger
  0 siblings, 1 reply; 20+ messages in thread
From: Jerin Jacob @ 2019-10-08  3:47 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dpdk-dev

On Tue, 8 Oct, 2019, 3:15 AM Stephen Hemminger, <stephen@networkplumber.org>
wrote:

> On Tue, 8 Oct 2019 01:03:17 +0530
> Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> > On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <
> stephen@networkplumber.org>
> > wrote:
> >
> > > On Mon, 7 Oct 2019 22:37:43 +0530
> > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > >
> > > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > > stephen@networkplumber.org>
> > > > wrote:
> > > >
> > > > > Simple classic BPF interpreter based off of libpcap.
> > > > >
> > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > modified to handle mbuf meta data. The existing pcap_offline_filter
> > > > > does not expose a way to match VLAN tags. Copying the BPF
> interpreter
> > > > > also means that rte_pdump still does not have a hard dependency
> > > > > on libpcap.
> > > > >
> > > >
> > > > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > > > interpreter. Currently it supports eBPF which is super set of
> cBPF.if is
> > > > this features very specific to cBPF, we clould simply implement
> cBPF
> > > using
> > > > eBPF or implement a new cBPF program type. That scheme could leverage
> > > > existing JIT infrastructure also. Using JIT will improve filtering
> > > > performance.
> > > >
> > > > >
> > > > >
> > >
> > > Because pcap library generates cBPF in its string to BPF compiler.
> > > Translating cBPF to eBPF is non trivial.
> > >
> >
> > Then at least cBPF interpreter should move to librte_bpf. We can hook to
> > JIT if required in future.
>
> The opcodes for cBPF and eBPF are not compatiable.
>

Yeah. I am saying to add new program type in bpf library of cBPF. Obviously
pdump is not the correct place for cBPF interpreter. Moving to rte_libbpf
library would help to enable other applications or libraries to use cBPF
bpf program class.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-08  3:47           ` Jerin Jacob
@ 2019-10-08  4:01             ` Stephen Hemminger
  2019-10-08  4:15               ` Jerin Jacob
  0 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-08  4:01 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dpdk-dev

On Tue, 8 Oct 2019 09:17:08 +0530
Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Tue, 8 Oct, 2019, 3:15 AM Stephen Hemminger, <stephen@networkplumber.org>
> wrote:
> 
> > On Tue, 8 Oct 2019 01:03:17 +0530
> > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> >  
> > > On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <
> > stephen@networkplumber.org>
> > > wrote:
> > >  
> > > > On Mon, 7 Oct 2019 22:37:43 +0530
> > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > >  
> > > > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > > > stephen@networkplumber.org>
> > > > > wrote:
> > > > >  
> > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > >
> > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > modified to handle mbuf meta data. The existing pcap_offline_filter
> > > > > > does not expose a way to match VLAN tags. Copying the BPF  
> > interpreter  
> > > > > > also means that rte_pdump still does not have a hard dependency
> > > > > > on libpcap.
> > > > > >  
> > > > >
> > > > > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > > > > interpreter. Currently it supports eBPF which is super set of  
> > cBPF.if is  
> > > > > this features very specific to cBPF, we clould simply implement  
> > cBPF  
> > > > using  
> > > > > eBPF or implement a new cBPF program type. That scheme could leverage
> > > > > existing JIT infrastructure also. Using JIT will improve filtering
> > > > > performance.
> > > > >  
> > > > > >
> > > > > >  
> > > >
> > > > Because pcap library generates cBPF in its string to BPF compiler.
> > > > Translating cBPF to eBPF is non trivial.
> > > >  
> > >
> > > Then at least cBPF interpreter should move to librte_bpf. We can hook to
> > > JIT if required in future.  
> >
> > The opcodes for cBPF and eBPF are not compatiable.
> >  
> 
> Yeah. I am saying to add new program type in bpf library of cBPF. Obviously
> pdump is not the correct place for cBPF interpreter. Moving to rte_libbpf
> library would help to enable other applications or libraries to use cBPF
> bpf program class.

The problem is you need a version of string to BPF program which is what
the libpcap pcap_compile() function does for you. eBPF as used now is all
about having a full language (CLANG or GCC) and that is not what is needed
here at all.  The problem is not the interpreter, the problem is on the
userspace BPF side. Until/unless that is fixed, cBPF is a better solution.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-08  4:01             ` Stephen Hemminger
@ 2019-10-08  4:15               ` Jerin Jacob
  2019-10-08  4:22                 ` Stephen Hemminger
  0 siblings, 1 reply; 20+ messages in thread
From: Jerin Jacob @ 2019-10-08  4:15 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dpdk-dev

On Tue, 8 Oct, 2019, 9:31 AM Stephen Hemminger, <stephen@networkplumber.org>
wrote:

> On Tue, 8 Oct 2019 09:17:08 +0530
> Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> > On Tue, 8 Oct, 2019, 3:15 AM Stephen Hemminger, <
> stephen@networkplumber.org>
> > wrote:
> >
> > > On Tue, 8 Oct 2019 01:03:17 +0530
> > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > >
> > > > On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <
> > > stephen@networkplumber.org>
> > > > wrote:
> > > >
> > > > > On Mon, 7 Oct 2019 22:37:43 +0530
> > > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > > >
> > > > > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > > > > stephen@networkplumber.org>
> > > > > > wrote:
> > > > > >
> > > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > > >
> > > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > > modified to handle mbuf meta data. The existing
> pcap_offline_filter
> > > > > > > does not expose a way to match VLAN tags. Copying the BPF
> > > interpreter
> > > > > > > also means that rte_pdump still does not have a hard dependency
> > > > > > > on libpcap.
> > > > > > >
> > > > > >
> > > > > > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > > > > > interpreter. Currently it supports eBPF which is super set of
> > > cBPF.if is
> > > > > > this features very specific to cBPF, we clould simply implement
> > > cBPF
> > > > > using
> > > > > > eBPF or implement a new cBPF program type. That scheme could
> leverage
> > > > > > existing JIT infrastructure also. Using JIT will improve
> filtering
> > > > > > performance.
> > > > > >
> > > > > > >
> > > > > > >
> > > > >
> > > > > Because pcap library generates cBPF in its string to BPF compiler.
> > > > > Translating cBPF to eBPF is non trivial.
> > > > >
> > > >
> > > > Then at least cBPF interpreter should move to librte_bpf. We can
> hook to
> > > > JIT if required in future.
> > >
> > > The opcodes for cBPF and eBPF are not compatiable.
> > >
> >
> > Yeah. I am saying to add new program type in bpf library of cBPF.
> Obviously
> > pdump is not the correct place for cBPF interpreter. Moving to rte_libbpf
> > library would help to enable other applications or libraries to use cBPF
> > bpf program class.
>
> The problem is you need a version of string to BPF program which is what
> the libpcap pcap_compile() function does for you. eBPF as used now is all
> about having a full language (CLANG or GCC) and that is not what is needed
> here at all.  The problem is not the interpreter, the problem is on the
> userspace BPF side. Until/unless that is fixed, cBPF is a better solution.
>


I am not saying to use eBPF with libpcap. All I am saying to move the cBPF
interpreter code(this patch) to rte_libbpf as it is the correct place of
that code in DPDK PoV. So that it can be used by another applications or
library.

>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-08  4:15               ` Jerin Jacob
@ 2019-10-08  4:22                 ` Stephen Hemminger
  2019-10-08 21:08                   ` Morten Brørup
  0 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-08  4:22 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dpdk-dev

On Tue, 8 Oct 2019 09:45:45 +0530
Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Tue, 8 Oct, 2019, 9:31 AM Stephen Hemminger, <stephen@networkplumber.org>
> wrote:
> 
> > On Tue, 8 Oct 2019 09:17:08 +0530
> > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> >  
> > > On Tue, 8 Oct, 2019, 3:15 AM Stephen Hemminger, <
> > stephen@networkplumber.org>
> > > wrote:
> > >  
> > > > On Tue, 8 Oct 2019 01:03:17 +0530
> > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > >  
> > > > > On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <
> > > > stephen@networkplumber.org>
> > > > > wrote:
> > > > >  
> > > > > > On Mon, 7 Oct 2019 22:37:43 +0530
> > > > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > > > >  
> > > > > > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > > > > > stephen@networkplumber.org>
> > > > > > > wrote:
> > > > > > >  
> > > > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > > > >
> > > > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > > > modified to handle mbuf meta data. The existing  
> > pcap_offline_filter  
> > > > > > > > does not expose a way to match VLAN tags. Copying the BPF  
> > > > interpreter  
> > > > > > > > also means that rte_pdump still does not have a hard dependency
> > > > > > > > on libpcap.
> > > > > > > >  
> > > > > > >
> > > > > > > Why not use DPDK's librte_bpf library? Rather implementing cBPF
> > > > > > > interpreter. Currently it supports eBPF which is super set of  
> > > > cBPF.if is  
> > > > > > > this features very specific to cBPF, we clould simply implement  
> > > > cBPF  
> > > > > > using  
> > > > > > > eBPF or implement a new cBPF program type. That scheme could  
> > leverage  
> > > > > > > existing JIT infrastructure also. Using JIT will improve  
> > filtering  
> > > > > > > performance.
> > > > > > >  
> > > > > > > >
> > > > > > > >  
> > > > > >
> > > > > > Because pcap library generates cBPF in its string to BPF compiler.
> > > > > > Translating cBPF to eBPF is non trivial.
> > > > > >  
> > > > >
> > > > > Then at least cBPF interpreter should move to librte_bpf. We can  
> > hook to  
> > > > > JIT if required in future.  
> > > >
> > > > The opcodes for cBPF and eBPF are not compatiable.
> > > >  
> > >
> > > Yeah. I am saying to add new program type in bpf library of cBPF.  
> > Obviously  
> > > pdump is not the correct place for cBPF interpreter. Moving to rte_libbpf
> > > library would help to enable other applications or libraries to use cBPF
> > > bpf program class.  
> >
> > The problem is you need a version of string to BPF program which is what
> > the libpcap pcap_compile() function does for you. eBPF as used now is all
> > about having a full language (CLANG or GCC) and that is not what is needed
> > here at all.  The problem is not the interpreter, the problem is on the
> > userspace BPF side. Until/unless that is fixed, cBPF is a better solution.
> >  
> 
> 
> I am not saying to use eBPF with libpcap. All I am saying to move the cBPF
> interpreter code(this patch) to rte_libbpf as it is the correct place of
> that code in DPDK PoV. So that it can be used by another applications or
> library.
> 
> >  

Sure that make sense?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-08  4:22                 ` Stephen Hemminger
@ 2019-10-08 21:08                   ` Morten Brørup
  2019-10-09  8:21                     ` Ananyev, Konstantin
  0 siblings, 1 reply; 20+ messages in thread
From: Morten Brørup @ 2019-10-08 21:08 UTC (permalink / raw)
  To: Stephen Hemminger, Jerin Jacob; +Cc: dpdk-dev

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Stephen Hemminger
> Sent: Tuesday, October 8, 2019 6:23 AM
> 
> On Tue, 8 Oct 2019 09:45:45 +0530
> Jerin Jacob <jerinjacobk@gmail.com> wrote:
> 
> > On Tue, 8 Oct, 2019, 9:31 AM Stephen Hemminger,
> <stephen@networkplumber.org>
> > wrote:
> >
> > > On Tue, 8 Oct 2019 09:17:08 +0530
> > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > >
> > > > On Tue, 8 Oct, 2019, 3:15 AM Stephen Hemminger, <
> > > stephen@networkplumber.org>
> > > > wrote:
> > > >
> > > > > On Tue, 8 Oct 2019 01:03:17 +0530
> > > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > > >
> > > > > > On Mon, 7 Oct, 2019, 11:03 PM Stephen Hemminger, <
> > > > > stephen@networkplumber.org>
> > > > > > wrote:
> > > > > >
> > > > > > > On Mon, 7 Oct 2019 22:37:43 +0530
> > > > > > > Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > > > > >
> > > > > > > > On Mon, 7 Oct, 2019, 10:23 PM Stephen Hemminger, <
> > > > > > > stephen@networkplumber.org>
> > > > > > > > wrote:
> > > > > > > >
> > > > > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > > > > >
> > > > > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > > > > modified to handle mbuf meta data. The existing
> > > pcap_offline_filter
> > > > > > > > > does not expose a way to match VLAN tags. Copying the BPF
> > > > > interpreter
> > > > > > > > > also means that rte_pdump still does not have a hard
> dependency
> > > > > > > > > on libpcap.
> > > > > > > > >
> > > > > > > >
> > > > > > > > Why not use DPDK's librte_bpf library? Rather implementing
> cBPF
> > > > > > > > interpreter. Currently it supports eBPF which is super set of
> > > > > cBPF.if is
> > > > > > > > this features very specific to cBPF, we clould simply
> implement
> > > > > cBPF
> > > > > > > using
> > > > > > > > eBPF or implement a new cBPF program type. That scheme could
> > > leverage
> > > > > > > > existing JIT infrastructure also. Using JIT will improve
> > > filtering
> > > > > > > > performance.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > > > > Because pcap library generates cBPF in its string to BPF
> compiler.
> > > > > > > Translating cBPF to eBPF is non trivial.
> > > > > > >
> > > > > >
> > > > > > Then at least cBPF interpreter should move to librte_bpf. We can
> > > hook to
> > > > > > JIT if required in future.
> > > > >
> > > > > The opcodes for cBPF and eBPF are not compatiable.
> > > > >
> > > >
> > > > Yeah. I am saying to add new program type in bpf library of cBPF.
> > > Obviously
> > > > pdump is not the correct place for cBPF interpreter. Moving to
> rte_libbpf
> > > > library would help to enable other applications or libraries to use
> cBPF
> > > > bpf program class.
> > >
> > > The problem is you need a version of string to BPF program which is
> what
> > > the libpcap pcap_compile() function does for you. eBPF as used now is
> all
> > > about having a full language (CLANG or GCC) and that is not what is
> needed
> > > here at all.  The problem is not the interpreter, the problem is on the
> > > userspace BPF side. Until/unless that is fixed, cBPF is a better
> solution.
> > >
> >
> >
> > I am not saying to use eBPF with libpcap. All I am saying to move the
> cBPF
> > interpreter code(this patch) to rte_libbpf as it is the correct place of
> > that code in DPDK PoV. So that it can be used by another applications or
> > library.
> >
> > >
> 
> Sure that make sense?

Initially, I would have said yes, because we already implemented our own cBPF interpreter that way. However, we are using it for packet capture only, and I cannot see any other use for it - except perhaps filtered port mirroring, but that is just another form of packet capturing. So it might as well stay with the packet capture library.


And here goes my rant against eBPF:

In my opinion, eBPF and cBPF are two completely different things... If only rte_libbpf was named rte_libebpf. Then we could have the cBPF interpreter as rte_libbpf or rte_libcbpf.

I would like to elaborate Stephen's comment about the main thing being the integration with userspace:
cBPF has a range of easily accessible tools readily available for use by network operators, such as tcpdump. I consider eBPF for programmers only.

A real life example: Our network appliance provides a GUI. The packet capture feature has a filter field where you can provide a cBPF program in the form of a hex string, which a network operator basically can create by using tcpdump with the right parameters on his laptop. I cannot imagine any network operator sitting down to write an eBPF program for capturing e.g. packets with UDP source port 53 and IP source address 1.1.1.1.


Med venlig hilsen / kind regards
- Morten Brørup


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-08 21:08                   ` Morten Brørup
@ 2019-10-09  8:21                     ` Ananyev, Konstantin
  2019-10-09 14:59                       ` Stephen Hemminger
  0 siblings, 1 reply; 20+ messages in thread
From: Ananyev, Konstantin @ 2019-10-09  8:21 UTC (permalink / raw)
  To: 'Morten Brørup', Stephen Hemminger, Jerin Jacob; +Cc: dpdk-dev



Hi everyone,

> > > > > > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > > > > > >
> > > > > > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > > > > > modified to handle mbuf meta data. The existing
> > > > pcap_offline_filter
> > > > > > > > > > does not expose a way to match VLAN tags. Copying the BPF
> > > > > > interpreter
> > > > > > > > > > also means that rte_pdump still does not have a hard
> > dependency
> > > > > > > > > > on libpcap.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Why not use DPDK's librte_bpf library? Rather implementing
> > cBPF
> > > > > > > > > interpreter. Currently it supports eBPF which is super set of
> > > > > > cBPF.if is
> > > > > > > > > this features very specific to cBPF, we clould simply
> > implement
> > > > > > cBPF
> > > > > > > > using
> > > > > > > > > eBPF or implement a new cBPF program type. That scheme could
> > > > leverage
> > > > > > > > > existing JIT infrastructure also. Using JIT will improve
> > > > filtering
> > > > > > > > > performance.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > >
> > > > > > > > Because pcap library generates cBPF in its string to BPF
> > compiler.
> > > > > > > > Translating cBPF to eBPF is non trivial.
> > > > > > > >
> > > > > > >
> > > > > > > Then at least cBPF interpreter should move to librte_bpf. We can
> > > > hook to
> > > > > > > JIT if required in future.
> > > > > >
> > > > > > The opcodes for cBPF and eBPF are not compatiable.
> > > > > >
> > > > >
> > > > > Yeah. I am saying to add new program type in bpf library of cBPF.
> > > > Obviously
> > > > > pdump is not the correct place for cBPF interpreter. Moving to
> > rte_libbpf
> > > > > library would help to enable other applications or libraries to use
> > cBPF
> > > > > bpf program class.
> > > >
> > > > The problem is you need a version of string to BPF program which is
> > what
> > > > the libpcap pcap_compile() function does for you. eBPF as used now is
> > all
> > > > about having a full language (CLANG or GCC) and that is not what is
> > needed
> > > > here at all.  The problem is not the interpreter, the problem is on the
> > > > userspace BPF side. Until/unless that is fixed, cBPF is a better
> > solution.
> > > >
> > >
> > >
> > > I am not saying to use eBPF with libpcap. All I am saying to move the
> > cBPF
> > > interpreter code(this patch) to rte_libbpf as it is the correct place of
> > > that code in DPDK PoV. So that it can be used by another applications or
> > > library.
> > >
> > > >
> >
> > Sure that make sense?

For me yes, what Jerin suggests does make sense.
We probably can extend rte_bpf_load to accept both ebpf and cbpf bytecode.
Or create a new function: cbpf_load() and make bpf_exec() to be able to execute both ISA.
Then pdump library can support both flavors (eBPF and cBPF).
Stephen, not sure I understand - what is your concern with such approach?

> 
> Initially, I would have said yes, because we already implemented our own cBPF interpreter that way. However, we are using it for packet
> capture only, and I cannot see any other use for it - except perhaps filtered port mirroring, but that is just another form of packet capturing.
> So it might as well stay with the packet capture library.
> 
> 
> And here goes my rant against eBPF:
> 
> In my opinion, eBPF and cBPF are two completely different things... If only rte_libbpf was named rte_libebpf. Then we could have the cBPF
> interpreter as rte_libbpf or rte_libcbpf.

I think we still can have it, see above.

> 
> I would like to elaborate Stephen's comment about the main thing being the integration with userspace:
> cBPF has a range of easily accessible tools readily available for use by network operators, such as tcpdump. I consider eBPF for
> programmers only.
> A real life example: Our network appliance provides a GUI. The packet capture feature has a filter field where you can provide a cBPF
> program in the form of a hex string, which a network operator basically can create by using tcpdump with the right parameters on his
> laptop. I cannot imagine any network operator sitting down to write an eBPF program for capturing e.g. packets with UDP source port 53
> and IP source address 1.1.1.1.

As I can read your main complaint is not about eBPF  itself, but about luck of eBPF code generation tools...
AFAIK for  kernel guys it is not a problem, as in kernel cBPF bytecode always converted to eBPF one before execute/JIT.
Probably we just need the same ability in user-space.

> 
> Med venlig hilsen / kind regards
> - Morten Brørup


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering
  2019-10-09  8:21                     ` Ananyev, Konstantin
@ 2019-10-09 14:59                       ` Stephen Hemminger
  0 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2019-10-09 14:59 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: 'Morten Brørup', Jerin Jacob, dpdk-dev

On Wed, 9 Oct 2019 08:21:42 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> Hi everyone,
> 
> > > > > > > > > > > Simple classic BPF interpreter based off of libpcap.
> > > > > > > > > > >
> > > > > > > > > > > This is a copy of the BPF interpreter from libpcap which is
> > > > > > > > > > > modified to handle mbuf meta data. The existing  
> > > > > pcap_offline_filter  
> > > > > > > > > > > does not expose a way to match VLAN tags. Copying the BPF  
> > > > > > > interpreter  
> > > > > > > > > > > also means that rte_pdump still does not have a hard  
> > > dependency  
> > > > > > > > > > > on libpcap.
> > > > > > > > > > >  
> > > > > > > > > >
> > > > > > > > > > Why not use DPDK's librte_bpf library? Rather implementing  
> > > cBPF  
> > > > > > > > > > interpreter. Currently it supports eBPF which is super set of  
> > > > > > > cBPF.if is  
> > > > > > > > > > this features very specific to cBPF, we clould simply  
> > > implement  
> > > > > > > cBPF  
> > > > > > > > > using  
> > > > > > > > > > eBPF or implement a new cBPF program type. That scheme could  
> > > > > leverage  
> > > > > > > > > > existing JIT infrastructure also. Using JIT will improve  
> > > > > filtering  
> > > > > > > > > > performance.
> > > > > > > > > >  
> > > > > > > > > > >
> > > > > > > > > > >  
> > > > > > > > >
> > > > > > > > > Because pcap library generates cBPF in its string to BPF  
> > > compiler.  
> > > > > > > > > Translating cBPF to eBPF is non trivial.
> > > > > > > > >  
> > > > > > > >
> > > > > > > > Then at least cBPF interpreter should move to librte_bpf. We can  
> > > > > hook to  
> > > > > > > > JIT if required in future.  
> > > > > > >
> > > > > > > The opcodes for cBPF and eBPF are not compatiable.
> > > > > > >  
> > > > > >
> > > > > > Yeah. I am saying to add new program type in bpf library of cBPF.  
> > > > > Obviously  
> > > > > > pdump is not the correct place for cBPF interpreter. Moving to  
> > > rte_libbpf  
> > > > > > library would help to enable other applications or libraries to use  
> > > cBPF  
> > > > > > bpf program class.  
> > > > >
> > > > > The problem is you need a version of string to BPF program which is  
> > > what  
> > > > > the libpcap pcap_compile() function does for you. eBPF as used now is  
> > > all  
> > > > > about having a full language (CLANG or GCC) and that is not what is  
> > > needed  
> > > > > here at all.  The problem is not the interpreter, the problem is on the
> > > > > userspace BPF side. Until/unless that is fixed, cBPF is a better  
> > > solution.  
> > > > >  
> > > >
> > > >
> > > > I am not saying to use eBPF with libpcap. All I am saying to move the  
> > > cBPF  
> > > > interpreter code(this patch) to rte_libbpf as it is the correct place of
> > > > that code in DPDK PoV. So that it can be used by another applications or
> > > > library.
> > > >  
> > > > >  
> > >
> > > Sure that make sense?  
> 
> For me yes, what Jerin suggests does make sense.
> We probably can extend rte_bpf_load to accept both ebpf and cbpf bytecode.
> Or create a new function: cbpf_load() and make bpf_exec() to be able to execute both ISA.
> Then pdump library can support both flavors (eBPF and cBPF).
> Stephen, not sure I understand - what is your concern with such approach?
> 
> > 
> > Initially, I would have said yes, because we already implemented our own cBPF interpreter that way. However, we are using it for packet
> > capture only, and I cannot see any other use for it - except perhaps filtered port mirroring, but that is just another form of packet capturing.
> > So it might as well stay with the packet capture library.
> > 
> > 
> > And here goes my rant against eBPF:
> > 
> > In my opinion, eBPF and cBPF are two completely different things... If only rte_libbpf was named rte_libebpf. Then we could have the cBPF
> > interpreter as rte_libbpf or rte_libcbpf.  
> 
> I think we still can have it, see above.
> 
> > 
> > I would like to elaborate Stephen's comment about the main thing being the integration with userspace:
> > cBPF has a range of easily accessible tools readily available for use by network operators, such as tcpdump. I consider eBPF for
> > programmers only.
> > A real life example: Our network appliance provides a GUI. The packet capture feature has a filter field where you can provide a cBPF
> > program in the form of a hex string, which a network operator basically can create by using tcpdump with the right parameters on his
> > laptop. I cannot imagine any network operator sitting down to write an eBPF program for capturing e.g. packets with UDP source port 53
> > and IP source address 1.1.1.1.  
> 
> As I can read your main complaint is not about eBPF  itself, but about luck of eBPF code generation tools...
> AFAIK for  kernel guys it is not a problem, as in kernel cBPF bytecode always converted to eBPF one before execute/JIT.
> Probably we just need the same ability in user-space.

Since the DPDK API needs to copy (to rte_malloc memory) and validate the capture filter,
Lets investigate something net/core/filter.c:bpf_convert_filter in Linux.


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-10-09 14:59 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-07 16:52 [dpdk-dev] [RFC 0/8] Packet Capture enhancements Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 1/8] pdump: use new pktmbuf copy function Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 2/8] pdump: use dynamic logtype Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 3/8] pdump: tag copied mbuf with port Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 4/8] pdump: stamp packets with current timestamp Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 5/8] pdump: add classic BPF filtering Stephen Hemminger
2019-10-07 17:07   ` Jerin Jacob
2019-10-07 17:33     ` Stephen Hemminger
2019-10-07 19:33       ` Jerin Jacob
2019-10-07 21:45         ` Stephen Hemminger
2019-10-08  3:47           ` Jerin Jacob
2019-10-08  4:01             ` Stephen Hemminger
2019-10-08  4:15               ` Jerin Jacob
2019-10-08  4:22                 ` Stephen Hemminger
2019-10-08 21:08                   ` Morten Brørup
2019-10-09  8:21                     ` Ananyev, Konstantin
2019-10-09 14:59                       ` Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 6/8] pdump: add packet header truncation Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 7/8] pcapng: add new library for writing pcapng files Stephen Hemminger
2019-10-07 16:52 ` [dpdk-dev] [RFC 8/8] app/capture: add packet capture using pcapng Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).