From: "Fu, JingguoX" <jingguox.fu@intel.com>
To: "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [dpdk-dev] [PATCH v3] virtio: Support mergeable buffer in virtio pmd
Date: Wed, 24 Sep 2014 09:22:09 +0000 [thread overview]
Message-ID: <6BD6202160B55B409D423293115822625483B6@SHSMSX101.ccr.corp.intel.com> (raw)
In-Reply-To: <1408006475-17606-1-git-send-email-changchun.ouyang@intel.com>
Tested-by: Jingguo Fu <jingguox.fu at intel.com>
This patch includes 1 files, and has been tested by Intel.
Please see information as the following:
Host:
Fedora 19 x86_64, Linux Kernel 3.9.0, GCC 4.8.2 Intel Xeon CPU E5-2680 v2 @ 2.80GHz
NIC: Intel Niantic 82599, Intel i350, Intel 82580 and Intel 82576
Guest:
Fedora 16 x86_64, Linux Kernel 3.4.2, GCC 4.6.3 Qemu emulator 1.4.2
This patch tests with user space vhost driver library patch.
We verified zero copy and one copy test cases for functional and performance.
This patch depend on the two patches:
[dpdk-dev] [PATCH] virtio: Update max RX packet length http://www.dpdk.org/ml/archives/dev/2014-September/005107.html
[dpdk-dev] [PATCH] virtio: Fix vring entry number issue http://www.dpdk.org/ml/archives/dev/2014-September/005170.html
-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ouyang Changchun
Sent: Thursday, August 14, 2014 16:55
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH v3] virtio: Support mergeable buffer in virtio pmd
v3 change:
- Investigate the comments from Huawei and fix one potential issue of wrong offset to
the number of descriptor in buffer; also fix other tiny comments.
v2 change:
- Resolve conflicts with the tip code;
- And resolve 2 issues:
-- fix mbuf leak when discard an uncompleted packet.
-- refine pkt.data to point to actual payload data start point.
v1 change:
- This patch supports mergeable buffer feature in DPDK based virtio PMD, which can
receive jumbo frame with larger size, like 3K, 4K or even 9K.
Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
Acked-by: Huawei Xie <huawei.xie@intel.com>
---
lib/librte_pmd_virtio/virtio_ethdev.c | 20 +--
lib/librte_pmd_virtio/virtio_ethdev.h | 3 +
lib/librte_pmd_virtio/virtio_rxtx.c | 221 +++++++++++++++++++++++++++++-----
3 files changed, 207 insertions(+), 37 deletions(-)
diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c b/lib/librte_pmd_virtio/virtio_ethdev.c
index b9f5529..535d798 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.c
+++ b/lib/librte_pmd_virtio/virtio_ethdev.c
@@ -337,7 +337,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
snprintf(vq_name, sizeof(vq_name), "port%d_tvq%d_hdrzone",
dev->data->port_id, queue_idx);
vq->virtio_net_hdr_mz = rte_memzone_reserve_aligned(vq_name,
- vq_size * sizeof(struct virtio_net_hdr),
+ vq_size * hw->vtnet_hdr_size,
socket_id, 0, CACHE_LINE_SIZE);
if (vq->virtio_net_hdr_mz == NULL) {
rte_free(vq);
@@ -346,7 +346,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
vq->virtio_net_hdr_mem =
vq->virtio_net_hdr_mz->phys_addr;
memset(vq->virtio_net_hdr_mz->addr, 0,
- vq_size * sizeof(struct virtio_net_hdr));
+ vq_size * hw->vtnet_hdr_size);
} else if (queue_type == VTNET_CQ) {
/* Allocate a page for control vq command, data and status */
snprintf(vq_name, sizeof(vq_name), "port%d_cvq_hdrzone",
@@ -571,9 +571,6 @@ virtio_negotiate_features(struct virtio_hw *hw)
mask |= VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN;
mask |= VTNET_LRO_FEATURES;
- /* rx_mbuf should not be in multiple merged segments */
- mask |= VIRTIO_NET_F_MRG_RXBUF;
-
/* not negotiating INDIRECT descriptor table support */
mask |= VIRTIO_RING_F_INDIRECT_DESC;
@@ -746,7 +743,6 @@ eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
}
eth_dev->dev_ops = &virtio_eth_dev_ops;
- eth_dev->rx_pkt_burst = &virtio_recv_pkts;
eth_dev->tx_pkt_burst = &virtio_xmit_pkts;
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
@@ -801,10 +797,13 @@ eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
virtio_negotiate_features(hw);
/* Setting up rx_header size for the device */
- if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF))
+ if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+ eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
- else
+ } else {
+ eth_dev->rx_pkt_burst = &virtio_recv_pkts;
hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
+ }
/* Allocate memory for storing MAC addresses */
eth_dev->data->mac_addrs = rte_zmalloc("virtio", ETHER_ADDR_LEN, 0);
@@ -1009,7 +1008,7 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
dev->data->rx_queues[i])) != NULL) {
- rte_pktmbuf_free_seg(buf);
+ rte_pktmbuf_free(buf);
mbuf_num++;
}
@@ -1028,7 +1027,8 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
mbuf_num = 0;
while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
dev->data->tx_queues[i])) != NULL) {
- rte_pktmbuf_free_seg(buf);
+ rte_pktmbuf_free(buf);
+
mbuf_num++;
}
diff --git a/lib/librte_pmd_virtio/virtio_ethdev.h b/lib/librte_pmd_virtio/virtio_ethdev.h
index 858e644..d2e1eed 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.h
+++ b/lib/librte_pmd_virtio/virtio_ethdev.h
@@ -104,6 +104,9 @@ int virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
+
uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
diff --git a/lib/librte_pmd_virtio/virtio_rxtx.c b/lib/librte_pmd_virtio/virtio_rxtx.c
index fcd8bd1..0b10108 100644
--- a/lib/librte_pmd_virtio/virtio_rxtx.c
+++ b/lib/librte_pmd_virtio/virtio_rxtx.c
@@ -146,6 +146,7 @@ static inline int
virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
{
struct vq_desc_extra *dxp;
+ struct virtio_hw *hw = vq->hw;
struct vring_desc *start_dp;
uint16_t needed = 1;
uint16_t head_idx, idx;
@@ -165,9 +166,11 @@ virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
dxp->ndescs = needed;
start_dp = vq->vq_ring.desc;
- start_dp[idx].addr =
- (uint64_t) (cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
- start_dp[idx].len = cookie->buf_len - RTE_PKTMBUF_HEADROOM + sizeof(struct virtio_net_hdr);
+ start_dp[idx].addr =
+ (uint64_t)(cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
+ - hw->vtnet_hdr_size);
+ start_dp[idx].len =
+ cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
start_dp[idx].flags = VRING_DESC_F_WRITE;
idx = start_dp[idx].next;
vq->vq_desc_head_idx = idx;
@@ -184,8 +187,10 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
{
struct vq_desc_extra *dxp;
struct vring_desc *start_dp;
- uint16_t needed = 2;
+ uint16_t seg_num = cookie->pkt.nb_segs;
+ uint16_t needed = 1 + seg_num;
uint16_t head_idx, idx;
+ uint16_t head_size = txvq->hw->vtnet_hdr_size;
if (unlikely(txvq->vq_free_cnt == 0))
return -ENOSPC;
@@ -198,19 +203,25 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
idx = head_idx;
dxp = &txvq->vq_descx[idx];
if (dxp->cookie != NULL)
- rte_pktmbuf_free_seg(dxp->cookie);
+ rte_pktmbuf_free(dxp->cookie);
dxp->cookie = (void *)cookie;
dxp->ndescs = needed;
start_dp = txvq->vq_ring.desc;
- start_dp[idx].addr =
- txvq->virtio_net_hdr_mem + idx * sizeof(struct virtio_net_hdr);
- start_dp[idx].len = sizeof(struct virtio_net_hdr);
+ start_dp[idx].addr =
+ txvq->virtio_net_hdr_mem + idx * head_size;
+ start_dp[idx].len = (uint32_t)head_size;
start_dp[idx].flags = VRING_DESC_F_NEXT;
- idx = start_dp[idx].next;
- start_dp[idx].addr = RTE_MBUF_DATA_DMA_ADDR(cookie);
- start_dp[idx].len = cookie->pkt.data_len;
- start_dp[idx].flags = 0;
+
+ for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
+ idx = start_dp[idx].next;
+ start_dp[idx].addr = RTE_MBUF_DATA_DMA_ADDR(cookie);
+ start_dp[idx].len = cookie->pkt.data_len;
+ start_dp[idx].flags = VRING_DESC_F_NEXT;
+ cookie = cookie->pkt.next;
+ }
+
+ start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
idx = start_dp[idx].next;
txvq->vq_desc_head_idx = idx;
if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
@@ -284,7 +295,7 @@ virtio_dev_vring_start(struct virtqueue *vq, int queue_type)
error = virtqueue_enqueue_recv_refill(vq, m);
if (error) {
- rte_pktmbuf_free_seg(m);
+ rte_pktmbuf_free(m);
break;
}
nbufs++;
@@ -423,7 +434,7 @@ virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
error = virtqueue_enqueue_recv_refill(vq, m);
if (unlikely(error)) {
RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
- rte_pktmbuf_free_seg(m);
+ rte_pktmbuf_free(m);
}
}
@@ -433,13 +444,13 @@ uint16_t
virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
{
struct virtqueue *rxvq = rx_queue;
- struct virtio_hw *hw = rxvq->hw;
struct rte_mbuf *rxm, *new_mbuf;
uint16_t nb_used, num, nb_rx = 0;
uint32_t len[VIRTIO_MBUF_BURST_SZ];
struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
int error;
uint32_t i, nb_enqueued = 0;
+ const uint32_t hdr_size = sizeof(struct virtio_net_hdr);
nb_used = VIRTQUEUE_NUSED(rxvq);
@@ -460,8 +471,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
- if (unlikely(len[i]
- < (uint32_t)hw->vtnet_hdr_size + ETHER_HDR_LEN)) {
+ if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
PMD_RX_LOG(ERR, "Packet drop");
nb_enqueued++;
virtio_discard_rxbuf(rxvq, rxm);
@@ -471,17 +481,16 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
rxm->pkt.in_port = rxvq->port_id;
rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+
rxm->pkt.nb_segs = 1;
rxm->pkt.next = NULL;
- rxm->pkt.pkt_len = (uint32_t)(len[i]
- - sizeof(struct virtio_net_hdr));
- rxm->pkt.data_len = (uint16_t)(len[i]
- - sizeof(struct virtio_net_hdr));
+ rxm->pkt.pkt_len = (uint32_t)(len[i] - hdr_size);
+ rxm->pkt.data_len = (uint16_t)(len[i] - hdr_size);
VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
rx_pkts[nb_rx++] = rxm;
- rxvq->bytes += len[i] - sizeof(struct virtio_net_hdr);
+ rxvq->bytes += rx_pkts[nb_rx - 1]->pkt.pkt_len;
}
rxvq->packets += nb_rx;
@@ -498,11 +507,165 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
}
error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
if (unlikely(error)) {
- rte_pktmbuf_free_seg(new_mbuf);
+ rte_pktmbuf_free(new_mbuf);
break;
}
nb_enqueued++;
}
+
+ if (likely(nb_enqueued)) {
+ if (unlikely(virtqueue_kick_prepare(rxvq))) {
+ virtqueue_notify(rxvq);
+ PMD_RX_LOG(DEBUG, "Notified\n");
+ }
+ }
+
+ vq_update_avail_idx(rxvq);
+
+ return nb_rx;
+}
+
+uint16_t
+virtio_recv_mergeable_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct virtqueue *rxvq = rx_queue;
+ struct rte_mbuf *rxm, *new_mbuf;
+ uint16_t nb_used, num, nb_rx = 0;
+ uint32_t len[VIRTIO_MBUF_BURST_SZ];
+ struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
+ struct rte_mbuf *prev;
+ int error;
+ uint32_t i = 0, nb_enqueued = 0;
+ uint32_t seg_num = 0;
+ uint16_t extra_idx = 0;
+ uint32_t seg_res = 0;
+ const uint32_t hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+ nb_used = VIRTQUEUE_NUSED(rxvq);
+
+ rmb();
+
+ if (nb_used == 0)
+ return 0;
+
+ PMD_RX_LOG(DEBUG, "used:%d\n", nb_used);
+
+ while (i < nb_used) {
+ struct virtio_net_hdr_mrg_rxbuf *header;
+
+ if (nb_rx == nb_pkts)
+ break;
+
+ num = virtqueue_dequeue_burst_rx(rxvq, rcv_pkts, len, 1);
+ if (num != 1)
+ continue;
+
+ i++;
+
+ PMD_RX_LOG(DEBUG, "dequeue:%d\n", num);
+ PMD_RX_LOG(DEBUG, "packet len:%d\n", len[0]);
+
+ rxm = rcv_pkts[0];
+
+ if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
+ PMD_RX_LOG(ERR, "Packet drop\n");
+ nb_enqueued++;
+ virtio_discard_rxbuf(rxvq, rxm);
+ rxvq->errors++;
+ continue;
+ }
+
+ header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
+ RTE_PKTMBUF_HEADROOM - hdr_size);
+ seg_num = header->num_buffers;
+
+ if (seg_num == 0)
+ seg_num = 1;
+
+ rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+ rxm->pkt.nb_segs = seg_num;
+ rxm->pkt.next = NULL;
+ rxm->pkt.pkt_len = (uint32_t)(len[0] - hdr_size);
+ rxm->pkt.data_len = (uint16_t)(len[0] - hdr_size);
+
+ rxm->pkt.in_port = rxvq->port_id;
+ rx_pkts[nb_rx] = rxm;
+ prev = rxm;
+
+ seg_res = seg_num - 1;
+
+ while (seg_res != 0) {
+ /*
+ * Get extra segments for current uncompleted packet.
+ */
+ uint32_t rcv_cnt =
+ RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
+ if (likely(VIRTQUEUE_NUSED(rxvq) >= rcv_cnt)) {
+ uint32_t rx_num =
+ virtqueue_dequeue_burst_rx(rxvq,
+ rcv_pkts, len, rcv_cnt);
+ i += rx_num;
+ rcv_cnt = rx_num;
+ } else {
+ PMD_RX_LOG(ERR,
+ "No enough segments for packet.\n");
+ nb_enqueued++;
+ virtio_discard_rxbuf(rxvq, rxm);
+ rxvq->errors++;
+ break;
+ }
+
+ extra_idx = 0;
+
+ while (extra_idx < rcv_cnt) {
+ rxm = rcv_pkts[extra_idx];
+
+ rxm->pkt.data =
+ (char *)rxm->buf_addr +
+ RTE_PKTMBUF_HEADROOM - hdr_size;
+ rxm->pkt.next = NULL;
+ rxm->pkt.pkt_len = (uint32_t)(len[extra_idx]);
+ rxm->pkt.data_len = (uint16_t)(len[extra_idx]);
+
+ if (prev)
+ prev->pkt.next = rxm;
+
+ prev = rxm;
+ rx_pkts[nb_rx]->pkt.pkt_len += rxm->pkt.pkt_len;
+ extra_idx++;
+ };
+ seg_res -= rcv_cnt;
+ }
+
+ VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
+ rx_pkts[nb_rx]->pkt.data_len);
+
+ rxvq->bytes += rx_pkts[nb_rx]->pkt.pkt_len;
+ nb_rx++;
+ }
+
+ rxvq->packets += nb_rx;
+
+ /* Allocate new mbuf for the used descriptor */
+ error = ENOSPC;
+ while (likely(!virtqueue_full(rxvq))) {
+ new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
+ if (unlikely(new_mbuf == NULL)) {
+ struct rte_eth_dev *dev
+ = &rte_eth_devices[rxvq->port_id];
+ dev->data->rx_mbuf_alloc_failed++;
+ break;
+ }
+ error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
+ if (unlikely(error)) {
+ rte_pktmbuf_free(new_mbuf);
+ break;
+ }
+ nb_enqueued++;
+ }
+
if (likely(nb_enqueued)) {
if (unlikely(virtqueue_kick_prepare(rxvq))) {
virtqueue_notify(rxvq);
@@ -536,12 +699,16 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
num = (uint16_t)(likely(nb_used < VIRTIO_MBUF_BURST_SZ) ? nb_used : VIRTIO_MBUF_BURST_SZ);
while (nb_tx < nb_pkts) {
- if (virtqueue_full(txvq) && num) {
+ int need = tx_pkts[nb_tx]->pkt.nb_segs - txvq->vq_free_cnt;
+ int deq_cnt = RTE_MIN(need, (int)num);
+
+ num -= (deq_cnt > 0) ? deq_cnt : 0;
+ while (deq_cnt > 0) {
virtqueue_dequeue_pkt_tx(txvq);
- num--;
+ deq_cnt--;
}
- if (!virtqueue_full(txvq)) {
+ if (tx_pkts[nb_tx]->pkt.nb_segs <= txvq->vq_free_cnt) {
txm = tx_pkts[nb_tx];
/* Enqueue Packet buffers */
error = virtqueue_enqueue_xmit(txvq, txm);
@@ -555,7 +722,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
break;
}
nb_tx++;
- txvq->bytes += txm->pkt.data_len;
+ txvq->bytes += txm->pkt.pkt_len;
} else {
PMD_TX_LOG(ERR, "No free tx descriptors to transmit");
break;
--
1.8.4.2
next prev parent reply other threads:[~2014-09-24 9:16 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-08-14 8:54 Ouyang Changchun
2014-08-21 1:23 ` Ouyang, Changchun
2014-08-25 15:16 ` Thomas Monjalon
2014-09-24 9:22 ` Fu, JingguoX [this message]
2014-09-24 9:31 ` Thomas Monjalon
2014-09-24 14:26 ` Ouyang, Changchun
2014-09-25 0:47 ` Fu, JingguoX
2014-09-25 7:12 ` Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6BD6202160B55B409D423293115822625483B6@SHSMSX101.ccr.corp.intel.com \
--to=jingguox.fu@intel.com \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).