Hi guys, OvS-DPDK has recently had small a change that changed the data room available in an mbuf (commit dfaf00e in OvS). This seems to have had the consequence of breaking the initialisation of eth_af_packets interfaces, when using default values ("options:dpdk- devargs=eth_af_packet0,iface=enp61s0f3"). After investigating, what seems to be happening is that the eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN + sizeof(struct sockaddr_ll) = 2016B" to be available in the data room of each mbuf. Previous to the above commit, OvS would allocate some extra space, and this would mean there would be enough room for the checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in rte_eth_af_packet.c. However, with the recent commit that isn't the case anymore, and without that extra space the first check in eth_rx_queue_setup() will now be hit and setup of a eth_af_packet interface fails. What I'm trying to understand here is, the logic behind setting a default 'framesz' of 2048B and it being hardcoded (instead of being based on the underlying MTU of the interface, or the mbuf data room directly). The documentation in [1] for mmap() and setting up buffer rings mentions the exact same values (tp_block_size=4096,tp_frame_size=2048), which seem to have been introduced on the first commit, back in 2014. The only constraint for the framesize, it seems, its that it fits inside the blocksize (i.e. doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT. Thus, given those constraints, could we instead base the setting of the framesize like on the below patch? It basically tries to set the framesize to the MTU of the underlying interface + TPACKET2_HDRLEN + sizeof(structsockaddr_ll), aligned to TPACKET_ALIGNMENT. This would allow applications such as OvS to use this interface by default, based on a framesize set dynamically, instead of relying on a default of 2048B; If one is setting up an eth_af_packet interface with MTU 9000B and the underlying MTU is 1500B, for example, this will still fail, and that's fine as they can always supply the "framesz" argument ("options:dpdk-devargs=eth_af_packet0,iface=enp61s0f3,framesz=9000"). However, on the default case, or when one has configured manually the underlying interface with the expected MTU already, this would work out of the box. Any thoughts on this? Thanks, Tiago. diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c index 95a98c6..451cec3 100644 --- a/drivers/net/af_packet/rte_eth_af_packet.c +++ b/drivers/net/af_packet/rte_eth_af_packet.c @@ -24,6 +24,8 @@ #include <unistd.h> #include <poll.h> +#define RTE_ROUNDUP(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) + #define ETH_AF_PACKET_IFACE_ARG "iface" #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" @@ -535,7 +537,6 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, const int sockfd, const unsigned nb_queues, unsigned int blocksize, - unsigned int blockcnt, unsigned int framesize, unsigned int framecnt, unsigned int qdisc_bypass, @@ -557,6 +558,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, int rc, tpver, discard; int qsockfd = -1; unsigned int i, q, rdsize; + unsigned int blockcnt; #if defined(PACKET_FANOUT) int fanout_arg; #endif @@ -587,13 +589,6 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, (*internals)->tx_queue[q].map = MAP_FAILED; } - req = &((*internals)->req); - - req->tp_block_size = blocksize; - req->tp_block_nr = blockcnt; - req->tp_frame_size = framesize; - req->tp_frame_nr = framecnt; - ifnamelen = strlen(pair->value); if (ifnamelen < sizeof(ifr.ifr_name)) { memcpy(ifr.ifr_name, pair->value, ifnamelen); @@ -622,6 +617,38 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, return -1; } memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); + if (!framesize) { + if (ioctl(sockfd, SIOCGIFMTU, &ifr) == -1) { + RTE_LOG(ERR, PMD, + "%s: ioctl failed (SIOCGIFMTU)", + name); + framesize = DFLT_FRAME_SIZE; + } else { + framesize = ifr.ifr_mtu; + framesize += TPACKET2_HDRLEN + sizeof(struct sockaddr_ll); + framesize = RTE_ROUNDUP(framesize, TPACKET_ALIGNMENT); + } + } + + blockcnt = framecnt / (blocksize / framesize); + if (!blockcnt) { + RTE_LOG(ERR, PMD, + "%s: invalid AF_PACKET MMAP parameters\n", name); + return -1; + } + + RTE_LOG(INFO, PMD, "%s: AF_PACKET MMAP parameters:\n", name); + RTE_LOG(INFO, PMD, "%s:\tblock size %d\n", name, blocksize); + RTE_LOG(INFO, PMD, "%s:\tblock count %d\n", name, blockcnt); + RTE_LOG(INFO, PMD, "%s:\tframe size %d\n", name, framesize); + RTE_LOG(INFO, PMD, "%s:\tframe count %d\n", name, framecnt); + + req = &((*internals)->req); + + req->tp_block_size = blocksize; + req->tp_block_nr = blockcnt; + req->tp_frame_nr = framecnt; + req->tp_frame_size = framesize; memset(&sockaddr, 0, sizeof(sockaddr)); sockaddr.sll_family = AF_PACKET; @@ -811,9 +838,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev, struct rte_eth_dev *eth_dev = NULL; struct rte_kvargs_pair *pair = NULL; unsigned k_idx; - unsigned int blockcount; unsigned int blocksize = DFLT_BLOCK_SIZE; - unsigned int framesize = DFLT_FRAME_SIZE; + unsigned int framesize = 0; unsigned int framecount = DFLT_FRAME_COUNT; unsigned int qpairs = 1; unsigned int qdisc_bypass = 1; @@ -887,21 +913,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev, return -1; } - blockcount = framecount / (blocksize / framesize); - if (!blockcount) { - PMD_LOG(ERR, - "%s: invalid AF_PACKET MMAP parameters", name); - return -1; - } - - PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); - PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); - PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); - PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); - PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); - if (rte_pmd_init_internals(dev, *sockfd, qpairs, - blocksize, blockcount, + blocksize, framesize, framecount, qdisc_bypass, &internals, ð_dev,
On 15-Nov-18 7:02 PM, Lam, Tiago wrote:
> Hi guys,
>
> OvS-DPDK has recently had small a change that changed the data room
> available in an mbuf (commit dfaf00e in OvS). This seems to have had the
> consequence of breaking the initialisation of eth_af_packets interfaces,
> when using default values ("options:dpdk-
> devargs=eth_af_packet0,iface=enp61s0f3").
>
> After investigating, what seems to be happening is that the
> eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN
> + sizeof(struct sockaddr_ll) = 2016B" to be available in the data room
> of each mbuf. Previous to the above commit, OvS would allocate some
> extra space, and this would mean there would be enough room for the
> checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in
> rte_eth_af_packet.c. However, with the recent commit that isn't the case
> anymore, and without that extra space the first check in
> eth_rx_queue_setup() will now be hit and setup of a eth_af_packet
> interface fails.
>
> What I'm trying to understand here is, the logic behind setting a
> default 'framesz' of 2048B and it being hardcoded (instead of being
> based on the underlying MTU of the interface, or the mbuf data room
> directly). The documentation in [1] for mmap() and setting up buffer
> rings mentions the exact same values
> (tp_block_size=4096,tp_frame_size=2048), which seem to have been
> introduced on the first commit, back in 2014. The only constraint
> for the framesize, it seems, its that it fits inside the blocksize (i.e.
> doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT.
I think the reason is no one bothered to do it properly (same with
detecting NUMA node locality for AF_PACKET interfaces - they all report
as being on socket 0, even though it's possible to get this info from
sysfs and set it correctly).
--
Thanks,
Anatoly
On Thu, Nov 15, 2018 at 07:02:37PM +0000, Lam, Tiago wrote: > Hi guys, > > OvS-DPDK has recently had small a change that changed the data room > available in an mbuf (commit dfaf00e in OvS). This seems to have had the > consequence of breaking the initialisation of eth_af_packets interfaces, > when using default values ("options:dpdk- > devargs=eth_af_packet0,iface=enp61s0f3"). > > After investigating, what seems to be happening is that the > eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN > + sizeof(struct sockaddr_ll) = 2016B" to be available in the data room > of each mbuf. Previous to the above commit, OvS would allocate some > extra space, and this would mean there would be enough room for the > checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in > rte_eth_af_packet.c. However, with the recent commit that isn't the case > anymore, and without that extra space the first check in > eth_rx_queue_setup() will now be hit and setup of a eth_af_packet > interface fails. > > What I'm trying to understand here is, the logic behind setting a > default 'framesz' of 2048B and it being hardcoded (instead of being > based on the underlying MTU of the interface, or the mbuf data room > directly). The documentation in [1] for mmap() and setting up buffer > rings mentions the exact same values > (tp_block_size=4096,tp_frame_size=2048), which seem to have been > introduced on the first commit, back in 2014. The only constraint > for the framesize, it seems, its that it fits inside the blocksize (i.e. > doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT. > While I can't comment on the af_packet driver, the reason why in DPDK you need to set your packet buffer size to 2k + headroom, is because the buffers sizes specified to individual NICs can sometimes only be specified in a course-grained manner. For example, if you check https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82599-10-gbe-controller-datasheet.pdf for the SRRCTL register, you will see that the buffer size can only be specified in units of 1k. Therefore, when you give the driver a buffer of exactly 2k, and the driver subtracts the headroom space, the actual space writeable by the NIC is below 2k - meaning that the NIC gets told it only has a 1k buffer. This then would lead to 1500-byte packets getting split unnecessarily into two buffers. So the upshot is that any DPDK application needs to allocate buffers of size 2k + HEADROOM + sizeof(struct rte_mbuf). Using buffers of only 2k will not work as expected for some NICs. If OVS is now using 2k buffers, rather than 2k + 256bytes, it will have problems, I think, and should be changed back to using the slightly larger buffer size. Regards, /Bruce
On 11/15/2018 7:02 PM, Lam, Tiago wrote: > Hi guys, > > OvS-DPDK has recently had small a change that changed the data room > available in an mbuf (commit dfaf00e in OvS). This seems to have had the > consequence of breaking the initialisation of eth_af_packets interfaces, > when using default values ("options:dpdk- > devargs=eth_af_packet0,iface=enp61s0f3"). > > After investigating, what seems to be happening is that the > eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN > + sizeof(struct sockaddr_ll) = 2016B" to be available in the data room > of each mbuf. Previous to the above commit, OvS would allocate some > extra space, and this would mean there would be enough room for the > checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in > rte_eth_af_packet.c. However, with the recent commit that isn't the case > anymore, and without that extra space the first check in > eth_rx_queue_setup() will now be hit and setup of a eth_af_packet > interface fails. > > What I'm trying to understand here is, the logic behind setting a > default 'framesz' of 2048B and it being hardcoded (instead of being > based on the underlying MTU of the interface, or the mbuf data room > directly). The documentation in [1] for mmap() and setting up buffer > rings mentions the exact same values > (tp_block_size=4096,tp_frame_size=2048), which seem to have been > introduced on the first commit, back in 2014. The only constraint > for the framesize, it seems, its that it fits inside the blocksize (i.e. > doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT. Independent from Bruce's comment to not use smaller mbuf size, I believe af_packet can be updated to be more flexible. Related to 'framesize', it has default value 2048 but can be updated via 'framesz=' devarg, so not exactly hardcoded. Your change looks good there, use devarg value if exist, get from underlying device and set default if it fails. A few comment below. > > Thus, given those constraints, could we instead base the setting of the > framesize like on the below patch? It basically tries to set the > framesize to the MTU of the underlying interface + TPACKET2_HDRLEN + > sizeof(structsockaddr_ll), aligned to TPACKET_ALIGNMENT. This would > allow applications such as OvS to use this interface by default, based > on a framesize set dynamically, instead of relying on a default of > 2048B; If one is setting up an eth_af_packet interface with MTU 9000B > and the underlying MTU is 1500B, for example, this will still fail, and > that's fine as they can always supply the "framesz" argument > ("options:dpdk-devargs=eth_af_packet0,iface=enp61s0f3,framesz=9000"). > However, on the default case, or when one has configured manually the > underlying interface with the expected MTU already, this would work out > of the box. > > Any thoughts on this? > > Thanks, > Tiago. > > diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c > index 95a98c6..451cec3 100644 > --- a/drivers/net/af_packet/rte_eth_af_packet.c > +++ b/drivers/net/af_packet/rte_eth_af_packet.c > @@ -24,6 +24,8 @@ > #include <unistd.h> > #include <poll.h> > > +#define RTE_ROUNDUP(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) This can go into more generic header <...> > @@ -622,6 +617,38 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, > return -1; > } > memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); > + if (!framesize) { > + if (ioctl(sockfd, SIOCGIFMTU, &ifr) == -1) { > + RTE_LOG(ERR, PMD, > + "%s: ioctl failed (SIOCGIFMTU)", > + name); > + framesize = DFLT_FRAME_SIZE; > + } else { > + framesize = ifr.ifr_mtu; > + framesize += TPACKET2_HDRLEN + sizeof(struct sockaddr_ll); > + framesize = RTE_ROUNDUP(framesize, TPACKET_ALIGNMENT); > + } > + } > + > + blockcnt = framecnt / (blocksize / framesize); If returned framesize > blocksize, result will be 0 since both are uint, which will end framecnt/0 and crash. <...> > @@ -887,21 +913,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev, > return -1; > } > > - blockcount = framecount / (blocksize / framesize); > - if (!blockcount) { > - PMD_LOG(ERR, > - "%s: invalid AF_PACKET MMAP parameters", name); > - return -1; > - } > - > - PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); > - PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); > - PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); > - PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); > - PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); > - Why move this block or change rte_pmd_init_internals(), adding SIOCGIFMTU above this block looks simpler.
On 16/11/2018 17:20, Ferruh Yigit wrote: > On 11/15/2018 7:02 PM, Lam, Tiago wrote: >> Hi guys, >> >> OvS-DPDK has recently had small a change that changed the data room >> available in an mbuf (commit dfaf00e in OvS). This seems to have had the >> consequence of breaking the initialisation of eth_af_packets interfaces, >> when using default values ("options:dpdk- >> devargs=eth_af_packet0,iface=enp61s0f3"). >> >> After investigating, what seems to be happening is that the >> eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN >> + sizeof(struct sockaddr_ll) = 2016B" to be available in the data room >> of each mbuf. Previous to the above commit, OvS would allocate some >> extra space, and this would mean there would be enough room for the >> checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in >> rte_eth_af_packet.c. However, with the recent commit that isn't the case >> anymore, and without that extra space the first check in >> eth_rx_queue_setup() will now be hit and setup of a eth_af_packet >> interface fails. >> >> What I'm trying to understand here is, the logic behind setting a >> default 'framesz' of 2048B and it being hardcoded (instead of being >> based on the underlying MTU of the interface, or the mbuf data room >> directly). The documentation in [1] for mmap() and setting up buffer >> rings mentions the exact same values >> (tp_block_size=4096,tp_frame_size=2048), which seem to have been >> introduced on the first commit, back in 2014. The only constraint >> for the framesize, it seems, its that it fits inside the blocksize (i.e. >> doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT. > > Independent from Bruce's comment to not use smaller mbuf size, I believe > af_packet can be updated to be more flexible. > > Related to 'framesize', it has default value 2048 but can be updated via > 'framesz=' devarg, so not exactly hardcoded. Your change looks good there, > use devarg value if exist, get from underlying device and set default if it fails. > > A few comment below. Thanks for the review, Ferruh. This was only a proposal, but since you reviewed it, I've addressed the comments and sent a formal patch: http://patchwork.dpdk.org/patch/48200/ Thanks, Tiago. > >> >> Thus, given those constraints, could we instead base the setting of the >> framesize like on the below patch? It basically tries to set the >> framesize to the MTU of the underlying interface + TPACKET2_HDRLEN + >> sizeof(structsockaddr_ll), aligned to TPACKET_ALIGNMENT. This would >> allow applications such as OvS to use this interface by default, based >> on a framesize set dynamically, instead of relying on a default of >> 2048B; If one is setting up an eth_af_packet interface with MTU 9000B >> and the underlying MTU is 1500B, for example, this will still fail, and >> that's fine as they can always supply the "framesz" argument >> ("options:dpdk-devargs=eth_af_packet0,iface=enp61s0f3,framesz=9000"). >> However, on the default case, or when one has configured manually the >> underlying interface with the expected MTU already, this would work out >> of the box. >> >> Any thoughts on this? >> >> Thanks, >> Tiago. >> >> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c >> index 95a98c6..451cec3 100644 >> --- a/drivers/net/af_packet/rte_eth_af_packet.c >> +++ b/drivers/net/af_packet/rte_eth_af_packet.c >> @@ -24,6 +24,8 @@ >> #include <unistd.h> >> #include <poll.h> >> >> +#define RTE_ROUNDUP(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) > > This can go into more generic header > > <...> > >> @@ -622,6 +617,38 @@ rte_pmd_init_internals(struct rte_vdev_device *dev, >> return -1; >> } >> memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); >> + if (!framesize) { >> + if (ioctl(sockfd, SIOCGIFMTU, &ifr) == -1) { >> + RTE_LOG(ERR, PMD, >> + "%s: ioctl failed (SIOCGIFMTU)", >> + name); >> + framesize = DFLT_FRAME_SIZE; >> + } else { >> + framesize = ifr.ifr_mtu; >> + framesize += TPACKET2_HDRLEN + sizeof(struct sockaddr_ll); >> + framesize = RTE_ROUNDUP(framesize, TPACKET_ALIGNMENT); >> + } >> + } >> + >> + blockcnt = framecnt / (blocksize / framesize); > > If returned framesize > blocksize, result will be 0 since both are uint, which > will end framecnt/0 and crash. > > <...> > >> @@ -887,21 +913,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev, >> return -1; >> } >> >> - blockcount = framecount / (blocksize / framesize); >> - if (!blockcount) { >> - PMD_LOG(ERR, >> - "%s: invalid AF_PACKET MMAP parameters", name); >> - return -1; >> - } >> - >> - PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); >> - PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); >> - PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); >> - PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); >> - PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); >> - > > Why move this block or change rte_pmd_init_internals(), adding SIOCGIFMTU above > this block looks simpler. >