DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] ring: empty and count optimizations
@ 2020-05-13 15:31 Morten Brørup
  2020-05-13 15:35 ` Morten Brørup
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Morten Brørup @ 2020-05-13 15:31 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

 Testing if the ring is empty is as simple as comparing the producer and\nconsumer pointers.\nIn theory, this optimization reduces the number of potential cache misses\nfrom 3 to 2 by not having to read r->mask in rte_ring_count().\n\nIt is not possible to enqueue more elements than the capacity of a ring,\nso the capacity comparison is a safeguard for observer threads only.\nInstead of completely removing the comparison, I have reorganized it to\nresemble the other trigrahps in the ring library and added a likely().\n\nThe modification of these two functions were discussed in the RFC here:\nhttps://mails.dpdk.org/archives/dev/2020-April/165752.html

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/librte_ring/rte_ring.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 86faede81..74a8fcdc8 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -55,7 +55,7 @@ extern "C" {
  *   - The memory size needed for the ring on success.
  *   - -EINVAL if count is not a power of 2.
  */
-ssize_t rte_ring_get_memsize(unsigned count);
+ssize_t rte_ring_get_memsize(unsigned int count);
 
 /**
  * Initialize a ring structure.
@@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
  * @return
  *   0 on success, or a negative value on error.
  */
-int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
-	unsigned flags);
+int rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
+	unsigned int flags);
 
 /**
  * Create a new ring named *name* in memory.
@@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
  *    - EEXIST - a memzone with the same name already exists
  *    - ENOMEM - no appropriate memory area found in which to create memzone
  */
-struct rte_ring *rte_ring_create(const char *name, unsigned count,
-				 int socket_id, unsigned flags);
+struct rte_ring *rte_ring_create(const char *name, unsigned int count,
+				 int socket_id, unsigned int flags);
 
 /**
  * De-allocate all memory used by the ring.
@@ -199,7 +199,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	uint32_t idx = prod_head & (r)->mask; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
+		for (i = 0; i < (n & ((~(unsigned int)0x3))); i+=4, idx+=4) { \
 			ring[idx] = obj_table[i]; \
 			ring[idx+1] = obj_table[i+1]; \
 			ring[idx+2] = obj_table[i+2]; \
@@ -230,7 +230,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	const uint32_t size = (r)->size; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
+		for (i = 0; i < (n & (~(unsigned int)0x3)); i+=4, idx+=4) {\
 			obj_table[i] = ring[idx]; \
 			obj_table[i+1] = ring[idx+1]; \
 			obj_table[i+2] = ring[idx+2]; \
@@ -683,13 +683,13 @@ rte_ring_reset(struct rte_ring *r);
  * @return
  *   The number of entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_count(const struct rte_ring *r)
 {
 	uint32_t prod_tail = r->prod.tail;
 	uint32_t cons_tail = r->cons.tail;
 	uint32_t count = (prod_tail - cons_tail) & r->mask;
-	return (count > r->capacity) ? r->capacity : count;
+	return likely(count <= r->capacity) ? count : r->capacity;
 }
 
 /**
@@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
  * @return
  *   The number of free entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_free_count(const struct rte_ring *r)
 {
 	return r->capacity - rte_ring_count(r);
@@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
 static inline int
 rte_ring_empty(const struct rte_ring *r)
 {
-	return rte_ring_count(r) == 0;
+	uint32_t prod_tail = r->prod.tail;
+	uint32_t cons_tail = r->cons.tail;
+	return cons_tail == prod_tail;
 }
 
 /**
@@ -860,7 +862,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -883,7 +885,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -910,7 +912,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -954,7 +956,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -979,7 +981,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -1006,7 +1008,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-13 15:31 [dpdk-dev] [PATCH] ring: empty and count optimizations Morten Brørup
@ 2020-05-13 15:35 ` Morten Brørup
  2020-05-13 17:08 ` Morten Brørup
  2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
  2 siblings, 0 replies; 16+ messages in thread
From: Morten Brørup @ 2020-05-13 15:35 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

Testing if the ring is empty is as simple as comparing the producer and
consumer pointers.
In theory, this optimization reduces the number of potential cache misses
from 3 to 2 by not having to read r->mask in rte_ring_count().

It is not possible to enqueue more elements than the capacity of a ring,
so the capacity comparison is a safeguard for observer threads only.
Instead of completely removing the comparison, I have reorganized it to
resemble the other trigrahps in the ring library and added a likely().

The modification of these two functions were discussed in the RFC here:
https://mails.dpdk.org/archives/dev/2020-April/165752.html

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/librte_ring/rte_ring.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 86faede81..74a8fcdc8 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -55,7 +55,7 @@ extern "C" {
  *   - The memory size needed for the ring on success.
  *   - -EINVAL if count is not a power of 2.
  */
-ssize_t rte_ring_get_memsize(unsigned count);
+ssize_t rte_ring_get_memsize(unsigned int count);
 
 /**
  * Initialize a ring structure.
@@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
  * @return
  *   0 on success, or a negative value on error.
  */
-int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
-	unsigned flags);
+int rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
+	unsigned int flags);
 
 /**
  * Create a new ring named *name* in memory.
@@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
  *    - EEXIST - a memzone with the same name already exists
  *    - ENOMEM - no appropriate memory area found in which to create memzone
  */
-struct rte_ring *rte_ring_create(const char *name, unsigned count,
-				 int socket_id, unsigned flags);
+struct rte_ring *rte_ring_create(const char *name, unsigned int count,
+				 int socket_id, unsigned int flags);
 
 /**
  * De-allocate all memory used by the ring.
@@ -199,7 +199,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	uint32_t idx = prod_head & (r)->mask; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
+		for (i = 0; i < (n & ((~(unsigned int)0x3))); i+=4, idx+=4) { \
 			ring[idx] = obj_table[i]; \
 			ring[idx+1] = obj_table[i+1]; \
 			ring[idx+2] = obj_table[i+2]; \
@@ -230,7 +230,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	const uint32_t size = (r)->size; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
+		for (i = 0; i < (n & (~(unsigned int)0x3)); i+=4, idx+=4) {\
 			obj_table[i] = ring[idx]; \
 			obj_table[i+1] = ring[idx+1]; \
 			obj_table[i+2] = ring[idx+2]; \
@@ -683,13 +683,13 @@ rte_ring_reset(struct rte_ring *r);
  * @return
  *   The number of entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_count(const struct rte_ring *r)
 {
 	uint32_t prod_tail = r->prod.tail;
 	uint32_t cons_tail = r->cons.tail;
 	uint32_t count = (prod_tail - cons_tail) & r->mask;
-	return (count > r->capacity) ? r->capacity : count;
+	return likely(count <= r->capacity) ? count : r->capacity;
 }
 
 /**
@@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
  * @return
  *   The number of free entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_free_count(const struct rte_ring *r)
 {
 	return r->capacity - rte_ring_count(r);
@@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
 static inline int
 rte_ring_empty(const struct rte_ring *r)
 {
-	return rte_ring_count(r) == 0;
+	uint32_t prod_tail = r->prod.tail;
+	uint32_t cons_tail = r->cons.tail;
+	return cons_tail == prod_tail;
 }
 
 /**
@@ -860,7 +862,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -883,7 +885,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -910,7 +912,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -954,7 +956,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -979,7 +981,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -1006,7 +1008,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-13 15:31 [dpdk-dev] [PATCH] ring: empty and count optimizations Morten Brørup
  2020-05-13 15:35 ` Morten Brørup
@ 2020-05-13 17:08 ` Morten Brørup
  2020-05-14 12:23   ` Ananyev, Konstantin
  2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
  2 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2020-05-13 17:08 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

Testing if the ring is empty is as simple as comparing the producer and
consumer pointers.
In theory, this optimization reduces the number of potential cache misses
from 3 to 2 by not having to read r->mask in rte_ring_count().

It is not possible to enqueue more elements than the capacity of a ring,
so the capacity comparison is a safeguard for observer threads only.
Instead of completely removing the comparison, I have reorganized it to
resemble the other trigrahps in the ring library and added a likely().

The modification of these two functions were discussed in the RFC here:
https://mails.dpdk.org/archives/dev/2020-April/165752.html

Also fixed some existing code not passing checkpatch.

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/librte_ring/rte_ring.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 86faede81..36438d9cd 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -55,7 +55,7 @@ extern "C" {
  *   - The memory size needed for the ring on success.
  *   - -EINVAL if count is not a power of 2.
  */
-ssize_t rte_ring_get_memsize(unsigned count);
+ssize_t rte_ring_get_memsize(unsigned int count);
 
 /**
  * Initialize a ring structure.
@@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
  * @return
  *   0 on success, or a negative value on error.
  */
-int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
-	unsigned flags);
+int rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
+	unsigned int flags);
 
 /**
  * Create a new ring named *name* in memory.
@@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
  *    - EEXIST - a memzone with the same name already exists
  *    - ENOMEM - no appropriate memory area found in which to create memzone
  */
-struct rte_ring *rte_ring_create(const char *name, unsigned count,
-				 int socket_id, unsigned flags);
+struct rte_ring *rte_ring_create(const char *name, unsigned int count,
+				 int socket_id, unsigned int flags);
 
 /**
  * De-allocate all memory used by the ring.
@@ -199,7 +199,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	uint32_t idx = prod_head & (r)->mask; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { \
 			ring[idx] = obj_table[i]; \
 			ring[idx+1] = obj_table[i+1]; \
 			ring[idx+2] = obj_table[i+2]; \
@@ -230,7 +230,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	const uint32_t size = (r)->size; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\
 			obj_table[i] = ring[idx]; \
 			obj_table[i+1] = ring[idx+1]; \
 			obj_table[i+2] = ring[idx+2]; \
@@ -683,13 +683,13 @@ rte_ring_reset(struct rte_ring *r);
  * @return
  *   The number of entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_count(const struct rte_ring *r)
 {
 	uint32_t prod_tail = r->prod.tail;
 	uint32_t cons_tail = r->cons.tail;
 	uint32_t count = (prod_tail - cons_tail) & r->mask;
-	return (count > r->capacity) ? r->capacity : count;
+	return likely(count <= r->capacity) ? count : r->capacity;
 }
 
 /**
@@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
  * @return
  *   The number of free entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_free_count(const struct rte_ring *r)
 {
 	return r->capacity - rte_ring_count(r);
@@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
 static inline int
 rte_ring_empty(const struct rte_ring *r)
 {
-	return rte_ring_count(r) == 0;
+	uint32_t prod_tail = r->prod.tail;
+	uint32_t cons_tail = r->cons.tail;
+	return cons_tail == prod_tail;
 }
 
 /**
@@ -860,7 +862,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -883,7 +885,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -910,7 +912,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -954,7 +956,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -979,7 +981,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -1006,7 +1008,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-13 17:08 ` Morten Brørup
@ 2020-05-14 12:23   ` Ananyev, Konstantin
  2020-05-14 13:45     ` Morten Brørup
  0 siblings, 1 reply; 16+ messages in thread
From: Ananyev, Konstantin @ 2020-05-14 12:23 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev


Hi Morten,

> Testing if the ring is empty is as simple as comparing the producer and
> consumer pointers.
> In theory, this optimization reduces the number of potential cache misses
> from 3 to 2 by not having to read r->mask in rte_ring_count().
> 
> It is not possible to enqueue more elements than the capacity of a ring,
> so the capacity comparison is a safeguard for observer threads only.
> Instead of completely removing the comparison, I have reorganized it to
> resemble the other trigrahps in the ring library and added a likely().
> 
> The modification of these two functions were discussed in the RFC here:
> https://mails.dpdk.org/archives/dev/2020-April/165752.html
> 
> Also fixed some existing code not passing checkpatch.
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/librte_ring/rte_ring.h | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> index 86faede81..36438d9cd 100644
> --- a/lib/librte_ring/rte_ring.h
> +++ b/lib/librte_ring/rte_ring.h
> @@ -55,7 +55,7 @@ extern "C" {
>   *   - The memory size needed for the ring on success.
>   *   - -EINVAL if count is not a power of 2.
>   */
> -ssize_t rte_ring_get_memsize(unsigned count);
> +ssize_t rte_ring_get_memsize(unsigned int count);

All these changes to replace 'unsigned' with insigned int' -
seems to be irrelevant to the patch subject, so can you
put them to a separate patch in the series. 
 
>  /**
>   * Initialize a ring structure.
> @@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
>   * @return
>   *   0 on success, or a negative value on error.
>   */
> -int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> -	unsigned flags);
> +int rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
> +	unsigned int flags);
> 
>  /**
>   * Create a new ring named *name* in memory.
> @@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
>   *    - EEXIST - a memzone with the same name already exists
>   *    - ENOMEM - no appropriate memory area found in which to create memzone
>   */
> -struct rte_ring *rte_ring_create(const char *name, unsigned count,
> -				 int socket_id, unsigned flags);
> +struct rte_ring *rte_ring_create(const char *name, unsigned int count,
> +				 int socket_id, unsigned int flags);
> 
>  /**
>   * De-allocate all memory used by the ring.
> @@ -199,7 +199,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
>  	uint32_t idx = prod_head & (r)->mask; \
>  	obj_type *ring = (obj_type *)ring_start; \
>  	if (likely(idx + n < size)) { \
> -		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
> +		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { \
>  			ring[idx] = obj_table[i]; \
>  			ring[idx+1] = obj_table[i+1]; \
>  			ring[idx+2] = obj_table[i+2]; \
> @@ -230,7 +230,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
>  	const uint32_t size = (r)->size; \
>  	obj_type *ring = (obj_type *)ring_start; \
>  	if (likely(idx + n < size)) { \
> -		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
> +		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\
>  			obj_table[i] = ring[idx]; \
>  			obj_table[i+1] = ring[idx+1]; \
>  			obj_table[i+2] = ring[idx+2]; \
> @@ -683,13 +683,13 @@ rte_ring_reset(struct rte_ring *r);
>   * @return
>   *   The number of entries in the ring.
>   */
> -static inline unsigned
> +static inline unsigned int
>  rte_ring_count(const struct rte_ring *r)
>  {
>  	uint32_t prod_tail = r->prod.tail;
>  	uint32_t cons_tail = r->cons.tail;
>  	uint32_t count = (prod_tail - cons_tail) & r->mask;
> -	return (count > r->capacity) ? r->capacity : count;
> +	return likely(count <= r->capacity) ? count : r->capacity;

Honestly, I don't see there is any point of that change:
I think it wouldn't change anything in terms of functionality
or performance. 

>  }
> 
>  /**
> @@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
>   * @return
>   *   The number of free entries in the ring.
>   */
> -static inline unsigned
> +static inline unsigned int
>  rte_ring_free_count(const struct rte_ring *r)
>  {
>  	return r->capacity - rte_ring_count(r);
> @@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
>  static inline int
>  rte_ring_empty(const struct rte_ring *r)
>  {
> -	return rte_ring_count(r) == 0;
> +	uint32_t prod_tail = r->prod.tail;
> +	uint32_t cons_tail = r->cons.tail;
> +	return cons_tail == prod_tail;
>  }
> 
>  /**
> @@ -860,7 +862,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)
>  {
> @@ -883,7 +885,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)
>  {
> @@ -910,7 +912,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  		      unsigned int n, unsigned int *free_space)
>  {
> @@ -954,7 +956,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)
>  {
> @@ -979,7 +981,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)
>  {
> @@ -1006,7 +1008,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
>   * @return
>   *   - Number of objects dequeued
>   */
> -static __rte_always_inline unsigned
> +static __rte_always_inline unsigned int
>  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)
>  {
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-14 12:23   ` Ananyev, Konstantin
@ 2020-05-14 13:45     ` Morten Brørup
  2020-05-14 16:46       ` Ananyev, Konstantin
  0 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2020-05-14 13:45 UTC (permalink / raw)
  To: Ananyev, Konstantin, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev,
> Konstantin
> Sent: Thursday, May 14, 2020 2:24 PM
> 
> 
> Hi Morten,
> 
> > Testing if the ring is empty is as simple as comparing the producer
> and
> > consumer pointers.
> > In theory, this optimization reduces the number of potential cache
> misses
> > from 3 to 2 by not having to read r->mask in rte_ring_count().
> >
> > It is not possible to enqueue more elements than the capacity of a
> ring,
> > so the capacity comparison is a safeguard for observer threads only.
> > Instead of completely removing the comparison, I have reorganized it
> to
> > resemble the other trigrahps in the ring library and added a
> likely().
> >
> > The modification of these two functions were discussed in the RFC
> here:
> > https://mails.dpdk.org/archives/dev/2020-April/165752.html
> >
> > Also fixed some existing code not passing checkpatch.
> >
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> >  lib/librte_ring/rte_ring.h | 36 +++++++++++++++++++-----------------
> >  1 file changed, 19 insertions(+), 17 deletions(-)
> >
> > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > index 86faede81..36438d9cd 100644
> > --- a/lib/librte_ring/rte_ring.h
> > +++ b/lib/librte_ring/rte_ring.h
> > @@ -55,7 +55,7 @@ extern "C" {
> >   *   - The memory size needed for the ring on success.
> >   *   - -EINVAL if count is not a power of 2.
> >   */
> > -ssize_t rte_ring_get_memsize(unsigned count);
> > +ssize_t rte_ring_get_memsize(unsigned int count);
> 
> All these changes to replace 'unsigned' with insigned int' -
> seems to be irrelevant to the patch subject, so can you
> put them to a separate patch in the series.

Will do.

Perhaps you could find an intern in Intel to fix all these quirks in ancient DPDK code that checkpatch complains about in one big patch, so we don't run into this over and over again, when submitting patches. :-)

> 
> >  /**
> >   * Initialize a ring structure.
> > @@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
> >   * @return
> >   *   0 on success, or a negative value on error.
> >   */
> > -int rte_ring_init(struct rte_ring *r, const char *name, unsigned
> count,
> > -	unsigned flags);
> > +int rte_ring_init(struct rte_ring *r, const char *name, unsigned int
> count,
> > +	unsigned int flags);
> >
> >  /**
> >   * Create a new ring named *name* in memory.
> > @@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char
> *name, unsigned count,
> >   *    - EEXIST - a memzone with the same name already exists
> >   *    - ENOMEM - no appropriate memory area found in which to create
> memzone
> >   */
> > -struct rte_ring *rte_ring_create(const char *name, unsigned count,
> > -				 int socket_id, unsigned flags);
> > +struct rte_ring *rte_ring_create(const char *name, unsigned int
> count,
> > +				 int socket_id, unsigned int flags);
> >
> >  /**
> >   * De-allocate all memory used by the ring.
> > @@ -199,7 +199,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring
> *r);
> >  	uint32_t idx = prod_head & (r)->mask; \
> >  	obj_type *ring = (obj_type *)ring_start; \
> >  	if (likely(idx + n < size)) { \
> > -		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
> > +		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { \
> >  			ring[idx] = obj_table[i]; \
> >  			ring[idx+1] = obj_table[i+1]; \
> >  			ring[idx+2] = obj_table[i+2]; \
> > @@ -230,7 +230,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring
> *r);
> >  	const uint32_t size = (r)->size; \
> >  	obj_type *ring = (obj_type *)ring_start; \
> >  	if (likely(idx + n < size)) { \
> > -		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
> > +		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\
> >  			obj_table[i] = ring[idx]; \
> >  			obj_table[i+1] = ring[idx+1]; \
> >  			obj_table[i+2] = ring[idx+2]; \
> > @@ -683,13 +683,13 @@ rte_ring_reset(struct rte_ring *r);
> >   * @return
> >   *   The number of entries in the ring.
> >   */
> > -static inline unsigned
> > +static inline unsigned int
> >  rte_ring_count(const struct rte_ring *r)
> >  {
> >  	uint32_t prod_tail = r->prod.tail;
> >  	uint32_t cons_tail = r->cons.tail;
> >  	uint32_t count = (prod_tail - cons_tail) & r->mask;
> > -	return (count > r->capacity) ? r->capacity : count;
> > +	return likely(count <= r->capacity) ? count : r->capacity;
> 
> Honestly, I don't see there is any point of that change:
> I think it wouldn't change anything in terms of functionality
> or performance.

Chapter 3.4.1 "Branch Prediction Optimization" in the Intel 64 and IA-32 Architectures Optimization Reference Manual recommends this kind of optimization as Assembly/Compiler Coding Rule 3, which is why I rearranged the trigraph. Essentially, there is a limit to the number of BTB (Branch Target Buffer) entries, so they should be conserved if possible.

In addition to that, I have added the likely() because I consider it nearly impossible that the count will exceed the capacity.

However, it's not the first time I see this kind of response to a suggested branch optimization on the DPDK mailing list. Everyone seem to think that branch prediction is infinite and always works. It may seem as if infinite on trivial applications, but BTB entries may be a scarce resource on complex applications. I assume Intel's recommendations are not just for the fun of it.

Konstantin, please note that I'm letting out my frustration about the general misconception about branch prediction here. You are doing a great job, so I feel bad about responding like this to you.

> 
> >  }
> >
> >  /**
> > @@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
> >   * @return
> >   *   The number of free entries in the ring.
> >   */
> > -static inline unsigned
> > +static inline unsigned int
> >  rte_ring_free_count(const struct rte_ring *r)
> >  {
> >  	return r->capacity - rte_ring_count(r);
> > @@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
> >  static inline int
> >  rte_ring_empty(const struct rte_ring *r)
> >  {
> > -	return rte_ring_count(r) == 0;
> > +	uint32_t prod_tail = r->prod.tail;
> > +	uint32_t cons_tail = r->cons.tail;
> > +	return cons_tail == prod_tail;
> >  }
> >
> >  /**
> > @@ -860,7 +862,7 @@ struct rte_ring *rte_ring_lookup(const char
> *name);
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const
> *obj_table,
> >  			 unsigned int n, unsigned int *free_space)
> >  {
> > @@ -883,7 +885,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r,
> void * const *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const
> *obj_table,
> >  			 unsigned int n, unsigned int *free_space)
> >  {
> > @@ -910,7 +912,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r,
> void * const *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  		      unsigned int n, unsigned int *free_space)
> >  {
> > @@ -954,7 +956,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void *
> const *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)
> >  {
> > @@ -979,7 +981,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r,
> void **obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)
> >  {
> > @@ -1006,7 +1008,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r,
> void **obj_table,
> >   * @return
> >   *   - Number of objects dequeued
> >   */
> > -static __rte_always_inline unsigned
> > +static __rte_always_inline unsigned int
> >  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)
> >  {
> > --
> > 2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-14 13:45     ` Morten Brørup
@ 2020-05-14 16:46       ` Ananyev, Konstantin
  2020-05-14 18:00         ` Morten Brørup
  0 siblings, 1 reply; 16+ messages in thread
From: Ananyev, Konstantin @ 2020-05-14 16:46 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev



>
> > > -static inline unsigned
> > > +static inline unsigned int
> > >  rte_ring_count(const struct rte_ring *r)
> > >  {
> > >  	uint32_t prod_tail = r->prod.tail;
> > >  	uint32_t cons_tail = r->cons.tail;
> > >  	uint32_t count = (prod_tail - cons_tail) & r->mask;
> > > -	return (count > r->capacity) ? r->capacity : count;
> > > +	return likely(count <= r->capacity) ? count : r->capacity;
> >
> > Honestly, I don't see there is any point of that change:
> > I think it wouldn't change anything in terms of functionality
> > or performance.
> 
> Chapter 3.4.1 "Branch Prediction Optimization" in the Intel 64 and IA-32 Architectures Optimization Reference Manual recommends this
> kind of optimization as Assembly/Compiler Coding Rule 3, which is why I rearranged the trigraph. Essentially, there is a limit to the number
> of BTB (Branch Target Buffer) entries, so they should be conserved if possible.
> 
> In addition to that, I have added the likely() because I consider it nearly impossible that the count will exceed the capacity.
> 
> However, it's not the first time I see this kind of response to a suggested branch optimization on the DPDK mailing list. Everyone seem to
> think that branch prediction is infinite and always works. It may seem as if infinite on trivial applications, but BTB entries may be a scarce
> resource on complex applications. I assume Intel's recommendations are not just for the fun of it.

I think it is better to leave such level of micro-optimizations to the compiler.
BTW, in that particular case, compiler most likely will generate a code
without any branches at all (at least for IA).
Let say on my box with gcc 7.3:

$ cat trc1.c
#include <stdint.h>
#include <rte_config.h>
#include <rte_ring.h>

uint32_t
fffx1(const struct rte_ring *r)
{
        uint32_t prod_tail = r->prod.tail;
        uint32_t cons_tail = r->cons.tail;
        uint32_t count = (prod_tail - cons_tail) & r->mask;
        return (count > r->capacity) ? r->capacity : count;
}

uint32_t
fffx2(const struct rte_ring *r)
{
        uint32_t prod_tail = r->prod.tail;
        uint32_t cons_tail = r->cons.tail;
        uint32_t count = (prod_tail - cons_tail) & r->mask;
        return likely(count <= r->capacity) ? count : r->capacity;
}
   
$ gcc -m64 -O3 -march=native -I${RTE_SDK}/x86_64-native-linuxapp-gcc/include -c trc1.c

$ objdump -d trc1.o

0000000000000000 <fffx1>:
   0:   8b 87 84 00 00 00       mov    0x84(%rdi),%eax
   6:   8b 97 04 01 00 00       mov    0x104(%rdi),%edx
   c:   29 d0                   sub    %edx,%eax
   e:   8b 57 38                mov    0x38(%rdi),%edx
  11:   23 47 34                and    0x34(%rdi),%eax
  14:   39 d0                   cmp    %edx,%eax
  16:   0f 47 c2                cmova  %edx,%eax
  19:   c3                      retq
  1a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000020 <fffx2>:
  20:   8b 87 84 00 00 00       mov    0x84(%rdi),%eax
  26:   8b 97 04 01 00 00       mov    0x104(%rdi),%edx
  2c:   29 d0                   sub    %edx,%eax
  2e:   8b 57 38                mov    0x38(%rdi),%edx
  31:   23 47 34                and    0x34(%rdi),%eax
  34:   39 d0                   cmp    %edx,%eax
  36:   0f 47 c2                cmova  %edx,%eax
  39:   c3                      retq

As you can see, there is no difference.

> 
> Konstantin, please note that I'm letting out my frustration about the general misconception about branch prediction here. You are doing a
> great job, so I feel bad about responding like this to you.

No worries, in fact I am glad to know that DPDK contributors
read IA optimization manual that thoughtfully 😊

Konstantin

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH] ring: empty and count optimizations
  2020-05-14 16:46       ` Ananyev, Konstantin
@ 2020-05-14 18:00         ` Morten Brørup
  0 siblings, 0 replies; 16+ messages in thread
From: Morten Brørup @ 2020-05-14 18:00 UTC (permalink / raw)
  To: Ananyev, Konstantin, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev,
> Konstantin
> Sent: Thursday, May 14, 2020 6:47 PM
> >
> > > > -static inline unsigned
> > > > +static inline unsigned int
> > > >  rte_ring_count(const struct rte_ring *r)
> > > >  {
> > > >  	uint32_t prod_tail = r->prod.tail;
> > > >  	uint32_t cons_tail = r->cons.tail;
> > > >  	uint32_t count = (prod_tail - cons_tail) & r->mask;
> > > > -	return (count > r->capacity) ? r->capacity : count;
> > > > +	return likely(count <= r->capacity) ? count : r->capacity;
> > >
> > > Honestly, I don't see there is any point of that change:
> > > I think it wouldn't change anything in terms of functionality
> > > or performance.
> >
> > Chapter 3.4.1 "Branch Prediction Optimization" in the Intel 64 and
> IA-32 Architectures Optimization Reference Manual recommends this
> > kind of optimization as Assembly/Compiler Coding Rule 3, which is why
> I rearranged the trigraph. Essentially, there is a limit to the number
> > of BTB (Branch Target Buffer) entries, so they should be conserved if
> possible.
> >
> > In addition to that, I have added the likely() because I consider it
> nearly impossible that the count will exceed the capacity.
> >
> > However, it's not the first time I see this kind of response to a
> suggested branch optimization on the DPDK mailing list. Everyone seem
> to
> > think that branch prediction is infinite and always works. It may
> seem as if infinite on trivial applications, but BTB entries may be a
> scarce
> > resource on complex applications. I assume Intel's recommendations
> are not just for the fun of it.
> 
> I think it is better to leave such level of micro-optimizations to the
> compiler.
> BTW, in that particular case, compiler most likely will generate a code
> without any branches at all (at least for IA).
> Let say on my box with gcc 7.3:
> 
> $ cat trc1.c
> #include <stdint.h>
> #include <rte_config.h>
> #include <rte_ring.h>
> 
> uint32_t
> fffx1(const struct rte_ring *r)
> {
>         uint32_t prod_tail = r->prod.tail;
>         uint32_t cons_tail = r->cons.tail;
>         uint32_t count = (prod_tail - cons_tail) & r->mask;
>         return (count > r->capacity) ? r->capacity : count;
> }
> 
> uint32_t
> fffx2(const struct rte_ring *r)
> {
>         uint32_t prod_tail = r->prod.tail;
>         uint32_t cons_tail = r->cons.tail;
>         uint32_t count = (prod_tail - cons_tail) & r->mask;
>         return likely(count <= r->capacity) ? count : r->capacity;
> }
> 
> $ gcc -m64 -O3 -march=native -I${RTE_SDK}/x86_64-native-linuxapp-
> gcc/include -c trc1.c
> 
> $ objdump -d trc1.o
> 
> 0000000000000000 <fffx1>:
>    0:   8b 87 84 00 00 00       mov    0x84(%rdi),%eax
>    6:   8b 97 04 01 00 00       mov    0x104(%rdi),%edx
>    c:   29 d0                   sub    %edx,%eax
>    e:   8b 57 38                mov    0x38(%rdi),%edx
>   11:   23 47 34                and    0x34(%rdi),%eax
>   14:   39 d0                   cmp    %edx,%eax
>   16:   0f 47 c2                cmova  %edx,%eax
>   19:   c3                      retq
>   1a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
> 
> 0000000000000020 <fffx2>:
>   20:   8b 87 84 00 00 00       mov    0x84(%rdi),%eax
>   26:   8b 97 04 01 00 00       mov    0x104(%rdi),%edx
>   2c:   29 d0                   sub    %edx,%eax
>   2e:   8b 57 38                mov    0x38(%rdi),%edx
>   31:   23 47 34                and    0x34(%rdi),%eax
>   34:   39 d0                   cmp    %edx,%eax
>   36:   0f 47 c2                cmova  %edx,%eax
>   39:   c3                      retq
> 
> As you can see, there is no difference.
> 

Thank you for the detailed feedback.

Reality trumps theory, so I will leave the count function as is. :-)


> >
> > Konstantin, please note that I'm letting out my frustration about the
> general misconception about branch prediction here. You are doing a
> > great job, so I feel bad about responding like this to you.
> 
> No worries, in fact I am glad to know that DPDK contributors
> read IA optimization manual that thoughtfully 😊
> 
> Konstantin

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH 0/2] ring: empty optimization
  2020-05-13 15:31 [dpdk-dev] [PATCH] ring: empty and count optimizations Morten Brørup
  2020-05-13 15:35 ` Morten Brørup
  2020-05-13 17:08 ` Morten Brørup
@ 2020-05-19 15:27 ` Morten Brørup
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 1/2] ring: coding style cleanup Morten Brørup
                     ` (2 more replies)
  2 siblings, 3 replies; 16+ messages in thread
From: Morten Brørup @ 2020-05-19 15:27 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

Testing if the ring is empty is as simple as comparing the producer and
consumer pointers.

Checkpatch complains about existing coding style violations, so the first
part of the patch fixes those, and contains no functional changes.

Morten Brørup (2):
  ring: coding style cleanup
  ring: empty optimization

 lib/librte_ring/rte_ring.c      |  4 +--
 lib/librte_ring/rte_ring.h      | 46 +++++++++++++++++----------------
 lib/librte_ring/rte_ring_elem.h | 10 +++----
 lib/librte_ring/rte_ring_hts.h  |  8 +++---
 lib/librte_ring/rte_ring_rts.h  |  8 +++---
 5 files changed, 39 insertions(+), 37 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH 1/2] ring: coding style cleanup
  2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
@ 2020-05-19 15:27   ` Morten Brørup
  2020-05-22 12:34     ` Ananyev, Konstantin
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 2/2] ring: empty optimization Morten Brørup
  2020-07-01  9:20   ` [dpdk-dev] [PATCH 0/2] " David Marchand
  2 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2020-05-19 15:27 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

Fix coding style violations that checkpatch will complain about.

Add missing "int" after "unsigned".
Add missing spaces around "+=" and "+".
Remove superfluous type cast of numerical constant.

Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/librte_ring/rte_ring.c      |  4 ++--
 lib/librte_ring/rte_ring.h      | 42 ++++++++++++++++-----------------
 lib/librte_ring/rte_ring_elem.h | 10 ++++----
 lib/librte_ring/rte_ring_hts.h  |  8 +++----
 lib/librte_ring/rte_ring_rts.h  |  8 +++----
 5 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
index ebe5ccf0d..6e0ac5097 100644
--- a/lib/librte_ring/rte_ring.c
+++ b/lib/librte_ring/rte_ring.c
@@ -174,8 +174,8 @@ get_sync_type(uint32_t flags, enum rte_ring_sync_type *prod_st,
 }
 
 int
-rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
-	unsigned flags)
+rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
+	unsigned int flags)
 {
 	int ret;
 
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 86faede81..9078e7c24 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -55,7 +55,7 @@ extern "C" {
  *   - The memory size needed for the ring on success.
  *   - -EINVAL if count is not a power of 2.
  */
-ssize_t rte_ring_get_memsize(unsigned count);
+ssize_t rte_ring_get_memsize(unsigned int count);
 
 /**
  * Initialize a ring structure.
@@ -109,8 +109,8 @@ ssize_t rte_ring_get_memsize(unsigned count);
  * @return
  *   0 on success, or a negative value on error.
  */
-int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
-	unsigned flags);
+int rte_ring_init(struct rte_ring *r, const char *name, unsigned int count,
+	unsigned int flags);
 
 /**
  * Create a new ring named *name* in memory.
@@ -169,8 +169,8 @@ int rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
  *    - EEXIST - a memzone with the same name already exists
  *    - ENOMEM - no appropriate memory area found in which to create memzone
  */
-struct rte_ring *rte_ring_create(const char *name, unsigned count,
-				 int socket_id, unsigned flags);
+struct rte_ring *rte_ring_create(const char *name, unsigned int count,
+				 int socket_id, unsigned int flags);
 
 /**
  * De-allocate all memory used by the ring.
@@ -199,11 +199,11 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	uint32_t idx = prod_head & (r)->mask; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { \
 			ring[idx] = obj_table[i]; \
-			ring[idx+1] = obj_table[i+1]; \
-			ring[idx+2] = obj_table[i+2]; \
-			ring[idx+3] = obj_table[i+3]; \
+			ring[idx + 1] = obj_table[i + 1]; \
+			ring[idx + 2] = obj_table[i + 2]; \
+			ring[idx + 3] = obj_table[i + 3]; \
 		} \
 		switch (n & 0x3) { \
 		case 3: \
@@ -230,11 +230,11 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
 	const uint32_t size = (r)->size; \
 	obj_type *ring = (obj_type *)ring_start; \
 	if (likely(idx + n < size)) { \
-		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
+		for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\
 			obj_table[i] = ring[idx]; \
-			obj_table[i+1] = ring[idx+1]; \
-			obj_table[i+2] = ring[idx+2]; \
-			obj_table[i+3] = ring[idx+3]; \
+			obj_table[i + 1] = ring[idx + 1]; \
+			obj_table[i + 2] = ring[idx + 2]; \
+			obj_table[i + 3] = ring[idx + 3]; \
 		} \
 		switch (n & 0x3) { \
 		case 3: \
@@ -683,7 +683,7 @@ rte_ring_reset(struct rte_ring *r);
  * @return
  *   The number of entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_count(const struct rte_ring *r)
 {
 	uint32_t prod_tail = r->prod.tail;
@@ -700,7 +700,7 @@ rte_ring_count(const struct rte_ring *r)
  * @return
  *   The number of free entries in the ring.
  */
-static inline unsigned
+static inline unsigned int
 rte_ring_free_count(const struct rte_ring *r)
 {
 	return r->capacity - rte_ring_count(r);
@@ -860,7 +860,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -883,7 +883,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -910,7 +910,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -954,7 +954,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -979,7 +979,7 @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -1006,7 +1006,7 @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
diff --git a/lib/librte_ring/rte_ring_elem.h b/lib/librte_ring/rte_ring_elem.h
index a5a4c46f9..6b4643ef2 100644
--- a/lib/librte_ring/rte_ring_elem.h
+++ b/lib/librte_ring/rte_ring_elem.h
@@ -889,7 +889,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -918,7 +918,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -949,7 +949,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -1001,7 +1001,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -1030,7 +1030,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
diff --git a/lib/librte_ring/rte_ring_hts.h b/lib/librte_ring/rte_ring_hts.h
index c7701defc..359b15771 100644
--- a/lib/librte_ring/rte_ring_hts.h
+++ b/lib/librte_ring/rte_ring_hts.h
@@ -189,7 +189,7 @@ rte_ring_mc_hts_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
  *   - n: Actual number of objects enqueued.
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_hts_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 	unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -219,7 +219,7 @@ rte_ring_mp_hts_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_hts_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 	unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -291,7 +291,7 @@ rte_ring_mc_hts_dequeue_bulk(struct rte_ring *r, void **obj_table,
  *   - n: Actual number of objects enqueued.
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_hts_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -317,7 +317,7 @@ rte_ring_mp_hts_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_hts_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
diff --git a/lib/librte_ring/rte_ring_rts.h b/lib/librte_ring/rte_ring_rts.h
index d40e9994f..afc12abe2 100644
--- a/lib/librte_ring/rte_ring_rts.h
+++ b/lib/librte_ring/rte_ring_rts.h
@@ -216,7 +216,7 @@ rte_ring_mc_rts_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
  *   - n: Actual number of objects enqueued.
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_rts_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 	unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -246,7 +246,7 @@ rte_ring_mp_rts_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_rts_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 	unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -318,7 +318,7 @@ rte_ring_mc_rts_dequeue_bulk(struct rte_ring *r, void **obj_table,
  *   - n: Actual number of objects enqueued.
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mp_rts_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -344,7 +344,7 @@ rte_ring_mp_rts_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
 __rte_experimental
-static __rte_always_inline unsigned
+static __rte_always_inline unsigned int
 rte_ring_mc_rts_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH 2/2] ring: empty optimization
  2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 1/2] ring: coding style cleanup Morten Brørup
@ 2020-05-19 15:27   ` Morten Brørup
  2020-05-19 15:52     ` Stephen Hemminger
  2020-05-22 12:32     ` Ananyev, Konstantin
  2020-07-01  9:20   ` [dpdk-dev] [PATCH 0/2] " David Marchand
  2 siblings, 2 replies; 16+ messages in thread
From: Morten Brørup @ 2020-05-19 15:27 UTC (permalink / raw)
  To: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd
  Cc: dev, Morten Brørup

Testing if the ring is empty is as simple as comparing the producer and
consumer pointers.

In theory, this optimization reduces the number of potential cache misses
from 3 to 2 by not having to read r->mask in rte_ring_count().

The modification of this function were also discussed in the RFC here:
https://mails.dpdk.org/archives/dev/2020-April/165752.html

Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/librte_ring/rte_ring.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 9078e7c24..f67141482 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
 static inline int
 rte_ring_empty(const struct rte_ring *r)
 {
-	return rte_ring_count(r) == 0;
+	uint32_t prod_tail = r->prod.tail;
+	uint32_t cons_tail = r->cons.tail;
+	return cons_tail == prod_tail;
 }
 
 /**
-- 
2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] ring: empty optimization
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 2/2] ring: empty optimization Morten Brørup
@ 2020-05-19 15:52     ` Stephen Hemminger
  2020-05-19 16:02       ` Morten Brørup
  2020-05-22 12:32     ` Ananyev, Konstantin
  1 sibling, 1 reply; 16+ messages in thread
From: Stephen Hemminger @ 2020-05-19 15:52 UTC (permalink / raw)
  To: Morten Brørup
  Cc: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd, dev

On Tue, 19 May 2020 15:27:25 +0000
Morten Brørup <mb@smartsharesystems.com> wrote:

> diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> index 9078e7c24..f67141482 100644
> --- a/lib/librte_ring/rte_ring.h
> +++ b/lib/librte_ring/rte_ring.h
> @@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
>  static inline int
>  rte_ring_empty(const struct rte_ring *r)
>  {
> -	return rte_ring_count(r) == 0;
> +	uint32_t prod_tail = r->prod.tail;
> +	uint32_t cons_tail = r->cons.tail;
> +	return cons_tail == prod_tail;
>  }

Blank line after declarations?

Are the temporary variable even needed?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] ring: empty optimization
  2020-05-19 15:52     ` Stephen Hemminger
@ 2020-05-19 16:02       ` Morten Brørup
  2020-07-01  9:19         ` David Marchand
  0 siblings, 1 reply; 16+ messages in thread
From: Morten Brørup @ 2020-05-19 16:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: olivier.matz, konstantin.ananyev, Honnappa.Nagarahalli, nd, dev

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Stephen Hemminger
> Sent: Tuesday, May 19, 2020 5:52 PM
> 
> On Tue, 19 May 2020 15:27:25 +0000
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > index 9078e7c24..f67141482 100644
> > --- a/lib/librte_ring/rte_ring.h
> > +++ b/lib/librte_ring/rte_ring.h
> > @@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
> >  static inline int
> >  rte_ring_empty(const struct rte_ring *r)
> >  {
> > -	return rte_ring_count(r) == 0;
> > +	uint32_t prod_tail = r->prod.tail;
> > +	uint32_t cons_tail = r->cons.tail;
> > +	return cons_tail == prod_tail;
> >  }
> 
> Blank line after declarations?
> 
> Are the temporary variable even needed?

Personally, I agree with you, but I was trying to match the existing coding style of the closely related rte_ring_count() function - only to avoid this kind of feedback.

Damn if you do, damn if you don't. :-)


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] ring: empty optimization
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 2/2] ring: empty optimization Morten Brørup
  2020-05-19 15:52     ` Stephen Hemminger
@ 2020-05-22 12:32     ` Ananyev, Konstantin
  1 sibling, 0 replies; 16+ messages in thread
From: Ananyev, Konstantin @ 2020-05-22 12:32 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev

> 
> Testing if the ring is empty is as simple as comparing the producer and
> consumer pointers.
> 
> In theory, this optimization reduces the number of potential cache misses
> from 3 to 2 by not having to read r->mask in rte_ring_count().
> 
> The modification of this function were also discussed in the RFC here:
> https://mails.dpdk.org/archives/dev/2020-April/165752.html
> 
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>  lib/librte_ring/rte_ring.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> index 9078e7c24..f67141482 100644
> --- a/lib/librte_ring/rte_ring.h
> +++ b/lib/librte_ring/rte_ring.h
> @@ -733,7 +733,9 @@ rte_ring_full(const struct rte_ring *r)
>  static inline int
>  rte_ring_empty(const struct rte_ring *r)
>  {
> -	return rte_ring_count(r) == 0;
> +	uint32_t prod_tail = r->prod.tail;
> +	uint32_t cons_tail = r->cons.tail;
> +	return cons_tail == prod_tail;
>  }
> 
>  /**
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] ring: coding style cleanup
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 1/2] ring: coding style cleanup Morten Brørup
@ 2020-05-22 12:34     ` Ananyev, Konstantin
  0 siblings, 0 replies; 16+ messages in thread
From: Ananyev, Konstantin @ 2020-05-22 12:34 UTC (permalink / raw)
  To: Morten Brørup, olivier.matz, Honnappa.Nagarahalli, nd; +Cc: dev

> 
> Fix coding style violations that checkpatch will complain about.
> 
> Add missing "int" after "unsigned".
> Add missing spaces around "+=" and "+".
> Remove superfluous type cast of numerical constant.
> 
> Suggested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.17.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 2/2] ring: empty optimization
  2020-05-19 16:02       ` Morten Brørup
@ 2020-07-01  9:19         ` David Marchand
  0 siblings, 0 replies; 16+ messages in thread
From: David Marchand @ 2020-07-01  9:19 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Stephen Hemminger, Olivier Matz, Ananyev, Konstantin,
	Honnappa Nagarahalli, nd, dev

On Tue, May 19, 2020 at 6:02 PM Morten Brørup <mb@smartsharesystems.com> wrote:
> > Blank line after declarations?
> >
> > Are the temporary variable even needed?
>
> Personally, I agree with you, but I was trying to match the existing coding style of the closely related rte_ring_count() function - only to avoid this kind of feedback.
>
> Damn if you do, damn if you don't. :-)

Yes, looking at the code, it seems fair taking this patch as is.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [dpdk-dev] [PATCH 0/2] ring: empty optimization
  2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 1/2] ring: coding style cleanup Morten Brørup
  2020-05-19 15:27   ` [dpdk-dev] [PATCH 2/2] ring: empty optimization Morten Brørup
@ 2020-07-01  9:20   ` David Marchand
  2 siblings, 0 replies; 16+ messages in thread
From: David Marchand @ 2020-07-01  9:20 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Olivier Matz, Ananyev, Konstantin, Honnappa Nagarahalli, nd, dev

On Tue, May 19, 2020 at 5:27 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> Testing if the ring is empty is as simple as comparing the producer and
> consumer pointers.
>
> Checkpatch complains about existing coding style violations, so the first
> part of the patch fixes those, and contains no functional changes.
>
> Morten Brørup (2):
>   ring: coding style cleanup
>   ring: empty optimization
>
>  lib/librte_ring/rte_ring.c      |  4 +--
>  lib/librte_ring/rte_ring.h      | 46 +++++++++++++++++----------------
>  lib/librte_ring/rte_ring_elem.h | 10 +++----
>  lib/librte_ring/rte_ring_hts.h  |  8 +++---
>  lib/librte_ring/rte_ring_rts.h  |  8 +++---
>  5 files changed, 39 insertions(+), 37 deletions(-)
>
> --
> 2.17.1
>

Series applied, thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-07-01  9:20 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-13 15:31 [dpdk-dev] [PATCH] ring: empty and count optimizations Morten Brørup
2020-05-13 15:35 ` Morten Brørup
2020-05-13 17:08 ` Morten Brørup
2020-05-14 12:23   ` Ananyev, Konstantin
2020-05-14 13:45     ` Morten Brørup
2020-05-14 16:46       ` Ananyev, Konstantin
2020-05-14 18:00         ` Morten Brørup
2020-05-19 15:27 ` [dpdk-dev] [PATCH 0/2] ring: empty optimization Morten Brørup
2020-05-19 15:27   ` [dpdk-dev] [PATCH 1/2] ring: coding style cleanup Morten Brørup
2020-05-22 12:34     ` Ananyev, Konstantin
2020-05-19 15:27   ` [dpdk-dev] [PATCH 2/2] ring: empty optimization Morten Brørup
2020-05-19 15:52     ` Stephen Hemminger
2020-05-19 16:02       ` Morten Brørup
2020-07-01  9:19         ` David Marchand
2020-05-22 12:32     ` Ananyev, Konstantin
2020-07-01  9:20   ` [dpdk-dev] [PATCH 0/2] " David Marchand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).