* [RFC 1/2] eal: add llc aware functions
2024-08-27 15:10 [RFC 0/2] introduce LLC aware functions Vipin Varghese
@ 2024-08-27 15:10 ` Vipin Varghese
2024-08-27 17:36 ` Stephen Hemminger
2024-08-27 20:56 ` Wathsala Wathawana Vithanage
2024-08-27 15:10 ` [RFC 2/2] eal/lcore: add llc aware for each macro Vipin Varghese
` (3 subsequent siblings)
4 siblings, 2 replies; 56+ messages in thread
From: Vipin Varghese @ 2024-08-27 15:10 UTC (permalink / raw)
To: ferruh.yigit, dev
Introduce lcore functions which operates on Last Level Cache for
core complexes or chiplet cores. On non chiplet core complexes,
the function will iterate over all the available dpdk lcores.
Functions added:
- rte_get_llc_first_lcores
- rte_get_llc_lcore
- rte_get_llc_n_lcore
Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
---
lib/eal/common/eal_common_lcore.c | 279 ++++++++++++++++++++++++++++--
1 file changed, 267 insertions(+), 12 deletions(-)
diff --git a/lib/eal/common/eal_common_lcore.c b/lib/eal/common/eal_common_lcore.c
index 2ff9252c52..4ff8b9e116 100644
--- a/lib/eal/common/eal_common_lcore.c
+++ b/lib/eal/common/eal_common_lcore.c
@@ -14,6 +14,7 @@
#ifndef RTE_EXEC_ENV_WINDOWS
#include <rte_telemetry.h>
#endif
+#include <rte_string_fns.h>
#include "eal_private.h"
#include "eal_thread.h"
@@ -93,25 +94,279 @@ int rte_lcore_is_enabled(unsigned int lcore_id)
return cfg->lcore_role[lcore_id] == ROLE_RTE;
}
-unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap)
+#define LCORE_GET_LLC \
+ "ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r | grep -m1 index[0-9] | awk -F '[x]' '{print $2}' "
+#define LCORE_GET_SHAREDLLC \
+ "grep [0-9] /sys/bus/cpu/devices/cpu%u/cache/index%u/shared_cpu_list"
+
+unsigned int rte_get_llc_first_lcores (rte_cpuset_t *llc_cpu)
{
- i++;
- if (wrap)
- i %= RTE_MAX_LCORE;
+ CPU_ZERO((rte_cpuset_t *)llc_cpu);
- while (i < RTE_MAX_LCORE) {
- if (!rte_lcore_is_enabled(i) ||
- (skip_main && (i == rte_get_main_lcore()))) {
- i++;
- if (wrap)
- i %= RTE_MAX_LCORE;
+ char cmdline[2048] = {'\0'};
+ char output_llc[8] = {'\0'};
+ char output_threads[16] = {'\0'};
+
+ for (unsigned int lcore =0; lcore < RTE_MAX_LCORE; lcore++)
+ {
+ if (!rte_lcore_is_enabled (lcore))
continue;
+
+ /* get sysfs llc index */
+ snprintf(cmdline, 2047, LCORE_GET_LLC, lcore);
+ FILE *fp = popen (cmdline, "r");
+ if (fp == NULL) {
+ return -1;
}
- break;
+ if (fgets(output_llc, sizeof(output_llc) - 1, fp) == NULL) {
+ pclose(fp);
+ return -1;
+ }
+ pclose(fp);
+ int llc_index = atoi (output_llc);
+
+ /* get sysfs core group of the same core index*/
+ snprintf(cmdline, 2047, LCORE_GET_SHAREDLLC, lcore, llc_index);
+ fp = popen (cmdline, "r");
+ if (fp == NULL) {
+ return -1;
+ }
+ if (fgets(output_threads, sizeof(output_threads) - 1, fp) == NULL) {
+ pclose(fp);
+ return -1;
+ }
+ pclose(fp);
+
+ output_threads [strlen(output_threads) - 1] = '\0';
+ char *smt_thrds[2];
+ int smt_threads = rte_strsplit(output_threads, sizeof(output_threads), smt_thrds, 2, ',');
+
+ for (int index = 0; index < smt_threads; index++) {
+ char *llc[2] = {'\0'};
+ int smt_cpu = rte_strsplit(smt_thrds[index], sizeof(smt_thrds[index]), llc, 2, '-');
+ RTE_SET_USED(smt_cpu);
+
+ unsigned int first_cpu = atoi (llc[0]);
+ unsigned int last_cpu = (NULL == llc[1]) ? atoi (llc[0]) : atoi (llc[1]);
+
+
+ for (unsigned int temp_cpu = first_cpu; temp_cpu <= last_cpu; temp_cpu++) {
+ if (rte_lcore_is_enabled(temp_cpu)) {
+ CPU_SET (temp_cpu, (rte_cpuset_t *) llc_cpu);
+ lcore = last_cpu;
+ break;
+ }
+ }
+ }
+ }
+
+ return CPU_COUNT((rte_cpuset_t *)llc_cpu);
+}
+
+unsigned int
+rte_get_llc_lcore (unsigned int lcore, rte_cpuset_t *llc_cpu,
+ unsigned int *first_cpu, unsigned int * last_cpu)
+{
+ CPU_ZERO((rte_cpuset_t *)llc_cpu);
+
+ char cmdline[2048] = {'\0'};
+ char output_llc[8] = {'\0'};
+ char output_threads[16] = {'\0'};
+
+ *first_cpu = *last_cpu = RTE_MAX_LCORE;
+
+ /* get sysfs llc index */
+ snprintf(cmdline, 2047, LCORE_GET_LLC, lcore);
+ FILE *fp = popen (cmdline, "r");
+ if (fp == NULL) {
+ return -1;
+ }
+ if (fgets(output_llc, sizeof(output_llc) - 1, fp) == NULL) {
+ pclose(fp);
+ return -1;
+ }
+ pclose(fp);
+ int llc_index = atoi (output_llc);
+
+ /* get sysfs core group of the same core index*/
+ snprintf(cmdline, 2047, LCORE_GET_SHAREDLLC, lcore, llc_index);
+ fp = popen (cmdline, "r");
+ if (fp == NULL) {
+ return -1;
+ }
+
+ if (fgets(output_threads, sizeof(output_threads) - 1, fp) == NULL) {
+ pclose(fp);
+ return -1;
}
- return i;
+ pclose(fp);
+
+ output_threads [strlen(output_threads) - 1] = '\0';
+ char *smt_thrds[2];
+ int smt_threads = rte_strsplit(output_threads, sizeof(output_threads), smt_thrds, 2, ',');
+
+ bool found_first_cpu = false;
+ unsigned int first_lcore_cpu = RTE_MAX_LCORE;
+ unsigned int last_lcore_cpu = RTE_MAX_LCORE;
+
+ for (int index = 0; index < smt_threads; index++) {
+ char *llc[2] = {'\0'};
+ int smt_cpu = rte_strsplit(smt_thrds[index], sizeof(smt_thrds[index]), llc, 2, '-');
+ RTE_SET_USED(smt_cpu);
+
+ char *end = NULL;
+ *first_cpu = strtoul (llc[0], end, 10);
+ *last_cpu = (1 == smt_cpu) ? strtoul (llc[0], end, 10) : strtoul (llc[1], end, 10);
+
+ unsigned int temp_cpu = RTE_MAX_LCORE;
+ RTE_LCORE_FOREACH(temp_cpu) {
+ if ((temp_cpu >= *first_cpu) && (temp_cpu <= *last_cpu)) {
+ CPU_SET (temp_cpu, (rte_cpuset_t *) llc_cpu);
+ //printf ("rte_get_llc_lcore: temp_cpu %u count %u \n", temp_cpu, CPU_COUNT(llc_cpu));
+
+ if (false == found_first_cpu) {
+ first_lcore_cpu = temp_cpu;
+ found_first_cpu = true;
+ }
+ last_lcore_cpu = temp_cpu;
+ }
+ //printf ("rte_get_llc_lcore: first %u last %u \n", first_lcore_cpu, last_lcore_cpu);
+ }
+ }
+
+ *first_cpu = first_lcore_cpu;
+ *last_cpu = last_lcore_cpu;
+
+ //printf ("rte_get_llc_lcore: first %u last %u count %u \n", *first_cpu, *last_cpu, CPU_COUNT(llc_cpu));
+ return CPU_COUNT((rte_cpuset_t *)llc_cpu);
+}
+
+unsigned int
+rte_get_llc_n_lcore (unsigned int lcore, rte_cpuset_t *llc_cpu,
+ unsigned int *first_cpu, unsigned int * last_cpu,
+ unsigned int n, bool skip)
+{
+ bool found_first_cpu = false;
+ bool found_last_cpu = false;
+ unsigned int first_lcore_cpu = RTE_MAX_LCORE;
+ unsigned int last_lcore_cpu = RTE_MAX_LCORE;
+
+ unsigned int temp_count = n;
+ unsigned int count = rte_get_llc_lcore (lcore, llc_cpu, first_cpu, last_cpu);
+
+ //printf ("rte_get_llc_n_lcore: first %u last %u count %u \n", *first_cpu, *last_cpu, CPU_COUNT(llc_cpu));
+
+ unsigned int temp_cpu = RTE_MAX_LCORE;
+ unsigned int temp_last_cpu = RTE_MAX_LCORE;
+ if (false == skip) {
+ if (count < n)
+ return 0;
+
+ RTE_LCORE_FOREACH(temp_cpu) {
+ if ((temp_cpu >= *first_cpu) && (temp_cpu <= *last_cpu)) {
+ if (CPU_ISSET(temp_cpu, llc_cpu) && (temp_count)) {
+ //printf ("rte_get_llc_n_lcore: temp - count %d cpu %u skip %u first %u last %u \n", temp_count, temp_cpu, skip, *first_cpu, *last_cpu);
+ if (false == found_first_cpu) {
+ *first_cpu = temp_cpu;
+ found_first_cpu = true;
+ }
+ temp_last_cpu = temp_cpu;
+
+ temp_count -= 1;
+ continue;
+ }
+ }
+ CPU_CLR(temp_cpu, llc_cpu);
+ }
+ *last_cpu = temp_last_cpu;
+ //printf ("rte_get_llc_n_lcore: start %u last %u count %u\n", *first_cpu, *last_cpu, CPU_COUNT(llc_cpu));
+ return n;
+ }
+
+ int total_core = CPU_COUNT(llc_cpu) - n;
+ if (total_core <= 0)
+ return 0;
+
+ RTE_LCORE_FOREACH(temp_cpu) {
+ if ((temp_cpu >= *first_cpu) && (temp_cpu <= *last_cpu)) {
+ if (CPU_ISSET(temp_cpu, llc_cpu) && (temp_count)) {
+ if (temp_count) {
+ CPU_CLR(temp_cpu, llc_cpu);
+ temp_count -= 1;
+ continue;
+ }
+
+ if (false == found_first_cpu) {
+ *first_cpu = temp_cpu;
+ found_first_cpu = true;
+ }
+ *last_cpu = temp_cpu;
+ }
+ }
+ }
+
+ //printf ("rte_get_llc_n_lcore: start %u last %u count %u\n", *first_cpu, *last_cpu, total_core);
+ return total_core;
+#if 0
+ if (false == skip) {
+ unsigned int start = *first_cpu, end = *last_cpu, temp_last_cpu = *last_cpu;
+ for (; (start <= end); start++)
+ {
+ if (CPU_ISSET(start, llc_cpu) && (temp_count)) {
+ temp_count -= 1;
+ continue;
+ } else if (CPU_ISSET(start, llc_cpu)) {
+ temp_last_cpu = (false == is_last_cpu) ? (start - 1) : temp_last_cpu;
+ is_last_cpu = true;
+
+ CPU_CLR(start, llc_cpu);
+ }
+ }
+ *last_cpu = temp_last_cpu;
+ return n;
+ }
+
+ int total_core = CPU_COUNT(llc_cpu) - n;
+ if (total_core <= 0)
+ return 0;
+
+ bool is_first_cpu = false;
+ unsigned int temp_last_cpu = *last_cpu;
+ for (unsigned int start = *first_cpu, end = *last_cpu; (start <= end) && (temp_count); start++)
+ {
+ if (CPU_ISSET(start, llc_cpu) && (temp_count)) {
+ *first_cpu = (is_first_cpu == false) ? start : *first_cpu;
+ temp_last_cpu = start;
+ CPU_CLR(start, llc_cpu);
+ temp_count -= 1;
+ }
+ }
+
+ *last_cpu = temp_last_cpu;
+ return total_core;
+#endif
+}
+
+unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap)
+{
+ i++;
+ if (wrap)
+ i %= RTE_MAX_LCORE;
+
+ while (i < RTE_MAX_LCORE) {
+ if (!rte_lcore_is_enabled(i) ||
+ (skip_main && (i == rte_get_main_lcore()))) {
+ i++;
+ if (wrap)
+ i %= RTE_MAX_LCORE;
+ continue;
+ }
+ break;
+ }
+ return i;
}
+
unsigned int
rte_lcore_to_socket_id(unsigned int lcore_id)
{
--
2.34.1
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 1/2] eal: add llc aware functions
2024-08-27 15:10 ` [RFC 1/2] eal: add llc " Vipin Varghese
@ 2024-08-27 17:36 ` Stephen Hemminger
2024-09-02 0:27 ` Varghese, Vipin
2024-08-27 20:56 ` Wathsala Wathawana Vithanage
1 sibling, 1 reply; 56+ messages in thread
From: Stephen Hemminger @ 2024-08-27 17:36 UTC (permalink / raw)
To: Vipin Varghese; +Cc: ferruh.yigit, dev
On Tue, 27 Aug 2024 20:40:13 +0530
Vipin Varghese <vipin.varghese@amd.com> wrote:
> + "ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r | grep -m1 index[0-9] | awk -F '[x]' '{print $2}' "
NAK
Running shell commands from EAL is non-portable and likely to be flagged by security scanning tools.
Do it in C please.
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 1/2] eal: add llc aware functions
2024-08-27 15:10 ` [RFC 1/2] eal: add llc " Vipin Varghese
2024-08-27 17:36 ` Stephen Hemminger
@ 2024-08-27 20:56 ` Wathsala Wathawana Vithanage
2024-08-29 3:21 ` 答复: " Feifei Wang
2024-09-02 1:20 ` Varghese, Vipin
1 sibling, 2 replies; 56+ messages in thread
From: Wathsala Wathawana Vithanage @ 2024-08-27 20:56 UTC (permalink / raw)
To: Vipin Varghese, ferruh.yigit, dev; +Cc: nd, nd
> -unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap)
> +#define LCORE_GET_LLC \
> + "ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r
> | grep -m1 index[0-9] | awk -F '[x]' '{print $2}' "
>
This won't work for some SOCs.
How to ensure the index you got is for an LLC? Some SOCs may only show upper-level caches here, therefore cannot be use blindly without knowing the SOC.
Also, unacceptable to execute a shell script, consider implementing in C.
--wathsala
^ permalink raw reply [flat|nested] 56+ messages in thread
* 答复: [RFC 1/2] eal: add llc aware functions
2024-08-27 20:56 ` Wathsala Wathawana Vithanage
@ 2024-08-29 3:21 ` Feifei Wang
2024-09-02 1:20 ` Varghese, Vipin
1 sibling, 0 replies; 56+ messages in thread
From: Feifei Wang @ 2024-08-29 3:21 UTC (permalink / raw)
To: Wathsala Wathawana Vithanage, Vipin Varghese, ferruh.yigit, dev
Cc: nd, nd, Jing Li, Liangxing Wang, Jianyong Wu
Hi,
> -----邮件原件-----
> 发件人: Wathsala Wathawana Vithanage <wathsala.vithanage@arm.com>
> 发送时间: 2024年8月28日 4:56
> 收件人: Vipin Varghese <vipin.varghese@amd.com>; ferruh.yigit@amd.com;
> dev@dpdk.org
> 抄送: nd <nd@arm.com>; nd <nd@arm.com>
> 主题: RE: [RFC 1/2] eal: add llc aware functions
>
> > -unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap)
> > +#define LCORE_GET_LLC \
> > + "ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r
> > | grep -m1 index[0-9] | awk -F '[x]' '{print $2}' "
> >
>
> This won't work for some SOCs.
> How to ensure the index you got is for an LLC? Some SOCs may only show
> upper-level caches here, therefore cannot be use blindly without knowing the
> SOC.
> Also, unacceptable to execute a shell script, consider implementing in C.
Maybe:
For arm, maybe we can load MPIDR_EL1 register to achieve cpu cluster topology.
MPIDR_EL1 register bit meaning:
[23:16] AFF3 (Level 3 affinity)
[15:8] AFF2 (Level 2 affinity)
[7:0] AFF1 (Level 1 affinity)
[7:0] AFF0 (Level 0 affinity)
For x86, we can use apic_id:
Apic_id includes cluster id, die id, smt id and core id.
This bypass execute a shell script, and for arm and x86, we set different path to implement this.
Best Regards
Feifei
> --wathsala
>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 1/2] eal: add llc aware functions
2024-08-27 20:56 ` Wathsala Wathawana Vithanage
2024-08-29 3:21 ` 答复: " Feifei Wang
@ 2024-09-02 1:20 ` Varghese, Vipin
2024-09-03 17:54 ` Wathsala Wathawana Vithanage
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-02 1:20 UTC (permalink / raw)
To: Wathsala Wathawana Vithanage, ferruh.yigit, dev; +Cc: nd
[-- Attachment #1: Type: text/plain, Size: 1706 bytes --]
<Snipped>
>> -unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap)
>> +#define LCORE_GET_LLC \
>> + "ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r
>> | grep -m1 index[0-9] | awk -F '[x]' '{print $2}' "
>>
> This won't work for some SOCs.
Thank you for your response. please find our response and queries below
> How to ensure the index you got is for an LLC?
we referred to How CPU topology info is exported via sysfs — The Linux
Kernel documentation
<https://www.kernel.org/doc/html/latest/admin-guide/cputopology.html>
and linux/Documentation/ABI/stable/sysfs-devices-system-cpu at master ·
torvalds/linux (github.com)
<https://github.com/torvalds/linux/blob/master/Documentation/ABI/stable/sysfs-devices-system-cpu>
and
Get Cache Info in Linux on ARMv8 64-bit Platform (zhiyisun.github.io)
<https://zhiyisun.github.io/2016/06/25/Get-Cache-Info-in-Linux-on-ARMv8-64-bit-Platform.html>.
Based on my current understanding on bare metal 64Bit Linux OS (which is
supported by most Distros), the cache topology are populated into sysfs.
> Some SOCs may only show upper-level caches here, therefore cannot be use blindly without knowing the SOC.
Can you please help us understand
1. if there are specific SoC which do not populate the information at
all? If yes are they in DTS?
2. If there are specific SoC which does not export to hypervisor like
Qemu or Xen?
We can work together to make it compatible.
> Also, unacceptable to execute a shell script, consider implementing in C.
As the intention of the RFC is to share possible API and Macro, we
welcome suggestions on the implementation as agreed with Stepehen.
>
> --wathsala
>
>
[-- Attachment #2: Type: text/html, Size: 3373 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 1/2] eal: add llc aware functions
2024-09-02 1:20 ` Varghese, Vipin
@ 2024-09-03 17:54 ` Wathsala Wathawana Vithanage
2024-09-04 8:18 ` Bruce Richardson
2024-09-06 11:59 ` Varghese, Vipin
0 siblings, 2 replies; 56+ messages in thread
From: Wathsala Wathawana Vithanage @ 2024-09-03 17:54 UTC (permalink / raw)
To: Varghese, Vipin, ferruh.yigit, dev; +Cc: nd, nd
> Some SOCs may only show upper-level caches here, therefore cannot
> be use blindly without knowing the SOC.
>
> Can you please help us understand
>
For instance, in Neoverse N1 can disable the use of SLC as LLC (a BIOS setting)
If SLC is not used as LLC, then your script would report the unified L2 as an LLC.
I don't think that's what you are interested in.
> 1. if there are specific SoC which do not populate the information at all? If yes
> are they in DTS?
This information is populated correctly for all SOCs, comment was on the script.
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 1/2] eal: add llc aware functions
2024-09-03 17:54 ` Wathsala Wathawana Vithanage
@ 2024-09-04 8:18 ` Bruce Richardson
2024-09-06 11:59 ` Varghese, Vipin
1 sibling, 0 replies; 56+ messages in thread
From: Bruce Richardson @ 2024-09-04 8:18 UTC (permalink / raw)
To: Wathsala Wathawana Vithanage; +Cc: Varghese, Vipin, ferruh.yigit, dev, nd
On Tue, Sep 03, 2024 at 05:54:22PM +0000, Wathsala Wathawana Vithanage wrote:
> > Some SOCs may only show upper-level caches here, therefore cannot
> > be use blindly without knowing the SOC.
> >
> > Can you please help us understand
> >
>
> For instance, in Neoverse N1 can disable the use of SLC as LLC (a BIOS setting)
> If SLC is not used as LLC, then your script would report the unified L2 as an LLC.
> I don't think that's what you are interested in.
>
> > 1. if there are specific SoC which do not populate the information at all? If yes
> > are they in DTS?
>
> This information is populated correctly for all SOCs, comment was on the script.
>
Given all the complexities around topologies, do we want this covered by
DPDK at all? Are we better to just recommend, to any applications that
need it, that they get the info straight from the kernel via sysfs? Why
have DPDK play the middle-man here, proxying the info from sysfs to the
app?
/Bruce
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 1/2] eal: add llc aware functions
2024-09-03 17:54 ` Wathsala Wathawana Vithanage
2024-09-04 8:18 ` Bruce Richardson
@ 2024-09-06 11:59 ` Varghese, Vipin
2024-09-12 16:58 ` Wathsala Wathawana Vithanage
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-06 11:59 UTC (permalink / raw)
To: Wathsala Wathawana Vithanage, Yigit, Ferruh, dev; +Cc: nd, nd
[-- Attachment #1: Type: text/plain, Size: 2011 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
> > Some SOCs may only show upper-level caches here, therefore
> > cannot be use blindly without knowing the SOC.
> >
> > Can you please help us understand
> >
>
> For instance, in Neoverse N1 can disable the use of SLC as LLC (a BIOS setting)
> If SLC is not used as LLC, then your script would report the unified L2 as an LLC.
Does `disabling SLC as LLC` disable L3? I think not, and what you are implying is the ` ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r …… ` will return index2 and not index3. Is this the understanding?
> I don't think that's what you are interested in.
My intention as shared is to `whether BIOS setting for CPU NUMA is enabled or not, I would like to allow the end customer get the core complexes (tile) which are under one group`.
So, if the `Last Level Cache` is L3 or L2 seen by OS, API allows the end user to get DPDK lcores sharing the last level cache.
But as per the earlier communication, specific SoC does not behave when some setting are done different. For AMD SoC case we are trying to help end user with right setting with tuning guides as pointed by ` 12. How to get best performance on AMD platform — Data Plane Development Kit 24.11.0-rc0 documentation (dpdk.org)<https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>`
Can you please confirm if such tuning guides or recommended settings are shared ? If not, can you please allow me to setup a technical call to sync on the same?
>
> > 1. if there are specific SoC which do not populate the information at
> > all? If yes are they in DTS?
>
> This information is populated correctly for all SOCs, comment was on the
> script.
Please note, I am not running any script. The command LCORE_GET_LLC is executed using C function `open`. As per suggestion of Stephen we have replied we will change to C function logic to get details.
Hope there is no longer confusion on this?
[-- Attachment #2: Type: text/html, Size: 5843 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 1/2] eal: add llc aware functions
2024-09-06 11:59 ` Varghese, Vipin
@ 2024-09-12 16:58 ` Wathsala Wathawana Vithanage
2024-10-21 8:20 ` Varghese, Vipin
0 siblings, 1 reply; 56+ messages in thread
From: Wathsala Wathawana Vithanage @ 2024-09-12 16:58 UTC (permalink / raw)
To: Varghese, Vipin, Yigit, Ferruh, dev; +Cc: nd, Honnappa Nagarahalli, nd
<snipped>
> >
> > For instance, in Neoverse N1 can disable the use of SLC as LLC (a BIOS
> > setting) If SLC is not used as LLC, then your script would report the unified L2
> as an LLC.
>
> Does `disabling SLC as LLC` disable L3? I think not, and what you are implying is
> the ` ls -d /sys/bus/cpu/devices/cpu%u/cache/index[0-9] | sort -r …… ` will
> return index2 and not index3. Is this the understanding?
>
It disables the use of SLC as an LLC for the CPUs and will return index2.
Disable SLC as L3 is a feature in Arm CMN interconnects (SFONLY mode).
When SLC is disabled as L3, firmware sets up ACPI PPTT to reflect this change.
Using the PPTT kernel correctly enumerates cache IDs not showing an L3.
>
> > I don't think that's what you are interested in.
> My intention as shared is to `whether BIOS setting for CPU NUMA is enabled
> or not, I would like to allow the end customer get the core complexes (tile)
> which are under one group`.
> So, if the `Last Level Cache` is L3 or L2 seen by OS, API allows the end user to
> get DPDK lcores sharing the last level cache.
>
> But as per the earlier communication, specific SoC does not behave when
> some setting are done different. For AMD SoC case we are trying to help end
> user with right setting with tuning guides as pointed by ` 12. How to get best
> performance on AMD platform — Data Plane Development Kit 24.11.0-rc0
> documentation (dpdk.org)
> <https://doc.dpdk.org/guides/linux_gsg/amd_platform.html> `
>
> Can you please confirm if such tuning guides or recommended settings are
> shared ? If not, can you please allow me to setup a technical call to sync on the
> same?
>
Currently there is no such document for Arm. But we would like to have one, there are
some complexities too, not all SOC vendors use Arm's CMN interconnect.
I would be happy to sync over a call.
> >
> > > 1. if there are specific SoC which do not populate the information
> > > at all? If yes are they in DTS?
> >
> > This information is populated correctly for all SOCs, comment was on
> > the script.
>
> Please note, I am not running any script. The command LCORE_GET_LLC is
> executed using C function `open`. As per suggestion of Stephen we have
> replied we will change to C function logic to get details.
> Hope there is no longer confusion on this?
>
If this is implemented using sysfs, then it needs to handle caveats like SFONLY mode.
Perhaps consulting /sys/bus/cpu/devices/cpu%u/cache/index[0-9]/type would help.
However, I prefer using hwloc to get this information accurately.
Thanks
--wathsala
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 1/2] eal: add llc aware functions
2024-09-12 16:58 ` Wathsala Wathawana Vithanage
@ 2024-10-21 8:20 ` Varghese, Vipin
0 siblings, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-10-21 8:20 UTC (permalink / raw)
To: Wathsala Wathawana Vithanage, Yigit, Ferruh, dev
Cc: nd, Honnappa Nagarahalli, nd
[AMD Official Use Only - AMD Internal Distribution Only]
> > >
> > > > 1. if there are specific SoC which do not populate the information
> > > > at all? If yes are they in DTS?
> > >
> > > This information is populated correctly for all SOCs, comment was on
> > > the script.
> >
> > Please note, I am not running any script. The command LCORE_GET_LLC is
> > executed using C function `open`. As per suggestion of Stephen we have
> > replied we will change to C function logic to get details.
> > Hope there is no longer confusion on this?
> >
> If this is implemented using sysfs, then it needs to handle caveats like SFONLY
> mode.
> Perhaps consulting /sys/bus/cpu/devices/cpu%u/cache/index[0-9]/type would help.
> However, I prefer using hwloc to get this information accurately.
New version (rfc-v2) shared soon, implemented using hwloc libraries rather than `sysfs`.
I hope with the distro hwloc pakages it will work with ` Neoverse N1 can disable the use of SLC as LLC (a BIOS setting) If SLC is not used as LLC`
>
> Thanks
>
> --wathsala
^ permalink raw reply [flat|nested] 56+ messages in thread
* [RFC 2/2] eal/lcore: add llc aware for each macro
2024-08-27 15:10 [RFC 0/2] introduce LLC aware functions Vipin Varghese
2024-08-27 15:10 ` [RFC 1/2] eal: add llc " Vipin Varghese
@ 2024-08-27 15:10 ` Vipin Varghese
2024-08-27 21:23 ` [RFC 0/2] introduce LLC aware functions Mattias Rönnblom
` (2 subsequent siblings)
4 siblings, 0 replies; 56+ messages in thread
From: Vipin Varghese @ 2024-08-27 15:10 UTC (permalink / raw)
To: ferruh.yigit, dev
add RTE_LCORE_FOREACH for dpdk lcore sharing the Last Level Cache.
For core complexes with shared LLC, the macro iterates for same llc
lcores. For cores within single LLC, the macro iterates over all
availble lcores.
MACRO added:
- RTE_LCORE_FOREACH_LLC_FIRST
- RTE_LCORE_FOREACH_LLC_FIRST_WORKER
- RTE_LCORE_FOREACH_LLC_WORKER
- RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER
- RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER
- RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER
Signed-off-by: Vipin Varghese <vipin.varghese@amd.com>
---
lib/eal/include/rte_lcore.h | 89 +++++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
diff --git a/lib/eal/include/rte_lcore.h b/lib/eal/include/rte_lcore.h
index 7deae47af3..7c1a240bde 100644
--- a/lib/eal/include/rte_lcore.h
+++ b/lib/eal/include/rte_lcore.h
@@ -18,6 +18,7 @@
#include <rte_eal.h>
#include <rte_launch.h>
#include <rte_thread.h>
+#include <rte_os.h>
#ifdef __cplusplus
extern "C" {
@@ -196,6 +197,21 @@ rte_cpuset_t rte_lcore_cpuset(unsigned int lcore_id);
*/
int rte_lcore_is_enabled(unsigned int lcore_id);
+/**
+ * Get the next enabled lcore ID within same llc.
+ *
+ * @param i
+ * The current lcore (reference).
+ * @param skip_main
+ * If true, do not return the ID of the main lcore.
+ * @param wrap
+ * If true, go back to 0 when RTE_MAX_LCORE is reached; otherwise,
+ * return RTE_MAX_LCORE.
+ * @return
+ * The next lcore_id or RTE_MAX_LCORE if not found.
+ */
+unsigned int rte_get_next_llc_lcore(unsigned int i, int skip_main, int wrap);
+
/**
* Get the next enabled lcore ID.
*
@@ -211,6 +227,11 @@ int rte_lcore_is_enabled(unsigned int lcore_id);
*/
unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap);
+unsigned int rte_get_llc_lcore (unsigned int i, rte_cpuset_t *llc_cpu, unsigned int *start, unsigned int *end);
+unsigned int rte_get_llc_first_lcores (rte_cpuset_t *llc_cpu);
+unsigned int rte_get_llc_n_lcore (unsigned int i, rte_cpuset_t *llc_cpu, unsigned int *start, unsigned int *end, unsigned int n, bool skip);
+
+
/**
* Macro to browse all running lcores.
*/
@@ -219,6 +240,7 @@ unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap);
i < RTE_MAX_LCORE; \
i = rte_get_next_lcore(i, 0, 0))
+
/**
* Macro to browse all running lcores except the main lcore.
*/
@@ -227,6 +249,73 @@ unsigned int rte_get_next_lcore(unsigned int i, int skip_main, int wrap);
i < RTE_MAX_LCORE; \
i = rte_get_next_lcore(i, 1, 0))
+/** Browse all the the cores in the provided llc domain **/
+
+#define RTE_LCORE_FOREACH_LLC_FIRST(i) \
+ rte_cpuset_t llc_foreach_first_lcores; \
+ CPU_ZERO(&llc_foreach_first_lcores); i = 0; \
+ unsigned int llc_foreach_num_iter = rte_get_llc_first_lcores(&llc_foreach_first_lcores); \
+ i = (0 == llc_foreach_num_iter) ? RTE_MAX_LCORE : i; \
+ for (; i < RTE_MAX_LCORE; i++) \
+ if (CPU_ISSET(i, &llc_foreach_first_lcores))
+
+#define RTE_LCORE_FOREACH_LLC_FIRST_WORKER(i) \
+ rte_cpuset_t llc_foreach_first_lcores; \
+ CPU_ZERO(&llc_foreach_first_lcores); i = 0; \
+ unsigned int llc_foreach_num_iter = rte_get_llc_first_lcores(&llc_foreach_first_lcores); \
+ CPU_CLR(rte_get_main_lcore(), &llc_foreach_first_lcores); \
+ i = (0 == llc_foreach_num_iter) ? RTE_MAX_LCORE : i; \
+ for (; i < RTE_MAX_LCORE; i++) \
+ if (CPU_ISSET(i, &llc_foreach_first_lcores))
+
+#define RTE_LCORE_FOREACH_LLC_WORKER(i) \
+ rte_cpuset_t llc_foreach_first_lcores; \
+ rte_cpuset_t llc_foreach_lcore; \
+ unsigned int start,end; \
+ CPU_ZERO(&llc_foreach_first_lcores); i = 0; \
+ unsigned int llc_foreach_num_iter = rte_get_llc_first_lcores(&llc_foreach_first_lcores); \
+ i = (0 == llc_foreach_num_iter) ? RTE_MAX_LCORE : i; \
+ for (unsigned int llc_i = i; llc_i < RTE_MAX_LCORE; llc_i++) \
+ if (CPU_ISSET(llc_i, &llc_foreach_first_lcores) && rte_get_llc_lcore (llc_i, &llc_foreach_lcore, &start, &end)) \
+ for (i = start; (i <= end); i++) \
+ if (CPU_ISSET(i, &llc_foreach_lcore) && (i != rte_get_main_lcore()))
+
+#define RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER(i) \
+ rte_cpuset_t llc_foreach_first_lcores; \
+ rte_cpuset_t llc_foreach_lcore; \
+ unsigned int start,end; \
+ CPU_ZERO(&llc_foreach_first_lcores); i = 0; \
+ unsigned int llc_foreach_num_iter = rte_get_llc_first_lcores(&llc_foreach_first_lcores); \
+ i = (0 == llc_foreach_num_iter) ? RTE_MAX_LCORE : i; \
+ for (unsigned int llc_i = i; llc_i < RTE_MAX_LCORE; llc_i++) \
+ if (CPU_ISSET(llc_i, &llc_foreach_first_lcores) && rte_get_llc_lcore (llc_i, &llc_foreach_lcore, &start, &end)) \
+ for (i = start + 1; (i <= end); i++) \
+ if (CPU_ISSET(i, &llc_foreach_lcore) && (i != rte_get_main_lcore()))
+
+#define RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER(i,n) \
+ rte_cpuset_t llc_foreach_first_lcores; \
+ rte_cpuset_t llc_foreach_lcore; \
+ unsigned int start,end, temp_count; \
+ CPU_ZERO(&llc_foreach_first_lcores); \
+ unsigned int llc_foreach_num_iter = rte_get_llc_first_lcores(&llc_foreach_first_lcores); \
+ i = (0 == llc_foreach_num_iter) ? RTE_MAX_LCORE : 0; \
+ for (unsigned int llc_i = i; llc_i < RTE_MAX_LCORE; llc_i++) \
+ if (CPU_ISSET(llc_i, &llc_foreach_first_lcores) && (rte_get_llc_n_lcore (llc_i, &llc_foreach_lcore, &start, &end, n, false) >= n)) \
+ for (i = start, temp_count = n; (i <= end) && (temp_count); i++) \
+ if (CPU_ISSET(i, &llc_foreach_lcore) && (i != rte_get_main_lcore()) && (temp_count--))
+
+#define RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER(i,n) \
+ rte_cpuset_t llc_foreach_skip_first_lcores; \
+ rte_cpuset_t llc_foreach_skip_lcore; \
+ unsigned int start_skip,end_skip,llc_skip_i; \
+ CPU_ZERO(&llc_foreach_skip_first_lcores); \
+ unsigned int llc_foreach_skip_num_iter = rte_get_llc_first_lcores(&llc_foreach_skip_first_lcores); \
+ i = (0 == llc_foreach_skip_num_iter) ? RTE_MAX_LCORE : 0; \
+ for (llc_skip_i = i; llc_skip_i < RTE_MAX_LCORE; llc_skip_i++) \
+ if (CPU_ISSET(llc_skip_i, &llc_foreach_skip_first_lcores) && (rte_get_llc_n_lcore (llc_skip_i, &llc_foreach_skip_lcore, &start_skip, &end_skip, n, true) > 0)) \
+ for (i = start_skip; (i <= end_skip); i++) \
+ if (CPU_ISSET(i, &llc_foreach_skip_lcore) && (i != rte_get_main_lcore()))
+
/**
* Callback prototype for initializing lcores.
*
--
2.34.1
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-08-27 15:10 [RFC 0/2] introduce LLC aware functions Vipin Varghese
2024-08-27 15:10 ` [RFC 1/2] eal: add llc " Vipin Varghese
2024-08-27 15:10 ` [RFC 2/2] eal/lcore: add llc aware for each macro Vipin Varghese
@ 2024-08-27 21:23 ` Mattias Rönnblom
2024-09-02 0:39 ` Varghese, Vipin
2024-08-28 8:38 ` Burakov, Anatoly
2024-10-07 21:28 ` Stephen Hemminger
4 siblings, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-08-27 21:23 UTC (permalink / raw)
To: Vipin Varghese, ferruh.yigit, dev
On 2024-08-27 17:10, Vipin Varghese wrote:
> As core density continues to increase, chiplet-based
> core packing has become a key trend. In AMD SoC EPYC
> architectures, core complexes within the same chiplet
> share a Last-Level Cache (LLC). By packing logical cores
> within the same LLC, we can enhance pipeline processing
> stages due to reduced latency and improved data locality.
>
> To leverage these benefits, DPDK libraries and examples
> can utilize localized lcores. This approach ensures more
> consistent latencies by minimizing the dispersion of lcores
> across different chiplet complexes and enhances packet
> processing by ensuring that data for subsequent pipeline
> stages is likely to reside within the LLC.
>
We shouldn't have a separate CPU/cache hierarchy API instead?
Could potentially be built on the 'hwloc' library.
I much agree cache/core topology may be of interest of the application
(or a work scheduler, like a DPDK event device), but it's not limited to
LLC. It may well be worthwhile to care about which cores shares L2
cache, for example. Not sure the RTE_LCORE_FOREACH_* approach scales.
> < Function: Purpose >
> ---------------------
> - rte_get_llc_first_lcores: Retrieves all the first lcores in the shared LLC.
> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n lcores in the shared LLC.
>
> < MACRO: Purpose >
> ------------------
> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint (lcore id).
> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while skipping first worker.
> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from each LLC.
> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then iterates through reaming lcores in each LLC.
>
> Vipin Varghese (2):
> eal: add llc aware functions
> eal/lcore: add llc aware for each macro
>
> lib/eal/common/eal_common_lcore.c | 279 ++++++++++++++++++++++++++++--
> lib/eal/include/rte_lcore.h | 89 ++++++++++
> 2 files changed, 356 insertions(+), 12 deletions(-)
>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-08-27 21:23 ` [RFC 0/2] introduce LLC aware functions Mattias Rönnblom
@ 2024-09-02 0:39 ` Varghese, Vipin
2024-09-04 9:30 ` Mattias Rönnblom
0 siblings, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-02 0:39 UTC (permalink / raw)
To: Mattias Rönnblom, ferruh.yigit, dev
<snipped>
Thank you Mattias for the comments and question, please let me try to
explain the same below
> We shouldn't have a separate CPU/cache hierarchy API instead?
Based on the intention to bring in CPU lcores which share same L3 (for
better cache hits and less noisy neighbor) current API focuses on using
Last Level Cache. But if the suggestion is `there are SoC where L2 cache
are also shared, and the new API should be provisioned`, I am also
comfortable with the thought.
>
> Could potentially be built on the 'hwloc' library.
There are 3 reason on AMD SoC we did not explore this path, reasons are
1. depending n hwloc version and kernel version certain SoC hierarchies
are not available
2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD Epyc Soc.
3. adds the extra dependency layer of library layer to be made available
to work.
hence we have tried to use Linux Documented generic layer of `sysfs CPU
cache`.
I will try to explore more on hwloc and check if other libraries within
DPDK leverages the same.
>
> I much agree cache/core topology may be of interest of the application
> (or a work scheduler, like a DPDK event device), but it's not limited to
> LLC. It may well be worthwhile to care about which cores shares L2
> cache, for example. Not sure the RTE_LCORE_FOREACH_* approach scales.
yes, totally understand as some SoC, multiple lcores shares same L2 cache.
Can we rework the API to be rte_get_cache_<function> where user argument
is desired lcore index.
1. index-1: SMT threads
2. index-2: threads sharing same L2 cache
3. index-3: threads sharing same L3 cache
4. index-MAX: identify the threads sharing last level cache.
>
>> < Function: Purpose >
>> ---------------------
>> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
>> shared LLC.
>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
>> lcores in the shared LLC.
>>
>> < MACRO: Purpose >
>> ------------------
>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
>> each LLC.
>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker
>> lcore from each LLC.
>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint
>> (lcore id).
>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC
>> while skipping first worker.
>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores
>> from each LLC.
>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>> iterates through reaming lcores in each LLC.
>>
While the MACRO are simple wrapper invoking appropriate API. can this be
worked out in this fashion?
<snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-02 0:39 ` Varghese, Vipin
@ 2024-09-04 9:30 ` Mattias Rönnblom
2024-09-04 14:37 ` Stephen Hemminger
2024-09-09 14:22 ` Varghese, Vipin
0 siblings, 2 replies; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-04 9:30 UTC (permalink / raw)
To: Varghese, Vipin, ferruh.yigit, dev
On 2024-09-02 02:39, Varghese, Vipin wrote:
> <snipped>
>
> Thank you Mattias for the comments and question, please let me try to
> explain the same below
>
>> We shouldn't have a separate CPU/cache hierarchy API instead?
>
> Based on the intention to bring in CPU lcores which share same L3 (for
> better cache hits and less noisy neighbor) current API focuses on using
>
> Last Level Cache. But if the suggestion is `there are SoC where L2 cache
> are also shared, and the new API should be provisioned`, I am also
>
> comfortable with the thought.
>
Rather than some AMD special case API hacked into <rte_lcore.h>, I think
we are better off with no DPDK API at all for this kind of functionality.
A DPDK CPU/memory hierarchy topology API very much makes sense, but it
should be reasonably generic and complete from the start.
>>
>> Could potentially be built on the 'hwloc' library.
>
> There are 3 reason on AMD SoC we did not explore this path, reasons are
>
> 1. depending n hwloc version and kernel version certain SoC hierarchies
> are not available
>
> 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD Epyc Soc.
>
> 3. adds the extra dependency layer of library layer to be made available
> to work.
>
>
> hence we have tried to use Linux Documented generic layer of `sysfs CPU
> cache`.
>
> I will try to explore more on hwloc and check if other libraries within
> DPDK leverages the same.
>
>>
>> I much agree cache/core topology may be of interest of the application
>> (or a work scheduler, like a DPDK event device), but it's not limited to
>> LLC. It may well be worthwhile to care about which cores shares L2
>> cache, for example. Not sure the RTE_LCORE_FOREACH_* approach scales.
>
> yes, totally understand as some SoC, multiple lcores shares same L2 cache.
>
>
> Can we rework the API to be rte_get_cache_<function> where user argument
> is desired lcore index.
>
> 1. index-1: SMT threads
>
> 2. index-2: threads sharing same L2 cache
>
> 3. index-3: threads sharing same L3 cache
>
> 4. index-MAX: identify the threads sharing last level cache.
>
>>
>>> < Function: Purpose >
>>> ---------------------
>>> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
>>> shared LLC.
>>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
>>> lcores in the shared LLC.
>>>
>>> < MACRO: Purpose >
>>> ------------------
>>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
>>> each LLC.
>>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker
>>> lcore from each LLC.
>>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint
>>> (lcore id).
>>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC
>>> while skipping first worker.
>>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores
>>> from each LLC.
>>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>>> iterates through reaming lcores in each LLC.
>>>
> While the MACRO are simple wrapper invoking appropriate API. can this be
> worked out in this fashion?
>
> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-04 9:30 ` Mattias Rönnblom
@ 2024-09-04 14:37 ` Stephen Hemminger
2024-09-11 3:13 ` Varghese, Vipin
2024-09-09 14:22 ` Varghese, Vipin
1 sibling, 1 reply; 56+ messages in thread
From: Stephen Hemminger @ 2024-09-04 14:37 UTC (permalink / raw)
To: Mattias Rönnblom; +Cc: Varghese, Vipin, ferruh.yigit, dev
On Wed, 4 Sep 2024 11:30:59 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> On 2024-09-02 02:39, Varghese, Vipin wrote:
> > <snipped>
> >
> > Thank you Mattias for the comments and question, please let me try to
> > explain the same below
> >
> >> We shouldn't have a separate CPU/cache hierarchy API instead?
> >
> > Based on the intention to bring in CPU lcores which share same L3 (for
> > better cache hits and less noisy neighbor) current API focuses on using
> >
> > Last Level Cache. But if the suggestion is `there are SoC where L2 cache
> > are also shared, and the new API should be provisioned`, I am also
> >
> > comfortable with the thought.
> >
>
> Rather than some AMD special case API hacked into <rte_lcore.h>, I think
> we are better off with no DPDK API at all for this kind of functionality.
>
> A DPDK CPU/memory hierarchy topology API very much makes sense, but it
> should be reasonably generic and complete from the start.
Agreed. This one of those cases where the existing project hwloc which
is part of open-mpi is more complete and well supported. It supports
multiple OS's and can deal with more quirks.
https://github.com/open-mpi/hwloc
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-04 14:37 ` Stephen Hemminger
@ 2024-09-11 3:13 ` Varghese, Vipin
2024-09-11 3:53 ` Stephen Hemminger
0 siblings, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-11 3:13 UTC (permalink / raw)
To: Stephen Hemminger, Mattias Rönnblom; +Cc: Yigit, Ferruh, dev
[-- Attachment #1: Type: text/plain, Size: 1496 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
> > >
> > > Thank you Mattias for the comments and question, please let me try
> > > to explain the same below
> > >
> > >> We shouldn't have a separate CPU/cache hierarchy API instead?
> > >
> > > Based on the intention to bring in CPU lcores which share same L3
> > > (for better cache hits and less noisy neighbor) current API focuses
> > > on using
> > >
> > > Last Level Cache. But if the suggestion is `there are SoC where L2
> > > cache are also shared, and the new API should be provisioned`, I am
> > > also
> > >
> > > comfortable with the thought.
> > >
> >
> > Rather than some AMD special case API hacked into <rte_lcore.h>, I
> > think we are better off with no DPDK API at all for this kind of functionality.
> >
> > A DPDK CPU/memory hierarchy topology API very much makes sense, but it
> > should be reasonably generic and complete from the start.
>
> Agreed. This one of those cases where the existing project hwloc which is part
> of open-mpi is more complete and well supported. It supports multiple OS's
> and can deal with more quirks.
Thank you Stephen for the inputs, last year when checked hwloc for distros there were anomalies for NUMA and Physical socket Identification on AMD EPYC Soc.
I will recheck the distros version of hwloc, if these work out fine I will re-work with hwloc libraries making it OS independent too.
>
> https://github.com/open-mpi/hwloc
[-- Attachment #2: Type: text/html, Size: 5277 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-11 3:13 ` Varghese, Vipin
@ 2024-09-11 3:53 ` Stephen Hemminger
2024-09-12 1:11 ` Varghese, Vipin
0 siblings, 1 reply; 56+ messages in thread
From: Stephen Hemminger @ 2024-09-11 3:53 UTC (permalink / raw)
To: Varghese, Vipin; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
On Wed, 11 Sep 2024 03:13:14 +0000
"Varghese, Vipin" <Vipin.Varghese@amd.com> wrote:
> > Agreed. This one of those cases where the existing project hwloc which is part
> > of open-mpi is more complete and well supported. It supports multiple OS's
> > and can deal with more quirks.
>
> Thank you Stephen for the inputs, last year when checked hwloc for distros there were anomalies for NUMA and Physical socket Identification on AMD EPYC Soc.
> I will recheck the distros version of hwloc, if these work out fine I will re-work with hwloc libraries making it OS independent too.
DPDK doesn't exist to resolve problems with upstreaming hardware support
in other packages. If DPDK does supports something only because it is harder, slower, more painful
to deal with another project; then you create long term technical debt.
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 3:53 ` Stephen Hemminger
@ 2024-09-12 1:11 ` Varghese, Vipin
0 siblings, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 1:11 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
>
> On Wed, 11 Sep 2024 03:13:14 +0000
> "Varghese, Vipin" <Vipin.Varghese@amd.com> wrote:
>
> > > Agreed. This one of those cases where the existing project hwloc
> > > which is part of open-mpi is more complete and well supported. It
> > > supports multiple OS's and can deal with more quirks.
> >
> > Thank you Stephen for the inputs, last year when checked hwloc for distros
> there were anomalies for NUMA and Physical socket Identification on AMD
> EPYC Soc.
> > I will recheck the distros version of hwloc, if these work out fine I will re-
> work with hwloc libraries making it OS independent too.
>
> DPDK doesn't exist to resolve problems with upstreaming hardware support in
> other packages.
Stephen, thank you for your comment. You have asked in earlier request to try using hwloc library.
I have mentioned at least till last year popular distros were still using older version for hwloc and I will recheck if it has changed.
This I assumed for mitigating the `sysfs` and bringing in `OS` portability.
If DPDK does supports something only because it is harder, > slower, more painful to deal with another project; then you create long term
> technical debt.
Now I really confused, for any application or end user which want to use the right set of lcores for performance DPDK provides abstraction and ease of use.
Maybe I am misreading the above idea, so let me back off a bit and share version-2 of the RFC.
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-04 9:30 ` Mattias Rönnblom
2024-09-04 14:37 ` Stephen Hemminger
@ 2024-09-09 14:22 ` Varghese, Vipin
2024-09-09 14:52 ` Mattias Rönnblom
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-09 14:22 UTC (permalink / raw)
To: Mattias Rönnblom, Yigit, Ferruh, dev
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
> > <snipped>
> >
> > Thank you Mattias for the comments and question, please let me try to
> > explain the same below
> >
> >> We shouldn't have a separate CPU/cache hierarchy API instead?
> >
> > Based on the intention to bring in CPU lcores which share same L3 (for
> > better cache hits and less noisy neighbor) current API focuses on
> > using
> >
> > Last Level Cache. But if the suggestion is `there are SoC where L2
> > cache are also shared, and the new API should be provisioned`, I am
> > also
> >
> > comfortable with the thought.
> >
>
> Rather than some AMD special case API hacked into <rte_lcore.h>, I think we
> are better off with no DPDK API at all for this kind of functionality.
Hi Mattias, as shared in the earlier email thread, this is not a AMD special case at all. Let me try to explain this one more time. One of techniques used to increase cores cost effective way to go for tiles of compute complexes.
This introduces a bunch of cores in sharing same Last Level Cache (namely L2, L3 or even L4) depending upon cache topology architecture.
The API suggested in RFC is to help end users to selectively use cores under same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS settings used). This is useful in both bare-metal and container environment.
As shared in response for cover letter +1 to expand it to more than just LLC cores. We have also confirmed the same to https://patchwork.dpdk.org/project/dpdk/cover/20240827151014.201-1-vipin.varghese@amd.com/
>
> A DPDK CPU/memory hierarchy topology API very much makes sense, but it
> should be reasonably generic and complete from the start.
>
> >>
> >> Could potentially be built on the 'hwloc' library.
> >
> > There are 3 reason on AMD SoC we did not explore this path, reasons
> > are
> >
> > 1. depending n hwloc version and kernel version certain SoC
> > hierarchies are not available
> >
> > 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD
> Epyc Soc.
> >
> > 3. adds the extra dependency layer of library layer to be made
> > available to work.
> >
> >
> > hence we have tried to use Linux Documented generic layer of `sysfs
> > CPU cache`.
> >
> > I will try to explore more on hwloc and check if other libraries
> > within DPDK leverages the same.
> >
> >>
> >> I much agree cache/core topology may be of interest of the
> >> application (or a work scheduler, like a DPDK event device), but it's
> >> not limited to LLC. It may well be worthwhile to care about which
> >> cores shares L2 cache, for example. Not sure the RTE_LCORE_FOREACH_*
> approach scales.
> >
> > yes, totally understand as some SoC, multiple lcores shares same L2 cache.
> >
> >
> > Can we rework the API to be rte_get_cache_<function> where user
> > argument is desired lcore index.
> >
> > 1. index-1: SMT threads
> >
> > 2. index-2: threads sharing same L2 cache
> >
> > 3. index-3: threads sharing same L3 cache
> >
> > 4. index-MAX: identify the threads sharing last level cache.
> >
> >>
> >>> < Function: Purpose >
> >>> ---------------------
> >>> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
> >>> shared LLC.
> >>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
> >>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
> >>> lcores in the shared LLC.
> >>>
> >>> < MACRO: Purpose >
> >>> ------------------
> >>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
> >>> each LLC.
> >>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first
> >>> worker lcore from each LLC.
> >>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on
> hint
> >>> (lcore id).
> >>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC
> >>> while skipping first worker.
> >>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores
> >>> from each LLC.
> >>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
> >>> iterates through reaming lcores in each LLC.
> >>>
> > While the MACRO are simple wrapper invoking appropriate API. can this
> > be worked out in this fashion?
> >
> > <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-09 14:22 ` Varghese, Vipin
@ 2024-09-09 14:52 ` Mattias Rönnblom
2024-09-11 3:26 ` Varghese, Vipin
0 siblings, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-09 14:52 UTC (permalink / raw)
To: Varghese, Vipin, Yigit, Ferruh, dev
On 2024-09-09 16:22, Varghese, Vipin wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> <snipped>
>
>>> <snipped>
>>>
>>> Thank you Mattias for the comments and question, please let me try to
>>> explain the same below
>>>
>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
>>>
>>> Based on the intention to bring in CPU lcores which share same L3 (for
>>> better cache hits and less noisy neighbor) current API focuses on
>>> using
>>>
>>> Last Level Cache. But if the suggestion is `there are SoC where L2
>>> cache are also shared, and the new API should be provisioned`, I am
>>> also
>>>
>>> comfortable with the thought.
>>>
>>
>> Rather than some AMD special case API hacked into <rte_lcore.h>, I think we
>> are better off with no DPDK API at all for this kind of functionality.
>
> Hi Mattias, as shared in the earlier email thread, this is not a AMD special case at all. Let me try to explain this one more time. One of techniques used to increase cores cost effective way to go for tiles of compute complexes.
> This introduces a bunch of cores in sharing same Last Level Cache (namely L2, L3 or even L4) depending upon cache topology architecture.
>
> The API suggested in RFC is to help end users to selectively use cores under same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS settings used). This is useful in both bare-metal and container environment.
>
I'm pretty familiar with AMD CPUs and the use of tiles (including the
challenges these kinds of non-uniformities pose for work scheduling).
To maximize performance, caring about core<->LLC relationship may well
not be enough, and more HT/core/cache/memory topology information is
required. That's what I meant by special case. A proper API should allow
access to information about which lcores are SMT siblings, cores on the
same L2, and cores on the same L3, to name a few things. Probably you
want to fit NUMA into the same API as well, although that is available
already in <rte_lcore.h>.
One can have a look at how scheduling domains work in the Linux kernel.
They model this kind of thing.
> As shared in response for cover letter +1 to expand it to more than just LLC cores. We have also confirmed the same to https://patchwork.dpdk.org/project/dpdk/cover/20240827151014.201-1-vipin.varghese@amd.com/
>
>>
>> A DPDK CPU/memory hierarchy topology API very much makes sense, but it
>> should be reasonably generic and complete from the start.
>>
>>>>
>>>> Could potentially be built on the 'hwloc' library.
>>>
>>> There are 3 reason on AMD SoC we did not explore this path, reasons
>>> are
>>>
>>> 1. depending n hwloc version and kernel version certain SoC
>>> hierarchies are not available
>>>
>>> 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD
>> Epyc Soc.
>>>
>>> 3. adds the extra dependency layer of library layer to be made
>>> available to work.
>>>
>>>
>>> hence we have tried to use Linux Documented generic layer of `sysfs
>>> CPU cache`.
>>>
>>> I will try to explore more on hwloc and check if other libraries
>>> within DPDK leverages the same.
>>>
>>>>
>>>> I much agree cache/core topology may be of interest of the
>>>> application (or a work scheduler, like a DPDK event device), but it's
>>>> not limited to LLC. It may well be worthwhile to care about which
>>>> cores shares L2 cache, for example. Not sure the RTE_LCORE_FOREACH_*
>> approach scales.
>>>
>>> yes, totally understand as some SoC, multiple lcores shares same L2 cache.
>>>
>>>
>>> Can we rework the API to be rte_get_cache_<function> where user
>>> argument is desired lcore index.
>>>
>>> 1. index-1: SMT threads
>>>
>>> 2. index-2: threads sharing same L2 cache
>>>
>>> 3. index-3: threads sharing same L3 cache
>>>
>>> 4. index-MAX: identify the threads sharing last level cache.
>>>
>>>>
>>>>> < Function: Purpose >
>>>>> ---------------------
>>>>> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
>>>>> shared LLC.
>>>>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>>>>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
>>>>> lcores in the shared LLC.
>>>>>
>>>>> < MACRO: Purpose >
>>>>> ------------------
>>>>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
>>>>> each LLC.
>>>>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first
>>>>> worker lcore from each LLC.
>>>>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on
>> hint
>>>>> (lcore id).
>>>>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC
>>>>> while skipping first worker.
>>>>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores
>>>>> from each LLC.
>>>>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>>>>> iterates through reaming lcores in each LLC.
>>>>>
>>> While the MACRO are simple wrapper invoking appropriate API. can this
>>> be worked out in this fashion?
>>>
>>> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-09 14:52 ` Mattias Rönnblom
@ 2024-09-11 3:26 ` Varghese, Vipin
2024-09-11 15:55 ` Mattias Rönnblom
2024-09-11 16:01 ` Bruce Richardson
0 siblings, 2 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-11 3:26 UTC (permalink / raw)
To: Mattias Rönnblom, Yigit, Ferruh, dev
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
>
> On 2024-09-09 16:22, Varghese, Vipin wrote:
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> > <snipped>
> >
> >>> <snipped>
> >>>
> >>> Thank you Mattias for the comments and question, please let me try
> >>> to explain the same below
> >>>
> >>>> We shouldn't have a separate CPU/cache hierarchy API instead?
> >>>
> >>> Based on the intention to bring in CPU lcores which share same L3
> >>> (for better cache hits and less noisy neighbor) current API focuses
> >>> on using
> >>>
> >>> Last Level Cache. But if the suggestion is `there are SoC where L2
> >>> cache are also shared, and the new API should be provisioned`, I am
> >>> also
> >>>
> >>> comfortable with the thought.
> >>>
> >>
> >> Rather than some AMD special case API hacked into <rte_lcore.h>, I
> >> think we are better off with no DPDK API at all for this kind of functionality.
> >
> > Hi Mattias, as shared in the earlier email thread, this is not a AMD special
> case at all. Let me try to explain this one more time. One of techniques used to
> increase cores cost effective way to go for tiles of compute complexes.
> > This introduces a bunch of cores in sharing same Last Level Cache (namely
> L2, L3 or even L4) depending upon cache topology architecture.
> >
> > The API suggested in RFC is to help end users to selectively use cores under
> same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS
> settings used). This is useful in both bare-metal and container environment.
> >
>
> I'm pretty familiar with AMD CPUs and the use of tiles (including the
> challenges these kinds of non-uniformities pose for work scheduling).
>
> To maximize performance, caring about core<->LLC relationship may well not
> be enough, and more HT/core/cache/memory topology information is
> required. That's what I meant by special case. A proper API should allow
> access to information about which lcores are SMT siblings, cores on the same
> L2, and cores on the same L3, to name a few things. Probably you want to fit
> NUMA into the same API as well, although that is available already in
> <rte_lcore.h>.
Thank you Mattias for the information, as shared by in the reply with Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a extra argument `u32 flags`.
The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2, RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED, RTE_GET_LCORE_BOOST_DISABLED.
This is AMD EPYC SoC agnostic and trying to address for all generic cases.
Please do let us know if we (Ferruh & myself) can sync up via call?
>
> One can have a look at how scheduling domains work in the Linux kernel.
> They model this kind of thing.
>
> > As shared in response for cover letter +1 to expand it to more than
> > just LLC cores. We have also confirmed the same to
> > https://patchwork.dpdk.org/project/dpdk/cover/20240827151014.201-
> 1-vip
> > in.varghese@amd.com/
> >
> >>
> >> A DPDK CPU/memory hierarchy topology API very much makes sense, but
> >> it should be reasonably generic and complete from the start.
> >>
> >>>>
> >>>> Could potentially be built on the 'hwloc' library.
> >>>
> >>> There are 3 reason on AMD SoC we did not explore this path, reasons
> >>> are
> >>>
> >>> 1. depending n hwloc version and kernel version certain SoC
> >>> hierarchies are not available
> >>>
> >>> 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD
> >> Epyc Soc.
> >>>
> >>> 3. adds the extra dependency layer of library layer to be made
> >>> available to work.
> >>>
> >>>
> >>> hence we have tried to use Linux Documented generic layer of `sysfs
> >>> CPU cache`.
> >>>
> >>> I will try to explore more on hwloc and check if other libraries
> >>> within DPDK leverages the same.
> >>>
> >>>>
> >>>> I much agree cache/core topology may be of interest of the
> >>>> application (or a work scheduler, like a DPDK event device), but
> >>>> it's not limited to LLC. It may well be worthwhile to care about
> >>>> which cores shares L2 cache, for example. Not sure the
> >>>> RTE_LCORE_FOREACH_*
> >> approach scales.
> >>>
> >>> yes, totally understand as some SoC, multiple lcores shares same L2 cache.
> >>>
> >>>
> >>> Can we rework the API to be rte_get_cache_<function> where user
> >>> argument is desired lcore index.
> >>>
> >>> 1. index-1: SMT threads
> >>>
> >>> 2. index-2: threads sharing same L2 cache
> >>>
> >>> 3. index-3: threads sharing same L3 cache
> >>>
> >>> 4. index-MAX: identify the threads sharing last level cache.
> >>>
> >>>>
> >>>>> < Function: Purpose >
> >>>>> ---------------------
> >>>>> - rte_get_llc_first_lcores: Retrieves all the first lcores in
> >>>>> the shared LLC.
> >>>>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
> >>>>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first
> >>>>> n lcores in the shared LLC.
> >>>>>
> >>>>> < MACRO: Purpose >
> >>>>> ------------------
> >>>>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
> >>>>> each LLC.
> >>>>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first
> >>>>> worker lcore from each LLC.
> >>>>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on
> >> hint
> >>>>> (lcore id).
> >>>>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from
> LLC
> >>>>> while skipping first worker.
> >>>>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n`
> lcores
> >>>>> from each LLC.
> >>>>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
> >>>>> iterates through reaming lcores in each LLC.
> >>>>>
> >>> While the MACRO are simple wrapper invoking appropriate API. can
> >>> this be worked out in this fashion?
> >>>
> >>> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-11 3:26 ` Varghese, Vipin
@ 2024-09-11 15:55 ` Mattias Rönnblom
2024-09-11 17:04 ` Honnappa Nagarahalli
2024-09-11 16:01 ` Bruce Richardson
1 sibling, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-11 15:55 UTC (permalink / raw)
To: Varghese, Vipin, Yigit, Ferruh, dev
On 2024-09-11 05:26, Varghese, Vipin wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> <snipped>
>
>>
>> On 2024-09-09 16:22, Varghese, Vipin wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> <snipped>
>>>
>>>>> <snipped>
>>>>>
>>>>> Thank you Mattias for the comments and question, please let me try
>>>>> to explain the same below
>>>>>
>>>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
>>>>>
>>>>> Based on the intention to bring in CPU lcores which share same L3
>>>>> (for better cache hits and less noisy neighbor) current API focuses
>>>>> on using
>>>>>
>>>>> Last Level Cache. But if the suggestion is `there are SoC where L2
>>>>> cache are also shared, and the new API should be provisioned`, I am
>>>>> also
>>>>>
>>>>> comfortable with the thought.
>>>>>
>>>>
>>>> Rather than some AMD special case API hacked into <rte_lcore.h>, I
>>>> think we are better off with no DPDK API at all for this kind of functionality.
>>>
>>> Hi Mattias, as shared in the earlier email thread, this is not a AMD special
>> case at all. Let me try to explain this one more time. One of techniques used to
>> increase cores cost effective way to go for tiles of compute complexes.
>>> This introduces a bunch of cores in sharing same Last Level Cache (namely
>> L2, L3 or even L4) depending upon cache topology architecture.
>>>
>>> The API suggested in RFC is to help end users to selectively use cores under
>> same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS
>> settings used). This is useful in both bare-metal and container environment.
>>>
>>
>> I'm pretty familiar with AMD CPUs and the use of tiles (including the
>> challenges these kinds of non-uniformities pose for work scheduling).
>>
>> To maximize performance, caring about core<->LLC relationship may well not
>> be enough, and more HT/core/cache/memory topology information is
>> required. That's what I meant by special case. A proper API should allow
>> access to information about which lcores are SMT siblings, cores on the same
>> L2, and cores on the same L3, to name a few things. Probably you want to fit
>> NUMA into the same API as well, although that is available already in
>> <rte_lcore.h>.
>
> Thank you Mattias for the information, as shared by in the reply with Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a extra argument `u32 flags`.
> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2, RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED, RTE_GET_LCORE_BOOST_DISABLED.
>
Wouldn't using that API be pretty awkward to use?
I mean, what you have is a topology, with nodes of different types and
with different properties, and you want to present it to the user.
In a sense, it's similar to XCM and DOM versus SAX. The above is
SAX-style, and what I have in mind is something DOM-like.
What use case do you have in mind? What's on top of my list is a
scenario where a DPDK app gets a bunch of cores (e.g., -l <cores>) and
tries to figure out how best make use of them. It's not going to "skip"
(ignore, leave unused) SMT siblings, or skip non-boosted cores, it would
just try to be clever in regards to which cores to use for what purpose.
> This is AMD EPYC SoC agnostic and trying to address for all generic cases.
>
> Please do let us know if we (Ferruh & myself) can sync up via call?
>
Sure, I can do that.
>>
>> One can have a look at how scheduling domains work in the Linux kernel.
>> They model this kind of thing.
>>
>>> As shared in response for cover letter +1 to expand it to more than
>>> just LLC cores. We have also confirmed the same to
>>> https://patchwork.dpdk.org/project/dpdk/cover/20240827151014.201-
>> 1-vip
>>> in.varghese@amd.com/
>>>
>>>>
>>>> A DPDK CPU/memory hierarchy topology API very much makes sense, but
>>>> it should be reasonably generic and complete from the start.
>>>>
>>>>>>
>>>>>> Could potentially be built on the 'hwloc' library.
>>>>>
>>>>> There are 3 reason on AMD SoC we did not explore this path, reasons
>>>>> are
>>>>>
>>>>> 1. depending n hwloc version and kernel version certain SoC
>>>>> hierarchies are not available
>>>>>
>>>>> 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD
>>>> Epyc Soc.
>>>>>
>>>>> 3. adds the extra dependency layer of library layer to be made
>>>>> available to work.
>>>>>
>>>>>
>>>>> hence we have tried to use Linux Documented generic layer of `sysfs
>>>>> CPU cache`.
>>>>>
>>>>> I will try to explore more on hwloc and check if other libraries
>>>>> within DPDK leverages the same.
>>>>>
>>>>>>
>>>>>> I much agree cache/core topology may be of interest of the
>>>>>> application (or a work scheduler, like a DPDK event device), but
>>>>>> it's not limited to LLC. It may well be worthwhile to care about
>>>>>> which cores shares L2 cache, for example. Not sure the
>>>>>> RTE_LCORE_FOREACH_*
>>>> approach scales.
>>>>>
>>>>> yes, totally understand as some SoC, multiple lcores shares same L2 cache.
>>>>>
>>>>>
>>>>> Can we rework the API to be rte_get_cache_<function> where user
>>>>> argument is desired lcore index.
>>>>>
>>>>> 1. index-1: SMT threads
>>>>>
>>>>> 2. index-2: threads sharing same L2 cache
>>>>>
>>>>> 3. index-3: threads sharing same L3 cache
>>>>>
>>>>> 4. index-MAX: identify the threads sharing last level cache.
>>>>>
>>>>>>
>>>>>>> < Function: Purpose >
>>>>>>> ---------------------
>>>>>>> - rte_get_llc_first_lcores: Retrieves all the first lcores in
>>>>>>> the shared LLC.
>>>>>>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>>>>>>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first
>>>>>>> n lcores in the shared LLC.
>>>>>>>
>>>>>>> < MACRO: Purpose >
>>>>>>> ------------------
>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
>>>>>>> each LLC.
>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first
>>>>>>> worker lcore from each LLC.
>>>>>>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on
>>>> hint
>>>>>>> (lcore id).
>>>>>>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from
>> LLC
>>>>>>> while skipping first worker.
>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n`
>> lcores
>>>>>>> from each LLC.
>>>>>>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>>>>>>> iterates through reaming lcores in each LLC.
>>>>>>>
>>>>> While the MACRO are simple wrapper invoking appropriate API. can
>>>>> this be worked out in this fashion?
>>>>>
>>>>> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-11 15:55 ` Mattias Rönnblom
@ 2024-09-11 17:04 ` Honnappa Nagarahalli
2024-09-12 1:33 ` Varghese, Vipin
2024-09-12 2:28 ` Varghese, Vipin
0 siblings, 2 replies; 56+ messages in thread
From: Honnappa Nagarahalli @ 2024-09-11 17:04 UTC (permalink / raw)
To: Mattias Rönnblom; +Cc: Varghese, Vipin, Yigit, Ferruh, dev, nd
> On Sep 11, 2024, at 10:55 AM, Mattias Rönnblom <hofors@lysator.liu.se> wrote:
>
> On 2024-09-11 05:26, Varghese, Vipin wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>> <snipped>
>>>
>>> On 2024-09-09 16:22, Varghese, Vipin wrote:
>>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>>
>>>> <snipped>
>>>>
>>>>>> <snipped>
>>>>>>
>>>>>> Thank you Mattias for the comments and question, please let me try
>>>>>> to explain the same below
>>>>>>
>>>>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
>>>>>>
>>>>>> Based on the intention to bring in CPU lcores which share same L3
>>>>>> (for better cache hits and less noisy neighbor) current API focuses
>>>>>> on using
>>>>>>
>>>>>> Last Level Cache. But if the suggestion is `there are SoC where L2
>>>>>> cache are also shared, and the new API should be provisioned`, I am
>>>>>> also
>>>>>>
>>>>>> comfortable with the thought.
>>>>>>
>>>>>
>>>>> Rather than some AMD special case API hacked into <rte_lcore.h>, I
>>>>> think we are better off with no DPDK API at all for this kind of functionality.
>>>>
>>>> Hi Mattias, as shared in the earlier email thread, this is not a AMD special
>>> case at all. Let me try to explain this one more time. One of techniques used to
>>> increase cores cost effective way to go for tiles of compute complexes.
>>>> This introduces a bunch of cores in sharing same Last Level Cache (namely
>>> L2, L3 or even L4) depending upon cache topology architecture.
>>>>
>>>> The API suggested in RFC is to help end users to selectively use cores under
>>> same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS
>>> settings used). This is useful in both bare-metal and container environment.
>>>>
>>>
>>> I'm pretty familiar with AMD CPUs and the use of tiles (including the
>>> challenges these kinds of non-uniformities pose for work scheduling).
>>>
>>> To maximize performance, caring about core<->LLC relationship may well not
>>> be enough, and more HT/core/cache/memory topology information is
>>> required. That's what I meant by special case. A proper API should allow
>>> access to information about which lcores are SMT siblings, cores on the same
>>> L2, and cores on the same L3, to name a few things. Probably you want to fit
>>> NUMA into the same API as well, although that is available already in
>>> <rte_lcore.h>.
>> Thank you Mattias for the information, as shared by in the reply with Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a extra argument `u32 flags`.
>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2, RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED, RTE_GET_LCORE_BOOST_DISABLED.
>
> Wouldn't using that API be pretty awkward to use?
>
> I mean, what you have is a topology, with nodes of different types and with different properties, and you want to present it to the user.
>
> In a sense, it's similar to XCM and DOM versus SAX. The above is SAX-style, and what I have in mind is something DOM-like.
>
> What use case do you have in mind? What's on top of my list is a scenario where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure out how best make use of them. It's not going to "skip" (ignore, leave unused) SMT siblings, or skip non-boosted cores, it would just try to be clever in regards to which cores to use for what purpose.
>
>> This is AMD EPYC SoC agnostic and trying to address for all generic cases.
>> Please do let us know if we (Ferruh & myself) can sync up via call?
>
> Sure, I can do that.
>
Can this be opened to the rest of the community? This is a common problem that needs to be solved for multiple architectures. I would be interested in attending.
>>>
>>> One can have a look at how scheduling domains work in the Linux kernel.
>>> They model this kind of thing.
>>>
>>>> As shared in response for cover letter +1 to expand it to more than
>>>> just LLC cores. We have also confirmed the same to
>>>> https://patchwork.dpdk.org/project/dpdk/cover/20240827151014.201-
>>> 1-vip
>>>> in.varghese@amd.com/
>>>>
>>>>>
>>>>> A DPDK CPU/memory hierarchy topology API very much makes sense, but
>>>>> it should be reasonably generic and complete from the start.
>>>>>
>>>>>>>
>>>>>>> Could potentially be built on the 'hwloc' library.
>>>>>>
>>>>>> There are 3 reason on AMD SoC we did not explore this path, reasons
>>>>>> are
>>>>>>
>>>>>> 1. depending n hwloc version and kernel version certain SoC
>>>>>> hierarchies are not available
>>>>>>
>>>>>> 2. CPU NUMA and IO (memory & PCIe) NUMA are independent on AMD
>>>>> Epyc Soc.
>>>>>>
>>>>>> 3. adds the extra dependency layer of library layer to be made
>>>>>> available to work.
>>>>>>
>>>>>>
>>>>>> hence we have tried to use Linux Documented generic layer of `sysfs
>>>>>> CPU cache`.
>>>>>>
>>>>>> I will try to explore more on hwloc and check if other libraries
>>>>>> within DPDK leverages the same.
>>>>>>
>>>>>>>
>>>>>>> I much agree cache/core topology may be of interest of the
>>>>>>> application (or a work scheduler, like a DPDK event device), but
>>>>>>> it's not limited to LLC. It may well be worthwhile to care about
>>>>>>> which cores shares L2 cache, for example. Not sure the
>>>>>>> RTE_LCORE_FOREACH_*
>>>>> approach scales.
>>>>>>
>>>>>> yes, totally understand as some SoC, multiple lcores shares same L2 cache.
>>>>>>
>>>>>>
>>>>>> Can we rework the API to be rte_get_cache_<function> where user
>>>>>> argument is desired lcore index.
>>>>>>
>>>>>> 1. index-1: SMT threads
>>>>>>
>>>>>> 2. index-2: threads sharing same L2 cache
>>>>>>
>>>>>> 3. index-3: threads sharing same L3 cache
>>>>>>
>>>>>> 4. index-MAX: identify the threads sharing last level cache.
>>>>>>
>>>>>>>
>>>>>>>> < Function: Purpose >
>>>>>>>> ---------------------
>>>>>>>> - rte_get_llc_first_lcores: Retrieves all the first lcores in
>>>>>>>> the shared LLC.
>>>>>>>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>>>>>>>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first
>>>>>>>> n lcores in the shared LLC.
>>>>>>>>
>>>>>>>> < MACRO: Purpose >
>>>>>>>> ------------------
>>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from
>>>>>>>> each LLC.
>>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first
>>>>>>>> worker lcore from each LLC.
>>>>>>>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on
>>>>> hint
>>>>>>>> (lcore id).
>>>>>>>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from
>>> LLC
>>>>>>>> while skipping first worker.
>>>>>>>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n`
>>> lcores
>>>>>>>> from each LLC.
>>>>>>>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>>>>>>>> iterates through reaming lcores in each LLC.
>>>>>>>>
>>>>>> While the MACRO are simple wrapper invoking appropriate API. can
>>>>>> this be worked out in this fashion?
>>>>>>
>>>>>> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 17:04 ` Honnappa Nagarahalli
@ 2024-09-12 1:33 ` Varghese, Vipin
2024-09-12 6:38 ` Mattias Rönnblom
2024-09-12 2:28 ` Varghese, Vipin
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 1:33 UTC (permalink / raw)
To: Honnappa Nagarahalli, Mattias Rönnblom; +Cc: Yigit, Ferruh, dev, nd
[Public]
Snipped
> >>>>
> >>>> <snipped>
> >>>>
> >>>>>> <snipped>
> >>>>>>
> >>>>>> Thank you Mattias for the comments and question, please let me
> >>>>>> try to explain the same below
> >>>>>>
> >>>>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
> >>>>>>
> >>>>>> Based on the intention to bring in CPU lcores which share same L3
> >>>>>> (for better cache hits and less noisy neighbor) current API
> >>>>>> focuses on using
> >>>>>>
> >>>>>> Last Level Cache. But if the suggestion is `there are SoC where
> >>>>>> L2 cache are also shared, and the new API should be provisioned`,
> >>>>>> I am also
> >>>>>>
> >>>>>> comfortable with the thought.
> >>>>>>
> >>>>>
> >>>>> Rather than some AMD special case API hacked into <rte_lcore.h>, I
> >>>>> think we are better off with no DPDK API at all for this kind of
> functionality.
> >>>>
> >>>> Hi Mattias, as shared in the earlier email thread, this is not a
> >>>> AMD special
> >>> case at all. Let me try to explain this one more time. One of
> >>> techniques used to increase cores cost effective way to go for tiles of
> compute complexes.
> >>>> This introduces a bunch of cores in sharing same Last Level Cache
> >>>> (namely
> >>> L2, L3 or even L4) depending upon cache topology architecture.
> >>>>
> >>>> The API suggested in RFC is to help end users to selectively use
> >>>> cores under
> >>> same Last Level Cache Hierarchy as advertised by OS (irrespective of
> >>> the BIOS settings used). This is useful in both bare-metal and container
> environment.
> >>>>
> >>>
> >>> I'm pretty familiar with AMD CPUs and the use of tiles (including
> >>> the challenges these kinds of non-uniformities pose for work scheduling).
> >>>
> >>> To maximize performance, caring about core<->LLC relationship may
> >>> well not be enough, and more HT/core/cache/memory topology
> >>> information is required. That's what I meant by special case. A
> >>> proper API should allow access to information about which lcores are
> >>> SMT siblings, cores on the same L2, and cores on the same L3, to
> >>> name a few things. Probably you want to fit NUMA into the same API
> >>> as well, although that is available already in <rte_lcore.h>.
> >> Thank you Mattias for the information, as shared by in the reply with
> Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a
> extra argument `u32 flags`.
> >> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
> RTE_GET_LCORE_BOOST_DISABLED.
> >
> > Wouldn't using that API be pretty awkward to use?
Current API available under DPDK is ` rte_get_next_lcore`, which is used within DPDK example and in customer solution.
Based on the comments from others we responded to the idea of changing the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
Can you please help us understand what is `awkward`.
> >
> > I mean, what you have is a topology, with nodes of different types and with
> different properties, and you want to present it to the user.
Let me be clear, what we want via DPDK to help customer to use an Unified API which works across multiple platforms.
Example - let a vendor have 2 products namely A and B. CPU-A has all cores within same SUB-NUMA domain and CPU-B has cores split to 2 sub-NUMA domain based on split LLC.
When `rte_get_next_lcore_extnd` is invoked for `LLC` on
1. CPU-A: it returns all cores as there is no split
2. CPU-B: it returns cores from specific sub-NUMA which is partitioned by L3
> >
> > In a sense, it's similar to XCM and DOM versus SAX. The above is SAX-style,
> and what I have in mind is something DOM-like.
> >
> > What use case do you have in mind? What's on top of my list is a scenario
> where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure
> out how best make use of them.
Exactly.
It's not going to "skip" (ignore, leave unused)
> SMT siblings, or skip non-boosted cores, it would just try to be clever in
> regards to which cores to use for what purpose.
Let me try to share my idea on SMT sibling. When user invoked for rte_get_next_lcore_extnd` is invoked for `L1 | SMT` flag with `lcore`; the API identifies first whether given lcore is part of enabled core list.
If yes, it programmatically either using `sysfs` or `hwloc library (shared the version concern on distros. Will recheck again)` identify the sibling thread and return.
If there is no sibling thread available under DPDK it will fetch next lcore (probably lcore +1 ).
> >
> >> This is AMD EPYC SoC agnostic and trying to address for all generic cases.
> >> Please do let us know if we (Ferruh & myself) can sync up via call?
> >
> > Sure, I can do that.
Let me sync with Ferruh and get a time slot for internal sync.
> >
> Can this be opened to the rest of the community? This is a common problem
> that needs to be solved for multiple architectures. I would be interested in
> attending.
Thank you Mattias, in DPDK Bangkok summit 2024 we did bring this up. As per the suggestion from Thomas and Jerrin we tried to bring the RFC for discussion.
For DPDK Montreal 2024, Keesang and Ferruh (most likely) is travelling for the summit and presenting this as the talk to get things moving.
>
> >>>
<snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 1:33 ` Varghese, Vipin
@ 2024-09-12 6:38 ` Mattias Rönnblom
2024-09-12 7:02 ` Mattias Rönnblom
2024-09-12 11:17 ` Varghese, Vipin
0 siblings, 2 replies; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 6:38 UTC (permalink / raw)
To: Varghese, Vipin, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
On 2024-09-12 03:33, Varghese, Vipin wrote:
> [Public]
>
> Snipped
>
>>>>>>
>>>>>> <snipped>
>>>>>>
>>>>>>>> <snipped>
>>>>>>>>
>>>>>>>> Thank you Mattias for the comments and question, please let me
>>>>>>>> try to explain the same below
>>>>>>>>
>>>>>>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
>>>>>>>>
>>>>>>>> Based on the intention to bring in CPU lcores which share same L3
>>>>>>>> (for better cache hits and less noisy neighbor) current API
>>>>>>>> focuses on using
>>>>>>>>
>>>>>>>> Last Level Cache. But if the suggestion is `there are SoC where
>>>>>>>> L2 cache are also shared, and the new API should be provisioned`,
>>>>>>>> I am also
>>>>>>>>
>>>>>>>> comfortable with the thought.
>>>>>>>>
>>>>>>>
>>>>>>> Rather than some AMD special case API hacked into <rte_lcore.h>, I
>>>>>>> think we are better off with no DPDK API at all for this kind of
>> functionality.
>>>>>>
>>>>>> Hi Mattias, as shared in the earlier email thread, this is not a
>>>>>> AMD special
>>>>> case at all. Let me try to explain this one more time. One of
>>>>> techniques used to increase cores cost effective way to go for tiles of
>> compute complexes.
>>>>>> This introduces a bunch of cores in sharing same Last Level Cache
>>>>>> (namely
>>>>> L2, L3 or even L4) depending upon cache topology architecture.
>>>>>>
>>>>>> The API suggested in RFC is to help end users to selectively use
>>>>>> cores under
>>>>> same Last Level Cache Hierarchy as advertised by OS (irrespective of
>>>>> the BIOS settings used). This is useful in both bare-metal and container
>> environment.
>>>>>>
>>>>>
>>>>> I'm pretty familiar with AMD CPUs and the use of tiles (including
>>>>> the challenges these kinds of non-uniformities pose for work scheduling).
>>>>>
>>>>> To maximize performance, caring about core<->LLC relationship may
>>>>> well not be enough, and more HT/core/cache/memory topology
>>>>> information is required. That's what I meant by special case. A
>>>>> proper API should allow access to information about which lcores are
>>>>> SMT siblings, cores on the same L2, and cores on the same L3, to
>>>>> name a few things. Probably you want to fit NUMA into the same API
>>>>> as well, although that is available already in <rte_lcore.h>.
>>>> Thank you Mattias for the information, as shared by in the reply with
>> Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a
>> extra argument `u32 flags`.
>>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>> RTE_GET_LCORE_BOOST_DISABLED.
>>>
>>> Wouldn't using that API be pretty awkward to use?
> Current API available under DPDK is ` rte_get_next_lcore`, which is used within DPDK example and in customer solution.
> Based on the comments from others we responded to the idea of changing the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
>
> Can you please help us understand what is `awkward`.
>
The awkwardness starts when you are trying to fit provide hwloc type
information over an API that was designed for iterating over lcores.
It seems to me that you should either have:
A) An API in similar to that of hwloc (or any DOM-like API), which would
give a low-level description of the hardware in implementation terms.
The topology would consist of nodes, with attributes, etc, where nodes
are things like cores or instances of caches of some level and
attributes are things like CPU actual and nominal, and maybe max
frequency, cache size, or memory size.
or
B) An API to be directly useful for a work scheduler, in which case you
should abstract away things like "boost" (and fold them into some
abstract capacity notion, together with core "size" [in
big-little/heterogeneous systems]), and have an abstract notion of what
core is "close" to some other core. This would something like Linux'
scheduling domains.
If you want B you probably need A as a part of its implementation, so
you may just as well start with A, I suppose.
What you could do to explore the API design is to add support for, for
example, boost core awareness or SMT affinity in the SW scheduler. You
could also do an "lstopo" equivalent, since that's needed for debugging
and exploration, if nothing else.
One question that will have to be answered in a work scheduling scenario
is "are these two lcores SMT siblings," or "are these two cores on the
same LLC", or "give me all lcores on a particular L2 cache".
>>>
>>> I mean, what you have is a topology, with nodes of different types and with
>> different properties, and you want to present it to the user.
> Let me be clear, what we want via DPDK to help customer to use an Unified API which works across multiple platforms.
> Example - let a vendor have 2 products namely A and B. CPU-A has all cores within same SUB-NUMA domain and CPU-B has cores split to 2 sub-NUMA domain based on split LLC.
> When `rte_get_next_lcore_extnd` is invoked for `LLC` on
> 1. CPU-A: it returns all cores as there is no split
> 2. CPU-B: it returns cores from specific sub-NUMA which is partitioned by L3
>
I think the function name rte_get_next_lcore_extnd() alone makes clear
this is an awkward API. :)
My gut feeling is to make it more explicit and forget about
<rte_lcore.h>. <rte_hwtopo.h>? Could and should still be EAL.
>>>
>>> In a sense, it's similar to XCM and DOM versus SAX. The above is SAX-style,
>> and what I have in mind is something DOM-like.
>>>
>>> What use case do you have in mind? What's on top of my list is a scenario
>> where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure
>> out how best make use of them.
> Exactly.
>
> It's not going to "skip" (ignore, leave unused)
>> SMT siblings, or skip non-boosted cores, it would just try to be clever in
>> regards to which cores to use for what purpose.
> Let me try to share my idea on SMT sibling. When user invoked for rte_get_next_lcore_extnd` is invoked for `L1 | SMT` flag with `lcore`; the API identifies first whether given lcore is part of enabled core list.
> If yes, it programmatically either using `sysfs` or `hwloc library (shared the version concern on distros. Will recheck again)` identify the sibling thread and return.
> If there is no sibling thread available under DPDK it will fetch next lcore (probably lcore +1 ).
>
Distributions having old hwloc versions isn't an argument for a new DPDK
library or new API. If only that was the issue, then it would be better
to help the hwloc and/or distributions, rather than the DPDK project.
>>>
>>>> This is AMD EPYC SoC agnostic and trying to address for all generic cases.
>>>> Please do let us know if we (Ferruh & myself) can sync up via call?
>>>
>>> Sure, I can do that.
>
> Let me sync with Ferruh and get a time slot for internal sync.
>
>>>
>> Can this be opened to the rest of the community? This is a common problem
>> that needs to be solved for multiple architectures. I would be interested in
>> attending.
> Thank you Mattias, in DPDK Bangkok summit 2024 we did bring this up. As per the suggestion from Thomas and Jerrin we tried to bring the RFC for discussion.
> For DPDK Montreal 2024, Keesang and Ferruh (most likely) is travelling for the summit and presenting this as the talk to get things moving.
>
>>
>>>>>
> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 6:38 ` Mattias Rönnblom
@ 2024-09-12 7:02 ` Mattias Rönnblom
2024-09-12 11:23 ` Varghese, Vipin
2024-09-12 11:17 ` Varghese, Vipin
1 sibling, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 7:02 UTC (permalink / raw)
To: Varghese, Vipin, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
On 2024-09-12 08:38, Mattias Rönnblom wrote:
> On 2024-09-12 03:33, Varghese, Vipin wrote:
>> [Public]
>>
>> Snipped
>>
>>>>>>>
>>>>>>> <snipped>
>>>>>>>
>>>>>>>>> <snipped>
>>>>>>>>>
>>>>>>>>> Thank you Mattias for the comments and question, please let me
>>>>>>>>> try to explain the same below
>>>>>>>>>
>>>>>>>>>> We shouldn't have a separate CPU/cache hierarchy API instead?
>>>>>>>>>
>>>>>>>>> Based on the intention to bring in CPU lcores which share same L3
>>>>>>>>> (for better cache hits and less noisy neighbor) current API
>>>>>>>>> focuses on using
>>>>>>>>>
>>>>>>>>> Last Level Cache. But if the suggestion is `there are SoC where
>>>>>>>>> L2 cache are also shared, and the new API should be provisioned`,
>>>>>>>>> I am also
>>>>>>>>>
>>>>>>>>> comfortable with the thought.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Rather than some AMD special case API hacked into <rte_lcore.h>, I
>>>>>>>> think we are better off with no DPDK API at all for this kind of
>>> functionality.
>>>>>>>
>>>>>>> Hi Mattias, as shared in the earlier email thread, this is not a
>>>>>>> AMD special
>>>>>> case at all. Let me try to explain this one more time. One of
>>>>>> techniques used to increase cores cost effective way to go for
>>>>>> tiles of
>>> compute complexes.
>>>>>>> This introduces a bunch of cores in sharing same Last Level Cache
>>>>>>> (namely
>>>>>> L2, L3 or even L4) depending upon cache topology architecture.
>>>>>>>
>>>>>>> The API suggested in RFC is to help end users to selectively use
>>>>>>> cores under
>>>>>> same Last Level Cache Hierarchy as advertised by OS (irrespective of
>>>>>> the BIOS settings used). This is useful in both bare-metal and
>>>>>> container
>>> environment.
>>>>>>>
>>>>>>
>>>>>> I'm pretty familiar with AMD CPUs and the use of tiles (including
>>>>>> the challenges these kinds of non-uniformities pose for work
>>>>>> scheduling).
>>>>>>
>>>>>> To maximize performance, caring about core<->LLC relationship may
>>>>>> well not be enough, and more HT/core/cache/memory topology
>>>>>> information is required. That's what I meant by special case. A
>>>>>> proper API should allow access to information about which lcores are
>>>>>> SMT siblings, cores on the same L2, and cores on the same L3, to
>>>>>> name a few things. Probably you want to fit NUMA into the same API
>>>>>> as well, although that is available already in <rte_lcore.h>.
>>>>> Thank you Mattias for the information, as shared by in the reply with
>>> Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a
>>> extra argument `u32 flags`.
>>>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>>> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>>> RTE_GET_LCORE_BOOST_DISABLED.
>>>>
>>>> Wouldn't using that API be pretty awkward to use?
>> Current API available under DPDK is ` rte_get_next_lcore`, which is
>> used within DPDK example and in customer solution.
>> Based on the comments from others we responded to the idea of changing
>> the new Api from ` rte_get_next_lcore_llc` to `
>> rte_get_next_lcore_exntd`.
>>
>> Can you please help us understand what is `awkward`.
>>
>
> The awkwardness starts when you are trying to fit provide hwloc type
> information over an API that was designed for iterating over lcores.
>
> It seems to me that you should either have:
> A) An API in similar to that of hwloc (or any DOM-like API), which would
> give a low-level description of the hardware in implementation terms.
> The topology would consist of nodes, with attributes, etc, where nodes
> are things like cores or instances of caches of some level and
> attributes are things like CPU actual and nominal, and maybe max
> frequency, cache size, or memory size.
To to be clear; it's something like this I think of when I say
"DOM-style" API.
#ifndef RTE_HWTOPO_H
#define RTE_HWTOPO_H
struct rte_hwtopo_node;
enum rte_hwtopo_node_type {
RTE_HWTOPO_NODE_TYPE_CPU_CORE,
RTE_HWTOPO_NODE_TYPE_CACHE,
RTE_HWTOPO_NODE_TYPE_NUMA
};
int
rte_hwtopo_init(void);
struct rte_hwtopo_node *
rte_hwtopo_get_core_by_lcore(unsigned int lcore);
struct rte_hwtopo_node *
rte_hwtopo_get_core_by_id(unsigned int os_cpu_id);
struct rte_hwtopo_node *
rte_hwtopo_parent(struct rte_hwtopo_node *node);
struct rte_hwtopo_node *
rte_hwtopo_first_child(struct rte_hwtopo_node *node);
struct rte_hwtopo_node *
rte_hwtopo_next_child(struct rte_hwtopo_node *node,
struct rte_hwtopo_node *child);
struct rte_hwtopo_node *
rte_hwtopo_first_sibling(struct rte_hwtopo_node *node);
struct rte_hwtopo_node *
rte_hwtopo_next_sibling(struct rte_hwtopo_node *node,
struct rte_hwtopo_node *child);
enum rte_hwtopo_node_type
rte_hwtopo_get_type(struct rte_hwtopo_node *node);
#define RTE_HWTOPO_NODE_ATTR_CORE_FREQUENCY_NOMINAL 0
#define RTE_HWTOPO_NODE_ATTR_CACHE_LEVEL 1
#define RTE_HWTOPO_NODE_ATTR_CACHE_SIZE 2
int
rte_hwtopo_get_attr_int64(struct rte_hwtopo_node *node, unsigned int
attr_name,
int64_t *attr_value);
int
rte_hwtopo_get_attr_str(struct rte_hwtopo_node *node, unsigned int
attr_name,
char *attr_value, size_t capacity);
#endif
Surely, this too would be awkward (or should I say cumbersome) to use in
certain scenarios. You could have syntactic sugar/special case helpers
which address common use cases. You would also build abstractions on top
of this (like the B case below).
One could have node type specific functions instead of generic getter
and setters. Anyway, this is not a counter-proposal, but rather just to
make clear, what I had in mind.
> or
> B) An API to be directly useful for a work scheduler, in which case you
> should abstract away things like "boost" (and fold them into some
> abstract capacity notion, together with core "size" [in
> big-little/heterogeneous systems]), and have an abstract notion of what
> core is "close" to some other core. This would something like Linux'
> scheduling domains.
>
> If you want B you probably need A as a part of its implementation, so
> you may just as well start with A, I suppose.
>
> What you could do to explore the API design is to add support for, for
> example, boost core awareness or SMT affinity in the SW scheduler. You
> could also do an "lstopo" equivalent, since that's needed for debugging
> and exploration, if nothing else.
>
> One question that will have to be answered in a work scheduling scenario
> is "are these two lcores SMT siblings," or "are these two cores on the
> same LLC", or "give me all lcores on a particular L2 cache".
>
>>>>
>>>> I mean, what you have is a topology, with nodes of different types
>>>> and with
>>> different properties, and you want to present it to the user.
>> Let me be clear, what we want via DPDK to help customer to use an
>> Unified API which works across multiple platforms.
>> Example - let a vendor have 2 products namely A and B. CPU-A has all
>> cores within same SUB-NUMA domain and CPU-B has cores split to 2
>> sub-NUMA domain based on split LLC.
>> When `rte_get_next_lcore_extnd` is invoked for `LLC` on
>> 1. CPU-A: it returns all cores as there is no split
>> 2. CPU-B: it returns cores from specific sub-NUMA which is partitioned
>> by L3
>>
>
> I think the function name rte_get_next_lcore_extnd() alone makes clear
> this is an awkward API. :)
>
> My gut feeling is to make it more explicit and forget about
> <rte_lcore.h>. <rte_hwtopo.h>? Could and should still be EAL.
>
>>>>
>>>> In a sense, it's similar to XCM and DOM versus SAX. The above is
>>>> SAX-style,
>>> and what I have in mind is something DOM-like.
>>>>
>>>> What use case do you have in mind? What's on top of my list is a
>>>> scenario
>>> where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries
>>> to figure
>>> out how best make use of them.
>> Exactly.
>>
>> It's not going to "skip" (ignore, leave unused)
>>> SMT siblings, or skip non-boosted cores, it would just try to be
>>> clever in
>>> regards to which cores to use for what purpose.
>> Let me try to share my idea on SMT sibling. When user invoked for
>> rte_get_next_lcore_extnd` is invoked for `L1 | SMT` flag with `lcore`;
>> the API identifies first whether given lcore is part of enabled core
>> list.
>> If yes, it programmatically either using `sysfs` or `hwloc library
>> (shared the version concern on distros. Will recheck again)` identify
>> the sibling thread and return.
>> If there is no sibling thread available under DPDK it will fetch next
>> lcore (probably lcore +1 ).
>>
>
> Distributions having old hwloc versions isn't an argument for a new DPDK
> library or new API. If only that was the issue, then it would be better
> to help the hwloc and/or distributions, rather than the DPDK project.
>
>>>>
>>>>> This is AMD EPYC SoC agnostic and trying to address for all generic
>>>>> cases.
>>>>> Please do let us know if we (Ferruh & myself) can sync up via call?
>>>>
>>>> Sure, I can do that.
>>
>> Let me sync with Ferruh and get a time slot for internal sync.
>>
>>>>
>>> Can this be opened to the rest of the community? This is a common
>>> problem
>>> that needs to be solved for multiple architectures. I would be
>>> interested in
>>> attending.
>> Thank you Mattias, in DPDK Bangkok summit 2024 we did bring this up.
>> As per the suggestion from Thomas and Jerrin we tried to bring the RFC
>> for discussion.
>> For DPDK Montreal 2024, Keesang and Ferruh (most likely) is travelling
>> for the summit and presenting this as the talk to get things moving.
>>
>>>
>>>>>>
>> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-12 7:02 ` Mattias Rönnblom
@ 2024-09-12 11:23 ` Varghese, Vipin
2024-09-12 12:12 ` Mattias Rönnblom
0 siblings, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 11:23 UTC (permalink / raw)
To: Mattias Rönnblom, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
[-- Attachment #1: Type: text/plain, Size: 2054 bytes --]
[Public]
Snipped
>
>
> To to be clear; it's something like this I think of when I say "DOM-style" API.
>
> #ifndef RTE_HWTOPO_H
> #define RTE_HWTOPO_H
>
> struct rte_hwtopo_node;
>
> enum rte_hwtopo_node_type {
> RTE_HWTOPO_NODE_TYPE_CPU_CORE,
> RTE_HWTOPO_NODE_TYPE_CACHE,
> RTE_HWTOPO_NODE_TYPE_NUMA
> };
>
> int
> rte_hwtopo_init(void);
>
> struct rte_hwtopo_node *
> rte_hwtopo_get_core_by_lcore(unsigned int lcore);
>
> struct rte_hwtopo_node *
> rte_hwtopo_get_core_by_id(unsigned int os_cpu_id);
>
> struct rte_hwtopo_node *
> rte_hwtopo_parent(struct rte_hwtopo_node *node);
>
> struct rte_hwtopo_node *
> rte_hwtopo_first_child(struct rte_hwtopo_node *node);
>
> struct rte_hwtopo_node *
> rte_hwtopo_next_child(struct rte_hwtopo_node *node,
> struct rte_hwtopo_node *child);
>
> struct rte_hwtopo_node *
> rte_hwtopo_first_sibling(struct rte_hwtopo_node *node);
>
> struct rte_hwtopo_node *
> rte_hwtopo_next_sibling(struct rte_hwtopo_node *node,
> struct rte_hwtopo_node *child);
>
> enum rte_hwtopo_node_type
> rte_hwtopo_get_type(struct rte_hwtopo_node *node);
>
> #define RTE_HWTOPO_NODE_ATTR_CORE_FREQUENCY_NOMINAL 0 #define
> RTE_HWTOPO_NODE_ATTR_CACHE_LEVEL 1 #define
> RTE_HWTOPO_NODE_ATTR_CACHE_SIZE 2
>
> int
> rte_hwtopo_get_attr_int64(struct rte_hwtopo_node *node, unsigned int
> attr_name,
> int64_t *attr_value);
>
> int
> rte_hwtopo_get_attr_str(struct rte_hwtopo_node *node, unsigned int
> attr_name,
> char *attr_value, size_t capacity);
>
> #endif
>
> Surely, this too would be awkward (or should I say cumbersome) to use in certain scenarios.
This appears to be more like hwloc api calls, as shared in my earlier email my intention with the API suggestion is not introduce new library.
I have certain reservations and with my current understanding I am not able to map certain DPDK core mapping. Let discuss this in technical call.
Snipped
[-- Attachment #2: Type: text/html, Size: 9142 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 11:23 ` Varghese, Vipin
@ 2024-09-12 12:12 ` Mattias Rönnblom
2024-09-12 15:50 ` Stephen Hemminger
0 siblings, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 12:12 UTC (permalink / raw)
To: Varghese, Vipin, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
On 2024-09-12 13:23, Varghese, Vipin wrote:
> [Public]
>
> Snipped
>>
>>
>> To to be clear; it's something like this I think of when I say "DOM-style" API.
>>
>> #ifndef RTE_HWTOPO_H
>> #define RTE_HWTOPO_H
>>
>> struct rte_hwtopo_node;
>>
>> enum rte_hwtopo_node_type {
>> RTE_HWTOPO_NODE_TYPE_CPU_CORE,
>> RTE_HWTOPO_NODE_TYPE_CACHE,
>> RTE_HWTOPO_NODE_TYPE_NUMA
>> };
>>
>> int
>> rte_hwtopo_init(void);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_get_core_by_lcore(unsigned int lcore);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_get_core_by_id(unsigned int os_cpu_id);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_parent(struct rte_hwtopo_node *node);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_first_child(struct rte_hwtopo_node *node);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_next_child(struct rte_hwtopo_node *node,
>> struct rte_hwtopo_node *child);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_first_sibling(struct rte_hwtopo_node *node);
>>
>> struct rte_hwtopo_node *
>> rte_hwtopo_next_sibling(struct rte_hwtopo_node *node,
>> struct rte_hwtopo_node *child);
>>
>> enum rte_hwtopo_node_type
>> rte_hwtopo_get_type(struct rte_hwtopo_node *node);
>>
>> #define RTE_HWTOPO_NODE_ATTR_CORE_FREQUENCY_NOMINAL 0 #define
>> RTE_HWTOPO_NODE_ATTR_CACHE_LEVEL 1 #define
>> RTE_HWTOPO_NODE_ATTR_CACHE_SIZE 2
>>
>> int
>> rte_hwtopo_get_attr_int64(struct rte_hwtopo_node *node, unsigned int
>> attr_name,
>> int64_t *attr_value);
>>
>> int
>> rte_hwtopo_get_attr_str(struct rte_hwtopo_node *node, unsigned int
>> attr_name,
>> char *attr_value, size_t capacity);
>>
>> #endif
>>
>> Surely, this too would be awkward (or should I say cumbersome) to use in certain scenarios.
> This appears to be more like hwloc api calls, as shared in my earlier
> email my intention with the API suggestion is not introduce new library.
> I have certain reservations and with my current understanding I am not
> able to map certain DPDK core mapping. Let discuss this in technical call.
> Snipped
It still would need to be a part of EAL (so not a new library), since
EAL surely would depend on it (sooner rather than later).
If this functionality should be a new library, or a new API in an
existing library, it doesn't really matter if your original intentions
where something else, does it.
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 12:12 ` Mattias Rönnblom
@ 2024-09-12 15:50 ` Stephen Hemminger
0 siblings, 0 replies; 56+ messages in thread
From: Stephen Hemminger @ 2024-09-12 15:50 UTC (permalink / raw)
To: Mattias Rönnblom
Cc: Varghese, Vipin, Honnappa Nagarahalli, Yigit, Ferruh, dev, nd
On Thu, 12 Sep 2024 14:12:55 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> On 2024-09-12 13:23, Varghese, Vipin wrote:
> > [Public]
> >
> > Snipped
> >>
> >>
> >> To to be clear; it's something like this I think of when I say "DOM-style" API.
> >>
> >> #ifndef RTE_HWTOPO_H
> >> #define RTE_HWTOPO_H
> >>
> >> struct rte_hwtopo_node;
> >>
> >> enum rte_hwtopo_node_type {
> >> RTE_HWTOPO_NODE_TYPE_CPU_CORE,
> >> RTE_HWTOPO_NODE_TYPE_CACHE,
> >> RTE_HWTOPO_NODE_TYPE_NUMA
> >> };
> >>
> >> int
> >> rte_hwtopo_init(void);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_get_core_by_lcore(unsigned int lcore);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_get_core_by_id(unsigned int os_cpu_id);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_parent(struct rte_hwtopo_node *node);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_first_child(struct rte_hwtopo_node *node);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_next_child(struct rte_hwtopo_node *node,
> >> struct rte_hwtopo_node *child);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_first_sibling(struct rte_hwtopo_node *node);
> >>
> >> struct rte_hwtopo_node *
> >> rte_hwtopo_next_sibling(struct rte_hwtopo_node *node,
> >> struct rte_hwtopo_node *child);
> >>
> >> enum rte_hwtopo_node_type
> >> rte_hwtopo_get_type(struct rte_hwtopo_node *node);
> >>
> >> #define RTE_HWTOPO_NODE_ATTR_CORE_FREQUENCY_NOMINAL 0 #define
> >> RTE_HWTOPO_NODE_ATTR_CACHE_LEVEL 1 #define
> >> RTE_HWTOPO_NODE_ATTR_CACHE_SIZE 2
> >>
> >> int
> >> rte_hwtopo_get_attr_int64(struct rte_hwtopo_node *node, unsigned int
> >> attr_name,
> >> int64_t *attr_value);
> >>
> >> int
> >> rte_hwtopo_get_attr_str(struct rte_hwtopo_node *node, unsigned int
> >> attr_name,
> >> char *attr_value, size_t capacity);
> >>
> >> #endif
> >>
> >> Surely, this too would be awkward (or should I say cumbersome) to use in certain scenarios.
> > This appears to be more like hwloc api calls, as shared in my earlier
> > email my intention with the API suggestion is not introduce new library.
> > I have certain reservations and with my current understanding I am not
> > able to map certain DPDK core mapping. Let discuss this in technical call.
> > Snipped
>
> It still would need to be a part of EAL (so not a new library), since
> EAL surely would depend on it (sooner rather than later).
>
> If this functionality should be a new library, or a new API in an
> existing library, it doesn't really matter if your original intentions
> where something else, does it.
>
Good discussion.
I wonder if the API would be cleaner if it just provided a tree representation
of the hardware in a data structure, instead of trying to provide FOREACH.
The other concern is that hardware will evolve and there is likely to be more
possibilities. It is impossible totally future proof API's (YAGNI) but worth
thinking about it now.
There is also the issue of core's with different clock speeds that should
be represented as well.
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-12 6:38 ` Mattias Rönnblom
2024-09-12 7:02 ` Mattias Rönnblom
@ 2024-09-12 11:17 ` Varghese, Vipin
2024-09-12 11:59 ` Mattias Rönnblom
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 11:17 UTC (permalink / raw)
To: Mattias Rönnblom, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
[-- Attachment #1: Type: text/plain, Size: 8241 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
> >>>> Thank you Mattias for the information, as shared by in the reply
> >>>> with
> >> Anatoly we want expose a new API `rte_get_next_lcore_ex` which
> >> intakes a extra argument `u32 flags`.
> >>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
> >> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
> >> RTE_GET_LCORE_BOOST_DISABLED.
> >>>
> >>> Wouldn't using that API be pretty awkward to use?
> > Current API available under DPDK is ` rte_get_next_lcore`, which is used
> within DPDK example and in customer solution.
> > Based on the comments from others we responded to the idea of changing
> the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
> >
> > Can you please help us understand what is `awkward`.
> >
>
> The awkwardness starts when you are trying to fit provide hwloc type
> information over an API that was designed for iterating over lcores.
I disagree to this point, current implementation of lcore libraries is only focused on iterating through list of enabled cores, core-mask, and lcore-map.
With ever increasing core count, memory, io and accelerators on SoC, sub-numa partitioning is common in various vendor SoC. Enhancing or Augumenting lcore API to extract or provision NUMA, Cache Topology is not awkward.
If memory, IO and accelerator can have sub-NUMA domain, why is it awkward to have lcore in domains? Hence I do not agree on the awkwardness argument.
>
> It seems to me that you should either have:
> A) An API in similar to that of hwloc (or any DOM-like API), which would give a
> low-level description of the hardware in implementation terms.
> The topology would consist of nodes, with attributes, etc, where nodes are
> things like cores or instances of caches of some level and attributes are things
> like CPU actual and nominal, and maybe max frequency, cache size, or memory
> size.
Here is the catch, `rte_eal_init` internally invokes `get_cpu|lcores` and populates thread (lcore) to physical CPU. But there is more than just CPU mapping, as we have seeing in SoC architecture. The argument shared by many is `DPDK is not the place for such topology discovery`.
As per my current understanding, I have to disagree to the abive because
1. forces user to use external libraries example like hwloc
2. forces user to creating internal mapping for lcore, core-mask, and lcore-map with topology awareness code.
My intention is to `enable end user to leverage the API format or similar API format (rte_get_next_lcore)` to get best results on any SoC (vendor agnostic).
I fail to grasp why we are asking CPU topology to exported, while NIC, PCIe and accelerators are not asked to be exported via external libraries like hwloc.
Hence let us setup tech call in slack or teams to understand this better.
> or
> B) An API to be directly useful for a work scheduler, in which case you should
> abstract away things like "boost"
Please note as shared in earlier reply to Bruce, I made a mistake of calling it boost (AMD SoC terminology). Instead it should DPDK_TURBO.
There are use cases and DPDK examples, where cypto and compression are run on cores where TURBO is enabled. This allows end users to boost when there is more work and disable boost when there is less or no work.
> (and fold them into some abstract capacity notion, together with core "size" [in big-little/heterogeneous systems]), and
> have an abstract notion of what core is "close" to some other core. This would
> something like Linux'
> scheduling domains.
We had similar discussion with Jerrin on the last day of Bangkok DPDK summit. This RFC was intended to help capture this relevant point. With my current understanding on selected SoC the little core on ARM Soc shares L2 cache, while this analogy does not cover all cases. But this would be good start.
>
> If you want B you probably need A as a part of its implementation, so you may
> just as well start with A, I suppose.
>
> What you could do to explore the API design is to add support for, for
> example, boost core awareness or SMT affinity in the SW scheduler. You could
> also do an "lstopo" equivalent, since that's needed for debugging and
> exploration, if nothing else.
Not following on this analogy, will discuss in detail in tech talk
>
> One question that will have to be answered in a work scheduling scenario is
> "are these two lcores SMT siblings," or "are these two cores on the same LLC",
> or "give me all lcores on a particular L2 cache".
>
Is not that we have been trying to address based on Anatoly request to generalize than LLC. Hence we agreed on sharing version-2 of RFC with `rte_get_nex_lcore_extnd` with `flags`.
May I ask where is the disconnect?
> >>>
> >>> I mean, what you have is a topology, with nodes of different types
> >>> and with
> >> different properties, and you want to present it to the user.
> > Let me be clear, what we want via DPDK to help customer to use an Unified
> API which works across multiple platforms.
> > Example - let a vendor have 2 products namely A and B. CPU-A has all cores
> within same SUB-NUMA domain and CPU-B has cores split to 2 sub-NUMA
> domain based on split LLC.
> > When `rte_get_next_lcore_extnd` is invoked for `LLC` on 1. CPU-A: it
> > returns all cores as there is no split 2. CPU-B: it returns cores from
> > specific sub-NUMA which is partitioned by L3
> >
>
> I think the function name rte_get_next_lcore_extnd() alone makes clear this is an awkward API. :)
I humbly disagree to this statement, as explained above.
>
> My gut feeling is to make it more explicit and forget about <rte_lcore.h>.
> <rte_hwtopo.h>? Could and should still be EAL.
For me this is like adding a new level of library and more code. While the easiest way was to add an API similar to existing `get_next_lcore` style for easy adoption.
>
> >>>
> >>> In a sense, it's similar to XCM and DOM versus SAX. The above is
> >>> SAX-style,
> >> and what I have in mind is something DOM-like.
> >>>
> >>> What use case do you have in mind? What's on top of my list is a scenario where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure out how best make use of them.
> > Exactly.
> >
> > It's not going to "skip" (ignore, leave unused)
> >> SMT siblings, or skip non-boosted cores, it would just try to be
> >> clever in regards to which cores to use for what purpose.
> > Let me try to share my idea on SMT sibling. When user invoked for
> rte_get_next_lcore_extnd` is invoked for `L1 | SMT` flag with `lcore`; the API
> identifies first whether given lcore is part of enabled core list.
> > If yes, it programmatically either using `sysfs` or `hwloc library (shared the
> version concern on distros. Will recheck again)` identify the sibling thread and
> return.
> > If there is no sibling thread available under DPDK it will fetch next lcore
> (probably lcore +1 ).
> >
>
> Distributions having old hwloc versions isn't an argument for a new DPDK library or new API. If only that was the issue, then it would be better to help the hwloc and/or distributions, rather than the DPDK project.
I do not agree to terms of ` Distributions having old hwloc versions isn't an argument for a new DPDK library or new API.` Because this is not what my intention is. Let me be clear on Ampere & AMD Bios settings are 2
1. SLC or L3 as NUMA enable
2. Numa for IO|memory
With `NUMA for IO|memory` is set hwloc library works as expected. But when `L3 as NUMA` is set gives incorrect details. We have been fixing this and pushing to upstream. But as I clearly shared, version of distros having latest hwloc is almost nil.
Hence to keep things simple, in documentation of DPDK we pointed to AMD SoC tuning guide we have been recommending not to enable `L3 as NUMA`.
Now end goal for me is to allow vendor agnostic API which is easy to understand and use, and works irrespective of BIOS settings. I have enabled parsing of OS `sysfs` as a RFC. But if the comment is to use `hwloc` as shared with response for Stephen I am open to try this again.
<snipped>
[-- Attachment #2: Type: text/html, Size: 21186 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 11:17 ` Varghese, Vipin
@ 2024-09-12 11:59 ` Mattias Rönnblom
2024-09-12 13:30 ` Bruce Richardson
0 siblings, 1 reply; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 11:59 UTC (permalink / raw)
To: Varghese, Vipin, Honnappa Nagarahalli; +Cc: Yigit, Ferruh, dev, nd
On 2024-09-12 13:17, Varghese, Vipin wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> <snipped>
>> >>>> Thank you Mattias for the information, as shared by in the reply
>> >>>> with
>> >> Anatoly we want expose a new API `rte_get_next_lcore_ex` which
>> >> intakes a extra argument `u32 flags`.
>> >>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>> >> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>> >> RTE_GET_LCORE_BOOST_DISABLED.
>> >>>
>> >>> Wouldn't using that API be pretty awkward to use?
>> > Current API available under DPDK is ` rte_get_next_lcore`, which is used
>> within DPDK example and in customer solution.
>> > Based on the comments from others we responded to the idea of changing
>> the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
>> >
>> > Can you please help us understand what is `awkward`.
>> >
>>
>> The awkwardness starts when you are trying to fit provide hwloc type
>> information over an API that was designed for iterating over lcores.
> I disagree to this point, current implementation of lcore libraries is
> only focused on iterating through list of enabled cores, core-mask, and
> lcore-map.
> With ever increasing core count, memory, io and accelerators on SoC,
> sub-numa partitioning is common in various vendor SoC. Enhancing or
> Augumenting lcore API to extract or provision NUMA, Cache Topology is
> not awkward.
DPDK providing an API for this information makes sense to me, as I've
mentioned before. What I questioned was the way it was done (i.e., the
API design) in your RFC, and the limited scope (which in part you have
addressed).
> If memory, IO and accelerator can have sub-NUMA domain, why is it
> awkward to have lcore in domains? Hence I do not agree on the
> awkwardness argument.
>>
>> It seems to me that you should either have:
>> A) An API in similar to that of hwloc (or any DOM-like API), which would give a
>> low-level description of the hardware in implementation terms.
>> The topology would consist of nodes, with attributes, etc, where nodes are
>> things like cores or instances of caches of some level and attributes are things
>> like CPU actual and nominal, and maybe max frequency, cache size, or memory
>> size.
> Here is the catch, `rte_eal_init` internally invokes `get_cpu|lcores`
> and populates thread (lcore) to physical CPU. But there is more than
> just CPU mapping, as we have seeing in SoC architecture. The argument
> shared by many is `DPDK is not the place for such topology discovery`.
> As per my current understanding, I have to disagree to the abive because
> 1. forces user to use external libraries example like hwloc
> 2. forces user to creating internal mapping for lcore, core-mask, and
> lcore-map with topology awareness code.
> My intention is to `enable end user to leverage the API format or
> similar API format (rte_get_next_lcore)` to get best results on any SoC
> (vendor agnostic).
> I fail to grasp why we are asking CPU topology to exported, while NIC,
> PCIe and accelerators are not asked to be exported via external
> libraries like hwloc.
> Hence let us setup tech call in slack or teams to understand this better.
>> or
>> B) An API to be directly useful for a work scheduler, in which case you should
>> abstract away things like "boost"
> Please note as shared in earlier reply to Bruce, I made a mistake of
> calling it boost (AMD SoC terminology). Instead it should DPDK_TURBO.
> There are use cases and DPDK examples, where cypto and compression are
> run on cores where TURBO is enabled. This allows end users to boost when
> there is more work and disable boost when there is less or no work.
>> (and fold them into some abstract capacity notion, together with core "size" [in big-little/heterogeneous systems]), and
>> have an abstract notion of what core is "close" to some other core. This would
>> something like Linux'
>> scheduling domains.
> We had similar discussion with Jerrin on the last day of Bangkok DPDK
> summit. This RFC was intended to help capture this relevant point. With
> my current understanding on selected SoC the little core on ARM Soc
> shares L2 cache, while this analogy does not cover all cases. But this
> would be good start.
>>
>> If you want B you probably need A as a part of its implementation, so you may
>> just as well start with A, I suppose.
>>
>> What you could do to explore the API design is to add support for, for
>> example, boost core awareness or SMT affinity in the SW scheduler. You could
>> also do an "lstopo" equivalent, since that's needed for debugging and
>> exploration, if nothing else.
> Not following on this analogy, will discuss in detail in tech talk
>>
>> One question that will have to be answered in a work scheduling scenario is
>> "are these two lcores SMT siblings," or "are these two cores on the same LLC",
>> or "give me all lcores on a particular L2 cache".
>>
> Is not that we have been trying to address based on Anatoly request to
> generalize than LLC. Hence we agreed on sharing version-2 of RFC with
> `rte_get_nex_lcore_extnd` with `flags`.
> May I ask where is the disconnect?
>> >>>
>> >>> I mean, what you have is a topology, with nodes of different types
>> >>> and with
>> >> different properties, and you want to present it to the user.
>> > Let me be clear, what we want via DPDK to help customer to use an Unified
>> API which works across multiple platforms.
>> > Example - let a vendor have 2 products namely A and B. CPU-A has all cores
>> within same SUB-NUMA domain and CPU-B has cores split to 2 sub-NUMA
>> domain based on split LLC.
>> > When `rte_get_next_lcore_extnd` is invoked for `LLC` on 1. CPU-A: it
>> > returns all cores as there is no split 2. CPU-B: it returns cores from
>> > specific sub-NUMA which is partitioned by L3
>> >
>>
>> I think the function name rte_get_next_lcore_extnd() alone makes clear this is an awkward API. :)
> I humbly disagree to this statement, as explained above.
>>
>> My gut feeling is to make it more explicit and forget about <rte_lcore.h>.
>> <rte_hwtopo.h>? Could and should still be EAL.
> For me this is like adding a new level of library and more code. While
> the easiest way was to add an API similar to existing `get_next_lcore`
> style for easy adoption.
A poorly designed, special-case API is not less work. It's just less
work for *you* *now*, and much more work for someone in the future to
clean it up.
>>
>> >>>
>> >>> In a sense, it's similar to XCM and DOM versus SAX. The above is
>> >>> SAX-style,
>> >> and what I have in mind is something DOM-like.
>> >>>
>> >>> What use case do you have in mind? What's on top of my list is a scenario where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure out how best make use of them.
>> > Exactly.
>> >
>> > It's not going to "skip" (ignore, leave unused)
>> >> SMT siblings, or skip non-boosted cores, it would just try to be
>> >> clever in regards to which cores to use for what purpose.
>> > Let me try to share my idea on SMT sibling. When user invoked for
>> rte_get_next_lcore_extnd` is invoked for `L1 | SMT` flag with `lcore`; the API
>> identifies first whether given lcore is part of enabled core list.
>> > If yes, it programmatically either using `sysfs` or `hwloc library (shared the
>> version concern on distros. Will recheck again)` identify the sibling thread and
>> return.
>> > If there is no sibling thread available under DPDK it will fetch next lcore
>> (probably lcore +1 ).
>> >
>>
>> Distributions having old hwloc versions isn't an argument for a new DPDK library or new API. If only that was the issue, then it would be better to help the hwloc and/or distributions, rather than the DPDK project.
> I do not agree to terms of ` Distributions having old hwloc versions
> isn't an argument for a new DPDK library or new API.` Because this is
> not what my intention is. Let me be clear on Ampere & AMD Bios settings
> are 2
> 1. SLC or L3 as NUMA enable
> 2. Numa for IO|memory
> With `NUMA for IO|memory` is set hwloc library works as expected. But
> when `L3 as NUMA` is set gives incorrect details. We have been fixing
> this and pushing to upstream. But as I clearly shared, version of
> distros having latest hwloc is almost nil.
> Hence to keep things simple, in documentation of DPDK we pointed to AMD
> SoC tuning guide we have been recommending not to enable `L3 as NUMA`.
> Now end goal for me is to allow vendor agnostic API which is easy to
> understand and use, and works irrespective of BIOS settings. I have
> enabled parsing of OS `sysfs` as a RFC. But if the comment is to use
> `hwloc` as shared with response for Stephen I am open to try this again.
> <snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 11:59 ` Mattias Rönnblom
@ 2024-09-12 13:30 ` Bruce Richardson
2024-09-12 16:32 ` Mattias Rönnblom
0 siblings, 1 reply; 56+ messages in thread
From: Bruce Richardson @ 2024-09-12 13:30 UTC (permalink / raw)
To: Mattias Rönnblom
Cc: Varghese, Vipin, Honnappa Nagarahalli, Yigit, Ferruh, dev, nd
On Thu, Sep 12, 2024 at 01:59:34PM +0200, Mattias Rönnblom wrote:
> On 2024-09-12 13:17, Varghese, Vipin wrote:
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> > <snipped>
> > > >>>> Thank you Mattias for the information, as shared by in the reply
> > > >>>> with
> > > >> Anatoly we want expose a new API `rte_get_next_lcore_ex` which
> > > >> intakes a extra argument `u32 flags`.
> > > >>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
> > > >> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
> > > >> RTE_GET_LCORE_BOOST_DISABLED.
> > > >>>
> > > >>> Wouldn't using that API be pretty awkward to use?
> > > > Current API available under DPDK is ` rte_get_next_lcore`, which is used
> > > within DPDK example and in customer solution.
> > > > Based on the comments from others we responded to the idea of changing
> > > the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
> > > >
> > > > Can you please help us understand what is `awkward`.
> > > >
> > >
> > > The awkwardness starts when you are trying to fit provide hwloc type
> > > information over an API that was designed for iterating over lcores.
> > I disagree to this point, current implementation of lcore libraries is
> > only focused on iterating through list of enabled cores, core-mask, and
> > lcore-map.
> > With ever increasing core count, memory, io and accelerators on SoC,
> > sub-numa partitioning is common in various vendor SoC. Enhancing or
> > Augumenting lcore API to extract or provision NUMA, Cache Topology is
> > not awkward.
>
> DPDK providing an API for this information makes sense to me, as I've
> mentioned before. What I questioned was the way it was done (i.e., the API
> design) in your RFC, and the limited scope (which in part you have
> addressed).
>
Actually, I'd like to touch on this first item a little bit. What is the
main benefit of providing this information in EAL? To me, it seems like
something that is for apps to try and be super-smart and select particular
cores out of a set of cores to run on. However, is that not taking work
that should really be the job of the person deploying the app? The deployer
- if I can use that term - has already selected a set of cores and NICs for
a DPDK application to use. Should they not also be the one selecting - via
app argument, via --lcores flag to map one core id to another, or otherwise
- which part of an application should run on what particular piece of
hardware?
In summary, what is the final real-world intended usecase for this work?
DPDK already tries to be smart about cores and NUMA, and in some cases we
have hit issues where users have - for their own valid reasons - wanted to
run DPDK in a sub-optimal way, and they end up having to fight DPDK's
smarts in order to do so! Ref: [1]
/Bruce
[1] https://git.dpdk.org/dpdk/commit/?id=ed34d87d9cfbae8b908159f60df2008e45e4c39f
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 13:30 ` Bruce Richardson
@ 2024-09-12 16:32 ` Mattias Rönnblom
0 siblings, 0 replies; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 16:32 UTC (permalink / raw)
To: Bruce Richardson
Cc: Varghese, Vipin, Honnappa Nagarahalli, Yigit, Ferruh, dev, nd
On 2024-09-12 15:30, Bruce Richardson wrote:
> On Thu, Sep 12, 2024 at 01:59:34PM +0200, Mattias Rönnblom wrote:
>> On 2024-09-12 13:17, Varghese, Vipin wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> <snipped>
>>>>>>>> Thank you Mattias for the information, as shared by in the reply
>>>>>>>> with
>>>>>> Anatoly we want expose a new API `rte_get_next_lcore_ex` which
>>>>>> intakes a extra argument `u32 flags`.
>>>>>>>> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>>>>>> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>>>>>> RTE_GET_LCORE_BOOST_DISABLED.
>>>>>>>
>>>>>>> Wouldn't using that API be pretty awkward to use?
>>>>> Current API available under DPDK is ` rte_get_next_lcore`, which is used
>>>> within DPDK example and in customer solution.
>>>>> Based on the comments from others we responded to the idea of changing
>>>> the new Api from ` rte_get_next_lcore_llc` to ` rte_get_next_lcore_exntd`.
>>>>>
>>>>> Can you please help us understand what is `awkward`.
>>>>>
>>>>
>>>> The awkwardness starts when you are trying to fit provide hwloc type
>>>> information over an API that was designed for iterating over lcores.
>>> I disagree to this point, current implementation of lcore libraries is
>>> only focused on iterating through list of enabled cores, core-mask, and
>>> lcore-map.
>>> With ever increasing core count, memory, io and accelerators on SoC,
>>> sub-numa partitioning is common in various vendor SoC. Enhancing or
>>> Augumenting lcore API to extract or provision NUMA, Cache Topology is
>>> not awkward.
>>
>> DPDK providing an API for this information makes sense to me, as I've
>> mentioned before. What I questioned was the way it was done (i.e., the API
>> design) in your RFC, and the limited scope (which in part you have
>> addressed).
>>
>
> Actually, I'd like to touch on this first item a little bit. What is the
> main benefit of providing this information in EAL? To me, it seems like
> something that is for apps to try and be super-smart and select particular
> cores out of a set of cores to run on. However, is that not taking work
> that should really be the job of the person deploying the app? The deployer
> - if I can use that term - has already selected a set of cores and NICs for
> a DPDK application to use. Should they not also be the one selecting - via
> app argument, via --lcores flag to map one core id to another, or otherwise
> - which part of an application should run on what particular piece of
> hardware?
>
Scheduling in one form or another will happen on a number of levels. One
level is what you call the "deployer". Whether man or machine, it will
allocate a bunch of lcores to the application - either statically by
using -l <cores>, or dynamically, by giving a very large core mask,
combined with having an agent in the app responsible to scale up or down
the number of cores actually used (allowing coexistence with other
non-DPDK, Linux process scheduler-scheduled processes, on the same set
of cores, although not at the same time).
I think the "deployer" level should generally not be aware of the DPDK
app internals, including how to assign different tasks to different
cores. That is consistent with how things work in a general-purpose
operating system, where you allocate cores, memory and I/O devices to an
instance (e.g., a VM), but then OS' scheduler figures out how to best
use them.
The app internal may be complicated, change across software versions and
traffic mixes/patterns, and most of all, not lend itself to static
at-start configuration at all.
> In summary, what is the final real-world intended usecase for this work?
One real-world example is an Eventdev app with some atomic processing
stage, using DSW, and SMT. Hardware threading on Intel x86 generally
improves performance with ~25%, which seems to hold true for data plane
apps as well, in my experience. So that's a (not-so-)freebie you don't
want to miss out on. To max out single-flow performance, the work
scheduler may not only need to give 100% of an lcore to bottleneck stage
atomic processing for that elephant flow, but a *full* physical core
(i.e., assure that the SMT sibling is idle). But, DSW doesn't understand
the CPU topology, so you have to choose between max multi-flow
throughput or max single-flow throughput at the time of deployment. A
RTE hwtopo API would certainly help in the implementation of SMT-aware
scheduling.
Another example could be the use of bigger or turbo-capable cores to run
CPU-hungry, singleton services (e.g., a Eventdev RX timer adapter core),
or the use of a hardware thread to run the SW scheduler service (which
needs to react quickly to incoming scheduling events, but maybe not need
all the cycles of a full physical core).
Yet another example would be an event device which understand how to
spread a particular flow across multiple cores, but use only cores
sharing the same L2. Or, keep only processing of a certain kind (e.g., a
certain Eventdev Queue) on cores with the same L2, improve L2 hit rates
for instructions and data related to that processing stage.
> DPDK already tries to be smart about cores and NUMA, and in some cases we
> have hit issues where users have - for their own valid reasons - wanted to
> run DPDK in a sub-optimal way, and they end up having to fight DPDK's
> smarts in order to do so! Ref: [1]
>
> /Bruce
>
> [1] https://git.dpdk.org/dpdk/commit/?id=ed34d87d9cfbae8b908159f60df2008e45e4c39f
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 17:04 ` Honnappa Nagarahalli
2024-09-12 1:33 ` Varghese, Vipin
@ 2024-09-12 2:28 ` Varghese, Vipin
1 sibling, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 2:28 UTC (permalink / raw)
To: Honnappa Nagarahalli, Mattias Rönnblom; +Cc: Yigit, Ferruh, dev, nd
[Public]
<snipped>
> > What use case do you have in mind? What's on top of my list is a scenario
> where a DPDK app gets a bunch of cores (e.g., -l <cores>) and tries to figure
> out how best make use of them. It's not going to "skip" (ignore, leave unused)
> SMT siblings, or skip non-boosted cores, it would just try to be clever in
> regards to which cores to use for what purpose.
> >
> >> This is AMD EPYC SoC agnostic and trying to address for all generic cases.
> >> Please do let us know if we (Ferruh & myself) can sync up via call?
> >
> > Sure, I can do that.
> >
> Can this be opened to the rest of the community? This is a common problem
> that needs to be solved for multiple architectures. I would be interested in
> attending.
Hi Honappa, we can accommodate the same. Let me work with Ferruh how to get it as tech discussion.
<snipped>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-11 3:26 ` Varghese, Vipin
2024-09-11 15:55 ` Mattias Rönnblom
@ 2024-09-11 16:01 ` Bruce Richardson
2024-09-11 22:25 ` Konstantin Ananyev
2024-09-12 2:19 ` Varghese, Vipin
1 sibling, 2 replies; 56+ messages in thread
From: Bruce Richardson @ 2024-09-11 16:01 UTC (permalink / raw)
To: Varghese, Vipin; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
On Wed, Sep 11, 2024 at 03:26:20AM +0000, Varghese, Vipin wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> <snipped>
>
> >
> > On 2024-09-09 16:22, Varghese, Vipin wrote:
> > > [AMD Official Use Only - AMD Internal Distribution Only]
> > >
> > > <snipped>
> > >
> > >>> <snipped>
> > >>>
> > >>> Thank you Mattias for the comments and question, please let me try
> > >>> to explain the same below
> > >>>
> > >>>> We shouldn't have a separate CPU/cache hierarchy API instead?
> > >>>
> > >>> Based on the intention to bring in CPU lcores which share same L3
> > >>> (for better cache hits and less noisy neighbor) current API focuses
> > >>> on using
> > >>>
> > >>> Last Level Cache. But if the suggestion is `there are SoC where L2
> > >>> cache are also shared, and the new API should be provisioned`, I am
> > >>> also
> > >>>
> > >>> comfortable with the thought.
> > >>>
> > >>
> > >> Rather than some AMD special case API hacked into <rte_lcore.h>, I
> > >> think we are better off with no DPDK API at all for this kind of functionality.
> > >
> > > Hi Mattias, as shared in the earlier email thread, this is not a AMD special
> > case at all. Let me try to explain this one more time. One of techniques used to
> > increase cores cost effective way to go for tiles of compute complexes.
> > > This introduces a bunch of cores in sharing same Last Level Cache (namely
> > L2, L3 or even L4) depending upon cache topology architecture.
> > >
> > > The API suggested in RFC is to help end users to selectively use cores under
> > same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS
> > settings used). This is useful in both bare-metal and container environment.
> > >
> >
> > I'm pretty familiar with AMD CPUs and the use of tiles (including the
> > challenges these kinds of non-uniformities pose for work scheduling).
> >
> > To maximize performance, caring about core<->LLC relationship may well not
> > be enough, and more HT/core/cache/memory topology information is
> > required. That's what I meant by special case. A proper API should allow
> > access to information about which lcores are SMT siblings, cores on the same
> > L2, and cores on the same L3, to name a few things. Probably you want to fit
> > NUMA into the same API as well, although that is available already in
> > <rte_lcore.h>.
>
> Thank you Mattias for the information, as shared by in the reply with Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a extra argument `u32 flags`.
> The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2, RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED, RTE_GET_LCORE_BOOST_DISABLED.
>
For the naming, would "rte_get_next_sibling_core" (or lcore if you prefer)
be a clearer name than just adding "ex" on to the end of the existing
function?
Looking logically, I'm not sure about the BOOST_ENABLED and BOOST_DISABLED
flags you propose - in a system with multiple possible standard and boost
frequencies what would those correspond to? What's also missing is a define
for getting actual NUMA siblings i.e. those sharing common memory but not
an L3 or anything else.
My suggestion would be to have the function take just an integer-type e.g.
uint16_t parameter which defines the memory/cache hierarchy level to use, 0
being lowest, 1 next, and so on. Different systems may have different
numbers of cache levels so lets just make it a zero-based index of levels,
rather than giving explicit defines (except for memory which should
probably always be last). The zero-level will be for "closest neighbour"
whatever that happens to be, with as many levels as is necessary to express
the topology, e.g. without SMT, but with 3 cache levels, level 0 would be
an L2 neighbour, level 1 an L3 neighbour. If the L3 was split within a
memory NUMA node, then level 2 would give the NUMA siblings. We'd just need
an API to return the max number of levels along with the iterator.
Regards,
/Bruce
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 16:01 ` Bruce Richardson
@ 2024-09-11 22:25 ` Konstantin Ananyev
2024-09-12 2:38 ` Varghese, Vipin
2024-09-12 2:19 ` Varghese, Vipin
1 sibling, 1 reply; 56+ messages in thread
From: Konstantin Ananyev @ 2024-09-11 22:25 UTC (permalink / raw)
To: Bruce Richardson, Varghese, Vipin
Cc: Mattias Rönnblom, Yigit, Ferruh, dev
> > > >>> Thank you Mattias for the comments and question, please let me try
> > > >>> to explain the same below
> > > >>>
> > > >>>> We shouldn't have a separate CPU/cache hierarchy API instead?
> > > >>>
> > > >>> Based on the intention to bring in CPU lcores which share same L3
> > > >>> (for better cache hits and less noisy neighbor) current API focuses
> > > >>> on using
> > > >>>
> > > >>> Last Level Cache. But if the suggestion is `there are SoC where L2
> > > >>> cache are also shared, and the new API should be provisioned`, I am
> > > >>> also
> > > >>>
> > > >>> comfortable with the thought.
> > > >>>
> > > >>
> > > >> Rather than some AMD special case API hacked into <rte_lcore.h>, I
> > > >> think we are better off with no DPDK API at all for this kind of functionality.
> > > >
> > > > Hi Mattias, as shared in the earlier email thread, this is not a AMD special
> > > case at all. Let me try to explain this one more time. One of techniques used to
> > > increase cores cost effective way to go for tiles of compute complexes.
> > > > This introduces a bunch of cores in sharing same Last Level Cache (namely
> > > L2, L3 or even L4) depending upon cache topology architecture.
> > > >
> > > > The API suggested in RFC is to help end users to selectively use cores under
> > > same Last Level Cache Hierarchy as advertised by OS (irrespective of the BIOS
> > > settings used). This is useful in both bare-metal and container environment.
> > > >
> > >
> > > I'm pretty familiar with AMD CPUs and the use of tiles (including the
> > > challenges these kinds of non-uniformities pose for work scheduling).
> > >
> > > To maximize performance, caring about core<->LLC relationship may well not
> > > be enough, and more HT/core/cache/memory topology information is
> > > required. That's what I meant by special case. A proper API should allow
> > > access to information about which lcores are SMT siblings, cores on the same
> > > L2, and cores on the same L3, to name a few things. Probably you want to fit
> > > NUMA into the same API as well, although that is available already in
> > > <rte_lcore.h>.
> >
> > Thank you Mattias for the information, as shared by in the reply with Anatoly we want expose a new API `rte_get_next_lcore_ex`
> which intakes a extra argument `u32 flags`.
> > The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2, RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
> RTE_GET_LCORE_BOOST_DISABLED.
> >
>
> For the naming, would "rte_get_next_sibling_core" (or lcore if you prefer)
> be a clearer name than just adding "ex" on to the end of the existing
> function?
>
> Looking logically, I'm not sure about the BOOST_ENABLED and BOOST_DISABLED
> flags you propose - in a system with multiple possible standard and boost
> frequencies what would those correspond to? What's also missing is a define
> for getting actual NUMA siblings i.e. those sharing common memory but not
> an L3 or anything else.
>
> My suggestion would be to have the function take just an integer-type e.g.
> uint16_t parameter which defines the memory/cache hierarchy level to use, 0
> being lowest, 1 next, and so on. Different systems may have different
> numbers of cache levels so lets just make it a zero-based index of levels,
> rather than giving explicit defines (except for memory which should
> probably always be last). The zero-level will be for "closest neighbour"
> whatever that happens to be, with as many levels as is necessary to express
> the topology, e.g. without SMT, but with 3 cache levels, level 0 would be
> an L2 neighbour, level 1 an L3 neighbour. If the L3 was split within a
> memory NUMA node, then level 2 would give the NUMA siblings. We'd just need
> an API to return the max number of levels along with the iterator.
Sounds like a neat idea to me.
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 22:25 ` Konstantin Ananyev
@ 2024-09-12 2:38 ` Varghese, Vipin
0 siblings, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 2:38 UTC (permalink / raw)
To: Konstantin Ananyev, Bruce Richardson
Cc: Mattias Rönnblom, Yigit, Ferruh, dev
[-- Attachment #1: Type: text/plain, Size: 2224 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
> >
> > For the naming, would "rte_get_next_sibling_core" (or lcore if you
> > prefer) be a clearer name than just adding "ex" on to the end of the
> > existing function?
> >
> > Looking logically, I'm not sure about the BOOST_ENABLED and
> > BOOST_DISABLED flags you propose - in a system with multiple possible
> > standard and boost frequencies what would those correspond to? What's
> > also missing is a define for getting actual NUMA siblings i.e. those
> > sharing common memory but not an L3 or anything else.
> >
> > My suggestion would be to have the function take just an integer-type e.g.
> > uint16_t parameter which defines the memory/cache hierarchy level to
> > use, 0 being lowest, 1 next, and so on. Different systems may have
> > different numbers of cache levels so lets just make it a zero-based
> > index of levels, rather than giving explicit defines (except for
> > memory which should probably always be last). The zero-level will be for
> "closest neighbour"
> > whatever that happens to be, with as many levels as is necessary to
> > express the topology, e.g. without SMT, but with 3 cache levels, level
> > 0 would be an L2 neighbour, level 1 an L3 neighbour. If the L3 was
> > split within a memory NUMA node, then level 2 would give the NUMA
> > siblings. We'd just need an API to return the max number of levels along with
> the iterator.
>
> Sounds like a neat idea to me.
Hi Konstantin, I have tried my best to address to Bruce comment. Let me try to recap
1. we want vendor agnostic API which allows end users to get list of lcores
2. this can be based on L1(SMT), L2, L3, NUMA, TURBO (as of now)
3. instead of creating multiple different API, we would like to add 1 API `rte_get_next_lcore_extnd` which can be controlled with `flags`
4. flag can be single or combination (like L3|TURBO_ENABLED or NUMA|TURBO_ENABLED).
5. As per my current idea, we can expand ease of use via MACRO and not API.
I hope this justifies why we should have 1 exntended API and wrap things with Macro?
May I setup or add to tech discussion call with Mattias and Honappa too?
[-- Attachment #2: Type: text/html, Size: 5898 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-11 16:01 ` Bruce Richardson
2024-09-11 22:25 ` Konstantin Ananyev
@ 2024-09-12 2:19 ` Varghese, Vipin
2024-09-12 9:17 ` Bruce Richardson
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 2:19 UTC (permalink / raw)
To: Bruce Richardson; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
[-- Attachment #1: Type: text/plain, Size: 6287 bytes --]
[Public]
<snipped>
> > > > <snipped>
> > > >
> > > >>> <snipped>
> > > >>>
> > > >>> Thank you Mattias for the comments and question, please let me
> > > >>> try to explain the same below
> > > >>>
> > > >>>> We shouldn't have a separate CPU/cache hierarchy API instead?
> > > >>>
> > > >>> Based on the intention to bring in CPU lcores which share same
> > > >>> L3 (for better cache hits and less noisy neighbor) current API
> > > >>> focuses on using
> > > >>>
> > > >>> Last Level Cache. But if the suggestion is `there are SoC where
> > > >>> L2 cache are also shared, and the new API should be
> > > >>> provisioned`, I am also
> > > >>>
> > > >>> comfortable with the thought.
> > > >>>
> > > >>
> > > >> Rather than some AMD special case API hacked into <rte_lcore.h>,
> > > >> I think we are better off with no DPDK API at all for this kind of
> functionality.
> > > >
> > > > Hi Mattias, as shared in the earlier email thread, this is not a
> > > > AMD special
> > > case at all. Let me try to explain this one more time. One of
> > > techniques used to increase cores cost effective way to go for tiles of
> compute complexes.
> > > > This introduces a bunch of cores in sharing same Last Level Cache
> > > > (namely
> > > L2, L3 or even L4) depending upon cache topology architecture.
> > > >
> > > > The API suggested in RFC is to help end users to selectively use
> > > > cores under
> > > same Last Level Cache Hierarchy as advertised by OS (irrespective of
> > > the BIOS settings used). This is useful in both bare-metal and container
> environment.
> > > >
> > >
> > > I'm pretty familiar with AMD CPUs and the use of tiles (including
> > > the challenges these kinds of non-uniformities pose for work scheduling).
> > >
> > > To maximize performance, caring about core<->LLC relationship may
> > > well not be enough, and more HT/core/cache/memory topology
> > > information is required. That's what I meant by special case. A
> > > proper API should allow access to information about which lcores are
> > > SMT siblings, cores on the same L2, and cores on the same L3, to
> > > name a few things. Probably you want to fit NUMA into the same API
> > > as well, although that is available already in <rte_lcore.h>.
> >
> > Thank you Mattias for the information, as shared by in the reply with
> Anatoly we want expose a new API `rte_get_next_lcore_ex` which intakes a
> extra argument `u32 flags`.
> > The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
> RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
> RTE_GET_LCORE_BOOST_DISABLED.
> >
>
> For the naming, would "rte_get_next_sibling_core" (or lcore if you prefer) be a
> clearer name than just adding "ex" on to the end of the existing function?
Thank you Bruce, Please find my answer below
Functions shared as per the RFC were
```
- rte_get_llc_first_lcores: Retrieves all the first lcores in the shared LLC.
- rte_get_llc_lcore: Retrieves all lcores that share the LLC.
- rte_get_llc_n_lcore: Retrieves the first n or skips the first n lcores in the shared LLC.
```
MACRO's extending the usability were
```
RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each LLC.
RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker lcore from each LLC.
RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint (lcore id).
RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while skipping first worker.
RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from each LLC.
RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then iterates through reaming lcores in each LLC.
```
Based on the discussions we agreed on sharing version-2 FRC for extending API as `rte_get_next_lcore_extnd` with extra argument as `flags`.
As per my ideation, for the API ` rte_get_next_sibling_core`, the above API can easily with flag ` RTE_GET_LCORE_L1 (SMT)`. Is this right understanding?
We can easily have simple MACROs like `RTE_LCORE_FOREACH_L1` which allows to iterate SMT sibling threads.
>
> Looking logically, I'm not sure about the BOOST_ENABLED and
> BOOST_DISABLED flags you propose
The idea for the BOOST_ENABLED & BOOST_DISABLED is based on DPDK power library which allows to enable boost.
Allow user to select lcores where BOOST is enabled|disabled using MACRO or API.
- in a system with multiple possible
> standard and boost frequencies what would those correspond to?
I now understand the confusion, apologies for mixing the AMD EPYC SoC boost with Intel Turbo.
Thank you for pointing out, we will use the terminology ` RTE_GET_LCORE_TURBO`.
What's also
> missing is a define for getting actual NUMA siblings i.e. those sharing common
> memory but not an L3 or anything else.
This can be extended into `rte_get_next_lcore_extnd` with flag ` RTE_GET_LCORE_NUMA`. This will allow to grab all lcores under the same sub-memory NUMA as shared by LCORE.
If SMT sibling is enabled and DPDK Lcore mask covers the sibling threads, then ` RTE_GET_LCORE_NUMA` get all lcore and sibling threads under same memory NUMA of lcore shared.
>
> My suggestion would be to have the function take just an integer-type e.g.
> uint16_t parameter which defines the memory/cache hierarchy level to use, 0
> being lowest, 1 next, and so on. Different systems may have different numbers
> of cache levels so lets just make it a zero-based index of levels, rather than
> giving explicit defines (except for memory which should probably always be
> last). The zero-level will be for "closest neighbour"
Good idea, we did prototype this internally. But issue it will keep on adding the number of API into lcore library.
To keep the API count less, we are using lcore id as hint to sub-NUMA.
> whatever that happens to be, with as many levels as is necessary to express
> the topology, e.g. without SMT, but with 3 cache levels, level 0 would be an L2
> neighbour, level 1 an L3 neighbour. If the L3 was split within a memory NUMA
> node, then level 2 would give the NUMA siblings. We'd just need an API to
> return the max number of levels along with the iterator.
We are using lcore numa as the hint.
>
> Regards,
> /Bruce
[-- Attachment #2: Type: text/html, Size: 18337 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 2:19 ` Varghese, Vipin
@ 2024-09-12 9:17 ` Bruce Richardson
2024-09-12 11:50 ` Varghese, Vipin
2024-09-12 13:18 ` Mattias Rönnblom
0 siblings, 2 replies; 56+ messages in thread
From: Bruce Richardson @ 2024-09-12 9:17 UTC (permalink / raw)
To: Varghese, Vipin; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
On Thu, Sep 12, 2024 at 02:19:07AM +0000, Varghese, Vipin wrote:
> [Public]
>
> <snipped>
>
>
>
> > > > > <snipped>
>
> > > > >
>
> > > > >>> <snipped>
>
> > > > >>>
>
> > > > >>> Thank you Mattias for the comments and question, please let
> me
>
> > > > >>> try to explain the same below
>
> > > > >>>
>
> > > > >>>> We shouldn't have a separate CPU/cache hierarchy API
> instead?
>
> > > > >>>
>
> > > > >>> Based on the intention to bring in CPU lcores which share
> same
>
> > > > >>> L3 (for better cache hits and less noisy neighbor) current
> API
>
> > > > >>> focuses on using
>
> > > > >>>
>
> > > > >>> Last Level Cache. But if the suggestion is `there are SoC
> where
>
> > > > >>> L2 cache are also shared, and the new API should be
>
> > > > >>> provisioned`, I am also
>
> > > > >>>
>
> > > > >>> comfortable with the thought.
>
> > > > >>>
>
> > > > >>
>
> > > > >> Rather than some AMD special case API hacked into
> <rte_lcore.h>,
>
> > > > >> I think we are better off with no DPDK API at all for this
> kind of
>
> > functionality.
>
> > > > >
>
> > > > > Hi Mattias, as shared in the earlier email thread, this is not
> a
>
> > > > > AMD special
>
> > > > case at all. Let me try to explain this one more time. One of
>
> > > > techniques used to increase cores cost effective way to go for
> tiles of
>
> > compute complexes.
>
> > > > > This introduces a bunch of cores in sharing same Last Level
> Cache
>
> > > > > (namely
>
> > > > L2, L3 or even L4) depending upon cache topology architecture.
>
> > > > >
>
> > > > > The API suggested in RFC is to help end users to selectively
> use
>
> > > > > cores under
>
> > > > same Last Level Cache Hierarchy as advertised by OS (irrespective
> of
>
> > > > the BIOS settings used). This is useful in both bare-metal and
> container
>
> > environment.
>
> > > > >
>
> > > >
>
> > > > I'm pretty familiar with AMD CPUs and the use of tiles (including
>
> > > > the challenges these kinds of non-uniformities pose for work
> scheduling).
>
> > > >
>
> > > > To maximize performance, caring about core<->LLC relationship may
>
> > > > well not be enough, and more HT/core/cache/memory topology
>
> > > > information is required. That's what I meant by special case. A
>
> > > > proper API should allow access to information about which lcores
> are
>
> > > > SMT siblings, cores on the same L2, and cores on the same L3, to
>
> > > > name a few things. Probably you want to fit NUMA into the same
> API
>
> > > > as well, although that is available already in <rte_lcore.h>.
>
> > >
>
> > > Thank you Mattias for the information, as shared by in the reply
> with
>
> > Anatoly we want expose a new API `rte_get_next_lcore_ex` which
> intakes a
>
> > extra argument `u32 flags`.
>
> > > The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>
> > RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>
> > RTE_GET_LCORE_BOOST_DISABLED.
>
> > >
>
> >
>
> > For the naming, would "rte_get_next_sibling_core" (or lcore if you
> prefer) be a
>
> > clearer name than just adding "ex" on to the end of the existing
> function?
>
> Thank you Bruce, Please find my answer below
>
>
>
> Functions shared as per the RFC were
>
> ```
>
> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
> shared LLC.
>
> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>
> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
> lcores in the shared LLC.
>
> ```
>
>
>
> MACRO’s extending the usability were
>
> ```
>
> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each
> LLC.
>
> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker
> lcore from each LLC.
>
> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint
> (lcore id).
>
> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while
> skipping first worker.
>
> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from
> each LLC.
>
> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
> iterates through reaming lcores in each LLC.
>
> ```
>
>
>
> Based on the discussions we agreed on sharing version-2 FRC for
> extending API as `rte_get_next_lcore_extnd` with extra argument as
> `flags`.
>
> As per my ideation, for the API ` rte_get_next_sibling_core`, the above
> API can easily with flag ` RTE_GET_LCORE_L1 (SMT)`. Is this right
> understanding?
>
> We can easily have simple MACROs like `RTE_LCORE_FOREACH_L1` which
> allows to iterate SMT sibling threads.
>
>
This seems like a lot of new macro and API additions! I'd really like to
cut that back and simplify the amount of new things we are adding to DPDK
for this. I tend to agree with others that external libs would be better
for apps that really want to deal with all this.
>
> >
>
> > Looking logically, I'm not sure about the BOOST_ENABLED and
>
> > BOOST_DISABLED flags you propose
>
> The idea for the BOOST_ENABLED & BOOST_DISABLED is based on DPDK power
> library which allows to enable boost.
>
> Allow user to select lcores where BOOST is enabled|disabled using MACRO
> or API.
>
>
>
> - in a system with multiple possible
>
> > standard and boost frequencies what would those correspond to?
>
> I now understand the confusion, apologies for mixing the AMD EPYC SoC
> boost with Intel Turbo.
>
>
>
> Thank you for pointing out, we will use the terminology `
> RTE_GET_LCORE_TURBO`.
>
>
That still doesn't clarify it for me. If you start mixing in power
management related functions in with topology ones things will turn into a
real headache. What does boost or turbo correspond to? Is it for cores that
have the feature enabled - whether or not it's currently in use - or is it
for finding cores that are currently boosted? Do we need additions for
cores that are boosted by 100Mhz vs say 300Mhz. What about cores that are
in lower frequencies for power-saving. Do we add macros for finding those?
>
> What's also
>
> > missing is a define for getting actual NUMA siblings i.e. those
> sharing common
>
> > memory but not an L3 or anything else.
>
> This can be extended into `rte_get_next_lcore_extnd` with flag `
> RTE_GET_LCORE_NUMA`. This will allow to grab all lcores under the same
> sub-memory NUMA as shared by LCORE.
>
> If SMT sibling is enabled and DPDK Lcore mask covers the sibling
> threads, then ` RTE_GET_LCORE_NUMA` get all lcore and sibling threads
> under same memory NUMA of lcore shared.
>
>
Yes. That can work. But it means we are basing the implementation on a
fixed idea of what topologies there are or can exist. My suggestion below
is just to ignore the whole idea of L1 vs L2 vs NUMA - just give the app a
way to find it's nearest nodes.
After all, the app doesn't want to know the topology just for the sake of
knowing it - it wants it to ensure best placement of work on cores! To that
end, it just needs to know what cores are near to each other and what are
far away.
>
> >
>
> > My suggestion would be to have the function take just an integer-type
> e.g.
>
> > uint16_t parameter which defines the memory/cache hierarchy level to
> use, 0
>
> > being lowest, 1 next, and so on. Different systems may have different
> numbers
>
> > of cache levels so lets just make it a zero-based index of levels,
> rather than
>
> > giving explicit defines (except for memory which should probably
> always be
>
> > last). The zero-level will be for "closest neighbour"
>
> Good idea, we did prototype this internally. But issue it will keep on
> adding the number of API into lcore library.
>
> To keep the API count less, we are using lcore id as hint to sub-NUMA.
>
I'm unclear about this keeping the API count down - you are proposing a lot
of APIs and macros up above. My suggestion is basically to add two APIs and
no macros: one API to get the max number of topology-nearness levels, and a
second API to get the next sibling a given nearness level from
0(nearest)..N(furthest). If we want, we can also add a FOREACH macro too.
Overall, though, as I say above, let's focus on the problem the app
actually wants these APIs for, not how we think we should solve it. Apps
don't want to know the topology for knowledge sake, they want to use that
knowledge to improve performance by pinning tasks to cores. What is the
minimum that we need to provide to enable the app to do that? For example,
if there are no lcores that share an L1, then from an app topology
viewpoint that L1 level may as well not exist, because it provides us no
details on how to place our work.
For the rare app that does have some esoteric use-case that does actually
want to know some intricate details of the topology, then having that app
use an external lib is probably a better solution than us trying to cover
all possible options in DPDK.
My 2c. on this at this stage anyway.
/Bruce
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-12 9:17 ` Bruce Richardson
@ 2024-09-12 11:50 ` Varghese, Vipin
2024-09-13 14:15 ` Burakov, Anatoly
2024-09-12 13:18 ` Mattias Rönnblom
1 sibling, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-12 11:50 UTC (permalink / raw)
To: Bruce Richardson; +Cc: Mattias Rönnblom, Yigit, Ferruh, dev
[Public]
Snipped
> >
> >
> > Based on the discussions we agreed on sharing version-2 FRC for
> > extending API as `rte_get_next_lcore_extnd` with extra argument as
> > `flags`.
> >
> > As per my ideation, for the API ` rte_get_next_sibling_core`, the above
> > API can easily with flag ` RTE_GET_LCORE_L1 (SMT)`. Is this right
> > understanding?
> >
> > We can easily have simple MACROs like `RTE_LCORE_FOREACH_L1` which
> > allows to iterate SMT sibling threads.
> >
> >
>
> This seems like a lot of new macro and API additions! I'd really like to cut that
> back and simplify the amount of new things we are adding to DPDK for this.
I disagree Bruce, as per the new conversation with Anatoly and you it has been shared the new API are
```
1. rte_get_next_lcore_exntd
2. rte_get_next_n_lcore_exntd
```
While I mentioned custom Macro can augment based on typical flag usage similar to ` RTE_LCORE_FOREACH and RTE_LCORE_FOREACH_WORKER` as
```
RTE_LCORE_FOREACH_FLAG
RTE_LCORE_FOREACH_WORKER_FLAG
Or
RTE_LCORE_FOREACH_LLC
RTE_LCORE_FOREACH_WORKER_LLC
```
Please note I have not even shared version-2 of RFC yet.
> I tend to agree with others that external libs would be better for apps that really want to deal with all this.
I have covered why this is not a good idea for Mattias query.
>
> >
> > >
> >
> > > Looking logically, I'm not sure about the BOOST_ENABLED and BOOST_DISABLED flags you propose
> > The idea for the BOOST_ENABLED & BOOST_DISABLED is based on DPDK power library which allows to enable boost.
> > Allow user to select lcores where BOOST is enabled|disabled using MACRO or API.
May be there is confusion, so let me try to be explicit here. The intention of any `rte_get_next_lcore##` is fetch lcores.
Hence with new proposed API `rte_get_next_lcore_exntd` with `flag set for Boost` is to fetch lcores where boost is enabled.
There is no intention to enable or disable boost on lcore with `get` API.
> >
> >
> >
> > - in a system with multiple possible
> >
> > > standard and boost frequencies what would those correspond to?
> >
> > I now understand the confusion, apologies for mixing the AMD EPYC SoC
> > boost with Intel Turbo.
> >
> >
> >
> > Thank you for pointing out, we will use the terminology `
> > RTE_GET_LCORE_TURBO`.
> >
> >
>
> That still doesn't clarify it for me. If you start mixing in power management related functions in with topology ones things will turn into a real headache.
Can you please tell me what is not clarified. DPDK lcores as of today has no notion of Cache, Numa, Power, Turbo or any DPDK supported features.
The initial API introduced were to expose lcore sharing the same Last Level Cache. Based on interaction with Anatoly, extending this to support multiple features turned out to be possibility.
Hence, we said we can share v2 for RFC based on this idea.
But if the claim is not to put TURBO I am also ok for this. Let only keep cache and NUMA-IO domain.
> What does boost or turbo correspond to? Is it for cores that have the feature enabled - whether or not it's currently in use - or is it for finding cores that are
> currently boosted? Do we need additions for cores that are boosted by 100Mhz vs say 300Mhz. What about cores that are in lower frequencies for
> power-saving. Do we add macros for finding those?
Why are we talking about feq-up and freq-down? This was not even discussed in this RFC patch at all.
> >
> > What's also
> >
> > > missing is a define for getting actual NUMA siblings i.e. those
> > sharing common memory but not an L3 or anything else.
> >
> > This can be extended into `rte_get_next_lcore_extnd` with flag `
> > RTE_GET_LCORE_NUMA`. This will allow to grab all lcores under the same
> > sub-memory NUMA as shared by LCORE.
> >
> > If SMT sibling is enabled and DPDK Lcore mask covers the sibling
> > threads, then ` RTE_GET_LCORE_NUMA` get all lcore and sibling threads
> > under same memory NUMA of lcore shared.
> >
> >
>
> Yes. That can work. But it means we are basing the implementation on a fixed idea of what topologies there are or can exist.
> My suggestion below is just to ignore the whole idea of L1 vs L2 vs NUMA - just give the app a way to find it's nearest nodes.
Bruce, for different vendor SoC, the implementation of architecture is different. Let me share what I know
1. using L1, we can fetch SMT threads
2. using L2 we can get certain SoC on Arm, Intel and power PC which is like efficient cores
3. using L3 we can get certain SoC like AMD, AF64x and others which follow chiplet or tile split L3 domain.
>
> After all, the app doesn't want to know the topology just for the sake of knowing it - it wants it to ensure best placement of work on cores! To that end, it just needs to know what cores are near to each other and what are far away.
Exactly, that is why we want to minimize new libraries and limit to format of existing API `rte_get_next_lcore`. The end user need to deploy another library or external library then map to DPDK lcore mapping to identify what is where.
So as end user I prefer simple API which get my work done.
>
> >
> > >
> >
> > > My suggestion would be to have the function take just an integer-type
> > e.g.
> >
> > > uint16_t parameter which defines the memory/cache hierarchy level to
> > use, 0
> >
> > > being lowest, 1 next, and so on. Different systems may have different
> > numbers
> >
> > > of cache levels so lets just make it a zero-based index of levels,
> > rather than
> >
> > > giving explicit defines (except for memory which should probably
> > always be
> >
> > > last). The zero-level will be for "closest neighbour"
> >
> > Good idea, we did prototype this internally. But issue it will keep on
> > adding the number of API into lcore library.
> >
> > To keep the API count less, we are using lcore id as hint to sub-NUMA.
> >
>
> I'm unclear about this keeping the API count down - you are proposing a lot of APIs and macros up above.
No, I am not. I have shared based on the last discussion with Anatoly we will end up with 2 API in lcore only. Explained in the above response
> My suggestion is basically to add two APIs and no macros: one API to get the max number of topology-nearness levels, and a
> second API to get the next sibling a given nearness level from
> 0(nearest)..N(furthest). If we want, we can also add a FOREACH macro too.
>
> Overall, though, as I say above, let's focus on the problem the app actually
> wants these APIs for, not how we think we should solve it. Apps don't want to
> know the topology for knowledge sake, they want to use that knowledge to
> improve performance by pinning tasks to cores. What is the minimum that we
> need to provide to enable the app to do that? For example, if there are no
> lcores that share an L1, then from an app topology viewpoint that L1 level may
> as well not exist, because it provides us no details on how to place our work.
I have shared above why we need vendor agnostic L1, L2, L3 and sub-NUMA-IO.
Snipped
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 11:50 ` Varghese, Vipin
@ 2024-09-13 14:15 ` Burakov, Anatoly
0 siblings, 0 replies; 56+ messages in thread
From: Burakov, Anatoly @ 2024-09-13 14:15 UTC (permalink / raw)
To: Varghese, Vipin, Bruce Richardson
Cc: Mattias Rönnblom, Yigit, Ferruh, dev
On 9/12/2024 1:50 PM, Varghese, Vipin wrote:
> [Public]
>
> Snipped
>
>>>
>>>
>>> Based on the discussions we agreed on sharing version-2 FRC for
>>> extending API as `rte_get_next_lcore_extnd` with extra argument as
>>> `flags`.
>>>
>>> As per my ideation, for the API ` rte_get_next_sibling_core`, the above
>>> API can easily with flag ` RTE_GET_LCORE_L1 (SMT)`. Is this right
>>> understanding?
>>>
>>> We can easily have simple MACROs like `RTE_LCORE_FOREACH_L1` which
>>> allows to iterate SMT sibling threads.
>>>
>>>
>>
>> This seems like a lot of new macro and API additions! I'd really like to cut that
>> back and simplify the amount of new things we are adding to DPDK for this.
> I disagree Bruce, as per the new conversation with Anatoly and you it has been shared the new API are
> ```
> 1. rte_get_next_lcore_exntd
> 2. rte_get_next_n_lcore_exntd
> ```
>
> While I mentioned custom Macro can augment based on typical flag usage similar to ` RTE_LCORE_FOREACH and RTE_LCORE_FOREACH_WORKER` as
> ```
> RTE_LCORE_FOREACH_FLAG
> RTE_LCORE_FOREACH_WORKER_FLAG
>
> Or
>
> RTE_LCORE_FOREACH_LLC
> RTE_LCORE_FOREACH_WORKER_LLC
> ```
>
> Please note I have not even shared version-2 of RFC yet.
>
>> I tend to agree with others that external libs would be better for apps that really want to deal with all this.
> I have covered why this is not a good idea for Mattias query.
>
>>
>>>
>>> >
>>>
>>> > Looking logically, I'm not sure about the BOOST_ENABLED and BOOST_DISABLED flags you propose
>>> The idea for the BOOST_ENABLED & BOOST_DISABLED is based on DPDK power library which allows to enable boost.
>>> Allow user to select lcores where BOOST is enabled|disabled using MACRO or API.
> May be there is confusion, so let me try to be explicit here. The intention of any `rte_get_next_lcore##` is fetch lcores.
> Hence with new proposed API `rte_get_next_lcore_exntd` with `flag set for Boost` is to fetch lcores where boost is enabled.
> There is no intention to enable or disable boost on lcore with `get` API.
>
>>>
>>>
>>>
>>> - in a system with multiple possible
>>>
>>> > standard and boost frequencies what would those correspond to?
>>>
>>> I now understand the confusion, apologies for mixing the AMD EPYC SoC
>>> boost with Intel Turbo.
>>>
>>>
>>>
>>> Thank you for pointing out, we will use the terminology `
>>> RTE_GET_LCORE_TURBO`.
>>>
>>>
>>
>> That still doesn't clarify it for me. If you start mixing in power management related functions in with topology ones things will turn into a real headache.
> Can you please tell me what is not clarified. DPDK lcores as of today has no notion of Cache, Numa, Power, Turbo or any DPDK supported features.
> The initial API introduced were to expose lcore sharing the same Last Level Cache. Based on interaction with Anatoly, extending this to support multiple features turned out to be possibility.
> Hence, we said we can share v2 for RFC based on this idea.
>
> But if the claim is not to put TURBO I am also ok for this. Let only keep cache and NUMA-IO domain.
>
>> What does boost or turbo correspond to? Is it for cores that have the feature enabled - whether or not it's currently in use - or is it for finding cores that are
>> currently boosted? Do we need additions for cores that are boosted by 100Mhz vs say 300Mhz. What about cores that are in lower frequencies for
>> power-saving. Do we add macros for finding those?
> Why are we talking about feq-up and freq-down? This was not even discussed in this RFC patch at all.
>
>>>
>>> What's also
>>>
>>> > missing is a define for getting actual NUMA siblings i.e. those
>>> sharing common memory but not an L3 or anything else.
>>>
>>> This can be extended into `rte_get_next_lcore_extnd` with flag `
>>> RTE_GET_LCORE_NUMA`. This will allow to grab all lcores under the same
>>> sub-memory NUMA as shared by LCORE.
>>>
>>> If SMT sibling is enabled and DPDK Lcore mask covers the sibling
>>> threads, then ` RTE_GET_LCORE_NUMA` get all lcore and sibling threads
>>> under same memory NUMA of lcore shared.
>>>
>>>
>>
>> Yes. That can work. But it means we are basing the implementation on a fixed idea of what topologies there are or can exist.
>> My suggestion below is just to ignore the whole idea of L1 vs L2 vs NUMA - just give the app a way to find it's nearest nodes.
> Bruce, for different vendor SoC, the implementation of architecture is different. Let me share what I know
> 1. using L1, we can fetch SMT threads
> 2. using L2 we can get certain SoC on Arm, Intel and power PC which is like efficient cores
> 3. using L3 we can get certain SoC like AMD, AF64x and others which follow chiplet or tile split L3 domain.
>
>>
>> After all, the app doesn't want to know the topology just for the sake of knowing it - it wants it to ensure best placement of work on cores! To that end, it just needs to know what cores are near to each other and what are far away.
> Exactly, that is why we want to minimize new libraries and limit to format of existing API `rte_get_next_lcore`. The end user need to deploy another library or external library then map to DPDK lcore mapping to identify what is where.
> So as end user I prefer simple API which get my work done.
>
>>
>>>
>>> >
>>>
>>> > My suggestion would be to have the function take just an integer-type
>>> e.g.
>>>
>>> > uint16_t parameter which defines the memory/cache hierarchy level to
>>> use, 0
>>>
>>> > being lowest, 1 next, and so on. Different systems may have different
>>> numbers
>>>
>>> > of cache levels so lets just make it a zero-based index of levels,
>>> rather than
>>>
>>> > giving explicit defines (except for memory which should probably
>>> always be
>>>
>>> > last). The zero-level will be for "closest neighbour"
>>>
>>> Good idea, we did prototype this internally. But issue it will keep on
>>> adding the number of API into lcore library.
>>>
>>> To keep the API count less, we are using lcore id as hint to sub-NUMA.
>>>
>>
>> I'm unclear about this keeping the API count down - you are proposing a lot of APIs and macros up above.
> No, I am not. I have shared based on the last discussion with Anatoly we will end up with 2 API in lcore only. Explained in the above response
>
>> My suggestion is basically to add two APIs and no macros: one API to get the max number of topology-nearness levels, and a
>> second API to get the next sibling a given nearness level from
>> 0(nearest)..N(furthest). If we want, we can also add a FOREACH macro too.
>>
>> Overall, though, as I say above, let's focus on the problem the app actually
>> wants these APIs for, not how we think we should solve it. Apps don't want to
>> know the topology for knowledge sake, they want to use that knowledge to
>> improve performance by pinning tasks to cores. What is the minimum that we
>> need to provide to enable the app to do that? For example, if there are no
>> lcores that share an L1, then from an app topology viewpoint that L1 level may
>> as well not exist, because it provides us no details on how to place our work.
> I have shared above why we need vendor agnostic L1, L2, L3 and sub-NUMA-IO.
>
> Snipped
Just to add my 2c here, since my name is being thrown around a lot in
this discussion :)
I tend to agree with Bruce here in the sense that if we want this API be
used to group cores together, then ideally we shouldn't really
explicitly call out the principle by which we group them unless we have
to. My main contention with the initial RFC *was* the fact that it was
tied to specific HW arch stuff in the API.
Vipin has suggested using a "flags" value to discriminate between
L1/L2/L3/NUMA/whatever ways of grouping cores, and I agree that it's
better than what was initially proposed (at least from my vantage
point), but what's even better is not to have any flags at all! As in, I
think the thing we're presumably trying to achieve here just as well
could be achieved simply by returning a number of "levels" we have in
our hierarchy, and user then being able to iterate over nearest
neighbours sitting on that "level" without explicitly specifying what
that level is.
So, for some systems level 0 would be SMT, for others - L3, for some -
NUMA, for yet others - efficiency/performance cores, etc. Bruce's
suggestion is that we don't explicitly call out the thing we use to
group the cores by, and instead rely on EAL to parse that information
out for us into a set of "levels". I would agree that for anything more
complicated an external library would be the way to go, because, well,
we're DPDK, not Linux kernel.
But, just to be clear, this is not mutually exclusive with some kind of
topology-style API. If we do go down that route, then the point about
"attaching to specific architectural features" becomes moot, as by
necessity any DOM-style API would have to represent topology in some
way, which then gets used by DPDK.
The main question here (and other people have rightly asked this
question) would be, do we want a topology API, or do we want an API to
assist with scheduling. My impression so far has been that Vipin is
looking for the latter rather than the former, as no topology-related
use cases were mentioned in the discussion except as a proxy for scheduling.
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-12 9:17 ` Bruce Richardson
2024-09-12 11:50 ` Varghese, Vipin
@ 2024-09-12 13:18 ` Mattias Rönnblom
1 sibling, 0 replies; 56+ messages in thread
From: Mattias Rönnblom @ 2024-09-12 13:18 UTC (permalink / raw)
To: Bruce Richardson, Varghese, Vipin; +Cc: Yigit, Ferruh, dev
On 2024-09-12 11:17, Bruce Richardson wrote:
> On Thu, Sep 12, 2024 at 02:19:07AM +0000, Varghese, Vipin wrote:
>> [Public]
>>
>> <snipped>
>>
>>
>>
>> > > > > <snipped>
>>
>> > > > >
>>
>> > > > >>> <snipped>
>>
>> > > > >>>
>>
>> > > > >>> Thank you Mattias for the comments and question, please let
>> me
>>
>> > > > >>> try to explain the same below
>>
>> > > > >>>
>>
>> > > > >>>> We shouldn't have a separate CPU/cache hierarchy API
>> instead?
>>
>> > > > >>>
>>
>> > > > >>> Based on the intention to bring in CPU lcores which share
>> same
>>
>> > > > >>> L3 (for better cache hits and less noisy neighbor) current
>> API
>>
>> > > > >>> focuses on using
>>
>> > > > >>>
>>
>> > > > >>> Last Level Cache. But if the suggestion is `there are SoC
>> where
>>
>> > > > >>> L2 cache are also shared, and the new API should be
>>
>> > > > >>> provisioned`, I am also
>>
>> > > > >>>
>>
>> > > > >>> comfortable with the thought.
>>
>> > > > >>>
>>
>> > > > >>
>>
>> > > > >> Rather than some AMD special case API hacked into
>> <rte_lcore.h>,
>>
>> > > > >> I think we are better off with no DPDK API at all for this
>> kind of
>>
>> > functionality.
>>
>> > > > >
>>
>> > > > > Hi Mattias, as shared in the earlier email thread, this is not
>> a
>>
>> > > > > AMD special
>>
>> > > > case at all. Let me try to explain this one more time. One of
>>
>> > > > techniques used to increase cores cost effective way to go for
>> tiles of
>>
>> > compute complexes.
>>
>> > > > > This introduces a bunch of cores in sharing same Last Level
>> Cache
>>
>> > > > > (namely
>>
>> > > > L2, L3 or even L4) depending upon cache topology architecture.
>>
>> > > > >
>>
>> > > > > The API suggested in RFC is to help end users to selectively
>> use
>>
>> > > > > cores under
>>
>> > > > same Last Level Cache Hierarchy as advertised by OS (irrespective
>> of
>>
>> > > > the BIOS settings used). This is useful in both bare-metal and
>> container
>>
>> > environment.
>>
>> > > > >
>>
>> > > >
>>
>> > > > I'm pretty familiar with AMD CPUs and the use of tiles (including
>>
>> > > > the challenges these kinds of non-uniformities pose for work
>> scheduling).
>>
>> > > >
>>
>> > > > To maximize performance, caring about core<->LLC relationship may
>>
>> > > > well not be enough, and more HT/core/cache/memory topology
>>
>> > > > information is required. That's what I meant by special case. A
>>
>> > > > proper API should allow access to information about which lcores
>> are
>>
>> > > > SMT siblings, cores on the same L2, and cores on the same L3, to
>>
>> > > > name a few things. Probably you want to fit NUMA into the same
>> API
>>
>> > > > as well, although that is available already in <rte_lcore.h>.
>>
>> > >
>>
>> > > Thank you Mattias for the information, as shared by in the reply
>> with
>>
>> > Anatoly we want expose a new API `rte_get_next_lcore_ex` which
>> intakes a
>>
>> > extra argument `u32 flags`.
>>
>> > > The flags can be RTE_GET_LCORE_L1 (SMT), RTE_GET_LCORE_L2,
>>
>> > RTE_GET_LCORE_L3, RTE_GET_LCORE_BOOST_ENABLED,
>>
>> > RTE_GET_LCORE_BOOST_DISABLED.
>>
>> > >
>>
>> >
>>
>> > For the naming, would "rte_get_next_sibling_core" (or lcore if you
>> prefer) be a
>>
>> > clearer name than just adding "ex" on to the end of the existing
>> function?
>>
>> Thank you Bruce, Please find my answer below
>>
>>
>>
>> Functions shared as per the RFC were
>>
>> ```
>>
>> - rte_get_llc_first_lcores: Retrieves all the first lcores in the
>> shared LLC.
>>
>> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
>>
>> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n
>> lcores in the shared LLC.
>>
>> ```
>>
>>
>>
>> MACRO’s extending the usability were
>>
>> ```
>>
>> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each
>> LLC.
>>
>> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker
>> lcore from each LLC.
>>
>> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint
>> (lcore id).
>>
>> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while
>> skipping first worker.
>>
>> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from
>> each LLC.
>>
>> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then
>> iterates through reaming lcores in each LLC.
>>
>> ```
>>
>>
>>
>> Based on the discussions we agreed on sharing version-2 FRC for
>> extending API as `rte_get_next_lcore_extnd` with extra argument as
>> `flags`.
>>
>> As per my ideation, for the API ` rte_get_next_sibling_core`, the above
>> API can easily with flag ` RTE_GET_LCORE_L1 (SMT)`. Is this right
>> understanding?
>>
>> We can easily have simple MACROs like `RTE_LCORE_FOREACH_L1` which
>> allows to iterate SMT sibling threads.
>>
>>
>
> This seems like a lot of new macro and API additions! I'd really like to
> cut that back and simplify the amount of new things we are adding to DPDK
> for this. I tend to agree with others that external libs would be better
> for apps that really want to deal with all this.
>
Conveying HW topology will require a fair bit of API verbiage. I think
there's no way around it, other than giving the API user half of the
story (or 1% of the story).
That's one of the reasons I think it should be in a separate header file
in EAL.
>>
>> >
>>
>> > Looking logically, I'm not sure about the BOOST_ENABLED and
>>
>> > BOOST_DISABLED flags you propose
>>
>> The idea for the BOOST_ENABLED & BOOST_DISABLED is based on DPDK power
>> library which allows to enable boost.
>>
>> Allow user to select lcores where BOOST is enabled|disabled using MACRO
>> or API.
>>
>>
>>
>> - in a system with multiple possible
>>
>> > standard and boost frequencies what would those correspond to?
>>
>> I now understand the confusion, apologies for mixing the AMD EPYC SoC
>> boost with Intel Turbo.
>>
>>
>>
>> Thank you for pointing out, we will use the terminology `
>> RTE_GET_LCORE_TURBO`.
>>
>>
>
> That still doesn't clarify it for me. If you start mixing in power
> management related functions in with topology ones things will turn into a
> real headache. What does boost or turbo correspond to? Is it for cores that
> have the feature enabled - whether or not it's currently in use - or is it
> for finding cores that are currently boosted? Do we need additions for
> cores that are boosted by 100Mhz vs say 300Mhz. What about cores that are
> in lower frequencies for power-saving. Do we add macros for finding those?
>
In my world, the operating frequency is a property of a CPU core node in
the hardware topology.
lcore discrimination (or classification) shouldn't be built as a myriad
of FOREACH macros, but rather generic iteration + app domain logic.
For example, the size of the L3 could be a factor. Should we have a
FOREACH_BIG_L3. No.
>>
>> What's also
>>
>> > missing is a define for getting actual NUMA siblings i.e. those
>> sharing common
>>
>> > memory but not an L3 or anything else.
>>
>> This can be extended into `rte_get_next_lcore_extnd` with flag `
>> RTE_GET_LCORE_NUMA`. This will allow to grab all lcores under the same
>> sub-memory NUMA as shared by LCORE.
>>
>> If SMT sibling is enabled and DPDK Lcore mask covers the sibling
>> threads, then ` RTE_GET_LCORE_NUMA` get all lcore and sibling threads
>> under same memory NUMA of lcore shared.
>>
>>
>
> Yes. That can work. But it means we are basing the implementation on a
> fixed idea of what topologies there are or can exist. My suggestion below
> is just to ignore the whole idea of L1 vs L2 vs NUMA - just give the app a
> way to find it's nearest nodes.
>
I think we need to agree what is the purpose of this API. Is it the to
describe the hardware topology in some details for general-purpose use
(including informing the operator, lstopo-style), or just some abstract,
simplified representation to be use purely for work scheduling.
> After all, the app doesn't want to know the topology just for the sake of
> knowing it - it wants it to ensure best placement of work on cores! To that
> end, it just needs to know what cores are near to each other and what are
> far away.
>
>>
>> >
>>
>> > My suggestion would be to have the function take just an integer-type
>> e.g.
>>
>> > uint16_t parameter which defines the memory/cache hierarchy level to
>> use, 0
>>
>> > being lowest, 1 next, and so on. Different systems may have different
>> numbers
>>
>> > of cache levels so lets just make it a zero-based index of levels,
>> rather than
>>
>> > giving explicit defines (except for memory which should probably
>> always be
>>
>> > last). The zero-level will be for "closest neighbour"
>>
>> Good idea, we did prototype this internally. But issue it will keep on
>> adding the number of API into lcore library.
>>
>> To keep the API count less, we are using lcore id as hint to sub-NUMA.
>>
>
> I'm unclear about this keeping the API count down - you are proposing a lot
> of APIs and macros up above. My suggestion is basically to add two APIs and
> no macros: one API to get the max number of topology-nearness levels, and a
> second API to get the next sibling a given nearness level from
> 0(nearest)..N(furthest). If we want, we can also add a FOREACH macro too.
>
> Overall, though, as I say above, let's focus on the problem the app
> actually wants these APIs for, not how we think we should solve it. Apps
> don't want to know the topology for knowledge sake, they want to use that
> knowledge to improve performance by pinning tasks to cores. What is the
> minimum that we need to provide to enable the app to do that? For example,
> if there are no lcores that share an L1, then from an app topology
> viewpoint that L1 level may as well not exist, because it provides us no
> details on how to place our work.
>
> For the rare app that does have some esoteric use-case that does actually
> want to know some intricate details of the topology, then having that app
> use an external lib is probably a better solution than us trying to cover
> all possible options in DPDK.
>
> My 2c. on this at this stage anyway.
>
> /Bruce
>
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-08-27 15:10 [RFC 0/2] introduce LLC aware functions Vipin Varghese
` (2 preceding siblings ...)
2024-08-27 21:23 ` [RFC 0/2] introduce LLC aware functions Mattias Rönnblom
@ 2024-08-28 8:38 ` Burakov, Anatoly
2024-09-02 1:08 ` Varghese, Vipin
2024-10-07 21:28 ` Stephen Hemminger
4 siblings, 1 reply; 56+ messages in thread
From: Burakov, Anatoly @ 2024-08-28 8:38 UTC (permalink / raw)
To: Vipin Varghese, ferruh.yigit, dev
On 8/27/2024 5:10 PM, Vipin Varghese wrote:
> As core density continues to increase, chiplet-based
> core packing has become a key trend. In AMD SoC EPYC
> architectures, core complexes within the same chiplet
> share a Last-Level Cache (LLC). By packing logical cores
> within the same LLC, we can enhance pipeline processing
> stages due to reduced latency and improved data locality.
>
> To leverage these benefits, DPDK libraries and examples
> can utilize localized lcores. This approach ensures more
> consistent latencies by minimizing the dispersion of lcores
> across different chiplet complexes and enhances packet
> processing by ensuring that data for subsequent pipeline
> stages is likely to reside within the LLC.
>
> < Function: Purpose >
> ---------------------
> - rte_get_llc_first_lcores: Retrieves all the first lcores in the shared LLC.
> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n lcores in the shared LLC.
>
> < MACRO: Purpose >
> ------------------
> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint (lcore id).
> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while skipping first worker.
> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from each LLC.
> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then iterates through reaming lcores in each LLC.
>
Hi Vipin,
I recently looked into how Intel's Sub-NUMA Clustering would work within
DPDK, and found that I actually didn't have to do anything, because the
SNC "clusters" present themselves as NUMA nodes, which DPDK already
supports natively.
Does AMD's implementation of chiplets not report themselves as separate
NUMA nodes? Because if it does, I don't really think any changes are
required because NUMA nodes would give you the same thing, would it not?
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-08-28 8:38 ` Burakov, Anatoly
@ 2024-09-02 1:08 ` Varghese, Vipin
2024-09-02 14:17 ` Burakov, Anatoly
0 siblings, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-02 1:08 UTC (permalink / raw)
To: Burakov, Anatoly, ferruh.yigit, dev
[-- Attachment #1: Type: text/plain, Size: 2391 bytes --]
<Snipped>
Thank you Antaloy for the response. Let me try to share my understanding.
> I recently looked into how Intel's Sub-NUMA Clustering would work within
> DPDK, and found that I actually didn't have to do anything, because the
> SNC "clusters" present themselves as NUMA nodes, which DPDK already
> supports natively.
yes, this is correct. In Intel Xeon Platinum BIOS one can enable
`Cluster per NUMA` as `1,2 or4`.
This divides the tiles into Sub-Numa parition, each having separate
lcores,memory controllers, PCIe
and accelerator.
>
> Does AMD's implementation of chiplets not report themselves as separate
> NUMA nodes?
In AMD EPYC Soc, this is different. There are 2 BIOS settings, namely
1. NPS: `Numa Per Socket` which allows the IO tile (memory, PCIe and
Accelerator) to be partitioned as Numa 0, 1, 2 or 4.
2. L3 as NUMA: `L3 cache of CPU tiles as individual NUMA`. This allows
all CPU tiles to be independent NUMA cores.
The above settings are possible because CPU is independent from IO tile.
Thus allowing 4 combinations be available for use.
These are covered in the tuning gudie for the SoC in 12. How to get best
performance on AMD platform — Data Plane Development Kit 24.07.0
documentation (dpdk.org)
<https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>.
> Because if it does, I don't really think any changes are
> required because NUMA nodes would give you the same thing, would it not?
I have a different opinion to this outlook. An end user can
1. Identify the lcores and it's NUMA user `usertools/cpu-layout.py`
2. But it is core mask in eal arguments which makes the threads
available to be used in a process.
3. there are no API which distinguish L3 numa domain. Function
`rte_socket_id
<https://doc.dpdk.org/api/rte__lcore_8h.html#a7c8da4664df26a64cf05dc508a4f26df>`
for CPU tiles like AMD SoC will return physical socket.
Example: In AMD EPYC Genoa, there are total of 13 tiles. 12 CPU tiles
and 1 IO tile. Setting
1. NPS to 4 will divide the memory, PCIe and accelerator into 4 domain.
While the all CPU will appear as single NUMA but each 12 tile having
independent L3 caches.
2. Setting `L3 as NUMA` allows each tile to appear as separate L3 clusters.
Hence, adding an API which allows to select available lcores based on
Split L3 is essential irrespective of the BIOS setting.
>
> --
> Thanks,
> Anatoly
>
[-- Attachment #2: Type: text/html, Size: 7994 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-02 1:08 ` Varghese, Vipin
@ 2024-09-02 14:17 ` Burakov, Anatoly
2024-09-02 15:33 ` Varghese, Vipin
0 siblings, 1 reply; 56+ messages in thread
From: Burakov, Anatoly @ 2024-09-02 14:17 UTC (permalink / raw)
To: Varghese, Vipin, ferruh.yigit, dev
On 9/2/2024 3:08 AM, Varghese, Vipin wrote:
> <Snipped>
>
> Thank you Antaloy for the response. Let me try to share my understanding.
>
>> I recently looked into how Intel's Sub-NUMA Clustering would work within
>> DPDK, and found that I actually didn't have to do anything, because the
>> SNC "clusters" present themselves as NUMA nodes, which DPDK already
>> supports natively.
>
> yes, this is correct. In Intel Xeon Platinum BIOS one can enable
> `Cluster per NUMA` as `1,2 or4`.
>
> This divides the tiles into Sub-Numa parition, each having separate
> lcores,memory controllers, PCIe
>
> and accelerator.
>
>>
>> Does AMD's implementation of chiplets not report themselves as separate
>> NUMA nodes?
>
> In AMD EPYC Soc, this is different. There are 2 BIOS settings, namely
>
> 1. NPS: `Numa Per Socket` which allows the IO tile (memory, PCIe and
> Accelerator) to be partitioned as Numa 0, 1, 2 or 4.
>
> 2. L3 as NUMA: `L3 cache of CPU tiles as individual NUMA`. This allows
> all CPU tiles to be independent NUMA cores.
>
>
> The above settings are possible because CPU is independent from IO tile.
> Thus allowing 4 combinations be available for use.
Sure, but presumably if the user wants to distinguish this, they have to
configure their system appropriately. If user wants to take advantage of
L3 as NUMA (which is what your patch proposes), then they can enable the
BIOS knob and get that functionality for free. DPDK already supports this.
>
> These are covered in the tuning gudie for the SoC in 12. How to get best
> performance on AMD platform — Data Plane Development Kit 24.07.0
> documentation (dpdk.org)
> <https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>.
>
>
>> Because if it does, I don't really think any changes are
>> required because NUMA nodes would give you the same thing, would it not?
>
> I have a different opinion to this outlook. An end user can
>
> 1. Identify the lcores and it's NUMA user `usertools/cpu-layout.py`
I recently submitted an enhacement for CPU layout script to print out
NUMA separately from physical socket [1].
[1]
https://patches.dpdk.org/project/dpdk/patch/40cf4ee32f15952457ac5526cfce64728bd13d32.1724323106.git.anatoly.burakov@intel.com/
I believe when "L3 as NUMA" is enabled in BIOS, the script will display
both physical package ID as well as NUMA nodes reported by the system,
which will be different from physical package ID, and which will display
information you were looking for.
>
> 2. But it is core mask in eal arguments which makes the threads
> available to be used in a process.
See above: if the OS already reports NUMA information, this is not a
problem to be solved, CPU layout script can give this information to the
user.
>
> 3. there are no API which distinguish L3 numa domain. Function
> `rte_socket_id
> <https://doc.dpdk.org/api/rte__lcore_8h.html#a7c8da4664df26a64cf05dc508a4f26df>` for CPU tiles like AMD SoC will return physical socket.
Sure, but I would think the answer to that would be to introduce an API
to distinguish between NUMA (socket ID in DPDK parlance) and package
(physical socket ID in the "traditional NUMA" sense). Once we can
distinguish between those, DPDK can just rely on NUMA information
provided by the OS, while still being capable of identifying physical
sockets if the user so desires.
I am actually going to introduce API to get *physical socket* (as
opposed to NUMA node) in the next few days.
>
>
> Example: In AMD EPYC Genoa, there are total of 13 tiles. 12 CPU tiles
> and 1 IO tile. Setting
>
> 1. NPS to 4 will divide the memory, PCIe and accelerator into 4 domain.
> While the all CPU will appear as single NUMA but each 12 tile having
> independent L3 caches.
>
> 2. Setting `L3 as NUMA` allows each tile to appear as separate L3 clusters.
>
>
> Hence, adding an API which allows to select available lcores based on
> Split L3 is essential irrespective of the BIOS setting.
>
I think the crucial issue here is the "irrespective of BIOS setting"
bit. If EAL is getting into the game of figuring out exact intricacies
of physical layout of the system, then there's a lot more work to be
done as there are lots of different topologies, as other people have
already commented, and such an API needs *a lot* of thought put into it.
If, on the other hand, we leave this issue to the kernel, and only
gather NUMA information provided by the kernel, then nothing has to be
done - DPDK already supports all of this natively, provided the user has
configured the system correctly.
Moreover, arguably DPDK already works that way: technically you can get
physical socket information even absent of NUMA support in BIOS, but
DPDK does not do that. Instead, if OS reports NUMA node as 0, that's
what we're going with (even if we could detect multiple sockets from
sysfs), and IMO it should stay that way unless there is a strong
argument otherwise. We force the user to configure their system
correctly as it is, and I see no reason to second-guess user's BIOS
configuration otherwise.
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-02 14:17 ` Burakov, Anatoly
@ 2024-09-02 15:33 ` Varghese, Vipin
2024-09-03 8:50 ` Burakov, Anatoly
0 siblings, 1 reply; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-02 15:33 UTC (permalink / raw)
To: Burakov, Anatoly, ferruh.yigit, dev
[-- Attachment #1: Type: text/plain, Size: 8164 bytes --]
<snipped>
>>
>>> I recently looked into how Intel's Sub-NUMA Clustering would work
>>> within
>>> DPDK, and found that I actually didn't have to do anything, because the
>>> SNC "clusters" present themselves as NUMA nodes, which DPDK already
>>> supports natively.
>>
>> yes, this is correct. In Intel Xeon Platinum BIOS one can enable
>> `Cluster per NUMA` as `1,2 or4`.
>>
>> This divides the tiles into Sub-Numa parition, each having separate
>> lcores,memory controllers, PCIe
>>
>> and accelerator.
>>
>>>
>>> Does AMD's implementation of chiplets not report themselves as separate
>>> NUMA nodes?
>>
>> In AMD EPYC Soc, this is different. There are 2 BIOS settings, namely
>>
>> 1. NPS: `Numa Per Socket` which allows the IO tile (memory, PCIe and
>> Accelerator) to be partitioned as Numa 0, 1, 2 or 4.
>>
>> 2. L3 as NUMA: `L3 cache of CPU tiles as individual NUMA`. This allows
>> all CPU tiles to be independent NUMA cores.
>>
>>
>> The above settings are possible because CPU is independent from IO tile.
>> Thus allowing 4 combinations be available for use.
>
> Sure, but presumably if the user wants to distinguish this, they have to
> configure their system appropriately. If user wants to take advantage of
> L3 as NUMA (which is what your patch proposes), then they can enable the
> BIOS knob and get that functionality for free. DPDK already supports
> this.
>
The intend of the RFC is to introduce the ability to select lcore within
the same
L3 cache whether the BIOS is set or unset for `L3 as NUMA`. This is also
achieved
and tested on platforms which advertises via sysfs by OS kernel. Thus
eliminating
the dependency on hwloc and libuma which can be different versions in
different distros.
>>
>> These are covered in the tuning gudie for the SoC in 12. How to get best
>> performance on AMD platform — Data Plane Development Kit 24.07.0
>> documentation (dpdk.org)
>> <https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>.
>>
>>
>>> Because if it does, I don't really think any changes are
>>> required because NUMA nodes would give you the same thing, would it
>>> not?
>>
>> I have a different opinion to this outlook. An end user can
>>
>> 1. Identify the lcores and it's NUMA user `usertools/cpu-layout.py`
>
> I recently submitted an enhacement for CPU layout script to print out
> NUMA separately from physical socket [1].
>
> [1]
> https://patches.dpdk.org/project/dpdk/patch/40cf4ee32f15952457ac5526cfce64728bd13d32.1724323106.git.anatoly.burakov@intel.com/
>
>
> I believe when "L3 as NUMA" is enabled in BIOS, the script will display
> both physical package ID as well as NUMA nodes reported by the system,
> which will be different from physical package ID, and which will display
> information you were looking for.
As AMD we had submitted earlier work on the same via usertools: enhance
logic to display NUMA - Patchwork (dpdk.org)
<https://patchwork.dpdk.org/project/dpdk/patch/20220326073207.489694-1-vipin.varghese@amd.com/>.
this clearly were distinguishing NUMA and Physical socket.
>
>>
>> 2. But it is core mask in eal arguments which makes the threads
>> available to be used in a process.
>
> See above: if the OS already reports NUMA information, this is not a
> problem to be solved, CPU layout script can give this information to the
> user.
Agreed, but as pointed out in case of Intel Xeon Platinum SPR, the tile
consists of cpu, memory, pcie and accelerator.
hence setting the BIOS option `Cluster per NUMA` the OS kernel & libnuma
display appropriate Domain with memory, pcie and cpu.
In case of AMD SoC, libnuma for CPU is different from memory NUMA per
socket.
>
>>
>> 3. there are no API which distinguish L3 numa domain. Function
>> `rte_socket_id
>> <https://doc.dpdk.org/api/rte__lcore_8h.html#a7c8da4664df26a64cf05dc508a4f26df>`
>> for CPU tiles like AMD SoC will return physical socket.
>
> Sure, but I would think the answer to that would be to introduce an API
> to distinguish between NUMA (socket ID in DPDK parlance) and package
> (physical socket ID in the "traditional NUMA" sense). Once we can
> distinguish between those, DPDK can just rely on NUMA information
> provided by the OS, while still being capable of identifying physical
> sockets if the user so desires.
Agreed, +1 for the idea for physcial socket and changes in library to
exploit the same.
>
> I am actually going to introduce API to get *physical socket* (as
> opposed to NUMA node) in the next few days.
>
But how does it solve the end customer issues
1. if there are multiple NIC or Accelerator on multiple socket, but IO
tile is partitioned to Sub Domain.
2. If RTE_FLOW steering is applied on NIC which needs to processed under
same L3 - reduces noisy neighbor and better cache hits
3, for PKT-distribute library which needs to run within same worker
lcore set as RX-Distributor-TX.
Current RFC suggested addresses the above, by helping the end users to
identify the lcores withing same L3 domain under a NUMA|Physical socket
irresepctive of BIOS setting.
>>
>>
>> Example: In AMD EPYC Genoa, there are total of 13 tiles. 12 CPU tiles
>> and 1 IO tile. Setting
>>
>> 1. NPS to 4 will divide the memory, PCIe and accelerator into 4 domain.
>> While the all CPU will appear as single NUMA but each 12 tile having
>> independent L3 caches.
>>
>> 2. Setting `L3 as NUMA` allows each tile to appear as separate L3
>> clusters.
>>
>>
>> Hence, adding an API which allows to select available lcores based on
>> Split L3 is essential irrespective of the BIOS setting.
>>
>
> I think the crucial issue here is the "irrespective of BIOS setting"
> bit.
That is what the current RFC achieves.
> If EAL is getting into the game of figuring out exact intricacies
> of physical layout of the system, then there's a lot more work to be
> done as there are lots of different topologies, as other people have
> already commented, and such an API needs *a lot* of thought put into it.
There is standard sysfs interfaces for CPU cache topology (OS kernel),
as mentioned earlier
problem with hwloc and libnuma is different distros has different
versions. There are solutions for
specific SoC architectures as per latest comment.
But we always can limit the API to selected SoC, while all other SoC
when invoked will invoke rte_get_next_lcore.
>
> If, on the other hand, we leave this issue to the kernel, and only
> gather NUMA information provided by the kernel, then nothing has to be
> done - DPDK already supports all of this natively, provided the user has
> configured the system correctly.
As shared above, we tried to bring this usertools: enhance logic to
display NUMA - Patchwork (dpdk.org)
<https://patchwork.dpdk.org/project/dpdk/patch/20220326073207.489694-1-vipin.varghese@amd.com/>.
DPDK support for lcore is getting enhanced and allowing user to use more
favorable lcores within same Tile.
>
> Moreover, arguably DPDK already works that way: technically you can get
> physical socket information even absent of NUMA support in BIOS, but
> DPDK does not do that. Instead, if OS reports NUMA node as 0, that's
> what we're going with (even if we could detect multiple sockets from
> sysfs),
In the above argument, it is shared as OS kernel detects NUMA or domain,
which is used by DPDK right?
The RFC suggested also adheres to the same, what OS sees. can you please
explain for better understanding
what in the RFC is doing differently?
> and IMO it should stay that way unless there is a strong
> argument otherwise.
Totally agree, that is what the RFC is also doing, based on what OS sees
as NUMA we are using it.
Only addition is within the NUMA if there are split LLC, allow selection
of those lcores. Rather than blindly choosing lcore using
rte_lcore_get_next.
> We force the user to configure their system
> correctly as it is, and I see no reason to second-guess user's BIOS
> configuration otherwise.
Again iterating, the changes suggested in RFC are agnostic to what BIOS
options are used,
It is to earlier question `is AMD configuration same as Intel tile` I
have explained it is not using BIOS setting.
>
> --
> Thanks,
> Anatoly
>
[-- Attachment #2: Type: text/html, Size: 13167 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-02 15:33 ` Varghese, Vipin
@ 2024-09-03 8:50 ` Burakov, Anatoly
2024-09-05 13:05 ` Ferruh Yigit
0 siblings, 1 reply; 56+ messages in thread
From: Burakov, Anatoly @ 2024-09-03 8:50 UTC (permalink / raw)
To: Varghese, Vipin, ferruh.yigit, dev
On 9/2/2024 5:33 PM, Varghese, Vipin wrote:
> <snipped>
>>>
>>>> I recently looked into how Intel's Sub-NUMA Clustering would work
>>>> within
>>>> DPDK, and found that I actually didn't have to do anything, because the
>>>> SNC "clusters" present themselves as NUMA nodes, which DPDK already
>>>> supports natively.
>>>
>>> yes, this is correct. In Intel Xeon Platinum BIOS one can enable
>>> `Cluster per NUMA` as `1,2 or4`.
>>>
>>> This divides the tiles into Sub-Numa parition, each having separate
>>> lcores,memory controllers, PCIe
>>>
>>> and accelerator.
>>>
>>>>
>>>> Does AMD's implementation of chiplets not report themselves as separate
>>>> NUMA nodes?
>>>
>>> In AMD EPYC Soc, this is different. There are 2 BIOS settings, namely
>>>
>>> 1. NPS: `Numa Per Socket` which allows the IO tile (memory, PCIe and
>>> Accelerator) to be partitioned as Numa 0, 1, 2 or 4.
>>>
>>> 2. L3 as NUMA: `L3 cache of CPU tiles as individual NUMA`. This allows
>>> all CPU tiles to be independent NUMA cores.
>>>
>>>
>>> The above settings are possible because CPU is independent from IO tile.
>>> Thus allowing 4 combinations be available for use.
>>
>> Sure, but presumably if the user wants to distinguish this, they have to
>> configure their system appropriately. If user wants to take advantage of
>> L3 as NUMA (which is what your patch proposes), then they can enable the
>> BIOS knob and get that functionality for free. DPDK already supports
>> this.
>>
> The intend of the RFC is to introduce the ability to select lcore within
> the same
>
> L3 cache whether the BIOS is set or unset for `L3 as NUMA`. This is also
> achieved
>
> and tested on platforms which advertises via sysfs by OS kernel. Thus
> eliminating
>
> the dependency on hwloc and libuma which can be different versions in
> different distros.
But we do depend on libnuma, so we might as well depend on it? Are there
different versions of libnuma that interfere with what you're trying to
do? You keep coming back to this "whether the BIOS is set or unset" for
L3 as NUMA, but I'm still unclear as to what issues your patch is
solving assuming "knob is set". When the system is configured correctly,
it already works and reports cores as part of NUMA nodes (as L3)
correctly. It is only when the system is configured *not* to do that
that issues arise, is it not? In which case IMO the easier solution
would be to just tell the user to enable that knob in BIOS?
>
>
>>>
>>> These are covered in the tuning gudie for the SoC in 12. How to get best
>>> performance on AMD platform — Data Plane Development Kit 24.07.0
>>> documentation (dpdk.org)
>>> <https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>.
>>>
>>>
>>>> Because if it does, I don't really think any changes are
>>>> required because NUMA nodes would give you the same thing, would it
>>>> not?
>>>
>>> I have a different opinion to this outlook. An end user can
>>>
>>> 1. Identify the lcores and it's NUMA user `usertools/cpu-layout.py`
>>
>> I recently submitted an enhacement for CPU layout script to print out
>> NUMA separately from physical socket [1].
>>
>> [1]
>> https://patches.dpdk.org/project/dpdk/patch/40cf4ee32f15952457ac5526cfce64728bd13d32.1724323106.git.anatoly.burakov@intel.com/
>>
>> I believe when "L3 as NUMA" is enabled in BIOS, the script will display
>> both physical package ID as well as NUMA nodes reported by the system,
>> which will be different from physical package ID, and which will display
>> information you were looking for.
>
> As AMD we had submitted earlier work on the same via usertools: enhance
> logic to display NUMA - Patchwork (dpdk.org)
> <https://patchwork.dpdk.org/project/dpdk/patch/20220326073207.489694-1-vipin.varghese@amd.com/>.
>
> this clearly were distinguishing NUMA and Physical socket.
Oh, cool, I didn't see that patch. I would argue my visual format is
more readable though, so perhaps we can get that in :)
> Agreed, but as pointed out in case of Intel Xeon Platinum SPR, the tile
> consists of cpu, memory, pcie and accelerator.
>
> hence setting the BIOS option `Cluster per NUMA` the OS kernel & libnuma
> display appropriate Domain with memory, pcie and cpu.
>
>
> In case of AMD SoC, libnuma for CPU is different from memory NUMA per
> socket.
I'm curious how does the kernel handle this then, and what are you
getting from libnuma. You seem to be implying that there are two
different NUMA nodes on your SoC, and either kernel or libnuma are in
conflict as to what belongs to what NUMA node?
>
>>
>>>
>>> 3. there are no API which distinguish L3 numa domain. Function
>>> `rte_socket_id
>>> <https://doc.dpdk.org/api/rte__lcore_8h.html#a7c8da4664df26a64cf05dc508a4f26df>` for CPU tiles like AMD SoC will return physical socket.
>>
>> Sure, but I would think the answer to that would be to introduce an API
>> to distinguish between NUMA (socket ID in DPDK parlance) and package
>> (physical socket ID in the "traditional NUMA" sense). Once we can
>> distinguish between those, DPDK can just rely on NUMA information
>> provided by the OS, while still being capable of identifying physical
>> sockets if the user so desires.
> Agreed, +1 for the idea for physcial socket and changes in library to
> exploit the same.
>>
>> I am actually going to introduce API to get *physical socket* (as
>> opposed to NUMA node) in the next few days.
>>
> But how does it solve the end customer issues
>
> 1. if there are multiple NIC or Accelerator on multiple socket, but IO
> tile is partitioned to Sub Domain.
At least on Intel platforms, NUMA node gets assigned correctly - that
is, if my Xeon with SNC enabled has NUMA nodes 3,4 on socket 1, and
there's a NIC connected to socket 1, it's going to show up as being on
NUMA node 3 or 4 depending on where exactly I plugged it in. Everything
already works as expected, and there is no need for any changes for
Intel platforms (at least none that I can see).
My proposed API is really for those users who wish to explicitly allow
for reserving memory/cores on "the same physical socket", as "on the
same tile" is already taken care of by NUMA nodes.
>
> 2. If RTE_FLOW steering is applied on NIC which needs to processed under
> same L3 - reduces noisy neighbor and better cache hits
>
> 3, for PKT-distribute library which needs to run within same worker
> lcore set as RX-Distributor-TX.
>
Same as above: on Intel platforms, NUMA nodes already solve this.
<snip>
> Totally agree, that is what the RFC is also doing, based on what OS sees
> as NUMA we are using it.
>
> Only addition is within the NUMA if there are split LLC, allow selection
> of those lcores. Rather than blindly choosing lcore using
>
> rte_lcore_get_next.
It feels like we're working around a problem that shouldn't exist in the
first place, because kernel should already report this information.
Within NUMA subsystem, there is sysfs node "distance" that, at least on
Intel platforms and in certain BIOS configuration, reports distance
between NUMA nodes, from which one can make inferences about how far a
specific NUMA node is from any other NUMA node. This could have been
used to encode L3 cache information. Do AMD platforms not do that? In
that case, "lcore next" for a particular socket ID (NUMA node, in
reality) should already get us any cores that are close to each other,
because all of this information is already encoded in NUMA nodes by the
system.
I feel like there's a disconnect between my understanding of the problem
space, and yours, so I'm going to ask a very basic question:
Assuming the user has configured their AMD system correctly (i.e.
enabled L3 as NUMA), are there any problem to be solved by adding a new
API? Does the system not report each L3 as a separate NUMA node?
>
>
>> We force the user to configure their system
>> correctly as it is, and I see no reason to second-guess user's BIOS
>> configuration otherwise.
>
> Again iterating, the changes suggested in RFC are agnostic to what BIOS
> options are used,
But that is exactly my contention: are we not effectively working around
users' misconfiguration of a system then?
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-03 8:50 ` Burakov, Anatoly
@ 2024-09-05 13:05 ` Ferruh Yigit
2024-09-05 14:45 ` Burakov, Anatoly
0 siblings, 1 reply; 56+ messages in thread
From: Ferruh Yigit @ 2024-09-05 13:05 UTC (permalink / raw)
To: Burakov, Anatoly, Varghese, Vipin, dev; +Cc: Mattias Rönnblom
On 9/3/2024 9:50 AM, Burakov, Anatoly wrote:
> On 9/2/2024 5:33 PM, Varghese, Vipin wrote:
>> <snipped>
>>>>
>>>>> I recently looked into how Intel's Sub-NUMA Clustering would work
>>>>> within
>>>>> DPDK, and found that I actually didn't have to do anything, because
>>>>> the
>>>>> SNC "clusters" present themselves as NUMA nodes, which DPDK already
>>>>> supports natively.
>>>>
>>>> yes, this is correct. In Intel Xeon Platinum BIOS one can enable
>>>> `Cluster per NUMA` as `1,2 or4`.
>>>>
>>>> This divides the tiles into Sub-Numa parition, each having separate
>>>> lcores,memory controllers, PCIe
>>>>
>>>> and accelerator.
>>>>
>>>>>
>>>>> Does AMD's implementation of chiplets not report themselves as
>>>>> separate
>>>>> NUMA nodes?
>>>>
>>>> In AMD EPYC Soc, this is different. There are 2 BIOS settings, namely
>>>>
>>>> 1. NPS: `Numa Per Socket` which allows the IO tile (memory, PCIe and
>>>> Accelerator) to be partitioned as Numa 0, 1, 2 or 4.
>>>>
>>>> 2. L3 as NUMA: `L3 cache of CPU tiles as individual NUMA`. This allows
>>>> all CPU tiles to be independent NUMA cores.
>>>>
>>>>
>>>> The above settings are possible because CPU is independent from IO
>>>> tile.
>>>> Thus allowing 4 combinations be available for use.
>>>
>>> Sure, but presumably if the user wants to distinguish this, they have to
>>> configure their system appropriately. If user wants to take advantage of
>>> L3 as NUMA (which is what your patch proposes), then they can enable the
>>> BIOS knob and get that functionality for free. DPDK already supports
>>> this.
>>>
>> The intend of the RFC is to introduce the ability to select lcore
>> within the same
>>
>> L3 cache whether the BIOS is set or unset for `L3 as NUMA`. This is
>> also achieved
>>
>> and tested on platforms which advertises via sysfs by OS kernel. Thus
>> eliminating
>>
>> the dependency on hwloc and libuma which can be different versions in
>> different distros.
>
> But we do depend on libnuma, so we might as well depend on it? Are there
> different versions of libnuma that interfere with what you're trying to
> do? You keep coming back to this "whether the BIOS is set or unset" for
> L3 as NUMA, but I'm still unclear as to what issues your patch is
> solving assuming "knob is set". When the system is configured correctly,
> it already works and reports cores as part of NUMA nodes (as L3)
> correctly. It is only when the system is configured *not* to do that
> that issues arise, is it not? In which case IMO the easier solution
> would be to just tell the user to enable that knob in BIOS?
>
>>
>>
>>>>
>>>> These are covered in the tuning gudie for the SoC in 12. How to get
>>>> best
>>>> performance on AMD platform — Data Plane Development Kit 24.07.0
>>>> documentation (dpdk.org)
>>>> <https://doc.dpdk.org/guides/linux_gsg/amd_platform.html>.
>>>>
>>>>
>>>>> Because if it does, I don't really think any changes are
>>>>> required because NUMA nodes would give you the same thing, would it
>>>>> not?
>>>>
>>>> I have a different opinion to this outlook. An end user can
>>>>
>>>> 1. Identify the lcores and it's NUMA user `usertools/cpu-layout.py`
>>>
>>> I recently submitted an enhacement for CPU layout script to print out
>>> NUMA separately from physical socket [1].
>>>
>>> [1]
>>> https://patches.dpdk.org/project/dpdk/
>>> patch/40cf4ee32f15952457ac5526cfce64728bd13d32.1724323106.git.anatoly.burakov@intel.com/
>>>
>>> I believe when "L3 as NUMA" is enabled in BIOS, the script will display
>>> both physical package ID as well as NUMA nodes reported by the system,
>>> which will be different from physical package ID, and which will display
>>> information you were looking for.
>>
>> As AMD we had submitted earlier work on the same via usertools:
>> enhance logic to display NUMA - Patchwork (dpdk.org) <https://
>> patchwork.dpdk.org/project/dpdk/patch/20220326073207.489694-1-
>> vipin.varghese@amd.com/>.
>>
>> this clearly were distinguishing NUMA and Physical socket.
>
> Oh, cool, I didn't see that patch. I would argue my visual format is
> more readable though, so perhaps we can get that in :)
>
>> Agreed, but as pointed out in case of Intel Xeon Platinum SPR, the
>> tile consists of cpu, memory, pcie and accelerator.
>>
>> hence setting the BIOS option `Cluster per NUMA` the OS kernel &
>> libnuma display appropriate Domain with memory, pcie and cpu.
>>
>>
>> In case of AMD SoC, libnuma for CPU is different from memory NUMA per
>> socket.
>
> I'm curious how does the kernel handle this then, and what are you
> getting from libnuma. You seem to be implying that there are two
> different NUMA nodes on your SoC, and either kernel or libnuma are in
> conflict as to what belongs to what NUMA node?
>
>>
>>>
>>>>
>>>> 3. there are no API which distinguish L3 numa domain. Function
>>>> `rte_socket_id
>>>> <https://doc.dpdk.org/api/
>>>> rte__lcore_8h.html#a7c8da4664df26a64cf05dc508a4f26df>` for CPU tiles
>>>> like AMD SoC will return physical socket.
>>>
>>> Sure, but I would think the answer to that would be to introduce an API
>>> to distinguish between NUMA (socket ID in DPDK parlance) and package
>>> (physical socket ID in the "traditional NUMA" sense). Once we can
>>> distinguish between those, DPDK can just rely on NUMA information
>>> provided by the OS, while still being capable of identifying physical
>>> sockets if the user so desires.
>> Agreed, +1 for the idea for physcial socket and changes in library to
>> exploit the same.
>>>
>>> I am actually going to introduce API to get *physical socket* (as
>>> opposed to NUMA node) in the next few days.
>>>
>> But how does it solve the end customer issues
>>
>> 1. if there are multiple NIC or Accelerator on multiple socket, but IO
>> tile is partitioned to Sub Domain.
>
> At least on Intel platforms, NUMA node gets assigned correctly - that
> is, if my Xeon with SNC enabled has NUMA nodes 3,4 on socket 1, and
> there's a NIC connected to socket 1, it's going to show up as being on
> NUMA node 3 or 4 depending on where exactly I plugged it in. Everything
> already works as expected, and there is no need for any changes for
> Intel platforms (at least none that I can see).
>
> My proposed API is really for those users who wish to explicitly allow
> for reserving memory/cores on "the same physical socket", as "on the
> same tile" is already taken care of by NUMA nodes.
>
>>
>> 2. If RTE_FLOW steering is applied on NIC which needs to processed
>> under same L3 - reduces noisy neighbor and better cache hits
>>
>> 3, for PKT-distribute library which needs to run within same worker
>> lcore set as RX-Distributor-TX.
>>
>
> Same as above: on Intel platforms, NUMA nodes already solve this.
>
> <snip>
>
>> Totally agree, that is what the RFC is also doing, based on what OS
>> sees as NUMA we are using it.
>>
>> Only addition is within the NUMA if there are split LLC, allow
>> selection of those lcores. Rather than blindly choosing lcore using
>>
>> rte_lcore_get_next.
>
> It feels like we're working around a problem that shouldn't exist in the
> first place, because kernel should already report this information.
> Within NUMA subsystem, there is sysfs node "distance" that, at least on
> Intel platforms and in certain BIOS configuration, reports distance
> between NUMA nodes, from which one can make inferences about how far a
> specific NUMA node is from any other NUMA node. This could have been
> used to encode L3 cache information. Do AMD platforms not do that? In
> that case, "lcore next" for a particular socket ID (NUMA node, in
> reality) should already get us any cores that are close to each other,
> because all of this information is already encoded in NUMA nodes by the
> system.
>
> I feel like there's a disconnect between my understanding of the problem
> space, and yours, so I'm going to ask a very basic question:
>
> Assuming the user has configured their AMD system correctly (i.e.
> enabled L3 as NUMA), are there any problem to be solved by adding a new
> API? Does the system not report each L3 as a separate NUMA node?
>
Hi Anatoly,
Let me try to answer.
To start with, Intel "Sub-NUMA Clustering" and AMD NUMA is different, as
far as I understand SNC is more similar to more classic physical socket
based NUMA.
Following is the AMD CPU:
┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
│TILE1││TILE2││ ││TILE5││TILE6│
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
└─────┘└─────┘│ IO │└─────┘└─────┘
┌─────┐┌─────┐│ TILE │┌─────┐┌─────┐
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
│TILE3││TILE4││ ││TILE7││TILE8│
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
│ ││ ││ ││ ││ │
└─────┘└─────┘└──────────┘└─────┘└─────┘
Each 'Tile' has multiple cores, and 'IO Tile' has memory controller, bus
controllers etc..
When NPS=x configured in bios, IO tile resources are split and each seen
as a NUMA node.
Following is NPS=4
┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
│ ││ ││ . ││ ││ │
│ ││ ││ . ││ ││ │
│TILE1││TILE2││ . ││TILE5││TILE6│
│ ││ ││NUMA .NUMA││ ││ │
│ ││ ││ 0 . 1 ││ ││ │
│ ││ ││ . ││ ││ │
└─────┘└─────┘│ . │└─────┘└─────┘
┌─────┐┌─────┐│..........│┌─────┐┌─────┐
│ ││ ││ . ││ ││ │
│ ││ ││NUMA .NUMA││ ││ │
│TILE3││TILE4││ 2 . 3 ││TILE7││TILE8│
│ ││ ││ . ││ ││ │
│ ││ ││ . ││ ││ │
│ ││ ││ . ││ ││ │
└─────┘└─────┘└─────.────┘└─────┘└─────┘
Benefit of this is approach is now all cores has to access all NUMA
without any penalty. Like a DPDK application can use cores from 'TILE1',
'TILE4' & 'TILE7' to access to NUMA0 (or any NUMA) resources in high
performance.
This is different than SNC where cores access to cross NUMA resources
hit by performance penalty.
Now, although which tile cores come from doesn't matter from NUMA
perspective, it may matter (based on workload) to have them under same LLC.
One way to make sure all cores are under same LLC, is to enable "L3 as
NUMA" BIOS option, which will make each TILE shown as a different NUMA,
and user select cores from one NUMA.
This is sufficient up to some point, but not enough when application
needs number of cores that uses multiple tiles.
Assume each tile has 8 cores, and application needs 24 cores, when user
provide all cores from TILE1, TILE2 & TILE3, in DPDK right now there is
now way for application to figure out how to group/select these cores to
use cores efficiently.
Indeed this is what Vipin is enabling, from a core, he is finding list
of cores that will work efficiently with this core. In this perspective
this is nothing really related to NUMA configuration, and nothing really
specific to AMD, as defined Linux sysfs interface is used for this.
There are other architectures around that has similar NUMA configuration
and they can also use same logic, at worst we can introduce an
architecture specific code that all architectures can have a way to find
other cores that works more efficient with given core. This is a useful
feature for DPDK.
Lets looks into another example, application uses 24 cores in an graph
library like usage, that we want to group each three cores to process a
graph node. Application needs to a way to select which three cores works
most efficient with eachother, that is what this patch enables. In this
case enabling "L3 as NUMA" does not help at all. With this patch both
bios config works, but of course user should select cores to provide
application based on configuration.
And we can even improve this effective core selection, like as Mattias
suggested we can select cores that share L2 caches, with expansion of
this patch. This is unrelated to NUMA, and again it is not introducing
architecture details to DPDK as this implementation already relies on
Linux sysfs interface.
I hope it clarifies a little more.
Thanks,
ferruh
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-05 13:05 ` Ferruh Yigit
@ 2024-09-05 14:45 ` Burakov, Anatoly
2024-09-05 15:34 ` Ferruh Yigit
0 siblings, 1 reply; 56+ messages in thread
From: Burakov, Anatoly @ 2024-09-05 14:45 UTC (permalink / raw)
To: Ferruh Yigit, Varghese, Vipin, dev; +Cc: Mattias Rönnblom
On 9/5/2024 3:05 PM, Ferruh Yigit wrote:
> On 9/3/2024 9:50 AM, Burakov, Anatoly wrote:
>> On 9/2/2024 5:33 PM, Varghese, Vipin wrote:
>>> <snipped>
>>>>>
Hi Ferruh,
>>
>> I feel like there's a disconnect between my understanding of the problem
>> space, and yours, so I'm going to ask a very basic question:
>>
>> Assuming the user has configured their AMD system correctly (i.e.
>> enabled L3 as NUMA), are there any problem to be solved by adding a new
>> API? Does the system not report each L3 as a separate NUMA node?
>>
>
> Hi Anatoly,
>
> Let me try to answer.
>
> To start with, Intel "Sub-NUMA Clustering" and AMD NUMA is different, as
> far as I understand SNC is more similar to more classic physical socket
> based NUMA.
>
> Following is the AMD CPU:
> ┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> │TILE1││TILE2││ ││TILE5││TILE6│
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> └─────┘└─────┘│ IO │└─────┘└─────┘
> ┌─────┐┌─────┐│ TILE │┌─────┐┌─────┐
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> │TILE3││TILE4││ ││TILE7││TILE8│
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> │ ││ ││ ││ ││ │
> └─────┘└─────┘└──────────┘└─────┘└─────┘
>
> Each 'Tile' has multiple cores, and 'IO Tile' has memory controller, bus
> controllers etc..
>
> When NPS=x configured in bios, IO tile resources are split and each seen
> as a NUMA node.
>
> Following is NPS=4
> ┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
> │ ││ ││ . ││ ││ │
> │ ││ ││ . ││ ││ │
> │TILE1││TILE2││ . ││TILE5││TILE6│
> │ ││ ││NUMA .NUMA││ ││ │
> │ ││ ││ 0 . 1 ││ ││ │
> │ ││ ││ . ││ ││ │
> └─────┘└─────┘│ . │└─────┘└─────┘
> ┌─────┐┌─────┐│..........│┌─────┐┌─────┐
> │ ││ ││ . ││ ││ │
> │ ││ ││NUMA .NUMA││ ││ │
> │TILE3││TILE4││ 2 . 3 ││TILE7││TILE8│
> │ ││ ││ . ││ ││ │
> │ ││ ││ . ││ ││ │
> │ ││ ││ . ││ ││ │
> └─────┘└─────┘└─────.────┘└─────┘└─────┘
>
> Benefit of this is approach is now all cores has to access all NUMA
> without any penalty. Like a DPDK application can use cores from 'TILE1',
> 'TILE4' & 'TILE7' to access to NUMA0 (or any NUMA) resources in high
> performance.
> This is different than SNC where cores access to cross NUMA resources
> hit by performance penalty.
>
> Now, although which tile cores come from doesn't matter from NUMA
> perspective, it may matter (based on workload) to have them under same LLC.
>
> One way to make sure all cores are under same LLC, is to enable "L3 as
> NUMA" BIOS option, which will make each TILE shown as a different NUMA,
> and user select cores from one NUMA.
> This is sufficient up to some point, but not enough when application
> needs number of cores that uses multiple tiles.
>
> Assume each tile has 8 cores, and application needs 24 cores, when user
> provide all cores from TILE1, TILE2 & TILE3, in DPDK right now there is
> now way for application to figure out how to group/select these cores to
> use cores efficiently.
>
> Indeed this is what Vipin is enabling, from a core, he is finding list
> of cores that will work efficiently with this core. In this perspective
> this is nothing really related to NUMA configuration, and nothing really
> specific to AMD, as defined Linux sysfs interface is used for this.
>
> There are other architectures around that has similar NUMA configuration
> and they can also use same logic, at worst we can introduce an
> architecture specific code that all architectures can have a way to find
> other cores that works more efficient with given core. This is a useful
> feature for DPDK.
>
> Lets looks into another example, application uses 24 cores in an graph
> library like usage, that we want to group each three cores to process a
> graph node. Application needs to a way to select which three cores works
> most efficient with eachother, that is what this patch enables. In this
> case enabling "L3 as NUMA" does not help at all. With this patch both
> bios config works, but of course user should select cores to provide
> application based on configuration.
>
>
> And we can even improve this effective core selection, like as Mattias
> suggested we can select cores that share L2 caches, with expansion of
> this patch. This is unrelated to NUMA, and again it is not introducing
> architecture details to DPDK as this implementation already relies on
> Linux sysfs interface.
>
> I hope it clarifies a little more.
>
>
> Thanks,
> ferruh
>
Yes, this does help clarify things a lot as to why current NUMA support
would be insufficient to express what you are describing.
However, in that case I would echo sentiment others have expressed
already as this kind of deep sysfs parsing doesn't seem like it would be
in scope for EAL, it sounds more like something a sysadmin/orchestration
(or the application itself) would do.
I mean, in principle I'm not opposed to having such an API, it just
seems like the abstraction would perhaps need to be a bit more robust
than directly referencing cache structure? Maybe something that
degenerates into NUMA nodes would be better, so that applications
wouldn't have to *specifically* worry about cache locality but instead
have a more generic API they can use to group cores together?
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-05 14:45 ` Burakov, Anatoly
@ 2024-09-05 15:34 ` Ferruh Yigit
2024-09-06 8:44 ` Burakov, Anatoly
0 siblings, 1 reply; 56+ messages in thread
From: Ferruh Yigit @ 2024-09-05 15:34 UTC (permalink / raw)
To: Burakov, Anatoly, Varghese, Vipin, dev; +Cc: Mattias Rönnblom
On 9/5/2024 3:45 PM, Burakov, Anatoly wrote:
> On 9/5/2024 3:05 PM, Ferruh Yigit wrote:
>> On 9/3/2024 9:50 AM, Burakov, Anatoly wrote:
>>> On 9/2/2024 5:33 PM, Varghese, Vipin wrote:
>>>> <snipped>
>>>>>>
>
> Hi Ferruh,
>
>>>
>>> I feel like there's a disconnect between my understanding of the problem
>>> space, and yours, so I'm going to ask a very basic question:
>>>
>>> Assuming the user has configured their AMD system correctly (i.e.
>>> enabled L3 as NUMA), are there any problem to be solved by adding a new
>>> API? Does the system not report each L3 as a separate NUMA node?
>>>
>>
>> Hi Anatoly,
>>
>> Let me try to answer.
>>
>> To start with, Intel "Sub-NUMA Clustering" and AMD NUMA is different, as
>> far as I understand SNC is more similar to more classic physical socket
>> based NUMA.
>>
>> Following is the AMD CPU:
>> ┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> │TILE1││TILE2││ ││TILE5││TILE6│
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> └─────┘└─────┘│ IO │└─────┘└─────┘
>> ┌─────┐┌─────┐│ TILE │┌─────┐┌─────┐
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> │TILE3││TILE4││ ││TILE7││TILE8│
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> │ ││ ││ ││ ││ │
>> └─────┘└─────┘└──────────┘└─────┘└─────┘
>>
>> Each 'Tile' has multiple cores, and 'IO Tile' has memory controller, bus
>> controllers etc..
>>
>> When NPS=x configured in bios, IO tile resources are split and each seen
>> as a NUMA node.
>>
>> Following is NPS=4
>> ┌─────┐┌─────┐┌──────────┐┌─────┐┌─────┐
>> │ ││ ││ . ││ ││ │
>> │ ││ ││ . ││ ││ │
>> │TILE1││TILE2││ . ││TILE5││TILE6│
>> │ ││ ││NUMA .NUMA││ ││ │
>> │ ││ ││ 0 . 1 ││ ││ │
>> │ ││ ││ . ││ ││ │
>> └─────┘└─────┘│ . │└─────┘└─────┘
>> ┌─────┐┌─────┐│..........│┌─────┐┌─────┐
>> │ ││ ││ . ││ ││ │
>> │ ││ ││NUMA .NUMA││ ││ │
>> │TILE3││TILE4││ 2 . 3 ││TILE7││TILE8│
>> │ ││ ││ . ││ ││ │
>> │ ││ ││ . ││ ││ │
>> │ ││ ││ . ││ ││ │
>> └─────┘└─────┘└─────.────┘└─────┘└─────┘
>>
>> Benefit of this is approach is now all cores has to access all NUMA
>> without any penalty. Like a DPDK application can use cores from 'TILE1',
>> 'TILE4' & 'TILE7' to access to NUMA0 (or any NUMA) resources in high
>> performance.
>> This is different than SNC where cores access to cross NUMA resources
>> hit by performance penalty.
>>
>> Now, although which tile cores come from doesn't matter from NUMA
>> perspective, it may matter (based on workload) to have them under same
>> LLC.
>>
>> One way to make sure all cores are under same LLC, is to enable "L3 as
>> NUMA" BIOS option, which will make each TILE shown as a different NUMA,
>> and user select cores from one NUMA.
>> This is sufficient up to some point, but not enough when application
>> needs number of cores that uses multiple tiles.
>>
>> Assume each tile has 8 cores, and application needs 24 cores, when user
>> provide all cores from TILE1, TILE2 & TILE3, in DPDK right now there is
>> now way for application to figure out how to group/select these cores to
>> use cores efficiently.
>>
>> Indeed this is what Vipin is enabling, from a core, he is finding list
>> of cores that will work efficiently with this core. In this perspective
>> this is nothing really related to NUMA configuration, and nothing really
>> specific to AMD, as defined Linux sysfs interface is used for this.
>>
>> There are other architectures around that has similar NUMA configuration
>> and they can also use same logic, at worst we can introduce an
>> architecture specific code that all architectures can have a way to find
>> other cores that works more efficient with given core. This is a useful
>> feature for DPDK.
>>
>> Lets looks into another example, application uses 24 cores in an graph
>> library like usage, that we want to group each three cores to process a
>> graph node. Application needs to a way to select which three cores works
>> most efficient with eachother, that is what this patch enables. In this
>> case enabling "L3 as NUMA" does not help at all. With this patch both
>> bios config works, but of course user should select cores to provide
>> application based on configuration.
>>
>>
>> And we can even improve this effective core selection, like as Mattias
>> suggested we can select cores that share L2 caches, with expansion of
>> this patch. This is unrelated to NUMA, and again it is not introducing
>> architecture details to DPDK as this implementation already relies on
>> Linux sysfs interface.
>>
>> I hope it clarifies a little more.
>>
>>
>> Thanks,
>> ferruh
>>
>
> Yes, this does help clarify things a lot as to why current NUMA support
> would be insufficient to express what you are describing.
>
> However, in that case I would echo sentiment others have expressed
> already as this kind of deep sysfs parsing doesn't seem like it would be
> in scope for EAL, it sounds more like something a sysadmin/orchestration
> (or the application itself) would do.
>
> I mean, in principle I'm not opposed to having such an API, it just
> seems like the abstraction would perhaps need to be a bit more robust
> than directly referencing cache structure? Maybe something that
> degenerates into NUMA nodes would be better, so that applications
> wouldn't have to *specifically* worry about cache locality but instead
> have a more generic API they can use to group cores together?
>
Unfortunately can't cover all usecases by sysadmin/orchestration (as
graph usecase one above), and definitely too much HW detail for the
application, that is why we required some programmatic way (APIs) for
applications.
And we are on the same page that, the more we can get away from
architecture details in the abstraction (APIs) better it is, overall
intention is to provide ways to application to find lcores works
efficiently with each other.
For this what do you think about slightly different API *, like:
```
rte_get_next_lcore_ex(uint i, u32 flag)
```
Based on the flag, we can grab the next eligible lcore, for this patch
the flag can be `RTE_LCORE_LLC`, but options are wide and different
architectures can have different grouping to benefit most from HW in a
vendor agnostic way.
I like the idea, what do you think about this abstraction?
* Kudos to Vipin 😉
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-09-05 15:34 ` Ferruh Yigit
@ 2024-09-06 8:44 ` Burakov, Anatoly
2024-09-09 14:14 ` Varghese, Vipin
0 siblings, 1 reply; 56+ messages in thread
From: Burakov, Anatoly @ 2024-09-06 8:44 UTC (permalink / raw)
To: Ferruh Yigit, Varghese, Vipin, dev; +Cc: Mattias Rönnblom
>> Yes, this does help clarify things a lot as to why current NUMA support
>> would be insufficient to express what you are describing.
>>
>> However, in that case I would echo sentiment others have expressed
>> already as this kind of deep sysfs parsing doesn't seem like it would be
>> in scope for EAL, it sounds more like something a sysadmin/orchestration
>> (or the application itself) would do.
>>
>> I mean, in principle I'm not opposed to having such an API, it just
>> seems like the abstraction would perhaps need to be a bit more robust
>> than directly referencing cache structure? Maybe something that
>> degenerates into NUMA nodes would be better, so that applications
>> wouldn't have to *specifically* worry about cache locality but instead
>> have a more generic API they can use to group cores together?
>>
>
> Unfortunately can't cover all usecases by sysadmin/orchestration (as
> graph usecase one above), and definitely too much HW detail for the
> application, that is why we required some programmatic way (APIs) for
> applications.
>
> And we are on the same page that, the more we can get away from
> architecture details in the abstraction (APIs) better it is, overall
> intention is to provide ways to application to find lcores works
> efficiently with each other.
>
> For this what do you think about slightly different API *, like:
> ```
> rte_get_next_lcore_ex(uint i, u32 flag)
> ```
>
> Based on the flag, we can grab the next eligible lcore, for this patch
> the flag can be `RTE_LCORE_LLC`, but options are wide and different
> architectures can have different grouping to benefit most from HW in a
> vendor agnostic way.
> I like the idea, what do you think about this abstraction?
>
> * Kudos to Vipin 😉
>
Hi Ferruh,
In principle, having flags for this sort of thing sounds like a better
way to go. I do like this idea as well! It of course remains to be seen
how it can work in practice but to me it certainly looks like a path
worth exploring.
--
Thanks,
Anatoly
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-09-06 8:44 ` Burakov, Anatoly
@ 2024-09-09 14:14 ` Varghese, Vipin
0 siblings, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-09-09 14:14 UTC (permalink / raw)
To: Burakov, Anatoly, Yigit, Ferruh, dev; +Cc: Mattias Rönnblom
[-- Attachment #1: Type: text/plain, Size: 2244 bytes --]
[AMD Official Use Only - AMD Internal Distribution Only]
<snipped>
>
> >> Yes, this does help clarify things a lot as to why current NUMA
> >> support would be insufficient to express what you are describing.
> >>
> >> However, in that case I would echo sentiment others have expressed
> >> already as this kind of deep sysfs parsing doesn't seem like it would
> >> be in scope for EAL, it sounds more like something a
> >> sysadmin/orchestration (or the application itself) would do.
> >>
> >> I mean, in principle I'm not opposed to having such an API, it just
> >> seems like the abstraction would perhaps need to be a bit more robust
> >> than directly referencing cache structure? Maybe something that
> >> degenerates into NUMA nodes would be better, so that applications
> >> wouldn't have to *specifically* worry about cache locality but
> >> instead have a more generic API they can use to group cores together?
> >>
> >
> > Unfortunately can't cover all usecases by sysadmin/orchestration (as
> > graph usecase one above), and definitely too much HW detail for the
> > application, that is why we required some programmatic way (APIs) for
> > applications.
> >
> > And we are on the same page that, the more we can get away from
> > architecture details in the abstraction (APIs) better it is, overall
> > intention is to provide ways to application to find lcores works
> > efficiently with each other.
> >
> > For this what do you think about slightly different API *, like:
> > ```
> > rte_get_next_lcore_ex(uint i, u32 flag) ```
> >
> > Based on the flag, we can grab the next eligible lcore, for this patch
> > the flag can be `RTE_LCORE_LLC`, but options are wide and different
> > architectures can have different grouping to benefit most from HW in a
> > vendor agnostic way.
> > I like the idea, what do you think about this abstraction?
> >
> > * Kudos to Vipin 😉
> >
>
> Hi Ferruh,
>
> In principle, having flags for this sort of thing sounds like a better way to go. I
> do like this idea as well! It of course remains to be seen how it can work in
> practice but to me it certainly looks like a path worth exploring.
>
Sharing the new RFC shortly.
<snipped>
[-- Attachment #2: Type: text/html, Size: 7640 bytes --]
^ permalink raw reply [flat|nested] 56+ messages in thread
* Re: [RFC 0/2] introduce LLC aware functions
2024-08-27 15:10 [RFC 0/2] introduce LLC aware functions Vipin Varghese
` (3 preceding siblings ...)
2024-08-28 8:38 ` Burakov, Anatoly
@ 2024-10-07 21:28 ` Stephen Hemminger
2024-10-21 8:17 ` Varghese, Vipin
4 siblings, 1 reply; 56+ messages in thread
From: Stephen Hemminger @ 2024-10-07 21:28 UTC (permalink / raw)
To: Vipin Varghese; +Cc: ferruh.yigit, dev
On Tue, 27 Aug 2024 20:40:12 +0530
Vipin Varghese <vipin.varghese@amd.com> wrote:
> As core density continues to increase, chiplet-based
> core packing has become a key trend. In AMD SoC EPYC
> architectures, core complexes within the same chiplet
> share a Last-Level Cache (LLC). By packing logical cores
> within the same LLC, we can enhance pipeline processing
> stages due to reduced latency and improved data locality.
>
> To leverage these benefits, DPDK libraries and examples
> can utilize localized lcores. This approach ensures more
> consistent latencies by minimizing the dispersion of lcores
> across different chiplet complexes and enhances packet
> processing by ensuring that data for subsequent pipeline
> stages is likely to reside within the LLC.
>
> < Function: Purpose >
> ---------------------
> - rte_get_llc_first_lcores: Retrieves all the first lcores in the shared LLC.
> - rte_get_llc_lcore: Retrieves all lcores that share the LLC.
> - rte_get_llc_n_lcore: Retrieves the first n or skips the first n lcores in the shared LLC.
>
> < MACRO: Purpose >
> ------------------
> RTE_LCORE_FOREACH_LLC_FIRST: iterates through all first lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_FIRST_WORKER: iterates through all first worker lcore from each LLC.
> RTE_LCORE_FOREACH_LLC_WORKER: iterates lcores from LLC based on hint (lcore id).
> RTE_LCORE_FOREACH_LLC_SKIP_FIRST_WORKER: iterates lcores from LLC while skipping first worker.
> RTE_LCORE_FOREACH_LLC_FIRST_N_WORKER: iterates through `n` lcores from each LLC.
> RTE_LCORE_FOREACH_LLC_SKIP_N_WORKER: skip first `n` lcores, then iterates through reaming lcores in each LLC.
>
> Vipin Varghese (2):
> eal: add llc aware functions
> eal/lcore: add llc aware for each macro
>
> lib/eal/common/eal_common_lcore.c | 279 ++++++++++++++++++++++++++++--
> lib/eal/include/rte_lcore.h | 89 ++++++++++
When are you going to send a new version?
Need:
- new functions need to be marked experimental
- has to build cleanly on all platforms.
- need functional tests
- address all the other review comments
^ permalink raw reply [flat|nested] 56+ messages in thread
* RE: [RFC 0/2] introduce LLC aware functions
2024-10-07 21:28 ` Stephen Hemminger
@ 2024-10-21 8:17 ` Varghese, Vipin
0 siblings, 0 replies; 56+ messages in thread
From: Varghese, Vipin @ 2024-10-21 8:17 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Yigit, Ferruh, dev
[AMD Official Use Only - AMD Internal Distribution Only]
>
> When are you going to send a new version?
We had been testing this on various Intel and AMD platforms. We have completed testing over sub-NUMA domains on both SoC.
We will be sharing the new patch (rfc-v2) before 22 Oct 2024.
>
> Need:
> - new functions need to be marked experimental
> - has to build cleanly on all platforms.
> - need functional tests
> - address all the other review comments
^ permalink raw reply [flat|nested] 56+ messages in thread