From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id A642BA0540; Tue, 13 Dec 2022 12:52:13 +0100 (CET) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 97DC24021D; Tue, 13 Dec 2022 12:52:13 +0100 (CET) Received: from smartserver.smartsharesystems.com (smartserver.smartsharesystems.com [77.243.40.215]) by mails.dpdk.org (Postfix) with ESMTP id E7E8C40146 for ; Tue, 13 Dec 2022 12:52:12 +0100 (CET) Content-class: urn:content-classes:message MIME-Version: 1.0 Subject: RE: [PATCH v4 1/4] eal: add generic support for reading PMU events Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Date: Tue, 13 Dec 2022 12:52:11 +0100 X-MimeOLE: Produced By Microsoft Exchange V6.5 Message-ID: <98CBD80474FA8B44BF855DF32C47DC35D8758C@smartserver.smartshare.dk> In-Reply-To: <20221213104350.3218167-2-tduszynski@marvell.com> X-MS-Has-Attach: X-MS-TNEF-Correlator: Thread-Topic: [PATCH v4 1/4] eal: add generic support for reading PMU events Thread-Index: AdkO39Uca+TwAyHaR+u53fJ5202lkwABM1/A References: <20221129092821.1304853-1-tduszynski@marvell.com> <20221213104350.3218167-1-tduszynski@marvell.com> <20221213104350.3218167-2-tduszynski@marvell.com> From: =?iso-8859-1?Q?Morten_Br=F8rup?= To: "Tomasz Duszynski" , Cc: , , X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org > From: Tomasz Duszynski [mailto:tduszynski@marvell.com] > Sent: Tuesday, 13 December 2022 11.44 >=20 > Add support for programming PMU counters and reading their values > in runtime bypassing kernel completely. >=20 > This is especially useful in cases where CPU cores are isolated > (nohz_full) i.e run dedicated tasks. In such cases one cannot use > standard perf utility without sacrificing latency and performance. >=20 > Signed-off-by: Tomasz Duszynski > --- > +++ b/lib/eal/common/rte_pmu.c > @@ -0,0 +1,456 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(C) 2022 Marvell International Ltd. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > + > +#include "pmu_private.h" > + > +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices" > + > +#ifndef GENMASK_ULL > +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> > ((64 - 1 - (h))))) > +#endif > + > +#ifndef FIELD_PREP > +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) > & (m)) > +#endif > + > +struct rte_pmu *rte_pmu; > + > +/* > + * Following __rte_weak functions provide default no-op. = Architectures > should override them if > + * necessary. > + */ > + > +int > +__rte_weak pmu_arch_init(void) > +{ > + return 0; > +} > + > +void > +__rte_weak pmu_arch_fini(void) > +{ > +} > + > +void > +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) > +{ > + RTE_SET_USED(config); > +} > + > +static int > +get_term_format(const char *name, int *num, uint64_t *mask) > +{ > + char *config =3D NULL; > + char path[PATH_MAX]; > + int high, low, ret; > + FILE *fp; > + > + /* quiesce -Wmaybe-uninitialized warning */ > + *num =3D 0; > + *mask =3D 0; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/format/%s", rte_pmu->name, name); > + fp =3D fopen(path, "r"); > + if (!fp) > + return -errno; > + > + errno =3D 0; > + ret =3D fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high); > + if (ret < 2) { > + ret =3D -ENODATA; > + goto out; > + } > + if (errno) { > + ret =3D -errno; > + goto out; > + } > + > + if (ret =3D=3D 2) > + high =3D low; > + > + *mask =3D GENMASK_ULL(high, low); > + /* Last digit should be [012]. If last digit is missing 0 is > implied. */ > + *num =3D config[strlen(config) - 1]; > + *num =3D isdigit(*num) ? *num - '0' : 0; > + > + ret =3D 0; > +out: > + free(config); > + fclose(fp); > + > + return ret; > +} > + > +static int > +parse_event(char *buf, uint64_t config[3]) > +{ > + char *token, *term; > + int num, ret, val; > + uint64_t mask; > + > + config[0] =3D config[1] =3D config[2] =3D 0; > + > + token =3D strtok(buf, ","); > + while (token) { > + errno =3D 0; > + /* =3D */ > + ret =3D sscanf(token, "%m[^=3D]=3D%i", &term, &val); > + if (ret < 1) > + return -ENODATA; > + if (errno) > + return -errno; > + if (ret =3D=3D 1) > + val =3D 1; > + > + ret =3D get_term_format(term, &num, &mask); > + free(term); > + if (ret) > + return ret; > + > + config[num] |=3D FIELD_PREP(mask, val); > + token =3D strtok(NULL, ","); > + } > + > + return 0; > +} > + > +static int > +get_event_config(const char *name, uint64_t config[3]) > +{ > + char path[PATH_MAX], buf[BUFSIZ]; > + FILE *fp; > + int ret; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/events/%s", rte_pmu->name, name); > + fp =3D fopen(path, "r"); > + if (!fp) > + return -errno; > + > + ret =3D fread(buf, 1, sizeof(buf), fp); > + if (ret =3D=3D 0) { > + fclose(fp); > + > + return -EINVAL; > + } > + fclose(fp); > + buf[ret] =3D '\0'; > + > + return parse_event(buf, config); > +} > + > +static int > +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) > +{ > + struct perf_event_attr attr =3D { > + .size =3D sizeof(struct perf_event_attr), > + .type =3D PERF_TYPE_RAW, > + .exclude_kernel =3D 1, > + .exclude_hv =3D 1, > + .disabled =3D 1, > + }; > + > + pmu_arch_fixup_config(config); > + > + attr.config =3D config[0]; > + attr.config1 =3D config[1]; > + attr.config2 =3D config[2]; > + > + return syscall(SYS_perf_event_open, &attr, rte_gettid(), > rte_lcore_to_cpu_id(lcore_id), > + group_fd, 0); > +} > + > +static int > +open_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group =3D &rte_pmu->group[lcore_id]; > + struct rte_pmu_event *event; > + uint64_t config[3]; > + int num =3D 0, ret; > + > + /* group leader gets created first, with fd =3D -1 */ > + group->fds[0] =3D -1; > + > + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { > + ret =3D get_event_config(event->name, config); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to get %s event config\n", > event->name); > + continue; > + } > + > + ret =3D do_perf_event_open(config, lcore_id, group->fds[0]); > + if (ret =3D=3D -1) { > + if (errno =3D=3D EOPNOTSUPP) > + RTE_LOG(ERR, EAL, "64 bit counters not > supported\n"); > + > + ret =3D -errno; > + goto out; > + } > + > + group->fds[event->index] =3D ret; > + num++; > + } > + > + return 0; > +out: > + for (--num; num >=3D 0; num--) { > + close(group->fds[num]); > + group->fds[num] =3D -1; > + } > + > + > + return ret; > +} > + > +static int > +mmap_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group =3D &rte_pmu->group[lcore_id]; > + void *addr; > + int ret, i; > + > + for (i =3D 0; i < rte_pmu->num_group_events; i++) { > + addr =3D mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, > group->fds[i], 0); > + if (addr =3D=3D MAP_FAILED) { > + ret =3D -errno; > + goto out; > + } > + > + group->mmap_pages[i] =3D addr; > + } > + > + return 0; > +out: > + for (; i; i--) { > + munmap(group->mmap_pages[i - 1], rte_mem_page_size()); > + group->mmap_pages[i - 1] =3D NULL; > + } > + > + return ret; > +} > + > +static void > +cleanup_events(int lcore_id) > +{ > + struct rte_pmu_event_group *group =3D &rte_pmu->group[lcore_id]; > + int i; > + > + if (!group->fds) > + return; > + > + if (group->fds[0] !=3D -1) > + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, > PERF_IOC_FLAG_GROUP); > + > + for (i =3D 0; i < rte_pmu->num_group_events; i++) { > + if (group->mmap_pages[i]) { > + munmap(group->mmap_pages[i], rte_mem_page_size()); > + group->mmap_pages[i] =3D NULL; > + } > + > + if (group->fds[i] !=3D -1) { > + close(group->fds[i]); > + group->fds[i] =3D -1; > + } > + } > + > + free(group->mmap_pages); > + free(group->fds); > + > + group->mmap_pages =3D NULL; > + group->fds =3D NULL; > + group->enabled =3D false; > +} > + > +int __rte_noinline > +rte_pmu_enable_group(int lcore_id) > +{ > + struct rte_pmu_event_group *group =3D &rte_pmu->group[lcore_id]; > + int ret; > + > + if (rte_pmu->num_group_events =3D=3D 0) { > + RTE_LOG(DEBUG, EAL, "no matching PMU events\n"); > + > + return 0; > + } > + > + group->fds =3D calloc(rte_pmu->num_group_events, sizeof(*group- > >fds)); > + if (!group->fds) { > + RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n"); > + > + return -ENOMEM; > + } > + > + group->mmap_pages =3D calloc(rte_pmu->num_group_events, > sizeof(*group->mmap_pages)); > + if (!group->mmap_pages) { > + RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n"); > + > + ret =3D -ENOMEM; > + goto out; > + } > + > + ret =3D open_events(lcore_id); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to open events on lcore-worker- > %d\n", lcore_id); > + goto out; > + } > + > + ret =3D mmap_events(lcore_id); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to map events on lcore-worker- > %d\n", lcore_id); > + goto out; > + } > + > + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, > PERF_IOC_FLAG_GROUP) =3D=3D -1) { > + RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker- > %d\n", lcore_id); > + > + ret =3D -errno; > + goto out; > + } > + > + return 0; > + > +out: > + cleanup_events(lcore_id); > + > + return ret; > +} > + > +static int > +scan_pmus(void) > +{ > + char path[PATH_MAX]; > + struct dirent *dent; > + const char *name; > + DIR *dirp; > + > + dirp =3D opendir(EVENT_SOURCE_DEVICES_PATH); > + if (!dirp) > + return -errno; > + > + while ((dent =3D readdir(dirp))) { > + name =3D dent->d_name; > + if (name[0] =3D=3D '.') > + continue; > + > + /* sysfs entry should either contain cpus or be a cpu */ > + if (!strcmp(name, "cpu")) > + break; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/cpus", name); > + if (access(path, F_OK) =3D=3D 0) > + break; > + } > + > + closedir(dirp); > + > + if (dent) { > + rte_pmu->name =3D strdup(name); > + if (!rte_pmu->name) > + return -ENOMEM; > + } > + > + return rte_pmu->name ? 0 : -ENODEV; > +} > + > +int > +rte_pmu_add_event(const char *name) > +{ > + struct rte_pmu_event *event; > + char path[PATH_MAX]; > + > + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH > "/%s/events/%s", rte_pmu->name, name); > + if (access(path, R_OK)) > + return -ENODEV; > + > + TAILQ_FOREACH(event, &rte_pmu->event_list, next) { > + if (!strcmp(event->name, name)) > + return event->index; > + continue; > + } > + > + event =3D calloc(1, sizeof(*event)); > + if (!event) > + return -ENOMEM; > + > + event->name =3D strdup(name); > + if (!event->name) { > + free(event); > + > + return -ENOMEM; > + } > + > + event->index =3D rte_pmu->num_group_events++; > + TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next); > + > + RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event- > >index); > + > + return event->index; > +} > + > +void > +eal_pmu_init(void) > +{ > + int ret; > + > + rte_pmu =3D calloc(1, sizeof(*rte_pmu)); > + if (!rte_pmu) { > + RTE_LOG(ERR, EAL, "failed to alloc PMU\n"); > + > + return; > + } > + > + TAILQ_INIT(&rte_pmu->event_list); > + > + ret =3D scan_pmus(); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to find core pmu\n"); > + goto out; > + } > + > + ret =3D pmu_arch_init(); > + if (ret) { > + RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n"); > + goto out; > + } > + > + return; > +out: > + free(rte_pmu->name); > + free(rte_pmu); > +} > + > +void > +eal_pmu_fini(void) > +{ > + struct rte_pmu_event *event, *tmp; > + int lcore_id; > + > + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) { > + TAILQ_REMOVE(&rte_pmu->event_list, event, next); > + free(event->name); > + free(event); > + } > + > + RTE_LCORE_FOREACH_WORKER(lcore_id) > + cleanup_events(lcore_id); > + > + pmu_arch_fini(); > + free(rte_pmu->name); > + free(rte_pmu); > +} > diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build > index cfcd40aaed..3bf830adee 100644 > --- a/lib/eal/include/meson.build > +++ b/lib/eal/include/meson.build > @@ -36,6 +36,7 @@ headers +=3D files( > 'rte_pci_dev_features.h', > 'rte_per_lcore.h', > 'rte_pflock.h', > + 'rte_pmu.h', > 'rte_random.h', > 'rte_reciprocal.h', > 'rte_seqcount.h', > diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h > new file mode 100644 > index 0000000000..e4b4f6b052 > --- /dev/null > +++ b/lib/eal/include/rte_pmu.h > @@ -0,0 +1,204 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 Marvell > + */ > + > +#ifndef _RTE_PMU_H_ > +#define _RTE_PMU_H_ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#include > +#include > + > +#ifdef RTE_EXEC_ENV_LINUX > + > +#include > + > +#include > +#include > +#include > +#include > + > +/** > + * @file > + * > + * PMU event tracing operations > + * > + * This file defines generic API and types necessary to setup PMU and > + * read selected counters in runtime. > + */ > + > +/** > + * A structure describing a group of events. > + */ > +struct rte_pmu_event_group { > + int *fds; /**< array of event descriptors */ > + void **mmap_pages; /**< array of pointers to mmapped > perf_event_attr structures */ There seems to be a lot of indirection involved here. Why are these = arrays not statically sized, instead of dynamically allocated? Also, what is the reason for hiding the type struct perf_event_mmap_page = **mmap_pages opaque by using void **mmap_pages instead? > + bool enabled; /**< true if group was enabled on particular lcore > */ > +}; > + > +/** > + * A structure describing an event. > + */ > +struct rte_pmu_event { > + char *name; /** name of an event */ > + int index; /** event index into fds/mmap_pages */ > + TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ > +}; > + > +/** > + * A PMU state container. > + */ > +struct rte_pmu { > + char *name; /** name of core PMU listed under > /sys/bus/event_source/devices */ > + struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore > event group data */ > + int num_group_events; /**< number of events in a group */ > + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching > events */ > +}; > + > +/** Pointer to the PMU state container */ > +extern struct rte_pmu *rte_pmu; Again, why not just extern struct rte_pmu, instead of dynamic = allocation? > + > +/** Each architecture supporting PMU needs to provide its own version > */ > +#ifndef rte_pmu_pmc_read > +#define rte_pmu_pmc_read(index) ({ 0; }) > +#endif > + > +/** > + * @internal > + * > + * Read PMU counter. > + * > + * @param pc > + * Pointer to the mmapped user page. > + * @return > + * Counter value read from hardware. > + */ > +__rte_internal > +static __rte_always_inline uint64_t > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) > +{ > + uint64_t offset, width, pmc =3D 0; > + uint32_t seq, index; > + int tries =3D 100; > + > + for (;;) { > + seq =3D pc->lock; > + rte_compiler_barrier(); > + index =3D pc->index; > + offset =3D pc->offset; > + width =3D pc->pmc_width; > + > + if (likely(pc->cap_user_rdpmc && index)) { > + pmc =3D rte_pmu_pmc_read(index - 1); > + pmc <<=3D 64 - width; > + pmc >>=3D 64 - width; > + } > + > + rte_compiler_barrier(); > + > + if (likely(pc->lock =3D=3D seq)) > + return pmc + offset; > + > + if (--tries =3D=3D 0) { > + RTE_LOG(DEBUG, EAL, "failed to get > perf_event_mmap_page lock\n"); > + break; > + } > + } > + > + return 0; > +} > + > +/** > + * @internal > + * > + * Enable group of events for a given lcore. > + * > + * @param lcore_id > + * The identifier of the lcore. > + * @return > + * 0 in case of success, negative value otherwise. > + */ > +__rte_internal > +int > +rte_pmu_enable_group(int lcore_id); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice > + * > + * Add event to the group of enabled events. > + * > + * @param name > + * Name of an event listed under > /sys/bus/event_source/devices/pmu/events. > + * @return > + * Event index in case of success, negative value otherwise. > + */ > +__rte_experimental > +int > +rte_pmu_add_event(const char *name); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice > + * > + * Read hardware counter configured to count occurrences of an event. > + * > + * @param index > + * Index of an event to be read. > + * @return > + * Event value read from register. In case of errors or lack of > support > + * 0 is returned. In other words, stream of zeros in a trace file > + * indicates problem with reading particular PMU event register. > + */ > +__rte_experimental > +static __rte_always_inline uint64_t > +rte_pmu_read(int index) > +{ > + int lcore_id =3D rte_lcore_id(); > + struct rte_pmu_event_group *group; > + int ret; > + > + if (!rte_pmu) > + return 0; > + > + group =3D &rte_pmu->group[lcore_id]; > + if (!group->enabled) { > + ret =3D rte_pmu_enable_group(lcore_id); > + if (ret) > + return 0; > + > + group->enabled =3D true; > + } Why is the group not enabled in the setup function, rte_pmu_add_event(), = instead of here, in the hot path? > + > + if (index < 0 || index >=3D rte_pmu->num_group_events) > + return 0; > + > + return rte_pmu_read_userpage((struct perf_event_mmap_page > *)group->mmap_pages[index]); Using fixed size arrays instead of multiple indirections via pointers is = faster. It could be: return rte_pmu_read_userpage((struct perf_event_mmap_page = *)rte_pmu.group[lcore_id].mmap_pages[index]); With our without suggested performance improvements... Series-acked-by: Morten Br=F8rup