From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga14.intel.com (mga14.intel.com [192.55.52.115]) by dpdk.org (Postfix) with ESMTP id 16F249AD7 for ; Tue, 3 Feb 2015 09:19:07 +0100 (CET) Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by fmsmga103.fm.intel.com with ESMTP; 03 Feb 2015 00:12:32 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.97,862,1389772800"; d="scan'208";a="449082370" Received: from shvmail01.sh.intel.com ([10.239.29.42]) by FMSMGA003.fm.intel.com with ESMTP; 03 Feb 2015 00:04:45 -0800 Received: from shecgisg004.sh.intel.com (shecgisg004.sh.intel.com [10.239.29.89]) by shvmail01.sh.intel.com with ESMTP id t138Iot3000886; Tue, 3 Feb 2015 16:18:50 +0800 Received: from shecgisg004.sh.intel.com (localhost [127.0.0.1]) by shecgisg004.sh.intel.com (8.13.6/8.13.6/SuSE Linux 0.8) with ESMTP id t138Ilk6028213; Tue, 3 Feb 2015 16:18:49 +0800 Received: (from dyzhou@localhost) by shecgisg004.sh.intel.com (8.13.6/8.13.6/Submit) id t138Ilxg028209; Tue, 3 Feb 2015 16:18:47 +0800 From: Zhou Danny To: dev@dpdk.org Date: Tue, 3 Feb 2015 16:18:31 +0800 Message-Id: <1422951511-28143-6-git-send-email-danny.zhou@intel.com> X-Mailer: git-send-email 1.7.4.1 In-Reply-To: <1422951511-28143-1-git-send-email-danny.zhou@intel.com> References: <1422951511-28143-1-git-send-email-danny.zhou@intel.com> Subject: [dpdk-dev] [PATCH v2 5/5] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 03 Feb 2015 08:19:09 -0000 v2 change - Remove unused function which is for debug purpose Demonstrate how to handle per rx queue interrupt in a NAPI-like implementation in usersapce. PDK polling thread mainly works in polling mode and switch to interrupt mode only if there is no any packet received in recent polls. Usersapce interrupt notification generally takes a lot more cycles than kernel, so one-shot interrupt is used here to guarantee minimum overhead and DPDK polling thread returns to polling mode immediately once it receives an interrupt notificaiton for incoming packet. Signed-off-by: Danny Zhou Tested-by: Yong Liu --- examples/l3fwd-power/main.c | 141 +++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 41 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f6b55b9..15f0a5a 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -75,12 +75,13 @@ #include #include #include +#include #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1 #define MAX_PKT_BURST 32 -#define MIN_ZERO_POLL_COUNT 5 +#define MIN_ZERO_POLL_COUNT 10 /* around 100ms at 2 Ghz */ #define TIMER_RESOLUTION_CYCLES 200000000ULL @@ -188,6 +189,9 @@ struct lcore_rx_queue { #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS #define MAX_RX_QUEUE_PER_PORT 128 +#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 + + #define MAX_LCORE_PARAMS 1024 struct lcore_params { uint8_t port_id; @@ -214,7 +218,7 @@ static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) / static struct rte_eth_conf port_conf = { .rxmode = { - .mq_mode = ETH_MQ_RX_RSS, + .mq_mode = ETH_MQ_RX_RSS, .max_rx_pkt_len = ETHER_MAX_LEN, .split_hdr_size = 0, .header_split = 0, /**< Header Split disabled */ @@ -226,11 +230,14 @@ static struct rte_eth_conf port_conf = { .rx_adv_conf = { .rss_conf = { .rss_key = NULL, - .rss_hf = ETH_RSS_IP, + .rss_hf = ETH_RSS_UDP, }, }, .txmode = { - .mq_mode = ETH_DCB_NONE, + .mq_mode = ETH_MQ_TX_NONE, + }, + .intr_conf = { + .rxq = 1, /**< rxq interrupt feature enabled */ }, }; @@ -402,19 +409,22 @@ power_timer_cb(__attribute__((unused)) struct rte_timer *tim, /* accumulate total execution time in us when callback is invoked */ sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / (float)SCALING_PERIOD; - /** * check whether need to scale down frequency a step if it sleep a lot. */ - if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) - rte_power_freq_down(lcore_id); + if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { + if (rte_power_freq_down) + rte_power_freq_down(lcore_id); + } else if ( (unsigned)(stats[lcore_id].nb_rx_processed / - stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) + stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { /** * scale down a step if average packet per iteration less * than expectation. */ - rte_power_freq_down(lcore_id); + if (rte_power_freq_down) + rte_power_freq_down(lcore_id); + } /** * initialize another timer according to current frequency to ensure @@ -707,22 +717,20 @@ l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, } -#define SLEEP_GEAR1_THRESHOLD 100 -#define SLEEP_GEAR2_THRESHOLD 1000 +#define MINIMUM_SLEEP_TIME 1 +#define SUSPEND_THRESHOLD 300 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count) { - /* If zero count is less than 100, use it as the sleep time in us */ - if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD) - return zero_rx_packet_count; - /* If zero count is less than 1000, sleep time should be 100 us */ - else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) && - (zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD)) - return SLEEP_GEAR1_THRESHOLD; - /* If zero count is greater than 1000, sleep time should be 1000 us */ - else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD) - return SLEEP_GEAR2_THRESHOLD; + /* If zero count is less than 100, sleep 1us */ + if (zero_rx_packet_count < SUSPEND_THRESHOLD) + return MINIMUM_SLEEP_TIME; + /* If zero count is less than 1000, sleep 100 us which is the minimum latency + switching from C3/C6 to C0 + */ + else + return SUSPEND_THRESHOLD; return 0; } @@ -762,6 +770,35 @@ power_freq_scaleup_heuristic(unsigned lcore_id, return FREQ_CURRENT; } +/** + * force polling thread sleep until one-shot rx interrupt triggers + * @param port_id + * Port id. + * @param queue_id + * Rx queue id. + * @return + * 0 on success + */ +static int +sleep_until_rx_interrupt(uint8_t port_id, uint8_t queue_id) +{ + /* Enable one-shot rx interrupt */ + rte_eth_dev_rx_queue_intr_enable(port_id, queue_id); + + RTE_LOG(INFO, L3FWD_POWER, + "lcore %u sleeps until interrupt on port%d,rxq%d triggers\n", + rte_lcore_id(), port_id, queue_id); + rte_eal_wait_rx_intr(port_id, queue_id); + RTE_LOG(INFO, L3FWD_POWER, + "lcore %u is waked up from rx interrupt on port%d,rxq%d\n", + rte_lcore_id(), port_id, queue_id); + + /* Disable one-shot rx interrupt */ + rte_eth_dev_rx_queue_intr_disable(port_id, queue_id); + + return 0; +} + /* main processing loop */ static int main_loop(__attribute__((unused)) void *dummy) @@ -775,7 +812,6 @@ main_loop(__attribute__((unused)) void *dummy) struct lcore_conf *qconf; struct lcore_rx_queue *rx_queue; enum freq_scale_hint_t lcore_scaleup_hint; - uint32_t lcore_rx_idle_count = 0; uint32_t lcore_idle_hint = 0; @@ -835,6 +871,8 @@ main_loop(__attribute__((unused)) void *dummy) prev_tsc_power = cur_tsc_power; } + +start_rx: /* * Read packet from RX queues */ @@ -848,6 +886,7 @@ main_loop(__attribute__((unused)) void *dummy) nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, MAX_PKT_BURST); + stats[lcore_id].nb_rx_processed += nb_rx; if (unlikely(nb_rx == 0)) { /** @@ -910,10 +949,13 @@ main_loop(__attribute__((unused)) void *dummy) rx_queue->freq_up_hint; } - if (lcore_scaleup_hint == FREQ_HIGHEST) - rte_power_freq_max(lcore_id); - else if (lcore_scaleup_hint == FREQ_HIGHER) - rte_power_freq_up(lcore_id); + if (lcore_scaleup_hint == FREQ_HIGHEST) { + if (rte_power_freq_max) + rte_power_freq_max(lcore_id); + } else if (lcore_scaleup_hint == FREQ_HIGHER) { + if (rte_power_freq_up) + rte_power_freq_up(lcore_id); + } } else { /** * All Rx queues empty in recent consecutive polls, @@ -928,16 +970,21 @@ main_loop(__attribute__((unused)) void *dummy) lcore_idle_hint = rx_queue->idle_hint; } - if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD) + if (lcore_idle_hint < SUSPEND_THRESHOLD) /** - * execute "pause" instruction to avoid context - * switch for short sleep. - */ + * execute "pause" instruction to avoid context + * switch which generally take hundres of microsecond + * for short sleep. + */ rte_delay_us(lcore_idle_hint); - else - /* long sleep force runing thread to suspend */ - usleep(lcore_idle_hint); - + else { + /* suspend untill rx interrupt trigges */ + sleep_until_rx_interrupt( + qconf->rx_queue_list[0].port_id, + qconf->rx_queue_list[0].queue_id); + /* start receiving packets immediately */ + goto start_rx; + } stats[lcore_id].sleep_time += lcore_idle_hint; } } @@ -1270,7 +1317,7 @@ setup_hash(int socketid) char s[64]; /* create ipv4 hash */ - snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); + rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); ipv4_l3fwd_hash_params.name = s; ipv4_l3fwd_hash_params.socket_id = socketid; ipv4_l3fwd_lookup_struct[socketid] = @@ -1280,7 +1327,7 @@ setup_hash(int socketid) "socket %d\n", socketid); /* create ipv6 hash */ - snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); + rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); ipv6_l3fwd_hash_params.name = s; ipv6_l3fwd_hash_params.socket_id = socketid; ipv6_l3fwd_lookup_struct[socketid] = @@ -1476,6 +1523,7 @@ main(int argc, char **argv) unsigned lcore_id; uint64_t hz; uint32_t n_tx_queue, nb_lcores; + uint32_t dev_rxq_num, dev_txq_num; uint8_t portid, nb_rx_queue, queue, socketid; /* catch SIGINT and restore cpufreq governor to ondemand */ @@ -1525,10 +1573,18 @@ main(int argc, char **argv) printf("Initializing port %d ... ", portid ); fflush(stdout); + rte_eth_dev_info_get(portid, &dev_info); + dev_rxq_num = dev_info.max_rx_queues; + dev_txq_num = dev_info.max_tx_queues; + nb_rx_queue = get_port_n_rx_queues(portid); + if (nb_rx_queue > dev_rxq_num) + rte_exit(EXIT_FAILURE, "Cannot configure not existed rxq: " + "port=%d\n", portid); + n_tx_queue = nb_lcores; - if (n_tx_queue > MAX_TX_QUEUE_PER_PORT) - n_tx_queue = MAX_TX_QUEUE_PER_PORT; + if (n_tx_queue > dev_txq_num) + n_tx_queue = dev_txq_num; printf("Creating queues: nb_rxq=%d nb_txq=%u... ", nb_rx_queue, (unsigned)n_tx_queue ); ret = rte_eth_dev_configure(portid, nb_rx_queue, @@ -1552,6 +1608,9 @@ main(int argc, char **argv) if (rte_lcore_is_enabled(lcore_id) == 0) continue; + if (queueid >= dev_txq_num) + continue; + if (numa_on) socketid = \ (uint8_t)rte_lcore_to_socket_id(lcore_id); @@ -1586,8 +1645,9 @@ main(int argc, char **argv) /* init power management library */ ret = rte_power_init(lcore_id); if (ret) - rte_exit(EXIT_FAILURE, "Power management library " - "initialization failed on core%u\n", lcore_id); + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER, + "Power management library initialization " + "failed on core%u", lcore_id); /* init timer structures for each enabled lcore */ rte_timer_init(&power_timers[lcore_id]); @@ -1635,7 +1695,6 @@ main(int argc, char **argv) if (ret < 0) rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " "port=%d\n", ret, portid); - /* * If enabled, put device in promiscuous mode. * This allows IO forwarding mode to forward packets -- 1.8.1.4