DPDK patches and discussions
 help / color / mirror / Atom feed
From: Aaron Conole <aconole@redhat.com>
To: Arnon Warshavsky <arnon@qwilt.com>
Cc: thomas@monjalon.net, anatoly.burakov@intel.com,
	wenzhuo.lu@intel.com, declan.doherty@intel.com,
	jerin.jacob@caviumnetworks.com, bruce.richardson@intel.com,
	ferruh.yigit@intel.com, dev@dpdk.org,
	Kevin Traynor <ktraynor@redhat.com>
Subject: Re: [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence
Date: Thu, 19 Apr 2018 13:48:22 -0400	[thread overview]
Message-ID: <f7ttvs7ytnd.fsf@dhcp-25.97.bos.redhat.com> (raw)
In-Reply-To: <1524117669-25729-11-git-send-email-arnon@qwilt.com> (Arnon Warshavsky's message of "Thu, 19 Apr 2018 09:01:08 +0300")

Arnon Warshavsky <arnon@qwilt.com> writes:

> Local functions to this file,
> changing from void to int are non-abi-breaking.
> For handling the single function that cannot
> change from void to int due to abi,
> where this is the only place it is called in,
> I added a state variable that is being checked
> right after the call to this function.
>
> --
>
> v4 - fix split literal strings in log messages
>
> Signed-off-by: Arnon Warshavsky <arnon@qwilt.com>
> ---

Hi Arnon,

Always happy to see panic calls get removed.  I have some comments inline.

>  lib/librte_eal/bsdapp/eal/eal.c           |  86 ++++++++++++++-------
>  lib/librte_eal/bsdapp/eal/eal_thread.c    |  65 +++++++++++-----
>  lib/librte_eal/common/eal_common_launch.c |  21 ++++++
>  lib/librte_eal/common/include/rte_debug.h |  12 +++
>  lib/librte_eal/linuxapp/eal/eal.c         | 120 ++++++++++++++++++++----------
>  lib/librte_eal/linuxapp/eal/eal_thread.c  |  65 +++++++++++-----
>  6 files changed, 270 insertions(+), 99 deletions(-)
>
> diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
> index d996190..9c2f6f1 100644
> --- a/lib/librte_eal/bsdapp/eal/eal.c
> +++ b/lib/librte_eal/bsdapp/eal/eal.c
> @@ -151,7 +151,7 @@ enum rte_iova_mode
>   * We also don't lock the whole file, so that in future we can use read-locks
>   * on other parts, e.g. memzones, to detect if there are running secondary
>   * processes. */
> -static void
> +static int
>  rte_eal_config_create(void)
>  {
>  	void *rte_mem_cfg_addr;
> @@ -160,60 +160,78 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	if (mem_cfg_fd < 0){
>  		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +					__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
>  	if (retval < 0){
>  		close(mem_cfg_fd);
> -		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +		return -1;

Previously, it wasn't possible for mem_cfg_fd to be reused after a
failure.  Now it is - please reset it to -1. in these close conditions.

>  	}
>  
>  	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
>  	if (retval < 0){
>  		close(mem_cfg_fd);
> -		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
> -				"process running?\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'. Is another primary process running?\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  
>  	if (rte_mem_cfg_addr == MAP_FAILED){
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +				__func__);
> +		return -1;
>  	}
>  	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
>  	rte_config.mem_config = rte_mem_cfg_addr;
> +
> +	return 0;
>  }
>  
>  /* attach to an existing shared memory config */
> -static void
> +static int
>  rte_eal_config_attach(void)
>  {
>  	void *rte_mem_cfg_addr;
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	if (mem_cfg_fd < 0){
>  		mem_cfg_fd = open(pathname, O_RDWR);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +					__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  	close(mem_cfg_fd);

Again, previously this would have aborted on a failure.  So it needs to
be reset to a value that allows retry.

> -	if (rte_mem_cfg_addr == MAP_FAILED)
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +	if (rte_mem_cfg_addr == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	rte_config.mem_config = rte_mem_cfg_addr;
> +
> +	return 0;
>  }
>  
>  /* Detect if we are a primary or a secondary process */
> @@ -237,23 +255,28 @@ enum rte_proc_type_t
>  }
>  
>  /* Sets up rte_config structure with the pointer to shared memory config.*/
> -static void
> +static int
>  rte_config_init(void)
>  {
>  	rte_config.process_type = internal_config.process_type;
>  
>  	switch (rte_config.process_type){
>  	case RTE_PROC_PRIMARY:
> -		rte_eal_config_create();
> +		if (rte_eal_config_create())
> +			return -1;
>  		break;
>  	case RTE_PROC_SECONDARY:
> -		rte_eal_config_attach();
> +		if (rte_eal_config_attach())
> +			return -1;
>  		rte_eal_mcfg_wait_complete(rte_config.mem_config);
>  		break;
>  	case RTE_PROC_AUTO:
>  	case RTE_PROC_INVALID:

Not for this patch, but I just noticed that this should probably use a
'default' case.

> -		rte_panic("Invalid process type\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
> +				__func__, rte_config.process_type);
> +		return -1;
>  	}
> +	return 0;
>  }
>  
>  /* display usage */
> @@ -595,7 +618,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	rte_srand(rte_rdtsc());
>  
> -	rte_config_init();
> +	if (rte_config_init() != 0)
> +		return -1;

Use rte_eal_init_alert to indicate why you are failing the init.

>  	if (rte_mp_channel_init() < 0) {
>  		rte_eal_init_alert("failed to init mp channel\n");
> @@ -652,7 +676,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	eal_check_mem_on_local_socket();
>  
> -	eal_thread_init_master(rte_config.master_lcore);
> +	if (eal_thread_init_master(rte_config.master_lcore) != 0)
> +		return -1;

Is it ever possible to recover from this?  Still needs
rte_eal_init_alert() call.

>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -666,18 +691,27 @@ static void rte_eal_init_alert(const char *msg)
>  		 * create communication pipes between master thread
>  		 * and children
>  		 */
> -		if (pipe(lcore_config[i].pipe_master2slave) < 0)
> -			rte_panic("Cannot create pipe\n");
> -		if (pipe(lcore_config[i].pipe_slave2master) < 0)
> -			rte_panic("Cannot create pipe\n");
> +		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
> +		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}

How are you cleaning up the threads that were spawned?  Lets say this
loop will execute 5 times, and on the 3rd entry, these errors happen.
You now leave DPDK 'half-initialized' - you've spun up threads and
allocated memory.

Also, again use rte_eal_init_alert().  It was added for a reason :)

>  
>  		lcore_config[i].state = WAIT;
>  
>  		/* create a thread for each lcore */
>  		ret = pthread_create(&lcore_config[i].thread_id, NULL,
>  				     eal_thread_loop, NULL);
> -		if (ret != 0)
> -			rte_panic("Cannot create thread\n");
> +		if (ret != 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
> +					__func__);
> +			return -1;
> +		}

Same question as before.  If pthread_create is failing, there are worse
problems than aborting.

>  		/* Set thread_name for aid in debugging. */
>  		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
> diff --git a/lib/librte_eal/bsdapp/eal/eal_thread.c b/lib/librte_eal/bsdapp/eal/eal_thread.c
> index d602daf..5c3947c 100644
> --- a/lib/librte_eal/bsdapp/eal/eal_thread.c
> +++ b/lib/librte_eal/bsdapp/eal/eal_thread.c
> @@ -51,16 +51,22 @@
>  	n = 0;
>  	while (n == 0 || (n < 0 && errno == EINTR))
>  		n = write(m2s, &c, 1);
> -	if (n < 0)
> -		rte_panic("cannot write on configuration pipe\n");
> +	if (n < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	/* wait ack */
>  	do {
>  		n = read(s2m, &c, 1);
>  	} while (n < 0 && errno == EINTR);
>  
> -	if (n <= 0)
> -		rte_panic("cannot read on configuration pipe\n");
> +	if (n <= 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	return 0;
>  }
> @@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		rte_move_to_panic_state();
> +	}
> +}
> +
> +/* move to panic state and do not return */
> +static __attribute__((noreturn)) void
> +defunct_and_remain_in_endless_loop(void)
> +{
> +	rte_move_to_panic_state();
> +	while (1)
> +		sleep(1);
>  }

This is worse than a panic.  Users will blame applications for appearing
to freeze.  Please leave the panics in place rather than do this.

>  /* main loop of threads */
> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  		if (thread_id == lcore_config[lcore_id].thread_id)
>  			break;
>  	}
> -	if (lcore_id == RTE_MAX_LCORE)
> -		rte_panic("cannot retrieve lcore id\n");
> +	if (lcore_id == RTE_MAX_LCORE) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
> +				__func__);
> +		defunct_and_remain_in_endless_loop();
> +	}

I'm not even sure this check has merit, tbh.  Is there ever a chance for
an lcore thread to be spawned like this?  Probably a better patch would
just remove all the code you've inserted, but keep the check you
removed.

>  	m2s = lcore_config[lcore_id].pipe_master2slave[0];
>  	s2m = lcore_config[lcore_id].pipe_slave2master[1];
> @@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		defunct_and_remain_in_endless_loop();

How does this improve the user experience?

> +	}
>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  			n = read(m2s, &c, 1);
>  		} while (n < 0 && errno == EINTR);
>  
> -		if (n <= 0)
> -			rte_panic("cannot read on configuration pipe\n");
> +		if (n <= 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();

Same question.  Actually this could happen on shutdown, I think?  If
there's a race where the pipe is torn down before the thread?  Not sure
if there are any ordering guarantees around that.

> +		}
>  
>  		lcore_config[lcore_id].state = RUNNING;
>  
> @@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
>  		n = 0;
>  		while (n == 0 || (n < 0 && errno == EINTR))
>  			n = write(s2m, &c, 1);
> -		if (n < 0)
> -			rte_panic("cannot write on configuration pipe\n");
> -
> -		if (lcore_config[lcore_id].f == NULL)
> -			rte_panic("NULL function pointer\n");
> +		if (n < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
> +
> +		if (lcore_config[lcore_id].f == NULL) {
> +			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}

I don't see how any of this is better for the user.  In fact, I think
this is worse because it will make portions of the application stop
working without any way to move forward.  rte_panic() will at least give
the process a chance to recover from a potentially ephemeral condition.

>  		/* call the function and store the return value */
>  		fct_arg = lcore_config[lcore_id].arg;
> diff --git a/lib/librte_eal/common/eal_common_launch.c b/lib/librte_eal/common/eal_common_launch.c
> index fe0ba3f..6f8bd46 100644
> --- a/lib/librte_eal/common/eal_common_launch.c
> +++ b/lib/librte_eal/common/eal_common_launch.c
> @@ -14,6 +14,7 @@
>  #include <rte_pause.h>
>  #include <rte_per_lcore.h>
>  #include <rte_lcore.h>
> +#include <rte_debug.h>
>  
>  /*
>   * Wait until a lcore finished its job.
> @@ -88,3 +89,23 @@ enum rte_lcore_state_t
>  		rte_eal_wait_lcore(lcore_id);
>  	}
>  }
> +
> +/* panic state */
> +static int _panic_state;
> +
> +/**
> + * Check if the system is in panic state
> + * @return int
> + */
> +int rte_get_panic_state(void)
> +{
> +	return _panic_state;
> +}
> +
> +/**
> + * Move the system to be in panic state
> + */
> +void rte_move_to_panic_state(void)
> +{
> +	_panic_state = 1;
> +}
> diff --git a/lib/librte_eal/common/include/rte_debug.h b/lib/librte_eal/common/include/rte_debug.h
> index 272df49..b421d33 100644
> --- a/lib/librte_eal/common/include/rte_debug.h
> +++ b/lib/librte_eal/common/include/rte_debug.h
> @@ -79,4 +79,16 @@ void __rte_panic(const char *funcname , const char *format, ...)
>  }
>  #endif
>  
> +/**
> + * Check if the system is in panic state
> + * @return int
> + */
> +int rte_get_panic_state(void);
> +
> +/**
> + * Move the system to be in panic state
> + */
> +void rte_move_to_panic_state(void);

This seems to only exist as a way of triggering the run_once check in
the eal_init.  It doesn't add anything except one more state variable to
check against.  What is the purpose?

Further, it seems unrelated to removing panics.

> +
>  #endif /* _RTE_DEBUG_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index 21afa73..393441a 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -160,7 +160,7 @@ enum rte_iova_mode
>   * We also don't lock the whole file, so that in future we can use read-locks
>   * on other parts, e.g. memzones, to detect if there are running secondary
>   * processes. */
> -static void
> +static int
>  rte_eal_config_create(void)
>  {
>  	void *rte_mem_cfg_addr;
> @@ -169,7 +169,7 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	/* map the config before hugepage address so that we don't waste a page */
>  	if (internal_config.base_virtaddr != 0)
> @@ -179,30 +179,39 @@ enum rte_iova_mode
>  	else
>  		rte_mem_cfg_addr = NULL;
>  
> -	if (mem_cfg_fd < 0){
> +	if (mem_cfg_fd < 0) {
>  		mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
> -	if (retval < 0){
> +	if (retval < 0) {
>  		close(mem_cfg_fd);
> -		rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot resize '%s' for rte_mem_config\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
> -	if (retval < 0){
> +	if (retval < 0) {
>  		close(mem_cfg_fd);
> -		rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
> -				"process running?\n", pathname);
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot create lock on '%s'."
> +				" Is another primary process running?\n",
> +				__func__, pathname);
> +		return -1;
>  	}
>  
>  	rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
>  				PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
>  
> -	if (rte_mem_cfg_addr == MAP_FAILED){
> -		rte_panic("Cannot mmap memory for rte_config\n");
> +	if (rte_mem_cfg_addr == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config\n",
> +			__func__);
> +		return -1;
>  	}
>  	memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
>  	rte_config.mem_config = rte_mem_cfg_addr;
> @@ -211,10 +220,11 @@ enum rte_iova_mode
>  	 * processes could later map the config into this exact location */
>  	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
>  
> +	return 0;
>  }
>  
>  /* attach to an existing shared memory config */
> -static void
> +static int
>  rte_eal_config_attach(void)
>  {
>  	struct rte_mem_config *mem_config;
> @@ -222,33 +232,40 @@ enum rte_iova_mode
>  	const char *pathname = eal_runtime_config_path();
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
> -	if (mem_cfg_fd < 0){
> +	if (mem_cfg_fd < 0) {
>  		mem_cfg_fd = open(pathname, O_RDWR);
> -		if (mem_cfg_fd < 0)
> -			rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
> +		if (mem_cfg_fd < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot open '%s' for rte_mem_config\n",
> +						__func__, pathname);
> +			return -1;
> +		}
>  	}
>  
>  	/* map it as read-only first */
>  	mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
>  			PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
> -	if (mem_config == MAP_FAILED)
> -		rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
> -			  errno, strerror(errno));
> +	if (mem_config == MAP_FAILED) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for rte_config! error %i (%s)\n",
> +				__func__, errno, strerror(errno));
> +		return -1;
> +	}
>  
>  	rte_config.mem_config = mem_config;
> +
> +	return 0;
>  }
>  
>  /* reattach the shared config at exact memory location primary process has it */
> -static void
> +static int
>  rte_eal_config_reattach(void)
>  {
>  	struct rte_mem_config *mem_config;
>  	void *rte_mem_cfg_addr;
>  
>  	if (internal_config.no_shconf)
> -		return;
> +		return 0;
>  
>  	/* save the address primary process has mapped shared config to */
>  	rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
> @@ -263,16 +280,21 @@ enum rte_iova_mode
>  	if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
>  		if (mem_config != MAP_FAILED)
>  			/* errno is stale, don't use */
> -			rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
> -				  " - please use '--base-virtaddr' option\n",
> -				  rte_mem_cfg_addr, mem_config);
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
> +					"rte_config at [%p], got [%p] - please use "
> +					"'--base-virtaddr' option\n",
> +					__func__, rte_mem_cfg_addr, mem_config);
>  		else
> -			rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
> -				  errno, strerror(errno));
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot mmap memory for "
> +					"rte_config! error %i (%s)\n",
> +					__func__, errno, strerror(errno));
> +		return -1;
>  	}
>  	close(mem_cfg_fd);
>  
>  	rte_config.mem_config = mem_config;
> +
> +	return 0;
>  }
>  
>  /* Detect if we are a primary or a secondary process */
> @@ -296,24 +318,31 @@ enum rte_proc_type_t
>  }
>  
>  /* Sets up rte_config structure with the pointer to shared memory config.*/
> -static void
> +static int
>  rte_config_init(void)
>  {
>  	rte_config.process_type = internal_config.process_type;
>  
>  	switch (rte_config.process_type){
>  	case RTE_PROC_PRIMARY:
> -		rte_eal_config_create();
> +		if (rte_eal_config_create() != 0)
> +			return -1;
>  		break;
>  	case RTE_PROC_SECONDARY:
> -		rte_eal_config_attach();
> +		if (rte_eal_config_attach() != 0)
> +			return -1;
>  		rte_eal_mcfg_wait_complete(rte_config.mem_config);
> -		rte_eal_config_reattach();
> +		if (rte_eal_config_reattach() != 0)
> +			return -1;
>  		break;
>  	case RTE_PROC_AUTO:
>  	case RTE_PROC_INVALID:
> -		rte_panic("Invalid process type\n");
> +		RTE_LOG(CRIT, EAL, "%s(): Invalid process type %d\n",
> +				__func__, rte_config.process_type);
> +		return -1;
>  	}
> +
> +	return 0;
>  }
>  
>  /* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
> @@ -820,7 +849,8 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	rte_srand(rte_rdtsc());
>  
> -	rte_config_init();
> +	if (rte_config_init() != 0)
> +		return -1;
>  
>  	if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
>  		rte_eal_init_alert("Cannot init logging.");
> @@ -892,6 +922,9 @@ static void rte_eal_init_alert(const char *msg)
>  
>  	eal_thread_init_master(rte_config.master_lcore);
>  
> +	if (rte_get_panic_state())
> +		return -1;
> +

Please just use run_once.  That's a better way of preventing this.

>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
>  	RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",
> @@ -909,18 +942,27 @@ static void rte_eal_init_alert(const char *msg)
>  		 * create communication pipes between master thread
>  		 * and children
>  		 */
> -		if (pipe(lcore_config[i].pipe_master2slave) < 0)
> -			rte_panic("Cannot create pipe\n");
> -		if (pipe(lcore_config[i].pipe_slave2master) < 0)
> -			rte_panic("Cannot create pipe\n");
> +		if (pipe(lcore_config[i].pipe_master2slave) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
> +		if (pipe(lcore_config[i].pipe_slave2master) < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create pipe\n",
> +					__func__);
> +			return -1;
> +		}
>  
>  		lcore_config[i].state = WAIT;
>  
>  		/* create a thread for each lcore */
>  		ret = pthread_create(&lcore_config[i].thread_id, NULL,
>  				     eal_thread_loop, NULL);
> -		if (ret != 0)
> -			rte_panic("Cannot create thread\n");
> +		if (ret != 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot create thread\n",
> +					__func__);
> +			return -1;
> +		}
>  
>  		/* Set thread_name for aid in debugging. */
>  		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
> diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
> index 08e150b..3afcee5 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_thread.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_thread.c

All of the comments from the bsd side apply here.

> @@ -51,16 +51,22 @@
>  	n = 0;
>  	while (n == 0 || (n < 0 && errno == EINTR))
>  		n = write(m2s, &c, 1);
> -	if (n < 0)
> -		rte_panic("cannot write on configuration pipe\n");
> +	if (n < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	/* wait ack */
>  	do {
>  		n = read(s2m, &c, 1);
>  	} while (n < 0 && errno == EINTR);
>  
> -	if (n <= 0)
> -		rte_panic("cannot read on configuration pipe\n");
> +	if (n <= 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +				__func__);
> +		return -1;
> +	}
>  
>  	return 0;
>  }
> @@ -84,8 +90,19 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		rte_move_to_panic_state();
> +	}
> +}
> +
> +/* move to panic state and do not return */
> +static __attribute__((noreturn)) void
> +defunct_and_remain_in_endless_loop(void)
> +{
> +	rte_move_to_panic_state();
> +	while (1)
> +		sleep(1);
>  }
>  
>  /* main loop of threads */
> @@ -106,8 +123,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  		if (thread_id == lcore_config[lcore_id].thread_id)
>  			break;
>  	}
> -	if (lcore_id == RTE_MAX_LCORE)
> -		rte_panic("cannot retrieve lcore id\n");
> +	if (lcore_id == RTE_MAX_LCORE) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot retrieve lcore id\n",
> +				__func__);
> +		defunct_and_remain_in_endless_loop();
> +	}
>  
>  	m2s = lcore_config[lcore_id].pipe_master2slave[0];
>  	s2m = lcore_config[lcore_id].pipe_slave2master[1];
> @@ -116,8 +136,10 @@ void eal_thread_init_master(unsigned lcore_id)
>  	RTE_PER_LCORE(_lcore_id) = lcore_id;
>  
>  	/* set CPU affinity */
> -	if (eal_thread_set_affinity() < 0)
> -		rte_panic("cannot set affinity\n");
> +	if (eal_thread_set_affinity() < 0) {
> +		RTE_LOG(CRIT, EAL, "%s(): Cannot set affinity\n", __func__);
> +		defunct_and_remain_in_endless_loop();
> +	}
>  
>  	ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
>  
> @@ -133,8 +155,11 @@ void eal_thread_init_master(unsigned lcore_id)
>  			n = read(m2s, &c, 1);
>  		} while (n < 0 && errno == EINTR);
>  
> -		if (n <= 0)
> -			rte_panic("cannot read on configuration pipe\n");
> +		if (n <= 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot read on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
>  
>  		lcore_config[lcore_id].state = RUNNING;
>  
> @@ -142,11 +167,17 @@ void eal_thread_init_master(unsigned lcore_id)
>  		n = 0;
>  		while (n == 0 || (n < 0 && errno == EINTR))
>  			n = write(s2m, &c, 1);
> -		if (n < 0)
> -			rte_panic("cannot write on configuration pipe\n");
> -
> -		if (lcore_config[lcore_id].f == NULL)
> -			rte_panic("NULL function pointer\n");
> +		if (n < 0) {
> +			RTE_LOG(CRIT, EAL, "%s(): Cannot write on configuration pipe\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
> +
> +		if (lcore_config[lcore_id].f == NULL) {
> +			RTE_LOG(CRIT, EAL, "%s(): NULL function pointer\n",
> +					__func__);
> +			defunct_and_remain_in_endless_loop();
> +		}
>  
>  		/* call the function and store the return value */
>  		fct_arg = lcore_config[lcore_id].arg;

  parent reply	other threads:[~2018-04-19 17:48 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-19  6:00 [dpdk-dev] [PATCH v4 00/11] eal: replace calls to rte_panic and refrain from new instances Arnon Warshavsky
2018-04-19  6:00 ` [dpdk-dev] [PATCH v4 01/11] crypto: replace rte_panic instances in crypto driver Arnon Warshavsky
2018-04-19 10:53   ` Trahe, Fiona
2018-04-19 13:49     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 02/11] bond: replace rte_panic instances in bonding driver Arnon Warshavsky
2018-04-19 17:25   ` Kevin Traynor
2018-04-20 13:13     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 03/11] e1000: replace rte_panic instances in e1000 driver Arnon Warshavsky
2018-04-19 17:25   ` Kevin Traynor
2018-04-20 13:14     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 04/11] ixgbe: replace rte_panic instances in ixgbe driver Arnon Warshavsky
2018-04-19 17:26   ` Kevin Traynor
2018-04-20 13:16     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 05/11] eal: replace rte_panic instances in eventdev Arnon Warshavsky
2018-04-19 17:26   ` Kevin Traynor
2018-04-20 13:17     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 06/11] kni: replace rte_panic instances in kni Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 07/11] eal: replace rte_panic instances in hugepage_info Arnon Warshavsky
2018-04-19 14:03   ` Burakov, Anatoly
2018-04-19 14:09     ` Arnon Warshavsky
2018-04-19 14:45       ` Burakov, Anatoly
2018-04-19 14:50         ` Burakov, Anatoly
2018-04-20 13:11           ` Arnon Warshavsky
2018-04-19 14:36   ` Kevin Traynor
2018-04-20 13:12     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 08/11] eal: replace rte_panic instances in interrupts thread Arnon Warshavsky
2018-04-19 17:27   ` Kevin Traynor
2018-04-20 13:18     ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 09/11] eal: replace rte_panic instances in ethdev Arnon Warshavsky
2018-04-19 17:27   ` Kevin Traynor
2018-04-20 13:23     ` Arnon Warshavsky
2018-04-20 13:56       ` Thomas Monjalon
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 10/11] eal: replace rte_panic instances in init sequence Arnon Warshavsky
2018-04-19 14:39   ` Burakov, Anatoly
2018-04-19 14:48     ` Arnon Warshavsky
2018-04-19 14:57       ` Burakov, Anatoly
2018-04-19 17:31         ` Kevin Traynor
2018-04-20 13:32           ` Arnon Warshavsky
2018-04-20 13:31         ` Arnon Warshavsky
2018-04-19 17:48   ` Aaron Conole [this message]
2018-04-20 13:55     ` Arnon Warshavsky
2018-04-20 14:53       ` Aaron Conole
2018-04-23  8:07         ` Arnon Warshavsky
2018-04-19  6:01 ` [dpdk-dev] [PATCH v4 11/11] devtools: prevent new instances of rte_panic and rte_exit Arnon Warshavsky
2018-04-19 17:52   ` Aaron Conole
2018-04-20 14:01     ` Arnon Warshavsky
2018-04-20 15:41       ` Burakov, Anatoly

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f7ttvs7ytnd.fsf@dhcp-25.97.bos.redhat.com \
    --to=aconole@redhat.com \
    --cc=anatoly.burakov@intel.com \
    --cc=arnon@qwilt.com \
    --cc=bruce.richardson@intel.com \
    --cc=declan.doherty@intel.com \
    --cc=dev@dpdk.org \
    --cc=ferruh.yigit@intel.com \
    --cc=jerin.jacob@caviumnetworks.com \
    --cc=ktraynor@redhat.com \
    --cc=thomas@monjalon.net \
    --cc=wenzhuo.lu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).