Blame SOURCES/0255-RHBZ-1638651-marginal-path.patch

4ae388
---
4ae388
 libmultipath/Makefile      |    7 
4ae388
 libmultipath/config.h      |   12 
4ae388
 libmultipath/configure.c   |   18 -
4ae388
 libmultipath/configure.h   |    3 
4ae388
 libmultipath/defaults.h    |    1 
4ae388
 libmultipath/dict.c        |  410 ++++++++++++++++++++++++
4ae388
 libmultipath/io_err_stat.c |  763 +++++++++++++++++++++++++++++++++++++++++++++
4ae388
 libmultipath/io_err_stat.h |   15 
4ae388
 libmultipath/propsel.c     |   98 +++++
4ae388
 libmultipath/propsel.h     |    4 
4ae388
 libmultipath/structs.h     |   14 
4ae388
 libmultipath/time-util.c   |   42 ++
4ae388
 libmultipath/time-util.h   |   13 
4ae388
 libmultipath/uevent.c      |   38 ++
4ae388
 libmultipath/uevent.h      |    2 
4ae388
 multipath/multipath.conf.5 |  108 ++++++
4ae388
 multipathd/cli_handlers.c  |    2 
4ae388
 multipathd/main.c          |   64 +++
4ae388
 18 files changed, 1599 insertions(+), 15 deletions(-)
4ae388
4ae388
Index: multipath-tools-130222/libmultipath/Makefile
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/Makefile
4ae388
+++ multipath-tools-130222/libmultipath/Makefile
4ae388
@@ -7,16 +7,17 @@ include ../Makefile.inc
4ae388
 SONAME=0
4ae388
 DEVLIB = libmultipath.so
4ae388
 LIBS = $(DEVLIB).$(SONAME)
4ae388
-LIBDEPS = -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd
4ae388
+LIBDEPS = -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -laio
4ae388
 CFLAGS += -fPIC -I$(mpathcmddir) -I$(mpathpersistdir)
4ae388
 
4ae388
 OBJS = memory.o parser.o vector.o devmapper.o \
4ae388
        hwtable.o blacklist.o util.o dmparser.o config.o \
4ae388
        structs.o discovery.o propsel.o dict.o \
4ae388
-       pgpolicies.o debug.o regex.o defaults.o uevent.o \
4ae388
+       pgpolicies.o debug.o regex.o defaults.o uevent.o time-util.o \
4ae388
        switchgroup.o uxsock.o print.o alias.o log_pthread.o \
4ae388
        log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
4ae388
-       lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o
4ae388
+       lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \
4ae388
+       io_err_stat.o
4ae388
 
4ae388
 LIBDM_API_FLUSH = $(shell grep -Ecs '^[a-z]*[[:space:]]+dm_task_no_flush' /usr/include/libdevmapper.h)
4ae388
 
4ae388
Index: multipath-tools-130222/libmultipath/config.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/config.h
4ae388
+++ multipath-tools-130222/libmultipath/config.h
4ae388
@@ -67,6 +67,10 @@ struct hwentry {
4ae388
 	int deferred_remove;
4ae388
 	int delay_watch_checks;
4ae388
 	int delay_wait_checks;
4ae388
+	int marginal_path_err_sample_time;
4ae388
+	int marginal_path_err_rate_threshold;
4ae388
+	int marginal_path_err_recheck_gap_time;
4ae388
+	int marginal_path_double_failed_time;
4ae388
 	int skip_kpartx;
4ae388
 	int max_sectors_kb;
4ae388
 	int unpriv_sgio;
4ae388
@@ -100,6 +104,10 @@ struct mpentry {
4ae388
 	int deferred_remove;
4ae388
 	int delay_watch_checks;
4ae388
 	int delay_wait_checks;
4ae388
+	int marginal_path_err_sample_time;
4ae388
+	int marginal_path_err_rate_threshold;
4ae388
+	int marginal_path_err_recheck_gap_time;
4ae388
+	int marginal_path_double_failed_time;
4ae388
 	int skip_kpartx;
4ae388
 	int max_sectors_kb;
4ae388
 	int unpriv_sgio;
4ae388
@@ -153,6 +161,10 @@ struct config {
4ae388
 	int processed_main_config;
4ae388
 	int delay_watch_checks;
4ae388
 	int delay_wait_checks;
4ae388
+	int marginal_path_err_sample_time;
4ae388
+	int marginal_path_err_rate_threshold;
4ae388
+	int marginal_path_err_recheck_gap_time;
4ae388
+	int marginal_path_double_failed_time;
4ae388
 	int retrigger_tries;
4ae388
 	int retrigger_delay;
4ae388
 	int new_bindings_in_boot;
4ae388
Index: multipath-tools-130222/libmultipath/configure.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/configure.c
4ae388
+++ multipath-tools-130222/libmultipath/configure.c
4ae388
@@ -42,6 +42,7 @@
4ae388
 #include "uxsock.h"
4ae388
 #include "wwids.h"
4ae388
 #include "sysfs.h"
4ae388
+#include "io_err_stat.h"
4ae388
 
4ae388
 /* group paths in pg by host adapter
4ae388
  */
4ae388
@@ -257,7 +258,8 @@ int rr_optimize_path_order(struct pathgr
4ae388
 }
4ae388
 
4ae388
 extern int
4ae388
-setup_map (struct multipath * mpp, char * params, int params_size)
4ae388
+setup_map (struct multipath * mpp, char * params, int params_size,
4ae388
+	   struct vectors *vecs)
4ae388
 {
4ae388
 	struct pathgroup * pgp;
4ae388
 	int i, old_nr_active;
4ae388
@@ -297,11 +299,21 @@ setup_map (struct multipath * mpp, char
4ae388
 	select_deferred_remove(mpp);
4ae388
 	select_delay_watch_checks(mpp);
4ae388
 	select_delay_wait_checks(mpp);
4ae388
+	select_marginal_path_err_sample_time(mpp);
4ae388
+	select_marginal_path_err_rate_threshold(mpp);
4ae388
+	select_marginal_path_err_recheck_gap_time(mpp);
4ae388
+	select_marginal_path_double_failed_time(mpp);
4ae388
 	select_skip_kpartx(mpp);
4ae388
 	select_max_sectors_kb(mpp);
4ae388
 	select_unpriv_sgio(mpp);
4ae388
 
4ae388
 	sysfs_set_scsi_tmo(mpp);
4ae388
+
4ae388
+	if (mpp->marginal_path_double_failed_time > 0 &&
4ae388
+	    mpp->marginal_path_err_sample_time > 0 &&
4ae388
+	    mpp->marginal_path_err_recheck_gap_time > 0 &&
4ae388
+	    mpp->marginal_path_err_rate_threshold >= 0)
4ae388
+		start_io_err_stat_thread(vecs);
4ae388
 	/*
4ae388
 	 * assign paths to path groups -- start with no groups and all paths
4ae388
 	 * in mpp->paths
4ae388
@@ -867,7 +879,7 @@ coalesce_paths (struct vectors * vecs, v
4ae388
 		verify_paths(mpp, vecs, NULL);
4ae388
 
4ae388
 		params[0] = '\0';
4ae388
-		if (setup_map(mpp, params, PARAMS_SIZE)) {
4ae388
+		if (setup_map(mpp, params, PARAMS_SIZE, vecs)) {
4ae388
 			remove_map(mpp, vecs, 0);
4ae388
 			continue;
4ae388
 		}
4ae388
@@ -1118,7 +1130,7 @@ extern int reload_map(struct vectors *ve
4ae388
 		vector_foreach_slot (mpp->paths, pp, i)
4ae388
 			pathinfo(pp, conf->hwtable, DI_PRIO);
4ae388
 	}
4ae388
-	if (setup_map(mpp, params, PARAMS_SIZE)) {
4ae388
+	if (setup_map(mpp, params, PARAMS_SIZE, vecs)) {
4ae388
 		condlog(0, "%s: failed to setup map", mpp->alias);
4ae388
 		return 1;
4ae388
 	}
4ae388
Index: multipath-tools-130222/libmultipath/configure.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/configure.h
4ae388
+++ multipath-tools-130222/libmultipath/configure.h
4ae388
@@ -24,7 +24,8 @@ enum actions {
4ae388
 #define FLUSH_ONE 1
4ae388
 #define FLUSH_ALL 2
4ae388
 
4ae388
-int setup_map (struct multipath * mpp, char * params, int params_size );
4ae388
+int setup_map (struct multipath * mpp, char * params, int params_size,
4ae388
+	       struct vectors *vecs);
4ae388
 int domap (struct multipath * mpp, char * params);
4ae388
 int reinstate_paths (struct multipath *mpp);
4ae388
 int check_daemon(void);
4ae388
Index: multipath-tools-130222/libmultipath/defaults.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/defaults.h
4ae388
+++ multipath-tools-130222/libmultipath/defaults.h
4ae388
@@ -22,6 +22,7 @@
4ae388
 #define DEFAULT_DETECT_CHECKER DETECT_CHECKER_OFF
4ae388
 #define DEFAULT_DEFERRED_REMOVE DEFERRED_REMOVE_OFF
4ae388
 #define DEFAULT_DELAY_CHECKS DELAY_CHECKS_OFF
4ae388
+#define DEFAULT_MARGINAL_PATH MARGINAL_PATH_OFF
4ae388
 #define DEFAULT_RETRIGGER_DELAY 10
4ae388
 #define DEFAULT_RETRIGGER_TRIES 3
4ae388
 #define DEFAULT_UEV_WAIT_TIMEOUT 30
4ae388
Index: multipath-tools-130222/libmultipath/dict.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/dict.c
4ae388
+++ multipath-tools-130222/libmultipath/dict.c
4ae388
@@ -1077,6 +1077,81 @@ def_all_tg_pt_handler(vector strvec)
4ae388
         return 0;
4ae388
 }
4ae388
 
4ae388
+static int
4ae388
+def_marginal_path_err_sample_time_handler(vector strvec)
4ae388
+{
4ae388
+	char * buff;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		conf->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((conf->marginal_path_err_sample_time = atoi(buff)) < 1)
4ae388
+		conf->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+def_marginal_path_err_rate_threshold_handler(vector strvec)
4ae388
+{
4ae388
+	char * buff;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		conf->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+	else if ((conf->marginal_path_err_rate_threshold = atoi(buff)) < 1)
4ae388
+		conf->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+def_marginal_path_err_recheck_gap_time_handler(vector strvec)
4ae388
+{
4ae388
+	char * buff;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		conf->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((conf->marginal_path_err_recheck_gap_time = atoi(buff)) < 1)
4ae388
+		conf->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+def_marginal_path_double_failed_time_handler(vector strvec)
4ae388
+{
4ae388
+	char * buff;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		conf->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((conf->marginal_path_double_failed_time = atoi(buff)) < 1)
4ae388
+		conf->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
 
4ae388
 /*
4ae388
  * blacklist block handlers
4ae388
@@ -2055,6 +2130,98 @@ hw_all_tg_pt_handler(vector strvec)
4ae388
 	return 0;
4ae388
 }
4ae388
 
4ae388
+static int
4ae388
+hw_marginal_path_err_sample_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct hwentry *hwe = VECTOR_LAST_SLOT(conf->hwtable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!hwe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		hwe->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((hwe->marginal_path_err_sample_time = atoi(buff)) < 1)
4ae388
+		hwe->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+hw_marginal_path_err_rate_threshold_handler(vector strvec)
4ae388
+{
4ae388
+	struct hwentry *hwe = VECTOR_LAST_SLOT(conf->hwtable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!hwe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		hwe->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+	else if ((hwe->marginal_path_err_rate_threshold = atoi(buff)) < 1)
4ae388
+		hwe->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+hw_marginal_path_err_recheck_gap_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct hwentry *hwe = VECTOR_LAST_SLOT(conf->hwtable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!hwe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		hwe->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((hwe->marginal_path_err_recheck_gap_time = atoi(buff)) < 1)
4ae388
+		hwe->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+hw_marginal_path_double_failed_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct hwentry *hwe = VECTOR_LAST_SLOT(conf->hwtable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!hwe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		hwe->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((hwe->marginal_path_double_failed_time = atoi(buff)) < 1)
4ae388
+		hwe->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
 /*
4ae388
  * multipaths block handlers
4ae388
  */
4ae388
@@ -2659,6 +2826,98 @@ mp_ghost_delay_handler(vector strvec)
4ae388
 	return 0;
4ae388
 }
4ae388
 
4ae388
+static int
4ae388
+mp_marginal_path_err_sample_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct mpentry *mpe = VECTOR_LAST_SLOT(conf->mptable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!mpe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		mpe->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((mpe->marginal_path_err_sample_time = atoi(buff)) < 1)
4ae388
+		mpe->marginal_path_err_sample_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+mp_marginal_path_err_rate_threshold_handler(vector strvec)
4ae388
+{
4ae388
+	struct mpentry *mpe = VECTOR_LAST_SLOT(conf->mptable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!mpe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		mpe->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+	else if ((mpe->marginal_path_err_rate_threshold = atoi(buff)) < 1)
4ae388
+		mpe->marginal_path_err_rate_threshold = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+mp_marginal_path_err_recheck_gap_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct mpentry *mpe = VECTOR_LAST_SLOT(conf->mptable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!mpe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		mpe->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((mpe->marginal_path_err_recheck_gap_time = atoi(buff)) < 1)
4ae388
+		mpe->marginal_path_err_recheck_gap_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+mp_marginal_path_double_failed_time_handler(vector strvec)
4ae388
+{
4ae388
+	struct mpentry *mpe = VECTOR_LAST_SLOT(conf->mptable);
4ae388
+	char * buff;
4ae388
+
4ae388
+	if (!mpe)
4ae388
+		return 1;
4ae388
+
4ae388
+	buff = set_value(strvec);
4ae388
+	if (!buff)
4ae388
+		return 1;
4ae388
+
4ae388
+	if ((strlen(buff) == 2 && !strcmp(buff, "no")) ||
4ae388
+	    (strlen(buff) == 1 && !strcmp(buff, "0")))
4ae388
+		mpe->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+	else if ((mpe->marginal_path_double_failed_time = atoi(buff)) < 1)
4ae388
+		mpe->marginal_path_double_failed_time = MARGINAL_PATH_OFF;
4ae388
+
4ae388
+	FREE(buff);
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
 /*
4ae388
  * config file keywords printing
4ae388
  */
4ae388
@@ -2989,6 +3248,56 @@ snprint_mp_ghost_delay (char * buff, int
4ae388
 }
4ae388
 
4ae388
 static int
4ae388
+snprint_mp_marginal_path_err_sample_time (char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct mpentry * mpe = (struct mpentry *)data;
4ae388
+
4ae388
+	if (mpe->marginal_path_err_sample_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (mpe->marginal_path_err_sample_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", mpe->marginal_path_err_sample_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_mp_marginal_path_err_rate_threshold (char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct mpentry * mpe = (struct mpentry *)data;
4ae388
+
4ae388
+	if (mpe->marginal_path_err_rate_threshold == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (mpe->marginal_path_err_rate_threshold == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", mpe->marginal_path_err_rate_threshold);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_mp_marginal_path_err_recheck_gap_time (char * buff, int len,
4ae388
+					       void * data)
4ae388
+{
4ae388
+	struct mpentry * mpe = (struct mpentry *)data;
4ae388
+
4ae388
+	if (mpe->marginal_path_err_recheck_gap_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (mpe->marginal_path_err_recheck_gap_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d",
4ae388
+			mpe->marginal_path_err_recheck_gap_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_mp_marginal_path_double_failed_time (char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct mpentry * mpe = (struct mpentry *)data;
4ae388
+
4ae388
+	if (mpe->marginal_path_double_failed_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (mpe->marginal_path_double_failed_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", mpe->marginal_path_double_failed_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
 snprint_hw_fast_io_fail(char * buff, int len, void * data)
4ae388
 {
4ae388
 	struct hwentry * hwe = (struct hwentry *)data;
4ae388
@@ -3429,6 +3738,55 @@ snprint_hw_all_tg_pt(char * buff, int le
4ae388
 }
4ae388
 
4ae388
 static int
4ae388
+snprint_hw_marginal_path_err_sample_time(char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct hwentry * hwe = (struct hwentry *)data;
4ae388
+
4ae388
+	if (hwe->marginal_path_err_sample_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (hwe->marginal_path_err_sample_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", hwe->marginal_path_err_sample_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_hw_marginal_path_err_rate_threshold(char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct hwentry * hwe = (struct hwentry *)data;
4ae388
+
4ae388
+	if (hwe->marginal_path_err_rate_threshold == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (hwe->marginal_path_err_rate_threshold == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", hwe->marginal_path_err_rate_threshold);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_hw_marginal_path_err_recheck_gap_time(char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct hwentry * hwe = (struct hwentry *)data;
4ae388
+
4ae388
+	if (hwe->marginal_path_err_recheck_gap_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (hwe->marginal_path_err_recheck_gap_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d",
4ae388
+			hwe->marginal_path_err_recheck_gap_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_hw_marginal_path_double_failed_time(char * buff, int len, void * data)
4ae388
+{
4ae388
+	struct hwentry * hwe = (struct hwentry *)data;
4ae388
+
4ae388
+	if (hwe->marginal_path_double_failed_time == MARGINAL_PATH_UNDEF)
4ae388
+		return 0;
4ae388
+	if (hwe->marginal_path_double_failed_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", hwe->marginal_path_double_failed_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
 snprint_def_polling_interval (char * buff, int len, void * data)
4ae388
 {
4ae388
 	return snprintf(buff, len, "%i", conf->checkint);
4ae388
@@ -3945,6 +4303,46 @@ snprint_def_all_tg_pt(char * buff, int l
4ae388
 }
4ae388
 
4ae388
 static int
4ae388
+snprint_def_marginal_path_err_sample_time(char * buff, int len, void * data)
4ae388
+{
4ae388
+	if (conf->marginal_path_err_sample_time == MARGINAL_PATH_UNDEF ||
4ae388
+	    conf->marginal_path_err_sample_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d", conf->marginal_path_err_sample_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_def_marginal_path_err_rate_threshold(char * buff, int len, void * data)
4ae388
+{
4ae388
+	if (conf->marginal_path_err_rate_threshold == MARGINAL_PATH_UNDEF ||
4ae388
+	    conf->marginal_path_err_rate_threshold == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d",
4ae388
+			conf->marginal_path_err_rate_threshold);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_def_marginal_path_err_recheck_gap_time(char * buff, int len,
4ae388
+					       void * data)
4ae388
+{
4ae388
+	if (conf->marginal_path_err_recheck_gap_time == MARGINAL_PATH_UNDEF ||
4ae388
+	    conf->marginal_path_err_recheck_gap_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d",
4ae388
+			conf->marginal_path_err_recheck_gap_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
+snprint_def_marginal_path_double_failed_time(char * buff, int len, void * data)
4ae388
+{
4ae388
+	if (conf->marginal_path_double_failed_time == MARGINAL_PATH_UNDEF ||
4ae388
+	    conf->marginal_path_double_failed_time == MARGINAL_PATH_OFF)
4ae388
+		return snprintf(buff, len, "no");
4ae388
+	return snprintf(buff, len, "%d",
4ae388
+			conf->marginal_path_double_failed_time);
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
 snprint_ble_simple (char * buff, int len, void * data)
4ae388
 {
4ae388
 	struct blentry * ble = (struct blentry *)data;
4ae388
@@ -4043,6 +4441,10 @@ init_keywords(void)
4ae388
 	install_keyword("unpriv_sgio", &def_unpriv_sgio_handler, &snprint_def_unpriv_sgio);
4ae388
 	install_keyword("ghost_delay", &def_ghost_delay_handler, &snprint_def_ghost_delay);
4ae388
 	install_keyword("all_tg_pt", &def_all_tg_pt_handler, &snprint_def_all_tg_pt);
4ae388
+	install_keyword("marginal_path_err_sample_time", &def_marginal_path_err_sample_time_handler, &snprint_def_marginal_path_err_sample_time);
4ae388
+	install_keyword("marginal_path_err_rate_threshold", &def_marginal_path_err_rate_threshold_handler, &snprint_def_marginal_path_err_rate_threshold);
4ae388
+	install_keyword("marginal_path_err_recheck_gap_time", &def_marginal_path_err_recheck_gap_time_handler, &snprint_def_marginal_path_err_recheck_gap_time);
4ae388
+	install_keyword("marginal_path_double_failed_time", &def_marginal_path_double_failed_time_handler, &snprint_def_marginal_path_double_failed_time);
4ae388
 	__deprecated install_keyword("default_selector", &def_selector_handler, NULL);
4ae388
 	__deprecated install_keyword("default_path_grouping_policy", &def_pgpolicy_handler, NULL);
4ae388
 	__deprecated install_keyword("default_uid_attribute", &def_uid_attribute_handler, NULL);
4ae388
@@ -4120,6 +4522,10 @@ init_keywords(void)
4ae388
 	install_keyword("unpriv_sgio", &hw_unpriv_sgio_handler, &snprint_hw_unpriv_sgio);
4ae388
 	install_keyword("ghost_delay", &hw_ghost_delay_handler, &snprint_hw_ghost_delay);
4ae388
 	install_keyword("all_tg_pt", &hw_all_tg_pt_handler, &snprint_hw_all_tg_pt);
4ae388
+	install_keyword("marginal_path_err_sample_time", &hw_marginal_path_err_sample_time_handler, &snprint_hw_marginal_path_err_sample_time);
4ae388
+	install_keyword("marginal_path_err_rate_threshold", &hw_marginal_path_err_rate_threshold_handler, &snprint_hw_marginal_path_err_rate_threshold);
4ae388
+	install_keyword("marginal_path_err_recheck_gap_time", &hw_marginal_path_err_recheck_gap_time_handler, &snprint_hw_marginal_path_err_recheck_gap_time);
4ae388
+	install_keyword("marginal_path_double_failed_time", &hw_marginal_path_double_failed_time_handler, &snprint_hw_marginal_path_double_failed_time);
4ae388
 	install_sublevel_end();
4ae388
 
4ae388
 	install_keyword_root("overrides", &nop_handler);
4ae388
@@ -4184,5 +4590,9 @@ init_keywords(void)
4ae388
 	install_keyword("max_sectors_kb", &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb);
4ae388
 	install_keyword("unpriv_sgio", &mp_unpriv_sgio_handler, &snprint_mp_unpriv_sgio);
4ae388
 	install_keyword("ghost_delay", &mp_ghost_delay_handler, &snprint_mp_ghost_delay);
4ae388
+	install_keyword("marginal_path_err_sample_time", &mp_marginal_path_err_sample_time_handler, &snprint_mp_marginal_path_err_sample_time);
4ae388
+	install_keyword("marginal_path_err_rate_threshold", &mp_marginal_path_err_rate_threshold_handler, &snprint_mp_marginal_path_err_rate_threshold);
4ae388
+	install_keyword("marginal_path_err_recheck_gap_time", &mp_marginal_path_err_recheck_gap_time_handler, &snprint_mp_marginal_path_err_recheck_gap_time);
4ae388
+	install_keyword("marginal_path_double_failed_time", &mp_marginal_path_double_failed_time_handler, &snprint_mp_marginal_path_double_failed_time);
4ae388
 	install_sublevel_end();
4ae388
 }
4ae388
Index: multipath-tools-130222/libmultipath/io_err_stat.c
4ae388
===================================================================
4ae388
--- /dev/null
4ae388
+++ multipath-tools-130222/libmultipath/io_err_stat.c
4ae388
@@ -0,0 +1,763 @@
4ae388
+/*
4ae388
+ * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
4ae388
+ *
4ae388
+ * io_err_stat.c
4ae388
+ * version 1.0
4ae388
+ *
4ae388
+ * IO error stream statistic process for path failure event from kernel
4ae388
+ *
4ae388
+ * Author(s): Guan Junxiong 2017 <guanjunxiong@huawei.com>
4ae388
+ *
4ae388
+ * This file is released under the GPL version 2, or any later version.
4ae388
+ */
4ae388
+
4ae388
+#include <unistd.h>
4ae388
+#include <pthread.h>
4ae388
+#include <signal.h>
4ae388
+#include <fcntl.h>
4ae388
+#include <sys/stat.h>
4ae388
+#include <sys/ioctl.h>
4ae388
+#include <linux/fs.h>
4ae388
+#include <libaio.h>
4ae388
+#include <errno.h>
4ae388
+#include <sys/mman.h>
4ae388
+
4ae388
+#include "vector.h"
4ae388
+#include "memory.h"
4ae388
+#include "checkers.h"
4ae388
+#include "config.h"
4ae388
+#include "structs.h"
4ae388
+#include "structs_vec.h"
4ae388
+#include "devmapper.h"
4ae388
+#include "debug.h"
4ae388
+#include "lock.h"
4ae388
+#include "time-util.h"
4ae388
+#include "io_err_stat.h"
4ae388
+
4ae388
+#define IOTIMEOUT_SEC			60
4ae388
+#define TIMEOUT_NO_IO_NSEC		10000000 /*10ms = 10000000ns*/
4ae388
+#define FLAKY_PATHFAIL_THRESHOLD	2
4ae388
+#define CONCUR_NR_EVENT			32
4ae388
+
4ae388
+#define PATH_IO_ERR_IN_CHECKING		-1
4ae388
+#define PATH_IO_ERR_WAITING_TO_CHECK	-2
4ae388
+
4ae388
+#define io_err_stat_log(prio, fmt, args...) \
4ae388
+	condlog(prio, "io error statistic: " fmt, ##args)
4ae388
+
4ae388
+
4ae388
+struct io_err_stat_pathvec {
4ae388
+	pthread_mutex_t mutex;
4ae388
+	vector		pathvec;
4ae388
+};
4ae388
+
4ae388
+struct dio_ctx {
4ae388
+	struct timespec	io_starttime;
4ae388
+	int		blksize;
4ae388
+	void		*buf;
4ae388
+	struct iocb	io;
4ae388
+};
4ae388
+
4ae388
+struct io_err_stat_path {
4ae388
+	char		devname[FILE_NAME_SIZE];
4ae388
+	int		fd;
4ae388
+	struct dio_ctx	*dio_ctx_array;
4ae388
+	int		io_err_nr;
4ae388
+	int		io_nr;
4ae388
+	struct timespec	start_time;
4ae388
+
4ae388
+	int		total_time;
4ae388
+	int		err_rate_threshold;
4ae388
+};
4ae388
+
4ae388
+pthread_t		io_err_stat_thr;
4ae388
+pthread_attr_t		io_err_stat_attr;
4ae388
+
4ae388
+static pthread_mutex_t io_err_thread_lock = PTHREAD_MUTEX_INITIALIZER;
4ae388
+static pthread_cond_t io_err_thread_cond = PTHREAD_COND_INITIALIZER;
4ae388
+static int io_err_thread_running = 0;
4ae388
+
4ae388
+#define uatomic_read(ptr) __atomic_load_n((ptr), __ATOMIC_SEQ_CST)
4ae388
+#define uatomic_set(ptr, val) __atomic_store_n((ptr), (val), __ATOMIC_SEQ_CST)
4ae388
+
4ae388
+static struct io_err_stat_pathvec *paths;
4ae388
+struct vectors *vecs;
4ae388
+io_context_t	ioctx;
4ae388
+
4ae388
+static void cancel_inflight_io(struct io_err_stat_path *pp);
4ae388
+
4ae388
+struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev)
4ae388
+{
4ae388
+	int i;
4ae388
+	struct io_err_stat_path *pp;
4ae388
+
4ae388
+	if (!pathvec)
4ae388
+		return NULL;
4ae388
+	vector_foreach_slot(pathvec, pp, i)
4ae388
+		if (!strcmp(pp->devname, dev))
4ae388
+			return pp;
4ae388
+
4ae388
+	io_err_stat_log(4, "%s: not found in check queue", dev);
4ae388
+
4ae388
+	return NULL;
4ae388
+}
4ae388
+
4ae388
+static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
4ae388
+		unsigned long pgsize)
4ae388
+{
4ae388
+	ct->blksize = blksize;
4ae388
+	if (posix_memalign(&ct->buf, pgsize, blksize))
4ae388
+		return 1;
4ae388
+	memset(ct->buf, 0, blksize);
4ae388
+	ct->io_starttime.tv_sec = 0;
4ae388
+	ct->io_starttime.tv_nsec = 0;
4ae388
+
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static void deinit_each_dio_ctx(struct dio_ctx *ct)
4ae388
+{
4ae388
+	if (ct->buf)
4ae388
+		free(ct->buf);
4ae388
+}
4ae388
+
4ae388
+static int setup_directio_ctx(struct io_err_stat_path *p)
4ae388
+{
4ae388
+	unsigned long pgsize = getpagesize();
4ae388
+	char fpath[PATH_MAX];
4ae388
+	int blksize = 0;
4ae388
+	int i;
4ae388
+
4ae388
+	if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX)
4ae388
+		return 1;
4ae388
+	if (p->fd < 0)
4ae388
+		p->fd = open(fpath, O_RDONLY | O_DIRECT);
4ae388
+	if (p->fd < 0)
4ae388
+		return 1;
4ae388
+
4ae388
+	p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT);
4ae388
+	if (!p->dio_ctx_array)
4ae388
+		goto fail_close;
4ae388
+
4ae388
+	if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
4ae388
+		io_err_stat_log(4, "%s:cannot get blocksize, set default 512",
4ae388
+				p->devname);
4ae388
+		blksize = 512;
4ae388
+	}
4ae388
+	if (!blksize)
4ae388
+		goto free_pdctx;
4ae388
+
4ae388
+	for (i = 0; i < CONCUR_NR_EVENT; i++) {
4ae388
+		if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize))
4ae388
+			goto deinit;
4ae388
+	}
4ae388
+	return 0;
4ae388
+
4ae388
+deinit:
4ae388
+	for (i = 0; i < CONCUR_NR_EVENT; i++)
4ae388
+		deinit_each_dio_ctx(p->dio_ctx_array + i);
4ae388
+free_pdctx:
4ae388
+	FREE(p->dio_ctx_array);
4ae388
+fail_close:
4ae388
+	close(p->fd);
4ae388
+
4ae388
+	return 1;
4ae388
+}
4ae388
+
4ae388
+static void destroy_directio_ctx(struct io_err_stat_path *p)
4ae388
+{
4ae388
+	int i;
4ae388
+
4ae388
+	if (!p || !p->dio_ctx_array)
4ae388
+		return;
4ae388
+	cancel_inflight_io(p);
4ae388
+
4ae388
+	for (i = 0; i < CONCUR_NR_EVENT; i++)
4ae388
+		deinit_each_dio_ctx(p->dio_ctx_array + i);
4ae388
+	FREE(p->dio_ctx_array);
4ae388
+
4ae388
+	if (p->fd > 0)
4ae388
+		close(p->fd);
4ae388
+}
4ae388
+
4ae388
+static struct io_err_stat_path *alloc_io_err_stat_path(void)
4ae388
+{
4ae388
+	struct io_err_stat_path *p;
4ae388
+
4ae388
+	p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
4ae388
+	if (!p)
4ae388
+		return NULL;
4ae388
+
4ae388
+	memset(p->devname, 0, sizeof(p->devname));
4ae388
+	p->io_err_nr = 0;
4ae388
+	p->io_nr = 0;
4ae388
+	p->total_time = 0;
4ae388
+	p->start_time.tv_sec = 0;
4ae388
+	p->start_time.tv_nsec = 0;
4ae388
+	p->err_rate_threshold = 0;
4ae388
+	p->fd = -1;
4ae388
+
4ae388
+	return p;
4ae388
+}
4ae388
+
4ae388
+static void free_io_err_stat_path(struct io_err_stat_path *p)
4ae388
+{
4ae388
+	FREE(p);
4ae388
+}
4ae388
+
4ae388
+static struct io_err_stat_pathvec *alloc_pathvec(void)
4ae388
+{
4ae388
+	struct io_err_stat_pathvec *p;
4ae388
+	int r;
4ae388
+
4ae388
+	p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
4ae388
+	if (!p)
4ae388
+		return NULL;
4ae388
+	p->pathvec = vector_alloc();
4ae388
+	if (!p->pathvec)
4ae388
+		goto out_free_struct_pathvec;
4ae388
+	r = pthread_mutex_init(&p->mutex, NULL);
4ae388
+	if (r)
4ae388
+		goto out_free_member_pathvec;
4ae388
+
4ae388
+	return p;
4ae388
+
4ae388
+out_free_member_pathvec:
4ae388
+	vector_free(p->pathvec);
4ae388
+out_free_struct_pathvec:
4ae388
+	FREE(p);
4ae388
+	return NULL;
4ae388
+}
4ae388
+
4ae388
+static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
4ae388
+{
4ae388
+	struct io_err_stat_path *path;
4ae388
+	int i;
4ae388
+
4ae388
+	if (!p)
4ae388
+		return;
4ae388
+	pthread_mutex_destroy(&p->mutex);
4ae388
+	if (!p->pathvec) {
4ae388
+		vector_foreach_slot(p->pathvec, path, i) {
4ae388
+			destroy_directio_ctx(path);
4ae388
+			free_io_err_stat_path(path);
4ae388
+		}
4ae388
+		vector_free(p->pathvec);
4ae388
+	}
4ae388
+	FREE(p);
4ae388
+}
4ae388
+
4ae388
+/*
4ae388
+ * return value
4ae388
+ * 0: enqueue OK
4ae388
+ * 1: fails because of internal error
4ae388
+ */
4ae388
+static int enqueue_io_err_stat_by_path(struct path *path)
4ae388
+{
4ae388
+	struct io_err_stat_path *p;
4ae388
+
4ae388
+	pthread_mutex_lock(&paths->mutex);
4ae388
+	p = find_err_path_by_dev(paths->pathvec, path->dev);
4ae388
+	if (p) {
4ae388
+		pthread_mutex_unlock(&paths->mutex);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	pthread_mutex_unlock(&paths->mutex);
4ae388
+
4ae388
+	p = alloc_io_err_stat_path();
4ae388
+	if (!p)
4ae388
+		return 1;
4ae388
+
4ae388
+	memcpy(p->devname, path->dev, sizeof(p->devname));
4ae388
+	p->total_time = path->mpp->marginal_path_err_sample_time;
4ae388
+	p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold;
4ae388
+
4ae388
+	if (setup_directio_ctx(p))
4ae388
+		goto free_ioerr_path;
4ae388
+	pthread_mutex_lock(&paths->mutex);
4ae388
+	if (!vector_alloc_slot(paths->pathvec))
4ae388
+		goto unlock_destroy;
4ae388
+	vector_set_slot(paths->pathvec, p);
4ae388
+	pthread_mutex_unlock(&paths->mutex);
4ae388
+
4ae388
+	io_err_stat_log(2, "%s: enqueue path %s to check",
4ae388
+			path->mpp->alias, path->dev);
4ae388
+	return 0;
4ae388
+
4ae388
+unlock_destroy:
4ae388
+	pthread_mutex_unlock(&paths->mutex);
4ae388
+	destroy_directio_ctx(p);
4ae388
+free_ioerr_path:
4ae388
+	free_io_err_stat_path(p);
4ae388
+
4ae388
+	return 1;
4ae388
+}
4ae388
+
4ae388
+int io_err_stat_handle_pathfail(struct path *path)
4ae388
+{
4ae388
+	struct timespec curr_time;
4ae388
+
4ae388
+	if (uatomic_read(&io_err_thread_running) == 0)
4ae388
+		return 1;
4ae388
+
4ae388
+	if (path->io_err_disable_reinstate) {
4ae388
+		io_err_stat_log(3, "%s: reinstate is already disabled",
4ae388
+				path->dev);
4ae388
+		return 1;
4ae388
+	}
4ae388
+	if (path->io_err_pathfail_cnt < 0)
4ae388
+		return 1;
4ae388
+
4ae388
+	if (!path->mpp)
4ae388
+		return 1;
4ae388
+	if (path->mpp->marginal_path_double_failed_time <= 0 ||
4ae388
+		path->mpp->marginal_path_err_sample_time <= 0 ||
4ae388
+		path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
4ae388
+		path->mpp->marginal_path_err_rate_threshold < 0) {
4ae388
+		io_err_stat_log(4, "%s: parameter not set", path->mpp->alias);
4ae388
+		return 1;
4ae388
+	}
4ae388
+	if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) {
4ae388
+		io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d",
4ae388
+				path->mpp->alias, 2 * IOTIMEOUT_SEC);
4ae388
+		return 1;
4ae388
+	}
4ae388
+	/*
4ae388
+	 * The test should only be started for paths that have failed
4ae388
+	 * repeatedly in a certain time frame, so that we have reason
4ae388
+	 * to assume they're flaky. Without bother the admin to configure
4ae388
+	 * the repeated count threshold and time frame, we assume a path
4ae388
+	 * which fails at least twice within 60 seconds is flaky.
4ae388
+	 */
4ae388
+	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
4ae388
+		return 1;
4ae388
+	if (path->io_err_pathfail_cnt == 0) {
4ae388
+		path->io_err_pathfail_cnt++;
4ae388
+		path->io_err_pathfail_starttime = curr_time.tv_sec;
4ae388
+		io_err_stat_log(5, "%s: start path flakiness pre-checking",
4ae388
+				path->dev);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
4ae388
+			path->mpp->marginal_path_double_failed_time) {
4ae388
+		path->io_err_pathfail_cnt = 0;
4ae388
+		path->io_err_pathfail_starttime = curr_time.tv_sec;
4ae388
+		io_err_stat_log(5, "%s: restart path flakiness pre-checking",
4ae388
+				path->dev);
4ae388
+	}
4ae388
+	path->io_err_pathfail_cnt++;
4ae388
+	if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
4ae388
+		path->io_err_disable_reinstate = 1;
4ae388
+		path->io_err_pathfail_cnt = PATH_IO_ERR_WAITING_TO_CHECK;
4ae388
+		/* enqueue path as soon as it comes up */
4ae388
+		path->io_err_dis_reinstate_time = 0;
4ae388
+		if (path->state != PATH_DOWN) {
4ae388
+			int oldstate = path->state;
4ae388
+			io_err_stat_log(2, "%s: mark as failed", path->dev);
4ae388
+			path->mpp->stat_path_failures++;
4ae388
+			path->state = PATH_DOWN;
4ae388
+			path->dmstate = PSTATE_FAILED;
4ae388
+			if (oldstate == PATH_UP || oldstate == PATH_GHOST)
4ae388
+				update_queue_mode_del_path(path->mpp);
4ae388
+			if (path->tick > conf->checkint)
4ae388
+				path->tick = conf->checkint;
4ae388
+		}
4ae388
+	}
4ae388
+
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+int need_io_err_check(struct path *pp)
4ae388
+{
4ae388
+	struct timespec curr_time;
4ae388
+	int r;
4ae388
+
4ae388
+	if (uatomic_read(&io_err_thread_running) == 0)
4ae388
+		return 0;
4ae388
+	if (pp->mpp->nr_active <= 0) {
4ae388
+		io_err_stat_log(2, "%s: recover path early", pp->dev);
4ae388
+		goto recover;
4ae388
+	}
4ae388
+	if (pp->io_err_pathfail_cnt != PATH_IO_ERR_WAITING_TO_CHECK)
4ae388
+		return 1;
4ae388
+	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
4ae388
+	    (curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
4ae388
+			pp->mpp->marginal_path_err_recheck_gap_time) {
4ae388
+		io_err_stat_log(4, "%s: reschedule checking after %d seconds",
4ae388
+				pp->dev,
4ae388
+				pp->mpp->marginal_path_err_recheck_gap_time);
4ae388
+		r = enqueue_io_err_stat_by_path(pp);
4ae388
+		/*
4ae388
+		 * Enqueue fails because of internal error.
4ae388
+		 * In this case , we recover this path
4ae388
+		 * Or else,  return 1 to set path state to PATH_SHAKY
4ae388
+		 */
4ae388
+		if (r == 1) {
4ae388
+			io_err_stat_log(3, "%s: enqueue fails, recovering",
4ae388
+					pp->dev);
4ae388
+			goto recover;
4ae388
+		} else
4ae388
+			pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
4ae388
+	}
4ae388
+
4ae388
+	return 1;
4ae388
+
4ae388
+recover:
4ae388
+	pp->io_err_pathfail_cnt = 0;
4ae388
+	pp->io_err_disable_reinstate = 0;
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
4ae388
+{
4ae388
+	int i;
4ae388
+
4ae388
+	i = find_slot(paths->pathvec, p);
4ae388
+	if (i != -1)
4ae388
+		vector_del_slot(paths->pathvec, i);
4ae388
+
4ae388
+	destroy_directio_ctx(p);
4ae388
+	free_io_err_stat_path(p);
4ae388
+
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static void account_async_io_state(struct io_err_stat_path *pp, int rc)
4ae388
+{
4ae388
+	switch (rc) {
4ae388
+	case PATH_DOWN:
4ae388
+		pp->io_err_nr++;
4ae388
+		break;
4ae388
+	case PATH_UNCHECKED:
4ae388
+	case PATH_UP:
4ae388
+	case PATH_PENDING:
4ae388
+		break;
4ae388
+	default:
4ae388
+		break;
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
4ae388
+{
4ae388
+	struct timespec currtime, difftime;
4ae388
+	struct path *path;
4ae388
+	double err_rate;
4ae388
+
4ae388
+	if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
4ae388
+		return 1;
4ae388
+	timespecsub(&currtime, &pp->start_time, &difftime);
4ae388
+	if (difftime.tv_sec < pp->total_time)
4ae388
+		return 0;
4ae388
+
4ae388
+	io_err_stat_log(4, "%s: check end", pp->devname);
4ae388
+
4ae388
+	err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr;
4ae388
+	io_err_stat_log(3, "%s: IO error rate (%.1f/1000)",
4ae388
+			pp->devname, err_rate);
4ae388
+	pthread_cleanup_push(cleanup_lock, &vecs->lock);
4ae388
+	lock(vecs->lock);
4ae388
+	pthread_testcancel();
4ae388
+	path = find_path_by_dev(vecs->pathvec, pp->devname);
4ae388
+	if (!path) {
4ae388
+		io_err_stat_log(4, "path %s not found'", pp->devname);
4ae388
+	} else if (err_rate <= pp->err_rate_threshold) {
4ae388
+		path->io_err_pathfail_cnt = 0;
4ae388
+		path->io_err_disable_reinstate = 0;
4ae388
+		io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating",
4ae388
+				pp->devname, pp->io_err_nr, pp->io_nr);
4ae388
+		/*
4ae388
+		 * schedule path check as soon as possible to
4ae388
+		 * update path state. Do NOT reinstate dm path here
4ae388
+		 */
4ae388
+		path->tick = 1;
4ae388
+
4ae388
+	} else if (path->mpp && path->mpp->nr_active > 0) {
4ae388
+		io_err_stat_log(3, "%s: keep failing the dm path %s",
4ae388
+				path->mpp->alias, path->dev);
4ae388
+		path->io_err_pathfail_cnt = PATH_IO_ERR_WAITING_TO_CHECK;
4ae388
+		path->io_err_disable_reinstate = 1;
4ae388
+		path->io_err_dis_reinstate_time = currtime.tv_sec;
4ae388
+		io_err_stat_log(3, "%s: disable reinstating of %s",
4ae388
+				path->mpp->alias, path->dev);
4ae388
+	} else {
4ae388
+		path->io_err_pathfail_cnt = 0;
4ae388
+		path->io_err_disable_reinstate = 0;
4ae388
+		io_err_stat_log(3, "%s: there is orphan path, enable reinstating",
4ae388
+				pp->devname);
4ae388
+	}
4ae388
+	lock_cleanup_pop(vecs->lock);
4ae388
+
4ae388
+	delete_io_err_stat_by_addr(pp);
4ae388
+
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
4ae388
+{
4ae388
+	int rc = -1;
4ae388
+
4ae388
+	if (ct->io_starttime.tv_nsec == 0 &&
4ae388
+			ct->io_starttime.tv_sec == 0) {
4ae388
+		struct iocb *ios[1] = { &ct->io };
4ae388
+
4ae388
+		if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) {
4ae388
+			ct->io_starttime.tv_sec = 0;
4ae388
+			ct->io_starttime.tv_nsec = 0;
4ae388
+			return rc;
4ae388
+		}
4ae388
+		io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0);
4ae388
+		if (io_submit(ioctx, 1, ios) != 1) {
4ae388
+			io_err_stat_log(5, "%s: io_submit error %i",
4ae388
+					dev, errno);
4ae388
+			return rc;
4ae388
+		}
4ae388
+		rc = 0;
4ae388
+	}
4ae388
+
4ae388
+	return rc;
4ae388
+}
4ae388
+
4ae388
+static void send_batch_async_ios(struct io_err_stat_path *pp)
4ae388
+{
4ae388
+	int i;
4ae388
+	struct dio_ctx *ct;
4ae388
+	struct timespec currtime, difftime;
4ae388
+
4ae388
+	if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
4ae388
+		return;
4ae388
+	/*
4ae388
+	 * Give a free time for all IO to complete or timeout
4ae388
+	 */
4ae388
+	if (pp->start_time.tv_sec != 0) {
4ae388
+		timespecsub(&currtime, &pp->start_time, &difftime);
4ae388
+		if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time)
4ae388
+			return;
4ae388
+	}
4ae388
+
4ae388
+	for (i = 0; i < CONCUR_NR_EVENT; i++) {
4ae388
+		ct = pp->dio_ctx_array + i;
4ae388
+		if (!send_each_async_io(ct, pp->fd, pp->devname))
4ae388
+			pp->io_nr++;
4ae388
+	}
4ae388
+	if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 &&
4ae388
+		clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
4ae388
+		pp->start_time.tv_sec = 0;
4ae388
+		pp->start_time.tv_nsec = 0;
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t,
4ae388
+		char *dev)
4ae388
+{
4ae388
+	struct timespec	difftime;
4ae388
+	struct io_event	event;
4ae388
+	int		rc = PATH_UNCHECKED;
4ae388
+	int		r;
4ae388
+
4ae388
+	if (ct->io_starttime.tv_sec == 0)
4ae388
+		return rc;
4ae388
+	timespecsub(t, &ct->io_starttime, &difftime);
4ae388
+	if (difftime.tv_sec > IOTIMEOUT_SEC) {
4ae388
+		struct iocb *ios[1] = { &ct->io };
4ae388
+
4ae388
+		io_err_stat_log(5, "%s: abort check on timeout", dev);
4ae388
+		r = io_cancel(ioctx, ios[0], &event);
4ae388
+		if (r)
4ae388
+			io_err_stat_log(5, "%s: io_cancel error %i",
4ae388
+					dev, errno);
4ae388
+		ct->io_starttime.tv_sec = 0;
4ae388
+		ct->io_starttime.tv_nsec = 0;
4ae388
+		rc = PATH_DOWN;
4ae388
+	} else {
4ae388
+		rc = PATH_PENDING;
4ae388
+	}
4ae388
+
4ae388
+	return rc;
4ae388
+}
4ae388
+
4ae388
+static void poll_async_io_timeout(void)
4ae388
+{
4ae388
+	struct io_err_stat_path *pp;
4ae388
+	struct timespec curr_time;
4ae388
+	int		rc = PATH_UNCHECKED;
4ae388
+	int		i, j;
4ae388
+
4ae388
+	if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
4ae388
+		return;
4ae388
+	vector_foreach_slot(paths->pathvec, pp, i) {
4ae388
+		for (j = 0; j < CONCUR_NR_EVENT; j++) {
4ae388
+			rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j,
4ae388
+					&curr_time, pp->devname);
4ae388
+			account_async_io_state(pp, rc);
4ae388
+		}
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static void cancel_inflight_io(struct io_err_stat_path *pp)
4ae388
+{
4ae388
+	struct io_event event;
4ae388
+	int i, r;
4ae388
+
4ae388
+	for (i = 0; i < CONCUR_NR_EVENT; i++) {
4ae388
+		struct dio_ctx *ct = pp->dio_ctx_array + i;
4ae388
+		struct iocb *ios[1] = { &ct->io };
4ae388
+
4ae388
+		if (ct->io_starttime.tv_sec == 0
4ae388
+				&& ct->io_starttime.tv_nsec == 0)
4ae388
+			continue;
4ae388
+		io_err_stat_log(5, "%s: abort infligh io",
4ae388
+				pp->devname);
4ae388
+		r = io_cancel(ioctx, ios[0], &event);
4ae388
+		if (r)
4ae388
+			io_err_stat_log(5, "%s: io_cancel error %d, %i",
4ae388
+					pp->devname, r, errno);
4ae388
+		ct->io_starttime.tv_sec = 0;
4ae388
+		ct->io_starttime.tv_nsec = 0;
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev)
4ae388
+{
4ae388
+	ct->io_starttime.tv_sec = 0;
4ae388
+	ct->io_starttime.tv_nsec = 0;
4ae388
+	return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
4ae388
+}
4ae388
+
4ae388
+static void handle_async_io_done_event(struct io_event *io_evt)
4ae388
+{
4ae388
+	struct io_err_stat_path *pp;
4ae388
+	struct dio_ctx *ct;
4ae388
+	int rc = PATH_UNCHECKED;
4ae388
+	int i, j;
4ae388
+
4ae388
+	vector_foreach_slot(paths->pathvec, pp, i) {
4ae388
+		for (j = 0; j < CONCUR_NR_EVENT; j++) {
4ae388
+			ct = pp->dio_ctx_array + j;
4ae388
+			if (&ct->io == io_evt->obj) {
4ae388
+				rc = handle_done_dio_ctx(ct, io_evt);
4ae388
+				account_async_io_state(pp, rc);
4ae388
+				return;
4ae388
+			}
4ae388
+		}
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static void process_async_ios_event(int timeout_nsecs, char *dev)
4ae388
+{
4ae388
+	struct io_event events[CONCUR_NR_EVENT];
4ae388
+	int		i, n;
4ae388
+	struct timespec	timeout = { .tv_nsec = timeout_nsecs };
4ae388
+
4ae388
+	errno = 0;
4ae388
+	n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout);
4ae388
+	if (n < 0) {
4ae388
+		io_err_stat_log(3, "%s: async io events returned %d (errno=%s)",
4ae388
+				dev, n, strerror(errno));
4ae388
+	} else {
4ae388
+		for (i = 0; i < n; i++)
4ae388
+			handle_async_io_done_event(&events[i]);
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+static void service_paths(void)
4ae388
+{
4ae388
+	struct io_err_stat_path *pp;
4ae388
+	int i;
4ae388
+
4ae388
+	pthread_mutex_lock(&paths->mutex);
4ae388
+	vector_foreach_slot(paths->pathvec, pp, i) {
4ae388
+		send_batch_async_ios(pp);
4ae388
+		process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname);
4ae388
+		poll_async_io_timeout();
4ae388
+		poll_io_err_stat(vecs, pp);
4ae388
+	}
4ae388
+	pthread_mutex_unlock(&paths->mutex);
4ae388
+}
4ae388
+
4ae388
+static void cleanup_unlock(void *arg)
4ae388
+{
4ae388
+	pthread_mutex_unlock((pthread_mutex_t*) arg);
4ae388
+}
4ae388
+
4ae388
+static void cleanup_exited(void *arg)
4ae388
+{
4ae388
+	uatomic_set(&io_err_thread_running, 0);
4ae388
+}
4ae388
+
4ae388
+static void *io_err_stat_loop(void *data)
4ae388
+{
4ae388
+	vecs = (struct vectors *)data;
4ae388
+
4ae388
+	pthread_cleanup_push(cleanup_exited, NULL);
4ae388
+
4ae388
+	mlockall(MCL_CURRENT | MCL_FUTURE);
4ae388
+
4ae388
+	pthread_mutex_lock(&io_err_thread_lock);
4ae388
+	uatomic_set(&io_err_thread_running, 1);
4ae388
+	pthread_cond_broadcast(&io_err_thread_cond);
4ae388
+	pthread_mutex_unlock(&io_err_thread_lock);
4ae388
+
4ae388
+	while (1) {
4ae388
+		service_paths();
4ae388
+		usleep(100000);
4ae388
+	}
4ae388
+
4ae388
+	pthread_cleanup_pop(1);
4ae388
+	return NULL;
4ae388
+}
4ae388
+
4ae388
+int start_io_err_stat_thread(void *data)
4ae388
+{
4ae388
+	int ret;
4ae388
+
4ae388
+	if (uatomic_read(&io_err_thread_running) == 1)
4ae388
+		return 0;
4ae388
+
4ae388
+	if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
4ae388
+		io_err_stat_log(4, "io_setup failed");
4ae388
+		return 1;
4ae388
+	}
4ae388
+	paths = alloc_pathvec();
4ae388
+	if (!paths)
4ae388
+		goto destroy_ctx;
4ae388
+
4ae388
+	pthread_mutex_lock(&io_err_thread_lock);
4ae388
+	pthread_cleanup_push(cleanup_unlock, &io_err_thread_lock);
4ae388
+
4ae388
+	ret = pthread_create(&io_err_stat_thr, &io_err_stat_attr,
4ae388
+			     io_err_stat_loop, data);
4ae388
+
4ae388
+	while (!ret && !uatomic_read(&io_err_thread_running) &&
4ae388
+	       pthread_cond_wait(&io_err_thread_cond,
4ae388
+				 &io_err_thread_lock) == 0);
4ae388
+
4ae388
+	pthread_cleanup_pop(1);
4ae388
+
4ae388
+	if (ret) {
4ae388
+		io_err_stat_log(0, "cannot create io_error statistic thread");
4ae388
+		goto out_free;
4ae388
+	}
4ae388
+
4ae388
+	io_err_stat_log(2, "io_error statistic thread started");
4ae388
+	return 0;
4ae388
+
4ae388
+out_free:
4ae388
+	free_io_err_pathvec(paths);
4ae388
+destroy_ctx:
4ae388
+	io_destroy(ioctx);
4ae388
+	io_err_stat_log(0, "failed to start io_error statistic thread");
4ae388
+	return 1;
4ae388
+}
4ae388
+
4ae388
+void stop_io_err_stat_thread(void)
4ae388
+{
4ae388
+	if (io_err_stat_thr == (pthread_t)0)
4ae388
+		return;
4ae388
+
4ae388
+	if (uatomic_read(&io_err_thread_running) == 1)
4ae388
+		pthread_cancel(io_err_stat_thr);
4ae388
+
4ae388
+	pthread_join(io_err_stat_thr, NULL);
4ae388
+	free_io_err_pathvec(paths);
4ae388
+	io_destroy(ioctx);
4ae388
+}
4ae388
Index: multipath-tools-130222/libmultipath/io_err_stat.h
4ae388
===================================================================
4ae388
--- /dev/null
4ae388
+++ multipath-tools-130222/libmultipath/io_err_stat.h
4ae388
@@ -0,0 +1,15 @@
4ae388
+#ifndef _IO_ERR_STAT_H
4ae388
+#define _IO_ERR_STAT_H
4ae388
+
4ae388
+#include "vector.h"
4ae388
+#include "lock.h"
4ae388
+
4ae388
+
4ae388
+extern pthread_attr_t io_err_stat_attr;
4ae388
+
4ae388
+int start_io_err_stat_thread(void *data);
4ae388
+void stop_io_err_stat_thread(void);
4ae388
+int io_err_stat_handle_pathfail(struct path *path);
4ae388
+int need_io_err_check(struct path *pp);
4ae388
+
4ae388
+#endif /* _IO_ERR_STAT_H */
4ae388
Index: multipath-tools-130222/libmultipath/propsel.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/propsel.c
4ae388
+++ multipath-tools-130222/libmultipath/propsel.c
4ae388
@@ -956,6 +956,104 @@ select_delay_wait_checks (struct multipa
4ae388
 }
4ae388
 
4ae388
 extern int
4ae388
+select_marginal_path_err_sample_time(struct multipath * mp)
4ae388
+{
4ae388
+	if (mp->mpe &&
4ae388
+	    mp->mpe->marginal_path_err_sample_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_sample_time = mp->mpe->marginal_path_err_sample_time;
4ae388
+		condlog(3, "marginal_path_err_sample_time = %i (multipath setting)", mp->marginal_path_err_sample_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (mp->hwe &&
4ae388
+	    mp->hwe->marginal_path_err_sample_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_sample_time = mp->hwe->marginal_path_err_sample_time;
4ae388
+		condlog(3, "marginal_path_err_sample_time = %i (controler setting)", mp->marginal_path_err_sample_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (conf->marginal_path_err_sample_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_sample_time = conf->marginal_path_err_sample_time;
4ae388
+		condlog(3, "marginal_path_err_sample_time = %i (config file default)", mp->marginal_path_err_sample_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	mp->marginal_path_err_sample_time = DEFAULT_DELAY_CHECKS;
4ae388
+	condlog(3, "marginal_path_err_sample_time = DISABLED (internal default)");
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+extern int
4ae388
+select_marginal_path_err_rate_threshold(struct multipath * mp)
4ae388
+{
4ae388
+	if (mp->mpe &&
4ae388
+	    mp->mpe->marginal_path_err_rate_threshold != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_rate_threshold = mp->mpe->marginal_path_err_rate_threshold;
4ae388
+		condlog(3, "marginal_path_err_rate_threshold = %i (multipath setting)", mp->marginal_path_err_rate_threshold);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (mp->hwe &&
4ae388
+	    mp->hwe->marginal_path_err_rate_threshold != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_rate_threshold = mp->hwe->marginal_path_err_rate_threshold;
4ae388
+		condlog(3, "marginal_path_err_rate_threshold = %i (controler setting)", mp->marginal_path_err_rate_threshold);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (conf->marginal_path_err_rate_threshold != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_rate_threshold = conf->marginal_path_err_rate_threshold;
4ae388
+		condlog(3, "marginal_path_err_rate_threshold = %i (config file default)", mp->marginal_path_err_rate_threshold);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	mp->marginal_path_err_rate_threshold = DEFAULT_DELAY_CHECKS;
4ae388
+	condlog(3, "marginal_path_err_rate_threshold = DISABLED (internal default)");
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+extern int
4ae388
+select_marginal_path_err_recheck_gap_time(struct multipath * mp)
4ae388
+{
4ae388
+	if (mp->mpe && mp->mpe->marginal_path_err_recheck_gap_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_recheck_gap_time = mp->mpe->marginal_path_err_recheck_gap_time;
4ae388
+		condlog(3, "marginal_path_err_recheck_gap_time = %i (multipath setting)", mp->marginal_path_err_recheck_gap_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (mp->hwe && mp->hwe->marginal_path_err_recheck_gap_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_recheck_gap_time = mp->hwe->marginal_path_err_recheck_gap_time;
4ae388
+		condlog(3, "marginal_path_err_recheck_gap_time = %i (controler setting)", mp->marginal_path_err_recheck_gap_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (conf->marginal_path_err_recheck_gap_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_err_recheck_gap_time = conf->marginal_path_err_recheck_gap_time;
4ae388
+		condlog(3, "marginal_path_err_recheck_gap_time = %i (config file default)", mp->marginal_path_err_recheck_gap_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	mp->marginal_path_err_recheck_gap_time = DEFAULT_DELAY_CHECKS;
4ae388
+	condlog(3, "marginal_path_err_recheck_gap_time = DISABLED (internal default)");
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+extern int
4ae388
+select_marginal_path_double_failed_time(struct multipath * mp)
4ae388
+{
4ae388
+	if (mp->mpe &&
4ae388
+	    mp->mpe->marginal_path_double_failed_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_double_failed_time = mp->mpe->marginal_path_double_failed_time;
4ae388
+		condlog(3, "marginal_path_double_failed_time = %i (multipath setting)", mp->marginal_path_double_failed_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (mp->hwe &&
4ae388
+	    mp->hwe->marginal_path_double_failed_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_double_failed_time = mp->hwe->marginal_path_double_failed_time;
4ae388
+		condlog(3, "marginal_path_double_failed_time = %i (controler setting)", mp->marginal_path_double_failed_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	if (conf->marginal_path_double_failed_time != MARGINAL_PATH_UNDEF) {
4ae388
+		mp->marginal_path_double_failed_time = conf->marginal_path_double_failed_time;
4ae388
+		condlog(3, "marginal_path_double_failed_time = %i (config file default)", mp->marginal_path_double_failed_time);
4ae388
+		return 0;
4ae388
+	}
4ae388
+	mp->marginal_path_double_failed_time = DEFAULT_DELAY_CHECKS;
4ae388
+	condlog(3, "marginal_path_double_failed_time = DISABLED (internal default)");
4ae388
+	return 0;
4ae388
+}
4ae388
+
4ae388
+extern int
4ae388
 select_skip_kpartx (struct multipath * mp)
4ae388
 {
4ae388
 	if (mp->mpe && mp->mpe->skip_kpartx != SKIP_KPARTX_UNDEF) {
4ae388
Index: multipath-tools-130222/libmultipath/propsel.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/propsel.h
4ae388
+++ multipath-tools-130222/libmultipath/propsel.h
4ae388
@@ -24,6 +24,10 @@ int select_detect_checker(struct path *
4ae388
 int select_deferred_remove(struct multipath *mp);
4ae388
 int select_delay_watch_checks (struct multipath * mp);
4ae388
 int select_delay_wait_checks (struct multipath * mp);
4ae388
+int select_marginal_path_err_sample_time(struct multipath *mp);
4ae388
+int select_marginal_path_err_rate_threshold(struct multipath *mp);
4ae388
+int select_marginal_path_err_recheck_gap_time(struct multipath *mp);
4ae388
+int select_marginal_path_double_failed_time(struct multipath *mp);
4ae388
 int select_skip_kpartx (struct multipath * mp);
4ae388
 int select_max_sectors_kb (struct multipath * mp);
4ae388
 int select_unpriv_sgio (struct multipath * mp);
4ae388
Index: multipath-tools-130222/libmultipath/structs.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/structs.h
4ae388
+++ multipath-tools-130222/libmultipath/structs.h
4ae388
@@ -3,6 +3,7 @@
4ae388
 
4ae388
 #include <sys/types.h>
4ae388
 #include <inttypes.h>
4ae388
+#include <time.h>
4ae388
 
4ae388
 #include "prio.h"
4ae388
 #include "byteorder.h"
4ae388
@@ -176,6 +177,11 @@ enum delay_checks_states {
4ae388
 	DELAY_CHECKS_UNDEF = 0,
4ae388
 };
4ae388
 
4ae388
+enum marginal_path_states {
4ae388
+	MARGINAL_PATH_OFF = -1,
4ae388
+	MARGINAL_PATH_UNDEF = 0,
4ae388
+};
4ae388
+
4ae388
 enum missing_udev_info_states {
4ae388
 	INFO_OK,
4ae388
 	INFO_MISSING,
4ae388
@@ -252,6 +258,10 @@ struct path {
4ae388
 	int missing_udev_info;
4ae388
 	int retriggers;
4ae388
 	int wwid_changed;
4ae388
+	time_t io_err_dis_reinstate_time;
4ae388
+	int io_err_disable_reinstate;
4ae388
+	int io_err_pathfail_cnt;
4ae388
+	int io_err_pathfail_starttime;
4ae388
 
4ae388
 	/* configlet pointers */
4ae388
 	struct hwentry * hwe;
4ae388
@@ -285,6 +295,10 @@ struct multipath {
4ae388
 	int deferred_remove;
4ae388
 	int delay_watch_checks;
4ae388
 	int delay_wait_checks;
4ae388
+	int marginal_path_err_sample_time;
4ae388
+	int marginal_path_err_rate_threshold;
4ae388
+	int marginal_path_err_recheck_gap_time;
4ae388
+	int marginal_path_double_failed_time;
4ae388
 	int force_udev_reload;
4ae388
 	int skip_kpartx;
4ae388
 	int max_sectors_kb;
4ae388
Index: multipath-tools-130222/libmultipath/time-util.c
4ae388
===================================================================
4ae388
--- /dev/null
4ae388
+++ multipath-tools-130222/libmultipath/time-util.c
4ae388
@@ -0,0 +1,42 @@
4ae388
+#include <assert.h>
4ae388
+#include <pthread.h>
4ae388
+#include <time.h>
4ae388
+#include "time-util.h"
4ae388
+
4ae388
+/* Initialize @cond as a condition variable that uses the monotonic clock */
4ae388
+void pthread_cond_init_mono(pthread_cond_t *cond)
4ae388
+{
4ae388
+	pthread_condattr_t attr;
4ae388
+	int res;
4ae388
+
4ae388
+	res = pthread_condattr_init(&attr);
4ae388
+	assert(res == 0);
4ae388
+	res = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
4ae388
+	assert(res == 0);
4ae388
+	res = pthread_cond_init(cond, &attr);
4ae388
+	assert(res == 0);
4ae388
+	res = pthread_condattr_destroy(&attr);
4ae388
+	assert(res == 0);
4ae388
+}
4ae388
+
4ae388
+/* Ensure that 0 <= ts->tv_nsec && ts->tv_nsec < 1000 * 1000 * 1000. */
4ae388
+void normalize_timespec(struct timespec *ts)
4ae388
+{
4ae388
+	while (ts->tv_nsec < 0) {
4ae388
+		ts->tv_nsec += 1000UL * 1000 * 1000;
4ae388
+		ts->tv_sec--;
4ae388
+	}
4ae388
+	while (ts->tv_nsec >= 1000UL * 1000 * 1000) {
4ae388
+		ts->tv_nsec -= 1000UL * 1000 * 1000;
4ae388
+		ts->tv_sec++;
4ae388
+	}
4ae388
+}
4ae388
+
4ae388
+/* Compute *res = *a - *b */
4ae388
+void timespecsub(const struct timespec *a, const struct timespec *b,
4ae388
+		 struct timespec *res)
4ae388
+{
4ae388
+	res->tv_sec = a->tv_sec - b->tv_sec;
4ae388
+	res->tv_nsec = a->tv_nsec - b->tv_nsec;
4ae388
+	normalize_timespec(res);
4ae388
+}
4ae388
Index: multipath-tools-130222/libmultipath/time-util.h
4ae388
===================================================================
4ae388
--- /dev/null
4ae388
+++ multipath-tools-130222/libmultipath/time-util.h
4ae388
@@ -0,0 +1,13 @@
4ae388
+#ifndef _TIME_UTIL_H_
4ae388
+#define _TIME_UTIL_H_
4ae388
+
4ae388
+#include <pthread.h>
4ae388
+
4ae388
+struct timespec;
4ae388
+
4ae388
+void pthread_cond_init_mono(pthread_cond_t *cond);
4ae388
+void normalize_timespec(struct timespec *ts);
4ae388
+void timespecsub(const struct timespec *a, const struct timespec *b,
4ae388
+		 struct timespec *res);
4ae388
+
4ae388
+#endif /* _TIME_UTIL_H_ */
4ae388
Index: multipath-tools-130222/libmultipath/uevent.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/uevent.c
4ae388
+++ multipath-tools-130222/libmultipath/uevent.c
4ae388
@@ -616,12 +616,46 @@ uevent_get_dm_name(struct uevent *uev)
4ae388
 	int i;
4ae388
 
4ae388
 	for (i = 0; uev->envp[i] != NULL; i++) {
4ae388
-		if (!strncmp(uev->envp[i], "DM_NAME", 6) &&
4ae388
-		    strlen(uev->envp[i]) > 7) {
4ae388
+		if (!strncmp(uev->envp[i], "DM_NAME", 7) &&
4ae388
+		    strlen(uev->envp[i]) > 8) {
4ae388
 			p = MALLOC(strlen(uev->envp[i] + 8) + 1);
4ae388
 			strcpy(p, uev->envp[i] + 8);
4ae388
 			break;
4ae388
 		}
4ae388
 	}
4ae388
+	return p;
4ae388
+}
4ae388
+
4ae388
+extern char *
4ae388
+uevent_get_dm_path(struct uevent *uev)
4ae388
+{
4ae388
+	char *p = NULL;
4ae388
+	int i;
4ae388
+
4ae388
+	for (i = 0; uev->envp[i] != NULL; i++) {
4ae388
+		if (!strncmp(uev->envp[i], "DM_PATH", 7) &&
4ae388
+		    strlen(uev->envp[i]) > 8) {
4ae388
+			p = MALLOC(strlen(uev->envp[i] + 8) + 1);
4ae388
+			strcpy(p, uev->envp[i] + 8);
4ae388
+			break;
4ae388
+		}
4ae388
+	}
4ae388
+	return p;
4ae388
+}
4ae388
+
4ae388
+extern char *
4ae388
+uevent_get_dm_action(struct uevent *uev)
4ae388
+{
4ae388
+	char *p = NULL;
4ae388
+	int i;
4ae388
+
4ae388
+	for (i = 0; uev->envp[i] != NULL; i++) {
4ae388
+		if (!strncmp(uev->envp[i], "DM_ACTION", 9) &&
4ae388
+		    strlen(uev->envp[i]) > 10) {
4ae388
+			p = MALLOC(strlen(uev->envp[i] + 10) + 1);
4ae388
+			strcpy(p, uev->envp[i] + 10);
4ae388
+			break;
4ae388
+		}
4ae388
+	}
4ae388
 	return p;
4ae388
 }
4ae388
Index: multipath-tools-130222/libmultipath/uevent.h
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/libmultipath/uevent.h
4ae388
+++ multipath-tools-130222/libmultipath/uevent.h
4ae388
@@ -36,5 +36,7 @@ int uevent_get_major(struct uevent *uev)
4ae388
 int uevent_get_minor(struct uevent *uev);
4ae388
 int uevent_get_disk_ro(struct uevent *uev);
4ae388
 char *uevent_get_dm_name(struct uevent *uev);
4ae388
+char *uevent_get_dm_path(struct uevent *uev);
4ae388
+char *uevent_get_dm_action(struct uevent *uev);
4ae388
 
4ae388
 #endif /* _UEVENT_H */
4ae388
Index: multipath-tools-130222/multipath/multipath.conf.5
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/multipath/multipath.conf.5
4ae388
+++ multipath-tools-130222/multipath/multipath.conf.5
4ae388
@@ -527,7 +527,7 @@ recently become valid for this many chec
4ae388
 being watched, when they next become valid, they will not be used until they
4ae388
 have stayed up for
4ae388
 .I delay_wait_checks
4ae388
-checks. Default is
4ae388
+checks. See "Shaky paths detection" below. Default is
4ae388
 .I no
4ae388
 .TP
4ae388
 .B delay_wait_checks
4ae388
@@ -537,9 +537,56 @@ online fails again within
4ae388
 checks, the next time it comes back online, it will marked and delayed, and not
4ae388
 used until it has passed
4ae388
 .I delay_wait_checks
4ae388
-checks. Default is
4ae388
+checks. See "Shaky paths detection" below. Default is
4ae388
 .I no
4ae388
 .TP
4ae388
+.B marginal_path_double_failed_time
4ae388
+One of the four parameters of supporting path check based on accounting IO
4ae388
+error such as intermittent error. When a path failed event occurs twice in
4ae388
+\fImarginal_path_double_failed_time\fR seconds due to an IO error and all the
4ae388
+other three parameters are set, multipathd will fail the path and enqueue
4ae388
+this path into a queue of which members are sent a couple of continuous
4ae388
+direct reading asynchronous IOs at a fixed sample rate of 10HZ to start IO
4ae388
+error accounting process. See "Shaky paths detection" below. Default is
4ae388
+\fIno\fR
4ae388
+.TP
4ae388
+.B marginal_path_err_sample_time
4ae388
+One of the four parameters of supporting path check based on accounting IO
4ae388
+error such as intermittent error. If it is set to a value no less than 120,
4ae388
+when a path fail event occurs twice in \fImarginal_path_double_failed_time\fR
4ae388
+second due to an IO error, multipathd will fail the path and enqueue this
4ae388
+path into a queue of which members are sent a couple of continuous direct
4ae388
+reading asynchronous IOs at a fixed sample rate of 10HZ to start the IO
4ae388
+accounting process for the path will last for
4ae388
+\fImarginal_path_err_sample_time\fR.
4ae388
+If the rate of IO error on a particular path is greater than the
4ae388
+\fImarginal_path_err_rate_threshold\fR, then the path will not reinstate for
4ae388
+\fImarginal_path_err_recheck_gap_time\fR seconds unless there is only one
4ae388
+active path. After \fImarginal_path_err_recheck_gap_time\fR expires, the path
4ae388
+will be requeueed for rechecking. If checking result is good enough, the
4ae388
+path will be reinstated. See "Shaky paths detection" below. Default is
4ae388
+\fIno\fR
4ae388
+.TP
4ae388
+.B marginal_path_err_rate_threshold
4ae388
+The error rate threshold as a permillage (1/1000). One of the four parameters
4ae388
+of supporting path check based on accounting IO error such as intermittent
4ae388
+error. Refer to \fImarginal_path_err_sample_time\fR. If the rate of IO errors
4ae388
+on a particular path is greater than this parameter, then the path will not
4ae388
+reinstate for \fImarginal_path_err_recheck_gap_time\fR seconds unless there is
4ae388
+only one active path. See "Shaky paths detection" below. Default is \fIno\fR
4ae388
+.TP
4ae388
+.B marginal_path_err_recheck_gap_time
4ae388
+One of the four parameters of supporting path check based on accounting IO
4ae388
+error such as intermittent error. Refer to
4ae388
+\fImarginal_path_err_sample_time\fR. If this parameter is set to a positive
4ae388
+value, the failed path of  which the IO error rate is larger than
4ae388
+\fImarginal_path_err_rate_threshold\fR will be kept in failed state for
4ae388
+\fImarginal_path_err_recheck_gap_time\fR seconds. When
4ae388
+\fImarginal_path_err_recheck_gap_time\fR seconds expires, the path will be
4ae388
+requeueed for checking. If checking result is good enough, the path will be
4ae388
+reinstated, or else it will keep failed. See "Shaky paths detection" below.
4ae388
+Default is \fIno\fR
4ae388
+.TP
4ae388
 .B missing_uev_wait_timeout
4ae388
 Controls how many seconds multipathd will wait, after a new multipath device
4ae388
 is created, to receive a change event from udev for the device, before
4ae388
@@ -771,6 +818,14 @@ section:
4ae388
 .TP
4ae388
 .B delay_wait_checks
4ae388
 .TP
4ae388
+.B marginal_path_err_sample_time
4ae388
+.TP
4ae388
+.B marginal_path_err_rate_threshold
4ae388
+.TP
4ae388
+.B marginal_path_err_recheck_gap_time
4ae388
+.TP
4ae388
+.B marginal_path_double_failed_time
4ae388
+.TP
4ae388
 .B skip_kpartx
4ae388
 .TP
4ae388
 .B max_sectors_kb
4ae388
@@ -877,6 +932,14 @@ section:
4ae388
 .TP
4ae388
 .B delay_wait_checks
4ae388
 .TP
4ae388
+.B marginal_path_err_sample_time
4ae388
+.TP
4ae388
+.B marginal_path_err_rate_threshold
4ae388
+.TP
4ae388
+.B marginal_path_err_recheck_gap_time
4ae388
+.TP
4ae388
+.B marginal_path_double_failed_time
4ae388
+.TP
4ae388
 .B skip_kpartx
4ae388
 .TP
4ae388
 .B max_sectors_kb
4ae388
@@ -887,6 +950,47 @@ section:
4ae388
 .RE
4ae388
 .PD
4ae388
 .LP
4ae388
+.SH "Shaky paths detection"
4ae388
+A common problem in SAN setups is the occurence of intermittent errors: a
4ae388
+path is unreachable, then reachable again for a short time, disappears again,
4ae388
+and so forth. This happens typically on unstable interconnects. It is
4ae388
+undesirable to switch pathgroups unnecessarily on such frequent, unreliable
4ae388
+events. \fImultipathd\fR supports two different methods for detecting this
4ae388
+situation and dealing with it. Both methods share the same basic mode of
4ae388
+operation: If a path is found to be \(dqshaky\(dq or \(dqflipping\(dq,
4ae388
+and appears to be in healthy status, it is not reinstated (put back to use)
4ae388
+immediately. Instead, it is watched for some time, and only reinstated
4ae388
+if the healthy state appears to be stable. The logic of determining
4ae388
+\(dqshaky\(dq condition, as well as the logic when to reinstate,
4ae388
+differs between the two methods.
4ae388
+.TP 8
4ae388
+.B \(dqdelay_checks\(dq failure tracking
4ae388
+If a path fails again within a
4ae388
+\fIdelay_watch_checks\fR interval after a failure, don't
4ae388
+reinstate it until it passes a \fIdelay_wait_checks\fR interval
4ae388
+in always good status.
4ae388
+The intervals are measured in \(dqticks\(dq, i.e. the
4ae388
+time between path checks by multipathd, which is variable and controlled by the
4ae388
+\fIpolling_interval\fR and \fImax_polling_interval\fR parameters.
4ae388
+.TP
4ae388
+.B \(dqmarginal_path\(dq failure tracking
4ae388
+If a second failure event (good->bad transition) occurs within
4ae388
+\fImarginal_path_double_failed_time\fR seconds after a failure, high-frequency
4ae388
+monitoring is started for the affected path: I/O is sent at a rate of 10 per
4ae388
+second. This is done for \fImarginal_path_err_sample_time\fR seconds. During
4ae388
+this period, the path is not reinstated. If the
4ae388
+rate of errors remains below \fImarginal_path_err_rate_threshold\fR during the
4ae388
+monitoring period, the path is reinstated. Otherwise, it
4ae388
+is kept in failed state for \fImarginal_path_err_recheck_gap_time\fR, and
4ae388
+after that, it is monitored again. For this method, time intervals are measured
4ae388
+in seconds.
4ae388
+.RE
4ae388
+.LP
4ae388
+See the documentation
4ae388
+of the individual options above for details.
4ae388
+It is \fBstrongly discouraged\fR to use more than one of these methods for any
4ae388
+given multipath map, because the two concurrent methods may interact in
4ae388
+unpredictable ways.
4ae388
 .SH "KNOWN ISSUES"
4ae388
 The usage of
4ae388
 .B queue_if_no_path
4ae388
Index: multipath-tools-130222/multipathd/cli_handlers.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/multipathd/cli_handlers.c
4ae388
+++ multipath-tools-130222/multipathd/cli_handlers.c
4ae388
@@ -721,7 +721,7 @@ int resize_map(struct multipath *mpp, un
4ae388
 
4ae388
 	mpp->size = size;
4ae388
 	update_mpp_paths(mpp, vecs->pathvec);
4ae388
-	setup_map(mpp, params, PARAMS_SIZE);
4ae388
+	setup_map(mpp, params, PARAMS_SIZE, vecs);
4ae388
 	mpp->action = ACT_RESIZE;
4ae388
 	if (domap(mpp, params) <= 0) {
4ae388
 		condlog(0, "%s: failed to resize map : %s", mpp->alias,
4ae388
Index: multipath-tools-130222/multipathd/main.c
4ae388
===================================================================
4ae388
--- multipath-tools-130222.orig/multipathd/main.c
4ae388
+++ multipath-tools-130222/multipathd/main.c
4ae388
@@ -56,6 +56,7 @@
4ae388
 #include <log.h>
4ae388
 #include <file.h>
4ae388
 #include <prkey.h>
4ae388
+#include <io_err_stat.h>
4ae388
 
4ae388
 #include "main.h"
4ae388
 #include "pidfile.h"
4ae388
@@ -274,7 +275,7 @@ retry:
4ae388
 	mpp->action = ACT_RELOAD;
4ae388
 
4ae388
 	extract_hwe_from_path(mpp);
4ae388
-	if (setup_map(mpp, params, PARAMS_SIZE)) {
4ae388
+	if (setup_map(mpp, params, PARAMS_SIZE, vecs)) {
4ae388
 		condlog(0, "%s: failed to setup new map in update", mpp->alias);
4ae388
 		retries = -1;
4ae388
 		goto fail;
4ae388
@@ -638,7 +639,7 @@ rescan:
4ae388
 	/*
4ae388
 	 * push the map to the device-mapper
4ae388
 	 */
4ae388
-	if (setup_map(mpp, params, PARAMS_SIZE)) {
4ae388
+	if (setup_map(mpp, params, PARAMS_SIZE, vecs)) {
4ae388
 		condlog(0, "%s: failed to setup map for addition of new "
4ae388
 			"path %s", mpp->alias, pp->dev);
4ae388
 		goto fail_map;
4ae388
@@ -771,7 +772,7 @@ ev_remove_path (struct path *pp, struct
4ae388
 			 */
4ae388
 		}
4ae388
 
4ae388
-		if (setup_map(mpp, params, PARAMS_SIZE)) {
4ae388
+		if (setup_map(mpp, params, PARAMS_SIZE, vecs)) {
4ae388
 			condlog(0, "%s: failed to setup map for"
4ae388
 				" removal of path %s", mpp->alias, pp->dev);
4ae388
 			goto fail;
4ae388
@@ -891,6 +892,41 @@ uev_update_path (struct uevent *uev, str
4ae388
 }
4ae388
 
4ae388
 static int
4ae388
+uev_pathfail_check(struct uevent *uev, struct vectors *vecs)
4ae388
+{
4ae388
+	char *action = NULL, *devt = NULL;
4ae388
+	struct path *pp;
4ae388
+	int r = 1;
4ae388
+
4ae388
+	action = uevent_get_dm_action(uev);
4ae388
+	if (!action)
4ae388
+		return 1;
4ae388
+	if (strncmp(action, "PATH_FAILED", 11))
4ae388
+		goto out;
4ae388
+	devt = uevent_get_dm_path(uev);
4ae388
+	if (!devt) {
4ae388
+		condlog(3, "%s: No DM_PATH in uevent", uev->kernel);
4ae388
+		goto out;
4ae388
+	}
4ae388
+
4ae388
+	pp = find_path_by_devt(vecs->pathvec, devt);
4ae388
+	if (!pp)
4ae388
+		goto out_devt;
4ae388
+	r = io_err_stat_handle_pathfail(pp);
4ae388
+
4ae388
+	if (r)
4ae388
+		condlog(3, "io_err_stat: %s: cannot handle pathfail uevent",
4ae388
+			pp->dev);
4ae388
+out_devt:
4ae388
+	FREE(devt);
4ae388
+	FREE(action);
4ae388
+	return r;
4ae388
+out:
4ae388
+	FREE(action);
4ae388
+	return 1;
4ae388
+}
4ae388
+
4ae388
+static int
4ae388
 map_discovery (struct vectors * vecs)
4ae388
 {
4ae388
 	struct multipath * mpp;
4ae388
@@ -974,6 +1010,14 @@ uev_trigger (struct uevent * uev, void *
4ae388
 	if (!strncmp(uev->kernel, "dm-", 3)) {
4ae388
 		if (!strncmp(uev->action, "change", 6)) {
4ae388
 			r = uev_add_map(uev, vecs);
4ae388
+
4ae388
+			/*
4ae388
+			 * the kernel-side dm-mpath issues a PATH_FAILED event
4ae388
+			 * when it encounters a path IO error. It is reason-
4ae388
+			 * able be the entry of path IO error accounting pro-
4ae388
+			 * cess.
4ae388
+			 */
4ae388
+			uev_pathfail_check(uev, vecs);
4ae388
 			goto out;
4ae388
 		}
4ae388
 		if (!strncmp(uev->action, "remove", 6)) {
4ae388
@@ -1405,6 +1449,17 @@ check_path (struct vectors * vecs, struc
4ae388
 		return;
4ae388
 
4ae388
 	if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
4ae388
+	    pp->io_err_disable_reinstate && need_io_err_check(pp)) {
4ae388
+		pp->state = PATH_SHAKY;
4ae388
+		/*
4ae388
+		 * to reschedule as soon as possible,so that this path can
4ae388
+		 * be recoverd in time
4ae388
+		 */
4ae388
+		pp->tick = 1;
4ae388
+		return;
4ae388
+	}
4ae388
+
4ae388
+	if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
4ae388
 	     pp->wait_checks > 0) {
4ae388
 		if (pp->mpp && pp->mpp->nr_active > 0) {
4ae388
 			pp->state = PATH_DELAYED;
4ae388
@@ -1955,6 +2010,7 @@ child (void * param)
4ae388
 	setup_thread_attr(&misc_attr, 64 * 1024, 1);
4ae388
 	setup_thread_attr(&uevent_attr, 128 * 1024, 1);
4ae388
 	setup_thread_attr(&waiter_attr, 32 * 1024, 1);
4ae388
+	setup_thread_attr(&io_err_stat_attr, 32 * 1024, 0);
4ae388
 
4ae388
 	if (logsink) {
4ae388
 		setup_thread_attr(&log_attr, 64 * 1024, 0);
4ae388
@@ -2097,6 +2153,8 @@ child (void * param)
4ae388
 	*/
4ae388
 	cleanup_checkers();
4ae388
 	cleanup_prio();
4ae388
+	stop_io_err_stat_thread();
4ae388
+	pthread_attr_destroy(&io_err_stat_attr);
4ae388
 
4ae388
 	dm_lib_release();
4ae388
 	dm_lib_exit();