|
|
4ae388 |
From e28c340ed961409700d46a1cb9a820a8b7a4d016 Mon Sep 17 00:00:00 2001
|
|
|
4ae388 |
From: Mike Christie <mchristi@redhat.com>
|
|
|
4ae388 |
Date: Thu, 11 Aug 2016 02:12:12 -0500
|
|
|
4ae388 |
Subject: [PATCH 04/11] multipath-tools: Add rbd checker.
|
|
|
4ae388 |
|
|
|
4ae388 |
For BZ 1348372 from upstream commit:
|
|
|
4ae388 |
|
|
|
4ae388 |
commit d1cad5649b6fcf9027d43ca0405c900080133e32
|
|
|
4ae388 |
Author: Mike Christie <mchristi@redhat.com>
|
|
|
4ae388 |
Date: Mon Aug 8 07:01:49 2016 -0500
|
|
|
4ae388 |
|
|
|
4ae388 |
multipath-tools: Add rbd checker.
|
|
|
4ae388 |
|
|
|
4ae388 |
This checker currently only handles the case where a path is failed
|
|
|
4ae388 |
due to it being blacklisted by the ceph cluster. The specific use
|
|
|
4ae388 |
case for me is when LIO exports rbd images through multiple LIO
|
|
|
4ae388 |
instances.
|
|
|
4ae388 |
|
|
|
4ae388 |
The problem it handles is when rbd instance1 has the exclusive lock,
|
|
|
4ae388 |
but becomes unreachable another host in the cluster will take over
|
|
|
4ae388 |
and blacklist the instance1. This prevents it from sending stale IO
|
|
|
4ae388 |
and corrupting data.
|
|
|
4ae388 |
|
|
|
4ae388 |
Later, when the host is reachable, we will want to failback to it.
|
|
|
4ae388 |
To this, the checker will detect we were blacklisted, unmap the old
|
|
|
4ae388 |
image which will make sure old IO is failed, and then remap the
|
|
|
4ae388 |
image
|
|
|
4ae388 |
and unblacklist the host. multipathd will then handle this like a
|
|
|
4ae388 |
path being removed and re-added.
|
|
|
4ae388 |
|
|
|
4ae388 |
--------
|
|
|
4ae388 |
|
|
|
4ae388 |
Porting notes:
|
|
|
4ae388 |
Added rbd to multipath.conf.annotated.
|
|
|
4ae388 |
|
|
|
4ae388 |
Signed-off-by: Mike Christie <mchristi@redhat.com>
|
|
|
4ae388 |
---
|
|
|
4ae388 |
libmultipath/checkers/Makefile | 7
|
|
|
4ae388 |
libmultipath/checkers/rbd.c | 639 +++++++++++++++++++++++++++++++++++++++++
|
|
|
4ae388 |
multipath.conf.annotated | 4
|
|
|
4ae388 |
multipath/multipath.conf.5 | 3
|
|
|
4ae388 |
4 files changed, 651 insertions(+), 2 deletions(-)
|
|
|
4ae388 |
create mode 100644 libmultipath/checkers/rbd.c
|
|
|
4ae388 |
|
|
|
4ae388 |
Index: multipath-tools-130222/libmultipath/checkers/Makefile
|
|
|
4ae388 |
===================================================================
|
|
|
4ae388 |
--- multipath-tools-130222.orig/libmultipath/checkers/Makefile
|
|
|
4ae388 |
+++ multipath-tools-130222/libmultipath/checkers/Makefile
|
|
|
4ae388 |
@@ -14,10 +14,17 @@ LIBS= \
|
|
|
4ae388 |
libcheckhp_sw.so \
|
|
|
4ae388 |
libcheckrdac.so
|
|
|
4ae388 |
|
|
|
4ae388 |
+ifeq ($(shell test -r /usr/include/rados/librados.h && echo 1),1)
|
|
|
4ae388 |
+LIBS += libcheckrbd.so
|
|
|
4ae388 |
+endif
|
|
|
4ae388 |
+
|
|
|
4ae388 |
CFLAGS += -fPIC -I..
|
|
|
4ae388 |
|
|
|
4ae388 |
all: $(LIBS)
|
|
|
4ae388 |
|
|
|
4ae388 |
+libcheckrbd.so: rbd.o
|
|
|
4ae388 |
+ $(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lrados -ludev
|
|
|
4ae388 |
+
|
|
|
4ae388 |
libcheckdirectio.so: libsg.o directio.o
|
|
|
4ae388 |
$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -laio
|
|
|
4ae388 |
|
|
|
4ae388 |
Index: multipath-tools-130222/libmultipath/checkers/rbd.c
|
|
|
4ae388 |
===================================================================
|
|
|
4ae388 |
--- /dev/null
|
|
|
4ae388 |
+++ multipath-tools-130222/libmultipath/checkers/rbd.c
|
|
|
4ae388 |
@@ -0,0 +1,639 @@
|
|
|
4ae388 |
+/*
|
|
|
4ae388 |
+ * Copyright (c) 2016 Red Hat
|
|
|
4ae388 |
+ * Copyright (c) 2004 Christophe Varoqui
|
|
|
4ae388 |
+ *
|
|
|
4ae388 |
+ * Code based off of tur.c and ceph's krbd.cc
|
|
|
4ae388 |
+ */
|
|
|
4ae388 |
+#define _GNU_SOURCE
|
|
|
4ae388 |
+#include <stdio.h>
|
|
|
4ae388 |
+#include <stdlib.h>
|
|
|
4ae388 |
+#include <string.h>
|
|
|
4ae388 |
+#include <unistd.h>
|
|
|
4ae388 |
+#include <fcntl.h>
|
|
|
4ae388 |
+#include <errno.h>
|
|
|
4ae388 |
+#include <pthread.h>
|
|
|
4ae388 |
+#include <libudev.h>
|
|
|
4ae388 |
+#include <ifaddrs.h>
|
|
|
4ae388 |
+#include <sys/types.h>
|
|
|
4ae388 |
+#include <sys/stat.h>
|
|
|
4ae388 |
+#include <sys/ioctl.h>
|
|
|
4ae388 |
+#include <sys/time.h>
|
|
|
4ae388 |
+#include <sys/wait.h>
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+#include "rados/librados.h"
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+#include "structs.h"
|
|
|
4ae388 |
+#include "checkers.h"
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+#include "../libmultipath/debug.h"
|
|
|
4ae388 |
+#include "../libmultipath/uevent.h"
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+struct rbd_checker_context;
|
|
|
4ae388 |
+typedef int (thread_fn)(struct rbd_checker_context *ct, char *msg);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+#define RBD_MSG(msg, fmt, args...) snprintf(msg, CHECKER_MSG_LEN, fmt, ##args);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+struct rbd_checker_context {
|
|
|
4ae388 |
+ int rbd_bus_id;
|
|
|
4ae388 |
+ char *client_addr;
|
|
|
4ae388 |
+ char *config_info;
|
|
|
4ae388 |
+ char *snap;
|
|
|
4ae388 |
+ char *pool;
|
|
|
4ae388 |
+ char *image;
|
|
|
4ae388 |
+ char *username;
|
|
|
4ae388 |
+ int remapped;
|
|
|
4ae388 |
+ int blacklisted;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ rados_t cluster;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ int state;
|
|
|
4ae388 |
+ int running;
|
|
|
4ae388 |
+ time_t time;
|
|
|
4ae388 |
+ thread_fn *fn;
|
|
|
4ae388 |
+ pthread_t thread;
|
|
|
4ae388 |
+ pthread_mutex_t lock;
|
|
|
4ae388 |
+ pthread_cond_t active;
|
|
|
4ae388 |
+ pthread_spinlock_t hldr_lock;
|
|
|
4ae388 |
+ int holders;
|
|
|
4ae388 |
+ char message[CHECKER_MSG_LEN];
|
|
|
4ae388 |
+};
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+int libcheck_init(struct checker * c)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct rbd_checker_context *ct;
|
|
|
4ae388 |
+ struct udev_device *block_dev;
|
|
|
4ae388 |
+ struct udev_device *bus_dev;
|
|
|
4ae388 |
+ struct udev *udev;
|
|
|
4ae388 |
+ struct stat sb;
|
|
|
4ae388 |
+ const char *block_name, *addr, *config_info;
|
|
|
4ae388 |
+ const char *image, *pool, *snap, *username;
|
|
|
4ae388 |
+ char sysfs_path[PATH_SIZE];
|
|
|
4ae388 |
+ int ret;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct = malloc(sizeof(struct rbd_checker_context));
|
|
|
4ae388 |
+ if (!ct)
|
|
|
4ae388 |
+ return 1;
|
|
|
4ae388 |
+ memset(ct, 0, sizeof(struct rbd_checker_context));
|
|
|
4ae388 |
+ ct->holders = 1;
|
|
|
4ae388 |
+ pthread_cond_init(&ct->active, NULL);
|
|
|
4ae388 |
+ pthread_mutex_init(&ct->lock, NULL);
|
|
|
4ae388 |
+ pthread_spin_init(&ct->hldr_lock, PTHREAD_PROCESS_PRIVATE);
|
|
|
4ae388 |
+ c->context = ct;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ /*
|
|
|
4ae388 |
+ * The rbd block layer sysfs device is not linked to the rbd bus
|
|
|
4ae388 |
+ * device that we interact with, so figure that out now.
|
|
|
4ae388 |
+ */
|
|
|
4ae388 |
+ if (fstat(c->fd, &sb) != 0)
|
|
|
4ae388 |
+ goto free_ct;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ udev = udev_new();
|
|
|
4ae388 |
+ if (!udev)
|
|
|
4ae388 |
+ goto free_ct;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ block_dev = udev_device_new_from_devnum(udev, 'b', sb.st_rdev);
|
|
|
4ae388 |
+ if (!block_dev)
|
|
|
4ae388 |
+ goto free_udev;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ block_name = udev_device_get_sysname(block_dev);
|
|
|
4ae388 |
+ ret = sscanf(block_name, "rbd%d", &ct->rbd_bus_id);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ udev_device_unref(block_dev);
|
|
|
4ae388 |
+ if (ret != 1)
|
|
|
4ae388 |
+ goto free_udev;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ snprintf(sysfs_path, sizeof(sysfs_path), "/sys/bus/rbd/devices/%d",
|
|
|
4ae388 |
+ ct->rbd_bus_id);
|
|
|
4ae388 |
+ bus_dev = udev_device_new_from_syspath(udev, sysfs_path);
|
|
|
4ae388 |
+ if (!bus_dev)
|
|
|
4ae388 |
+ goto free_udev;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ addr = udev_device_get_sysattr_value(bus_dev, "client_addr");
|
|
|
4ae388 |
+ if (!addr) {
|
|
|
4ae388 |
+ condlog(0, "Could not find client_addr in rbd sysfs. Try "
|
|
|
4ae388 |
+ "updating kernel");
|
|
|
4ae388 |
+ goto free_dev;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->client_addr = strdup(addr);
|
|
|
4ae388 |
+ if (!ct->client_addr)
|
|
|
4ae388 |
+ goto free_dev;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ config_info = udev_device_get_sysattr_value(bus_dev, "config_info");
|
|
|
4ae388 |
+ if (!config_info)
|
|
|
4ae388 |
+ goto free_addr;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->config_info = strdup(config_info);
|
|
|
4ae388 |
+ if (!ct->config_info)
|
|
|
4ae388 |
+ goto free_addr;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ username = strstr(config_info, "name=");
|
|
|
4ae388 |
+ if (username) {
|
|
|
4ae388 |
+ char *end;
|
|
|
4ae388 |
+ int len;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ username += 5;
|
|
|
4ae388 |
+ end = strchr(username, ',');
|
|
|
4ae388 |
+ if (!end)
|
|
|
4ae388 |
+ goto free_info;
|
|
|
4ae388 |
+ len = end - username;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->username = malloc(len + 1);
|
|
|
4ae388 |
+ if (!ct->username)
|
|
|
4ae388 |
+ goto free_info;
|
|
|
4ae388 |
+ strncpy(ct->username, username, len);
|
|
|
4ae388 |
+ ct->username[len] = '\0';
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ image = udev_device_get_sysattr_value(bus_dev, "name");
|
|
|
4ae388 |
+ if (!image)
|
|
|
4ae388 |
+ goto free_username;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->image = strdup(image);
|
|
|
4ae388 |
+ if (!ct->image)
|
|
|
4ae388 |
+ goto free_info;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ pool = udev_device_get_sysattr_value(bus_dev, "pool");
|
|
|
4ae388 |
+ if (!pool)
|
|
|
4ae388 |
+ goto free_image;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->pool = strdup(pool);
|
|
|
4ae388 |
+ if (!ct->pool)
|
|
|
4ae388 |
+ goto free_image;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ snap = udev_device_get_sysattr_value(bus_dev, "current_snap");
|
|
|
4ae388 |
+ if (!snap)
|
|
|
4ae388 |
+ goto free_pool;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (strcmp("-", snap)) {
|
|
|
4ae388 |
+ ct->snap = strdup(snap);
|
|
|
4ae388 |
+ if (!ct->snap)
|
|
|
4ae388 |
+ goto free_pool;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (rados_create(&ct->cluster, NULL) < 0) {
|
|
|
4ae388 |
+ condlog(0, "Could not create rados cluster");
|
|
|
4ae388 |
+ goto free_snap;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (rados_conf_read_file(ct->cluster, NULL) < 0) {
|
|
|
4ae388 |
+ condlog(0, "Could not read rados conf");
|
|
|
4ae388 |
+ goto shutdown_rados;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = rados_connect(ct->cluster);
|
|
|
4ae388 |
+ if (ret < 0) {
|
|
|
4ae388 |
+ condlog(0, "Could not connect to rados cluster");
|
|
|
4ae388 |
+ goto shutdown_rados;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ udev_device_unref(bus_dev);
|
|
|
4ae388 |
+ udev_unref(udev);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ condlog(3, "rbd%d checker init %s %s/%s@%s %s", ct->rbd_bus_id,
|
|
|
4ae388 |
+ ct->client_addr, ct->pool, ct->image, ct->snap ? ct->snap : "-",
|
|
|
4ae388 |
+ ct->username ? ct->username : "none");
|
|
|
4ae388 |
+ return 0;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+shutdown_rados:
|
|
|
4ae388 |
+ rados_shutdown(ct->cluster);
|
|
|
4ae388 |
+free_snap:
|
|
|
4ae388 |
+ if (ct->snap)
|
|
|
4ae388 |
+ free(ct->snap);
|
|
|
4ae388 |
+free_pool:
|
|
|
4ae388 |
+ free(ct->pool);
|
|
|
4ae388 |
+free_image:
|
|
|
4ae388 |
+ free(ct->image);
|
|
|
4ae388 |
+free_username:
|
|
|
4ae388 |
+ if (ct->username)
|
|
|
4ae388 |
+ free(ct->username);
|
|
|
4ae388 |
+free_info:
|
|
|
4ae388 |
+ free(ct->config_info);
|
|
|
4ae388 |
+free_addr:
|
|
|
4ae388 |
+ free(ct->client_addr);
|
|
|
4ae388 |
+free_dev:
|
|
|
4ae388 |
+ udev_device_unref(bus_dev);
|
|
|
4ae388 |
+free_udev:
|
|
|
4ae388 |
+ udev_unref(udev);
|
|
|
4ae388 |
+free_ct:
|
|
|
4ae388 |
+ free(ct);
|
|
|
4ae388 |
+ return 1;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+void cleanup_context(struct rbd_checker_context *ct)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ pthread_mutex_destroy(&ct->lock);
|
|
|
4ae388 |
+ pthread_cond_destroy(&ct->active);
|
|
|
4ae388 |
+ pthread_spin_destroy(&ct->hldr_lock);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ rados_shutdown(ct->cluster);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (ct->username)
|
|
|
4ae388 |
+ free(ct->username);
|
|
|
4ae388 |
+ if (ct->snap)
|
|
|
4ae388 |
+ free(ct->snap);
|
|
|
4ae388 |
+ free(ct->pool);
|
|
|
4ae388 |
+ free(ct->image);
|
|
|
4ae388 |
+ free(ct->config_info);
|
|
|
4ae388 |
+ free(ct->client_addr);
|
|
|
4ae388 |
+ free(ct);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+void libcheck_free(struct checker * c)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ if (c->context) {
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = c->context;
|
|
|
4ae388 |
+ int holders;
|
|
|
4ae388 |
+ pthread_t thread;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ pthread_spin_lock(&ct->hldr_lock);
|
|
|
4ae388 |
+ ct->holders--;
|
|
|
4ae388 |
+ holders = ct->holders;
|
|
|
4ae388 |
+ thread = ct->thread;
|
|
|
4ae388 |
+ pthread_spin_unlock(&ct->hldr_lock);
|
|
|
4ae388 |
+ if (holders)
|
|
|
4ae388 |
+ pthread_cancel(thread);
|
|
|
4ae388 |
+ else
|
|
|
4ae388 |
+ cleanup_context(ct);
|
|
|
4ae388 |
+ c->context = NULL;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int rbd_is_blacklisted(struct rbd_checker_context *ct, char *msg)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ char *addr_tok, *start, *save;
|
|
|
4ae388 |
+ char *cmd[2];
|
|
|
4ae388 |
+ char *blklist, *stat;
|
|
|
4ae388 |
+ size_t blklist_len, stat_len;
|
|
|
4ae388 |
+ int ret;
|
|
|
4ae388 |
+ char *end;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ cmd[0] = "{\"prefix\": \"osd blacklist ls\"}";
|
|
|
4ae388 |
+ cmd[1] = NULL;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
|
|
|
4ae388 |
+ &blklist, &blklist_len, &stat, &stat_len);
|
|
|
4ae388 |
+ if (ret < 0) {
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd checker failed: mon command failed %d",
|
|
|
4ae388 |
+ ret);
|
|
|
4ae388 |
+ return ret;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!blklist || !blklist_len)
|
|
|
4ae388 |
+ goto free_bufs;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ /*
|
|
|
4ae388 |
+ * parse list of addrs with the format
|
|
|
4ae388 |
+ * ipv4:port/nonce date time\n
|
|
|
4ae388 |
+ * or
|
|
|
4ae388 |
+ * [ipv6]:port/nonce date time\n
|
|
|
4ae388 |
+ */
|
|
|
4ae388 |
+ ret = 0;
|
|
|
4ae388 |
+ for (start = blklist; ; start = NULL) {
|
|
|
4ae388 |
+ addr_tok = strtok_r(start, "\n", &save);
|
|
|
4ae388 |
+ if (!addr_tok || !strlen(addr_tok))
|
|
|
4ae388 |
+ break;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ end = strchr(addr_tok, ' ');
|
|
|
4ae388 |
+ if (!end) {
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d checker failed: invalid blacklist %s",
|
|
|
4ae388 |
+ ct->rbd_bus_id, addr_tok);
|
|
|
4ae388 |
+ break;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ *end = '\0';
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!strcmp(addr_tok, ct->client_addr)) {
|
|
|
4ae388 |
+ ct->blacklisted = 1;
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d checker: %s is blacklisted",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ct->client_addr);
|
|
|
4ae388 |
+ ret = 1;
|
|
|
4ae388 |
+ break;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+free_bufs:
|
|
|
4ae388 |
+ rados_buffer_free(blklist);
|
|
|
4ae388 |
+ rados_buffer_free(stat);
|
|
|
4ae388 |
+ return ret;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+int rbd_check(struct rbd_checker_context *ct, char *msg)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ if (ct->blacklisted || rbd_is_blacklisted(ct, msg) == 1)
|
|
|
4ae388 |
+ return PATH_DOWN;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd checker reports path is up");
|
|
|
4ae388 |
+ /*
|
|
|
4ae388 |
+ * Path may have issues, but the ceph cluster is at least
|
|
|
4ae388 |
+ * accepting IO, so we can attempt to do IO.
|
|
|
4ae388 |
+ *
|
|
|
4ae388 |
+ * TODO: in future versions, we can run other tests to
|
|
|
4ae388 |
+ * verify OSDs and networks.
|
|
|
4ae388 |
+ */
|
|
|
4ae388 |
+ return PATH_UP;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+int safe_write(int fd, const void *buf, size_t count)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ while (count > 0) {
|
|
|
4ae388 |
+ ssize_t r = write(fd, buf, count);
|
|
|
4ae388 |
+ if (r < 0) {
|
|
|
4ae388 |
+ if (errno == EINTR)
|
|
|
4ae388 |
+ continue;
|
|
|
4ae388 |
+ return -errno;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ count -= r;
|
|
|
4ae388 |
+ buf = (char *)buf + r;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ return 0;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int sysfs_write_rbd_bus(const char *which, const char *buf,
|
|
|
4ae388 |
+ size_t buf_len)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ char sysfs_path[PATH_SIZE];
|
|
|
4ae388 |
+ int fd;
|
|
|
4ae388 |
+ int r;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ /* we require newer kernels so single_major should alwayws be there */
|
|
|
4ae388 |
+ snprintf(sysfs_path, sizeof(sysfs_path),
|
|
|
4ae388 |
+ "/sys/bus/rbd/%s_single_major", which);
|
|
|
4ae388 |
+ fd = open(sysfs_path, O_WRONLY);
|
|
|
4ae388 |
+ if (fd < 0)
|
|
|
4ae388 |
+ return -errno;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ r = safe_write(fd, buf, buf_len);
|
|
|
4ae388 |
+ close(fd);
|
|
|
4ae388 |
+ return r;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int rbd_remap(struct rbd_checker_context *ct)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ char *argv[11];
|
|
|
4ae388 |
+ pid_t pid;
|
|
|
4ae388 |
+ int ret = 0, i = 0;
|
|
|
4ae388 |
+ int status;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ pid = fork();
|
|
|
4ae388 |
+ switch (pid) {
|
|
|
4ae388 |
+ case 0:
|
|
|
4ae388 |
+ argv[i++] = "rbd";
|
|
|
4ae388 |
+ argv[i++] = "map";
|
|
|
4ae388 |
+ argv[i++] = "-o noshare";
|
|
|
4ae388 |
+ if (ct->username) {
|
|
|
4ae388 |
+ argv[i++] = "--id";
|
|
|
4ae388 |
+ argv[i++] = ct->username;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ argv[i++] = "--pool";
|
|
|
4ae388 |
+ argv[i++] = ct->pool;
|
|
|
4ae388 |
+ if (ct->snap) {
|
|
|
4ae388 |
+ argv[i++] = "--snap";
|
|
|
4ae388 |
+ argv[i++] = ct->snap;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ argv[i++] = ct->image;
|
|
|
4ae388 |
+ argv[i] = NULL;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = execvp(argv[0], argv);
|
|
|
4ae388 |
+ condlog(0, "Error executing rbd: %s", strerror(errno));
|
|
|
4ae388 |
+ exit(-1);
|
|
|
4ae388 |
+ case -1:
|
|
|
4ae388 |
+ condlog(0, "fork failed: %s", strerror(errno));
|
|
|
4ae388 |
+ return -1;
|
|
|
4ae388 |
+ default:
|
|
|
4ae388 |
+ ret = -1;
|
|
|
4ae388 |
+ wait(&status);
|
|
|
4ae388 |
+ if (WIFEXITED(status)) {
|
|
|
4ae388 |
+ status = WEXITSTATUS(status);
|
|
|
4ae388 |
+ if (status == 0)
|
|
|
4ae388 |
+ ret = 0;
|
|
|
4ae388 |
+ else
|
|
|
4ae388 |
+ condlog(0, "rbd failed with %d", status);
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ return ret;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int sysfs_write_rbd_remove(const char *buf, int buf_len)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ return sysfs_write_rbd_bus("remove", buf, buf_len);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int rbd_rm_blacklist(struct rbd_checker_context *ct)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ char *cmd[2];
|
|
|
4ae388 |
+ char *stat, *cmd_str;
|
|
|
4ae388 |
+ size_t stat_len;
|
|
|
4ae388 |
+ int ret;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = asprintf(&cmd_str, "{\"prefix\": \"osd blacklist\", \"blacklistop\": \"rm\", \"addr\": \"%s\"}",
|
|
|
4ae388 |
+ ct->client_addr);
|
|
|
4ae388 |
+ if (ret == -1)
|
|
|
4ae388 |
+ return -ENOMEM;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ cmd[0] = cmd_str;
|
|
|
4ae388 |
+ cmd[1] = NULL;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
|
|
|
4ae388 |
+ NULL, 0, &stat, &stat_len);
|
|
|
4ae388 |
+ if (ret < 0) {
|
|
|
4ae388 |
+ condlog(1, "rbd%d repair failed to remove blacklist for %s %d",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ct->client_addr, ret);
|
|
|
4ae388 |
+ goto free_cmd;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ condlog(1, "rbd%d repair rm blacklist for %s",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ct->client_addr);
|
|
|
4ae388 |
+ free(stat);
|
|
|
4ae388 |
+free_cmd:
|
|
|
4ae388 |
+ free(cmd_str);
|
|
|
4ae388 |
+ return ret;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int rbd_repair(struct rbd_checker_context *ct, char *msg)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ char del[17];
|
|
|
4ae388 |
+ int ret;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!ct->blacklisted)
|
|
|
4ae388 |
+ return PATH_UP;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!ct->remapped) {
|
|
|
4ae388 |
+ ret = rbd_remap(ct);
|
|
|
4ae388 |
+ if (ret) {
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d repair failed to remap. Err %d",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ret);
|
|
|
4ae388 |
+ return PATH_DOWN;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ ct->remapped = 1;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ snprintf(del, sizeof(del), "%d force", ct->rbd_bus_id);
|
|
|
4ae388 |
+ ret = sysfs_write_rbd_remove(del, strlen(del) + 1);
|
|
|
4ae388 |
+ if (ret) {
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d repair failed to clean up. Err %d",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ret);
|
|
|
4ae388 |
+ return PATH_DOWN;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ret = rbd_rm_blacklist(ct);
|
|
|
4ae388 |
+ if (ret) {
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d repair could not remove blacklist entry. Err %d",
|
|
|
4ae388 |
+ ct->rbd_bus_id, ret);
|
|
|
4ae388 |
+ return PATH_DOWN;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->remapped = 0;
|
|
|
4ae388 |
+ ct->blacklisted = 0;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ RBD_MSG(msg, "rbd%d has been repaired", ct->rbd_bus_id);
|
|
|
4ae388 |
+ return PATH_UP;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+#define rbd_thread_cleanup_push(ct) pthread_cleanup_push(cleanup_func, ct)
|
|
|
4ae388 |
+#define rbd_thread_cleanup_pop(ct) pthread_cleanup_pop(1)
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+void cleanup_func(void *data)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ int holders;
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = data;
|
|
|
4ae388 |
+ pthread_spin_lock(&ct->hldr_lock);
|
|
|
4ae388 |
+ ct->holders--;
|
|
|
4ae388 |
+ holders = ct->holders;
|
|
|
4ae388 |
+ ct->thread = 0;
|
|
|
4ae388 |
+ pthread_spin_unlock(&ct->hldr_lock);
|
|
|
4ae388 |
+ if (!holders)
|
|
|
4ae388 |
+ cleanup_context(ct);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+void *rbd_thread(void *ctx)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = ctx;
|
|
|
4ae388 |
+ int state;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ condlog(3, "rbd%d thread starting up", ct->rbd_bus_id);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ ct->message[0] = '\0';
|
|
|
4ae388 |
+ /* This thread can be canceled, so setup clean up */
|
|
|
4ae388 |
+ rbd_thread_cleanup_push(ct)
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ /* checker start up */
|
|
|
4ae388 |
+ pthread_mutex_lock(&ct->lock);
|
|
|
4ae388 |
+ ct->state = PATH_PENDING;
|
|
|
4ae388 |
+ pthread_mutex_unlock(&ct->lock);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ state = ct->fn(ct, ct->message);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ /* checker done */
|
|
|
4ae388 |
+ pthread_mutex_lock(&ct->lock);
|
|
|
4ae388 |
+ ct->state = state;
|
|
|
4ae388 |
+ pthread_mutex_unlock(&ct->lock);
|
|
|
4ae388 |
+ pthread_cond_signal(&ct->active);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ condlog(3, "rbd%d thead finished, state %s", ct->rbd_bus_id,
|
|
|
4ae388 |
+ checker_state_name(state));
|
|
|
4ae388 |
+ rbd_thread_cleanup_pop(ct);
|
|
|
4ae388 |
+ return ((void *)0);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static void rbd_timeout(struct timespec *tsp)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct timeval now;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ gettimeofday(&now, NULL);
|
|
|
4ae388 |
+ tsp->tv_sec = now.tv_sec;
|
|
|
4ae388 |
+ tsp->tv_nsec = now.tv_usec * 1000;
|
|
|
4ae388 |
+ tsp->tv_nsec += 1000000; /* 1 millisecond */
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+static int rbd_exec_fn(struct checker *c, thread_fn *fn)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = c->context;
|
|
|
4ae388 |
+ struct timespec tsp;
|
|
|
4ae388 |
+ pthread_attr_t attr;
|
|
|
4ae388 |
+ int rbd_status, r;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (c->sync)
|
|
|
4ae388 |
+ return rbd_check(ct, c->message);
|
|
|
4ae388 |
+ /*
|
|
|
4ae388 |
+ * Async mode
|
|
|
4ae388 |
+ */
|
|
|
4ae388 |
+ r = pthread_mutex_lock(&ct->lock);
|
|
|
4ae388 |
+ if (r != 0) {
|
|
|
4ae388 |
+ condlog(2, "rbd%d mutex lock failed with %d", ct->rbd_bus_id,
|
|
|
4ae388 |
+ r);
|
|
|
4ae388 |
+ MSG(c, "rbd%d thread failed to initialize", ct->rbd_bus_id);
|
|
|
4ae388 |
+ return PATH_WILD;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (ct->running) {
|
|
|
4ae388 |
+ /* Check if checker is still running */
|
|
|
4ae388 |
+ if (ct->thread) {
|
|
|
4ae388 |
+ condlog(3, "rbd%d thread not finished", ct->rbd_bus_id);
|
|
|
4ae388 |
+ rbd_status = PATH_PENDING;
|
|
|
4ae388 |
+ } else {
|
|
|
4ae388 |
+ /* checker done */
|
|
|
4ae388 |
+ ct->running = 0;
|
|
|
4ae388 |
+ rbd_status = ct->state;
|
|
|
4ae388 |
+ strncpy(c->message, ct->message, CHECKER_MSG_LEN);
|
|
|
4ae388 |
+ c->message[CHECKER_MSG_LEN - 1] = '\0';
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ pthread_mutex_unlock(&ct->lock);
|
|
|
4ae388 |
+ } else {
|
|
|
4ae388 |
+ /* Start new checker */
|
|
|
4ae388 |
+ ct->state = PATH_UNCHECKED;
|
|
|
4ae388 |
+ ct->fn = fn;
|
|
|
4ae388 |
+ pthread_spin_lock(&ct->hldr_lock);
|
|
|
4ae388 |
+ ct->holders++;
|
|
|
4ae388 |
+ pthread_spin_unlock(&ct->hldr_lock);
|
|
|
4ae388 |
+ setup_thread_attr(&attr, 32 * 1024, 1);
|
|
|
4ae388 |
+ r = pthread_create(&ct->thread, &attr, rbd_thread, ct);
|
|
|
4ae388 |
+ if (r) {
|
|
|
4ae388 |
+ pthread_mutex_unlock(&ct->lock);
|
|
|
4ae388 |
+ ct->thread = 0;
|
|
|
4ae388 |
+ ct->holders--;
|
|
|
4ae388 |
+ condlog(3, "rbd%d failed to start rbd thread, using sync mode",
|
|
|
4ae388 |
+ ct->rbd_bus_id);
|
|
|
4ae388 |
+ return fn(ct, c->message);
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ pthread_attr_destroy(&attr);
|
|
|
4ae388 |
+ rbd_timeout(&tsp;;
|
|
|
4ae388 |
+ r = pthread_cond_timedwait(&ct->active, &ct->lock, &tsp;;
|
|
|
4ae388 |
+ rbd_status = ct->state;
|
|
|
4ae388 |
+ strncpy(c->message, ct->message,CHECKER_MSG_LEN);
|
|
|
4ae388 |
+ c->message[CHECKER_MSG_LEN -1] = '\0';
|
|
|
4ae388 |
+ pthread_mutex_unlock(&ct->lock);
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (ct->thread &&
|
|
|
4ae388 |
+ (rbd_status == PATH_PENDING || rbd_status == PATH_UNCHECKED)) {
|
|
|
4ae388 |
+ condlog(3, "rbd%d thread still running",
|
|
|
4ae388 |
+ ct->rbd_bus_id);
|
|
|
4ae388 |
+ ct->running = 1;
|
|
|
4ae388 |
+ rbd_status = PATH_PENDING;
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+ }
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ return rbd_status;
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+void libcheck_repair(struct checker * c)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = c->context;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!ct || !ct->blacklisted)
|
|
|
4ae388 |
+ return;
|
|
|
4ae388 |
+ rbd_exec_fn(c, rbd_repair);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+int libcheck_check(struct checker * c)
|
|
|
4ae388 |
+{
|
|
|
4ae388 |
+ struct rbd_checker_context *ct = c->context;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (!ct)
|
|
|
4ae388 |
+ return PATH_UNCHECKED;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ if (ct->blacklisted)
|
|
|
4ae388 |
+ return PATH_DOWN;
|
|
|
4ae388 |
+
|
|
|
4ae388 |
+ return rbd_exec_fn(c, rbd_check);
|
|
|
4ae388 |
+}
|
|
|
4ae388 |
Index: multipath-tools-130222/multipath.conf.annotated
|
|
|
4ae388 |
===================================================================
|
|
|
4ae388 |
--- multipath-tools-130222.orig/multipath.conf.annotated
|
|
|
4ae388 |
+++ multipath-tools-130222/multipath.conf.annotated
|
|
|
4ae388 |
@@ -97,7 +97,7 @@
|
|
|
4ae388 |
# # scope : multipath & multipathd
|
|
|
4ae388 |
# # desc : the default method used to determine the paths' state
|
|
|
4ae388 |
# # values : readsector0|tur|emc_clariion|hp_sw|directio|rdac|
|
|
|
4ae388 |
-# cciss_tur|hp_tur
|
|
|
4ae388 |
+# cciss_tur|hp_tur|rbd
|
|
|
4ae388 |
# # default : directio
|
|
|
4ae388 |
# #
|
|
|
4ae388 |
# path_checker directio
|
|
|
4ae388 |
@@ -493,7 +493,7 @@
|
|
|
4ae388 |
# # scope : multipathd & multipathd
|
|
|
4ae388 |
# # desc : path checking algorithm to use to check path state
|
|
|
4ae388 |
# # values : readsector0|tur|emc_clariion|hp_sw|directio|rdac|
|
|
|
4ae388 |
-# # cciss_tur|hp_tur
|
|
|
4ae388 |
+# # cciss_tur|hp_tur|rbd
|
|
|
4ae388 |
# #
|
|
|
4ae388 |
# path_checker directio
|
|
|
4ae388 |
#
|
|
|
4ae388 |
Index: multipath-tools-130222/multipath/multipath.conf.5
|
|
|
4ae388 |
===================================================================
|
|
|
4ae388 |
--- multipath-tools-130222.orig/multipath/multipath.conf.5
|
|
|
4ae388 |
+++ multipath-tools-130222/multipath/multipath.conf.5
|
|
|
4ae388 |
@@ -284,6 +284,9 @@ Check the path state for LSI/Engenio/Net
|
|
|
4ae388 |
.B directio
|
|
|
4ae388 |
Read the first sector with direct I/O.
|
|
|
4ae388 |
.TP
|
|
|
4ae388 |
+.B rbd
|
|
|
4ae388 |
+Check if the path is in the Ceph blacklist.
|
|
|
4ae388 |
+.TP
|
|
|
4ae388 |
Default value is \fIdirectio\fR.
|
|
|
4ae388 |
.RE
|
|
|
4ae388 |
.TP
|