Blame SOURCES/0576-journald-never-block-when-sending-messages-on-NOTIFY.patch

17b0f1
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001
17b0f1
From: Jan Synacek <jsynacek@redhat.com>
17b0f1
Date: Tue, 21 Nov 2017 12:46:28 +0100
17b0f1
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET
17b0f1
 socket
17b0f1
17b0f1
Otherwise we might run into deadlocks, when journald blocks on the
17b0f1
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and
17b0f1
dbus-daemon blocks on logging to journald. Break this cycle by making
17b0f1
sure that journald never ever blocks on PID 1.
17b0f1
17b0f1
Note that this change disables support for event loop watchdog support,
17b0f1
as these messages are sent in blocking style by sd-event. That should
17b0f1
not be a big loss though, as people reported frequent problems with the
17b0f1
watchdog hitting journald on excessively slow IO.
17b0f1
17b0f1
Fixes: #1505.
17b0f1
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7)
17b0f1
17b0f1
Resolves: #1511565
17b0f1
---
17b0f1
 src/journal/journald-server.c     | 130 +++++++++++++++++++++++++++++-
17b0f1
 src/journal/journald-server.h     |  13 ++-
17b0f1
 src/journal/journald-stream.c     |  68 ++++++++++++++--
17b0f1
 src/journal/journald-stream.h     |   3 +
17b0f1
 src/journal/journald.c            |   8 --
17b0f1
 units/systemd-journald.service.in |   1 -
17b0f1
 6 files changed, 201 insertions(+), 22 deletions(-)
17b0f1
17b0f1
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
17b0f1
index daeecd5191..a810829b24 100644
17b0f1
--- a/src/journal/journald-server.c
17b0f1
+++ b/src/journal/journald-server.c
17b0f1
@@ -67,6 +67,8 @@
17b0f1
 
17b0f1
 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
17b0f1
 
17b0f1
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024)
17b0f1
+
17b0f1
 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
17b0f1
 + * for a bit of additional metadata. */
17b0f1
 #define DEFAULT_LINE_MAX (48*1024)
17b0f1
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) {
17b0f1
         return 0;
17b0f1
 }
17b0f1
 
17b0f1
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
17b0f1
+        Server *s = userdata;
17b0f1
+        int r;
17b0f1
+
17b0f1
+        assert(s);
17b0f1
+        assert(s->notify_event_source == es);
17b0f1
+        assert(s->notify_fd == fd);
17b0f1
+
17b0f1
+        if (revents != EPOLLOUT) {
17b0f1
+                log_error("Invalid events on notify file descriptor.");
17b0f1
+                return -EINVAL;
17b0f1
+        }
17b0f1
+
17b0f1
+        /* The $NOTIFY_SOCKET is writable again, now send exactly one
17b0f1
+         * message on it. Either it's the initial READY=1 event or an
17b0f1
+         * stdout stream event. If there's nothing to write anymore,
17b0f1
+         * turn our event source off. The next time there's something
17b0f1
+         * to send it will be turned on again. */
17b0f1
+
17b0f1
+        if (!s->sent_notify_ready) {
17b0f1
+                static const char p[] =
17b0f1
+                        "READY=1\n"
17b0f1
+                        "STATUS=Processing requests...";
17b0f1
+                ssize_t l;
17b0f1
+
17b0f1
+                l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
17b0f1
+                if (l < 0) {
17b0f1
+                        if (errno == EAGAIN)
17b0f1
+                                return 0;
17b0f1
+
17b0f1
+                        return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
17b0f1
+                }
17b0f1
+
17b0f1
+                s->sent_notify_ready = true;
17b0f1
+                log_debug("Sent READY=1 notification.");
17b0f1
+
17b0f1
+        } else if (s->stdout_streams_notify_queue)
17b0f1
+                /* Dispatch one stream notification event */
17b0f1
+                stdout_stream_send_notify(s->stdout_streams_notify_queue);
17b0f1
+
17b0f1
+        /* Leave us enabled if there's still more to to do. */
17b0f1
+        if (s->stdout_streams_notify_queue)
17b0f1
+                return 0;
17b0f1
+
17b0f1
+        /* There was nothing to do anymore, let's turn ourselves off. */
17b0f1
+        r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
17b0f1
+        if (r < 0)
17b0f1
+                return log_error_errno(r, "Failed to turn off notify event source: %m");
17b0f1
+
17b0f1
+        return 0;
17b0f1
+}
17b0f1
+
17b0f1
+static int server_connect_notify(Server *s) {
17b0f1
+        union sockaddr_union sa = {
17b0f1
+                .un.sun_family = AF_UNIX,
17b0f1
+        };
17b0f1
+        const char *e;
17b0f1
+        int r;
17b0f1
+
17b0f1
+        assert(s);
17b0f1
+        assert(s->notify_fd < 0);
17b0f1
+        assert(!s->notify_event_source);
17b0f1
+
17b0f1
+        /*
17b0f1
+          So here's the problem: we'd like to send notification
17b0f1
+          messages to PID 1, but we cannot do that via sd_notify(),
17b0f1
+          since that's synchronous, and we might end up blocking on
17b0f1
+          it. Specifically: given that PID 1 might block on
17b0f1
+          dbus-daemon during IPC, and dbus-daemon is logging to us,
17b0f1
+          and might hence block on us, we might end up in a deadlock
17b0f1
+          if we block on sending PID 1 notification messages -- by
17b0f1
+          generating a full blocking circle. To avoid this, let's
17b0f1
+          create a non-blocking socket, and connect it to the
17b0f1
+          notification socket, and then wait for POLLOUT before we
17b0f1
+          send anything. This should efficiently avoid any deadlocks,
17b0f1
+          as we'll never block on PID 1, hence PID 1 can safely block
17b0f1
+          on dbus-daemon which can safely block on us again.
17b0f1
+
17b0f1
+          Don't think that this issue is real? It is, see:
17b0f1
+          https://github.com/systemd/systemd/issues/1505
17b0f1
+        */
17b0f1
+
17b0f1
+        e = getenv("NOTIFY_SOCKET");
17b0f1
+        if (!e)
17b0f1
+                return 0;
17b0f1
+
17b0f1
+        if ((e[0] != '@' && e[0] != '/') || e[1] == 0) {
17b0f1
+                log_error("NOTIFY_SOCKET set to an invalid value: %s", e);
17b0f1
+                return -EINVAL;
17b0f1
+        }
17b0f1
+
17b0f1
+        if (strlen(e) > sizeof(sa.un.sun_path)) {
17b0f1
+                log_error("NOTIFY_SOCKET path too long: %s", e);
17b0f1
+                return -EINVAL;
17b0f1
+        }
17b0f1
+
17b0f1
+        s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
17b0f1
+        if (s->notify_fd < 0)
17b0f1
+                return log_error_errno(errno, "Failed to create notify socket: %m");
17b0f1
+
17b0f1
+        (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
17b0f1
+
17b0f1
+        strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path));
17b0f1
+        if (sa.un.sun_path[0] == '@')
17b0f1
+                sa.un.sun_path[0] = 0;
17b0f1
+
17b0f1
+        r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e));
17b0f1
+        if (r < 0)
17b0f1
+                return log_error_errno(errno, "Failed to connect to notify socket: %m");
17b0f1
+
17b0f1
+        r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
17b0f1
+        if (r < 0)
17b0f1
+                return log_error_errno(r, "Failed to watch notification socket: %m");
17b0f1
+
17b0f1
+        /* This should fire pretty soon, which we'll use to send the
17b0f1
+         * READY=1 event. */
17b0f1
+
17b0f1
+        return 0;
17b0f1
+}
17b0f1
+
17b0f1
 int server_init(Server *s) {
17b0f1
         _cleanup_fdset_free_ FDSet *fds = NULL;
17b0f1
         int n, r, fd;
17b0f1
@@ -1563,7 +1685,7 @@ int server_init(Server *s) {
17b0f1
         assert(s);
17b0f1
 
17b0f1
         zero(*s);
17b0f1
-        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1;
17b0f1
+        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1;
17b0f1
         s->compress = true;
17b0f1
         s->seal = true;
17b0f1
 
17b0f1
@@ -1611,8 +1733,6 @@ int server_init(Server *s) {
17b0f1
         if (r < 0)
17b0f1
                 return log_error_errno(r, "Failed to create event loop: %m");
17b0f1
 
17b0f1
-        sd_event_set_watchdog(s->event, true);
17b0f1
-
17b0f1
         n = sd_listen_fds(true);
17b0f1
         if (n < 0)
17b0f1
                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
17b0f1
@@ -1718,6 +1838,8 @@ int server_init(Server *s) {
17b0f1
         server_cache_boot_id(s);
17b0f1
         server_cache_machine_id(s);
17b0f1
 
17b0f1
+        (void) server_connect_notify(s);
17b0f1
+
17b0f1
         r = system_journal_open(s, false);
17b0f1
         if (r < 0)
17b0f1
                 return r;
17b0f1
@@ -1770,6 +1892,7 @@ void server_done(Server *s) {
17b0f1
         sd_event_source_unref(s->sigterm_event_source);
17b0f1
         sd_event_source_unref(s->sigint_event_source);
17b0f1
         sd_event_source_unref(s->hostname_event_source);
17b0f1
+        sd_event_source_unref(s->notify_event_source);
17b0f1
         sd_event_unref(s->event);
17b0f1
 
17b0f1
         safe_close(s->syslog_fd);
17b0f1
@@ -1778,6 +1901,7 @@ void server_done(Server *s) {
17b0f1
         safe_close(s->dev_kmsg_fd);
17b0f1
         safe_close(s->audit_fd);
17b0f1
         safe_close(s->hostname_fd);
17b0f1
+        safe_close(s->notify_fd);
17b0f1
 
17b0f1
         if (s->rate_limit)
17b0f1
                 journal_rate_limit_free(s->rate_limit);
17b0f1
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
17b0f1
index b294107788..e59ff35e22 100644
17b0f1
--- a/src/journal/journald-server.h
17b0f1
+++ b/src/journal/journald-server.h
17b0f1
@@ -27,12 +27,15 @@
17b0f1
 #include <sys/types.h>
17b0f1
 #include <sys/socket.h>
17b0f1
 
17b0f1
+typedef struct Server Server;
17b0f1
+
17b0f1
 #include "sd-event.h"
17b0f1
 #include "journal-file.h"
17b0f1
 #include "hashmap.h"
17b0f1
 #include "util.h"
17b0f1
 #include "audit.h"
17b0f1
 #include "journald-rate-limit.h"
17b0f1
+#include "journald-stream.h"
17b0f1
 #include "list.h"
17b0f1
 
17b0f1
 typedef enum Storage {
17b0f1
@@ -52,15 +55,14 @@ typedef enum SplitMode {
17b0f1
         _SPLIT_INVALID = -1
17b0f1
 } SplitMode;
17b0f1
 
17b0f1
-typedef struct StdoutStream StdoutStream;
17b0f1
-
17b0f1
-typedef struct Server {
17b0f1
+struct Server {
17b0f1
         int syslog_fd;
17b0f1
         int native_fd;
17b0f1
         int stdout_fd;
17b0f1
         int dev_kmsg_fd;
17b0f1
         int audit_fd;
17b0f1
         int hostname_fd;
17b0f1
+        int notify_fd;
17b0f1
 
17b0f1
         sd_event *event;
17b0f1
 
17b0f1
@@ -75,6 +77,7 @@ typedef struct Server {
17b0f1
         sd_event_source *sigterm_event_source;
17b0f1
         sd_event_source *sigint_event_source;
17b0f1
         sd_event_source *hostname_event_source;
17b0f1
+        sd_event_source *notify_event_source;
17b0f1
 
17b0f1
         JournalFile *runtime_journal;
17b0f1
         JournalFile *system_journal;
17b0f1
@@ -114,6 +117,7 @@ typedef struct Server {
17b0f1
         usec_t oldest_file_usec;
17b0f1
 
17b0f1
         LIST_HEAD(StdoutStream, stdout_streams);
17b0f1
+        LIST_HEAD(StdoutStream, stdout_streams_notify_queue);
17b0f1
         unsigned n_stdout_streams;
17b0f1
 
17b0f1
         char *tty_path;
17b0f1
@@ -135,6 +139,7 @@ typedef struct Server {
17b0f1
 
17b0f1
         struct udev *udev;
17b0f1
 
17b0f1
+        bool sent_notify_ready;
17b0f1
         bool sync_scheduled;
17b0f1
 
17b0f1
         char machine_id_field[sizeof("_MACHINE_ID=") + 32];
17b0f1
@@ -145,7 +150,7 @@ typedef struct Server {
17b0f1
         char *cgroup_root;
17b0f1
 
17b0f1
         size_t line_max;
17b0f1
-} Server;
17b0f1
+};
17b0f1
 
17b0f1
 #define N_IOVEC_META_FIELDS 20
17b0f1
 #define N_IOVEC_KERNEL_FIELDS 64
17b0f1
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
17b0f1
index 4d6b7ad189..9118d1a318 100644
17b0f1
--- a/src/journal/journald-stream.c
17b0f1
+++ b/src/journal/journald-stream.c
17b0f1
@@ -80,6 +80,7 @@ struct StdoutStream {
17b0f1
         bool forward_to_console:1;
17b0f1
 
17b0f1
         bool fdstore:1;
17b0f1
+        bool in_notify_queue:1;
17b0f1
 
17b0f1
         char *buffer;
17b0f1
         size_t length;
17b0f1
@@ -90,6 +91,7 @@ struct StdoutStream {
17b0f1
         char *state_file;
17b0f1
 
17b0f1
         LIST_FIELDS(StdoutStream, stdout_stream);
17b0f1
+        LIST_FIELDS(StdoutStream, stdout_stream_notify_queue);
17b0f1
 
17b0f1
         char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX];
17b0f1
 };
17b0f1
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) {
17b0f1
                 assert(s->server->n_stdout_streams > 0);
17b0f1
                 s->server->n_stdout_streams --;
17b0f1
                 LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
17b0f1
+
17b0f1
+                if (s->in_notify_queue)
17b0f1
+                        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
17b0f1
         }
17b0f1
 
17b0f1
         if (s->event_source) {
17b0f1
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) {
17b0f1
                 return;
17b0f1
 
17b0f1
         if (s->state_file)
17b0f1
-                unlink(s->state_file);
17b0f1
+                (void) unlink(s->state_file);
17b0f1
 
17b0f1
         stdout_stream_free(s);
17b0f1
 }
17b0f1
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) {
17b0f1
         free(temp_path);
17b0f1
         temp_path = NULL;
17b0f1
 
17b0f1
-        /* Store the connection fd in PID 1, so that we get it passed
17b0f1
-         * in again on next start */
17b0f1
-        if (!s->fdstore) {
17b0f1
-                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
17b0f1
-                s->fdstore = true;
17b0f1
+        if (!s->fdstore && !s->in_notify_queue) {
17b0f1
+                LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
17b0f1
+                s->in_notify_queue = true;
17b0f1
+
17b0f1
+                if (s->server->notify_event_source) {
17b0f1
+                        r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON);
17b0f1
+                        if (r < 0)
17b0f1
+                                log_warning_errno(r, "Failed to enable notify event source: %m");
17b0f1
+                }
17b0f1
         }
17b0f1
 
17b0f1
 finish:
17b0f1
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) {
17b0f1
 
17b0f1
         return 0;
17b0f1
 }
17b0f1
+
17b0f1
+void stdout_stream_send_notify(StdoutStream *s) {
17b0f1
+        struct iovec iovec = {
17b0f1
+                .iov_base = (char*) "FDSTORE=1",
17b0f1
+                .iov_len = strlen("FDSTORE=1"),
17b0f1
+        };
17b0f1
+        struct msghdr msghdr = {
17b0f1
+                .msg_iov = &iovec,
17b0f1
+                .msg_iovlen = 1,
17b0f1
+        };
17b0f1
+        struct cmsghdr *cmsg;
17b0f1
+        ssize_t l;
17b0f1
+
17b0f1
+        assert(s);
17b0f1
+        assert(!s->fdstore);
17b0f1
+        assert(s->in_notify_queue);
17b0f1
+        assert(s->server);
17b0f1
+        assert(s->server->notify_fd >= 0);
17b0f1
+
17b0f1
+        /* Store the connection fd in PID 1, so that we get it passed
17b0f1
+         * in again on next start */
17b0f1
+
17b0f1
+        msghdr.msg_controllen = CMSG_SPACE(sizeof(int));
17b0f1
+        msghdr.msg_control = alloca0(msghdr.msg_controllen);
17b0f1
+
17b0f1
+        cmsg = CMSG_FIRSTHDR(&msghdr);
17b0f1
+        cmsg->cmsg_level = SOL_SOCKET;
17b0f1
+        cmsg->cmsg_type = SCM_RIGHTS;
17b0f1
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
17b0f1
+
17b0f1
+        memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int));
17b0f1
+
17b0f1
+        l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL);
17b0f1
+        if (l < 0) {
17b0f1
+                if (errno == EAGAIN)
17b0f1
+                        return;
17b0f1
+
17b0f1
+                log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m");
17b0f1
+        } else {
17b0f1
+                log_debug("Successfully sent stream file descriptor to service manager.");
17b0f1
+                s->fdstore = 1;
17b0f1
+        }
17b0f1
+
17b0f1
+        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
17b0f1
+        s->in_notify_queue = false;
17b0f1
+
17b0f1
+}
17b0f1
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
17b0f1
index 94bf955d78..fd8c94fb65 100644
17b0f1
--- a/src/journal/journald-stream.h
17b0f1
+++ b/src/journal/journald-stream.h
17b0f1
@@ -21,9 +21,12 @@
17b0f1
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
17b0f1
 ***/
17b0f1
 
17b0f1
+typedef struct StdoutStream StdoutStream;
17b0f1
+
17b0f1
 #include "fdset.h"
17b0f1
 #include "journald-server.h"
17b0f1
 
17b0f1
 int server_open_stdout_socket(Server *s, FDSet *fds);
17b0f1
 
17b0f1
 void stdout_stream_free(StdoutStream *s);
17b0f1
+void stdout_stream_send_notify(StdoutStream *s);
17b0f1
diff --git a/src/journal/journald.c b/src/journal/journald.c
17b0f1
index 15bbcbe3de..b7ba2b6ec5 100644
17b0f1
--- a/src/journal/journald.c
17b0f1
+++ b/src/journal/journald.c
17b0f1
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) {
17b0f1
         log_debug("systemd-journald running as pid "PID_FMT, getpid());
17b0f1
         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
17b0f1
 
17b0f1
-        sd_notify(false,
17b0f1
-                  "READY=1\n"
17b0f1
-                  "STATUS=Processing requests...");
17b0f1
-
17b0f1
         for (;;) {
17b0f1
                 usec_t t = USEC_INFINITY, n;
17b0f1
 
17b0f1
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) {
17b0f1
         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
17b0f1
 
17b0f1
 finish:
17b0f1
-        sd_notify(false,
17b0f1
-                  "STOPPING=1\n"
17b0f1
-                  "STATUS=Shutting down...");
17b0f1
-
17b0f1
         server_done(&server);
17b0f1
 
17b0f1
         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
17b0f1
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
17b0f1
index 0d1ea61fe8..c94c0bfba1 100644
17b0f1
--- a/units/systemd-journald.service.in
17b0f1
+++ b/units/systemd-journald.service.in
17b0f1
@@ -22,7 +22,6 @@ RestartSec=0
17b0f1
 StandardOutput=null
17b0f1
 FileDescriptorStoreMax=4224
17b0f1
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
17b0f1
-WatchdogSec=3min
17b0f1
 
17b0f1
 # Increase the default a bit in order to allow many simultaneous
17b0f1
 # services being run since we keep one fd open per service. Also, when