Blame SOURCES/0009-net-Wait-for-dhclient-to-daemonize-before-reading-le.patch

936480
From c1c6d0b586e5556a37b9b813afbb4e6a24921adf Mon Sep 17 00:00:00 2001
936480
From: Eduardo Otubo <otubo@redhat.com>
936480
Date: Wed, 23 Jan 2019 12:30:21 +0100
936480
Subject: net: Wait for dhclient to daemonize before reading lease file
936480
936480
RH-Author: Eduardo Otubo <otubo@redhat.com>
936480
Message-id: <20190123123021.32708-1-otubo@redhat.com>
936480
Patchwork-id: 84095
936480
O-Subject: [RHEL-7.7 cloud-init PATCH] net: Wait for dhclient to daemonize before reading lease file
936480
Bugzilla: 1632967
936480
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
936480
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
936480
936480
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
936480
Brew: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
936480
Tested: Me and upstream
936480
936480
commit fdadcb5fae51f4e6799314ab98e3aec56c79b17c
936480
Author: Jason Zions <jasonzio@microsoft.com>
936480
Date:   Tue Jan 15 21:37:17 2019 +0000
936480
936480
    net: Wait for dhclient to daemonize before reading lease file
936480
936480
    cloud-init uses dhclient to fetch the DHCP lease so it can extract
936480
    DHCP options.  dhclient creates the leasefile, then writes to it;
936480
    simply waiting for the leasefile to appear creates a race between
936480
    dhclient and cloud-init. Instead, wait for dhclient to be parented by
936480
    init. At that point, we know it has written to the leasefile, so it's
936480
    safe to copy the file and kill the process.
936480
936480
    cloud-init creates a temporary directory in which to execute dhclient,
936480
    and deletes that directory after it has killed the process. If
936480
    cloud-init abandons waiting for dhclient to daemonize, it will still
936480
    attempt to delete the temporary directory, but will not report an
936480
    exception should that attempt fail.
936480
936480
    LP: #1794399
936480
936480
Signed-off-by: Eduardo Otubo <otubo@redhat.com>
936480
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
936480
---
936480
 cloudinit/net/dhcp.py              | 44 +++++++++++++++++++++++++++-----------
936480
 cloudinit/net/tests/test_dhcp.py   | 15 ++++++++++---
936480
 cloudinit/temp_utils.py            |  4 ++--
936480
 cloudinit/tests/test_temp_utils.py | 18 +++++++++++++++-
936480
 cloudinit/util.py                  | 16 +++++++++++++-
936480
 tests/unittests/test_util.py       |  6 ++++++
936480
 6 files changed, 83 insertions(+), 20 deletions(-)
936480
936480
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
936480
index 0db991d..c98a97c 100644
936480
--- a/cloudinit/net/dhcp.py
936480
+++ b/cloudinit/net/dhcp.py
936480
@@ -9,6 +9,7 @@ import logging
936480
 import os
936480
 import re
936480
 import signal
936480
+import time
936480
 
936480
 from cloudinit.net import (
936480
     EphemeralIPv4Network, find_fallback_nic, get_devicelist,
936480
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
936480
     if not dhclient_path:
936480
         LOG.debug('Skip dhclient configuration: No dhclient command found.')
936480
         return []
936480
-    with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
936480
+    with temp_utils.tempdir(rmtree_ignore_errors=True,
936480
+                            prefix='cloud-init-dhcp-',
936480
+                            needs_exe=True) as tdir:
936480
         # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
936480
         return dhcp_discovery(dhclient_path, nic, tdir)
936480
 
936480
@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
936480
            '-pf', pid_file, interface, '-sf', '/bin/true']
936480
     util.subp(cmd, capture=True)
936480
 
936480
-    # dhclient doesn't write a pid file until after it forks when it gets a
936480
-    # proper lease response. Since cleandir is a temp directory that gets
936480
-    # removed, we need to wait for that pidfile creation before the
936480
-    # cleandir is removed, otherwise we get FileNotFound errors.
936480
+    # Wait for pid file and lease file to appear, and for the process
936480
+    # named by the pid file to daemonize (have pid 1 as its parent). If we
936480
+    # try to read the lease file before daemonization happens, we might try
936480
+    # to read it before the dhclient has actually written it. We also have
936480
+    # to wait until the dhclient has become a daemon so we can be sure to
936480
+    # kill the correct process, thus freeing cleandir to be deleted back
936480
+    # up the callstack.
936480
     missing = util.wait_for_files(
936480
         [pid_file, lease_file], maxwait=5, naplen=0.01)
936480
     if missing:
936480
         LOG.warning("dhclient did not produce expected files: %s",
936480
                     ', '.join(os.path.basename(f) for f in missing))
936480
         return []
936480
-    pid_content = util.load_file(pid_file).strip()
936480
-    try:
936480
-        pid = int(pid_content)
936480
-    except ValueError:
936480
-        LOG.debug(
936480
-            "pid file contains non-integer content '%s'", pid_content)
936480
-    else:
936480
-        os.kill(pid, signal.SIGKILL)
936480
+
936480
+    ppid = 'unknown'
936480
+    for _ in range(0, 1000):
936480
+        pid_content = util.load_file(pid_file).strip()
936480
+        try:
936480
+            pid = int(pid_content)
936480
+        except ValueError:
936480
+            pass
936480
+        else:
936480
+            ppid = util.get_proc_ppid(pid)
936480
+            if ppid == 1:
936480
+                LOG.debug('killing dhclient with pid=%s', pid)
936480
+                os.kill(pid, signal.SIGKILL)
936480
+                return parse_dhcp_lease_file(lease_file)
936480
+        time.sleep(0.01)
936480
+
936480
+    LOG.error(
936480
+        'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
936480
+        pid_content, ppid, 0.01 * 1000
936480
+    )
936480
     return parse_dhcp_lease_file(lease_file)
936480
 
936480
 
936480
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
936480
index cd3e732..79e8842 100644
936480
--- a/cloudinit/net/tests/test_dhcp.py
936480
+++ b/cloudinit/net/tests/test_dhcp.py
936480
@@ -145,16 +145,20 @@ class TestDHCPDiscoveryClean(CiTestCase):
936480
               'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
936480
             dhcp_discovery(dhclient_script, 'eth9', tmpdir))
936480
         self.assertIn(
936480
-            "pid file contains non-integer content ''", self.logs.getvalue())
936480
+            "dhclient(pid=, parentpid=unknown) failed "
936480
+            "to daemonize after 10.0 seconds",
936480
+            self.logs.getvalue())
936480
         m_kill.assert_not_called()
936480
 
936480
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
936480
     @mock.patch('cloudinit.net.dhcp.os.kill')
936480
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
936480
     @mock.patch('cloudinit.net.dhcp.util.subp')
936480
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
936480
                                                                   m_subp,
936480
                                                                   m_wait,
936480
-                                                                  m_kill):
936480
+                                                                  m_kill,
936480
+                                                                  m_getppid):
936480
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
936480
         tmpdir = self.tmp_dir()
936480
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
936480
@@ -164,6 +168,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
936480
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
936480
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
936480
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
936480
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
936480
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
936480
         self.assertEqual(
936480
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
936480
@@ -173,9 +178,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
936480
             self.logs.getvalue())
936480
         m_kill.assert_not_called()
936480
 
936480
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
936480
     @mock.patch('cloudinit.net.dhcp.os.kill')
936480
     @mock.patch('cloudinit.net.dhcp.util.subp')
936480
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
936480
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
936480
         """dhcp_discovery brings up the interface and runs dhclient.
936480
 
936480
         It also returns the parsed dhcp.leases file generated in the sandbox.
936480
@@ -197,6 +203,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
936480
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
936480
         my_pid = 1
936480
         write_file(pid_file, "%d\n" % my_pid)
936480
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
936480
 
936480
         self.assertItemsEqual(
936480
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
936480
@@ -355,3 +362,5 @@ class TestEphemeralDhcpNoNetworkSetup(HttprettyTestCase):
936480
             self.assertEqual(fake_lease, lease)
936480
         # Ensure that dhcp discovery occurs
936480
         m_dhcp.called_once_with()
936480
+
936480
+# vi: ts=4 expandtab
936480
diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py
936480
index c98a1b5..346276e 100644
936480
--- a/cloudinit/temp_utils.py
936480
+++ b/cloudinit/temp_utils.py
936480
@@ -81,7 +81,7 @@ def ExtendedTemporaryFile(**kwargs):
936480
 
936480
 
936480
 @contextlib.contextmanager
936480
-def tempdir(**kwargs):
936480
+def tempdir(rmtree_ignore_errors=False, **kwargs):
936480
     # This seems like it was only added in python 3.2
936480
     # Make it since its useful...
936480
     # See: http://bugs.python.org/file12970/tempdir.patch
936480
@@ -89,7 +89,7 @@ def tempdir(**kwargs):
936480
     try:
936480
         yield tdir
936480
     finally:
936480
-        shutil.rmtree(tdir)
936480
+        shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
936480
 
936480
 
936480
 def mkdtemp(**kwargs):
936480
diff --git a/cloudinit/tests/test_temp_utils.py b/cloudinit/tests/test_temp_utils.py
936480
index ffbb92c..4a52ef8 100644
936480
--- a/cloudinit/tests/test_temp_utils.py
936480
+++ b/cloudinit/tests/test_temp_utils.py
936480
@@ -2,8 +2,9 @@
936480
 
936480
 """Tests for cloudinit.temp_utils"""
936480
 
936480
-from cloudinit.temp_utils import mkdtemp, mkstemp
936480
+from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
936480
 from cloudinit.tests.helpers import CiTestCase, wrap_and_call
936480
+import os
936480
 
936480
 
936480
 class TestTempUtils(CiTestCase):
936480
@@ -98,4 +99,19 @@ class TestTempUtils(CiTestCase):
936480
         self.assertEqual('/fake/return/path', retval)
936480
         self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
936480
 
936480
+    def test_tempdir_error_suppression(self):
936480
+        """test tempdir suppresses errors during directory removal."""
936480
+
936480
+        with self.assertRaises(OSError):
936480
+            with tempdir(prefix='cloud-init-dhcp-') as tdir:
936480
+                os.rmdir(tdir)
936480
+                # As a result, the directory is already gone,
936480
+                # so shutil.rmtree should raise OSError
936480
+
936480
+        with tempdir(rmtree_ignore_errors=True,
936480
+                     prefix='cloud-init-dhcp-') as tdir:
936480
+            os.rmdir(tdir)
936480
+            # Since the directory is already gone, shutil.rmtree would raise
936480
+            # OSError, but we suppress that
936480
+
936480
 # vi: ts=4 expandtab
936480
diff --git a/cloudinit/util.py b/cloudinit/util.py
936480
index 7800f7b..a84112a 100644
936480
--- a/cloudinit/util.py
936480
+++ b/cloudinit/util.py
936480
@@ -2861,7 +2861,6 @@ def mount_is_read_write(mount_point):
936480
     mount_opts = result[-1].split(',')
936480
     return mount_opts[0] == 'rw'
936480
 
936480
-
936480
 def udevadm_settle(exists=None, timeout=None):
936480
     """Invoke udevadm settle with optional exists and timeout parameters"""
936480
     settle_cmd = ["udevadm", "settle"]
936480
@@ -2875,5 +2874,20 @@ def udevadm_settle(exists=None, timeout=None):
936480
 
936480
     return subp(settle_cmd)
936480
 
936480
+def get_proc_ppid(pid):
936480
+    """
936480
+    Return the parent pid of a process.
936480
+    """
936480
+    ppid = 0
936480
+    try:
936480
+        contents = load_file("/proc/%s/stat" % pid, quiet=True)
936480
+    except IOError as e:
936480
+        LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
936480
+    if contents:
936480
+        parts = contents.split(" ", 4)
936480
+        # man proc says
936480
+        #  ppid %d     (4) The PID of the parent.
936480
+        ppid = int(parts[3])
936480
+    return ppid
936480
 
936480
 # vi: ts=4 expandtab
936480
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
936480
index 5a14479..8aebcd6 100644
936480
--- a/tests/unittests/test_util.py
936480
+++ b/tests/unittests/test_util.py
936480
@@ -1114,6 +1114,12 @@ class TestLoadShellContent(helpers.TestCase):
936480
                 'key3="val3 #tricky"',
936480
                 ''])))
936480
 
936480
+    def test_get_proc_ppid(self):
936480
+        """get_proc_ppid returns correct parent pid value."""
936480
+        my_pid = os.getpid()
936480
+        my_ppid = os.getppid()
936480
+        self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
936480
+
936480
 
936480
 class TestGetProcEnv(helpers.TestCase):
936480
     """test get_proc_env."""
936480
-- 
936480
1.8.3.1
936480