Implement basic automatic KVM postcopy migration (#1262)
authorCalum Calder <calumcalder@users.noreply.github.com>
Tue, 1 Aug 2017 10:20:19 +0000 (11:20 +0100)
committerMorg <morg@google.com>
Tue, 1 Aug 2017 10:20:19 +0000 (12:20 +0200)
This commit adds postcopy functionality to ganeti by automatically
switching a migration to postcopy mode if the postcopy-ram or
x-postcopy-ram migration capability flag is set in an instance's
hypervisor parameters.

Signed-off-by: Calum Calder <calumcalder@google.com>
Reviewed-by: Federico Morg Pareschi <morg@google.com>

lib/hypervisor/hv_kvm/__init__.py
man/gnt-instance.rst
test/py/ganeti.hypervisor.hv_kvm_unittest.py

index 3569a20..9a83bcd 100644 (file)
@@ -538,12 +538,14 @@ class KVMHypervisor(hv_base.BaseHypervisor):
   _VIRTIO_NET_PCI = "virtio-net-pci"
   _VIRTIO_BLK_PCI = "virtio-blk-pci"
 
-  _MIGRATION_STATUS_RE = re.compile(r"Migration\s+status:\s+(\w+)",
+  _MIGRATION_STATUS_RE = re.compile(r"Migration\s+status:\s+([-\w]+)",
                                     re.M | re.I)
   _MIGRATION_PROGRESS_RE = \
     re.compile(r"\s*transferred\s+ram:\s+(?P<transferred>\d+)\s+kbytes\s*\n.*"
                r"\s*remaining\s+ram:\s+(?P<remaining>\d+)\s+kbytes\s*\n"
                r"\s*total\s+ram:\s+(?P<total>\d+)\s+kbytes\s*\n", re.I)
+  _MIGRATION_PRECOPY_PASSES_RE = \
+    re.compile(r"\s*dirty sync count:\s+(\d+)", re.I | re.M)
 
   _MIGRATION_INFO_MAX_BAD_ANSWERS = 5
   _MIGRATION_INFO_RETRY_DELAY = 2
@@ -2485,13 +2487,55 @@ class KVMHypervisor(hv_base.BaseHypervisor):
 
     migration_caps = instance.hvparams[constants.HV_KVM_MIGRATION_CAPS]
     if migration_caps:
-      for c in migration_caps.split(_MIGRATION_CAPS_DELIM):
+      capabilities = migration_caps.split(_MIGRATION_CAPS_DELIM)
+      postcopy_enabled = ('x-postcopy-ram' in capabilities
+                          or 'postcopy-ram' in capabilities)
+      for c in capabilities:
         migrate_command = ("migrate_set_capability %s on" % c)
         self._CallMonitorCommand(instance_name, migrate_command)
+    else:
+      postcopy_enabled = False
 
     migrate_command = "migrate -d tcp:%s:%s" % (target, port)
     self._CallMonitorCommand(instance_name, migrate_command)
 
+    if postcopy_enabled:
+      self._PostcopyAfterPrecopy(instance)
+
+  def _PostcopyAfterPrecopy(self, instance):
+    """Enable postcopying RAM after one precopy pass.
+
+    Requires that an instance is currently migrating, and that the
+    postcopy-ram (x-postcopy-ram on QEMU version 2.5 and below)
+    migration capability is enabled in the instance's hypervisor
+    parameters.
+
+    @type instance: L{objects.Instance}
+    @param instance: The instance being migrated.
+
+    """
+    precopy_passes = 0
+    while precopy_passes < 2:
+      migration_status = \
+          self._CallMonitorCommand(instance.name, 'info migrate')
+
+      status_match = self._MIGRATION_STATUS_RE.search(migration_status.stdout)
+      if status_match and status_match.group(1) != 'active':
+        logging.debug('Did not attempt postcopy, migration status: %s'
+          % status_match.group(1))
+        break
+      if migration_status.stderr:
+        logging.debug('Error polling for dirty sync count in '
+          'hv_kvm._PostcopyAfterPrecopy(): %s' % migration_status.stderr)
+        break
+
+      passes_match = \
+          self._MIGRATION_PRECOPY_PASSES_RE.search(migration_status.stdout)
+      if passes_match:
+        precopy_passes = int(passes_match.group(1))
+    else:
+      self._CallMonitorCommand(instance.name, 'migrate_start_postcopy')
+
   def FinalizeMigrationSource(self, instance, success, _):
     """Finalize the instance migration on the source node.
 
index 20d567e..d796684 100644 (file)
@@ -866,10 +866,14 @@ migration\_caps
 
     Enable specific migration capabilities by providing a ":" separated
     list of supported capabilites. QEMU version 1.7.0 defines
-    x-rdma-pin-all, auto-converge, zero-blocks, and xbzrle. Please note
-    that while a combination of xbzrle and auto-converge might speed up
-    the migration process significantly, the first may cause BSOD on
-    Windows8r2 instances running on drbd.
+    x-rdma-pin-all, auto-converge, zero-blocks, and xbzrle. QEMU version
+    2.5 defines x-postcopy-ram and 2.6 renames this to postcopy-ram.
+    If x-postcopy-ram or postcopy-ram are enabled, Ganeti will
+    automatically move a migration to postcopy mode after one iteration
+    of precopying the instance's RAM.
+    Please note that while a combination of xbzrle and auto-converge
+    might speed up the migration process significantly, the first may
+    cause BSOD on Windows8r2 instances running on drbd.
 
 kvm\_path
     Valid for the KVM hypervisor.
index 9d6e648..f59bd7a 100755 (executable)
@@ -704,5 +704,150 @@ class TestKvmCpuPinning(testutils.GanetiTestCase):
       self.assertEqual(mock_process.set_cpu_affinity.call_args_list[1],
                        mock.call([4]))
 
+class TestPostcopyAfterPrecopy(testutils.GanetiTestCase):
+  def setUp(self):
+    super(TestPostcopyAfterPrecopy, self).setUp()
+    kvm_class = 'ganeti.hypervisor.hv_kvm.KVMHypervisor'
+    self.MockOut('qmp', mock.patch('ganeti.hypervisor.hv_kvm.QmpConnection'))
+    self.MockOut('run_cmd', mock.patch('ganeti.utils.RunCmd'))
+    self.MockOut('ensure_dirs', mock.patch('ganeti.utils.EnsureDirs'))
+    self.MockOut('write_file', mock.patch('ganeti.utils.WriteFile'))
+    self.params = constants.HVC_DEFAULTS[constants.HT_KVM].copy()
+
+  def _TestPostcopyAfterPrecopy(self, runcmd, postcopy_started_goal):
+    hypervisor = hv_kvm.KVMHypervisor()
+    self.iteration = 0
+    self.postcopy_started = False
+
+    def runcmd_mock(cmd, env=None, output=None, cwd="/", reset_env=False,
+           interactive=False, timeout=None, noclose_fds=None,
+           input_fd=None, postfork_fn=None):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if not self.postcopy_started and cmd.find('migrate_start_postcopy') != -1:
+        self.postcopy_started = True
+        res.stdout = ('migrate_postcopy_start\n'
+                      '(qemu) ')
+      return runcmd(cmd, res)
+
+    with mock.patch('ganeti.utils.RunCmd', runcmd_mock):
+      instance = mock.MagicMock()
+      instance.name = 'example.instance'
+      hypervisor._PostcopyAfterPrecopy(instance)
+      self.assertEqual(self.postcopy_started, postcopy_started_goal)
+
+  def testNormal(self):
+    def runcmd_normal(cmd, res):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if cmd.find('info migrate') != -1:
+        self.iteration += 1
+        res.stdout = (
+            'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+            '(qemu) info migrate\n'
+            'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+            'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+            'Migration status: active\n'
+            'skipped: 0 pages\n'
+            'dirty sync count: %i\n'
+            '(qemu) ' % self.iteration
+          )
+      return res
+
+    self._TestPostcopyAfterPrecopy(runcmd_normal, True)
+
+  def testEmptyResponses(self):
+    def runcmd_empty_responses(cmd, res):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if cmd.find('info migrate') != -1:
+        self.iteration += 1
+        if self.iteration < 3:
+          res.stdout = (
+              'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+              '(qemu) info migrate\n'
+              '(qemu) '
+            )
+        else:
+          res.stdout = (
+              'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+              '(qemu) info migrate\n'
+              'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+              'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+              'Migration status: active\n'
+              'skipped: 0 pages\n'
+              'dirty sync count: %i\n'
+              '(qemu) ' % self.iteration
+            )
+      return res
+    self._TestPostcopyAfterPrecopy(runcmd_empty_responses, True)
+
+  def testMonitorRemoved(self):
+    def runcmd_monitor_removed(cmd, res):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if cmd.find('info migrate') != -1:
+        self.iteration += 1
+        if self.iteration < 3:
+          res.stdout = (
+              'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+              '(qemu) info migrate\n'
+              'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+              'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+              'Migration status: active\n'
+              'skipped: 0 pages\n'
+              'dirty sync count: %i\n'
+              '(qemu) '
+            )
+        else:
+          res.stderr = ('2017/07/26 15:49:52 socat[105703] E connect(3, AF=1 '
+                        '"/var/run/ganeti/kvm-hypervisor/ctrl/example.instanc'
+                        'e.monitor", 85): No such file or directory')
+      return res
+    self._TestPostcopyAfterPrecopy(runcmd_monitor_removed, False)
+
+  def testMigrationFailed(self):
+    def runcmd_migration_failed(cmd, res):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if cmd.find('info migrate') != -1:
+        self.iteration += 1
+        if self.iteration < 3:
+          res.stdout = (
+              'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+              '(qemu) info migrate\n'
+              'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+              'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+              'Migration status: active\n'
+              'skipped: 0 pages\n'
+              'dirty sync count: %i\n'
+              '(qemu) '
+            )
+        else:
+          res.stdout = (
+              'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+              '(qemu) info migrate\n'
+              'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+              'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+              'Migration status: failed\n'
+              'skipped: 0 pages\n'
+              'dirty sync count: %i\n'
+              '(qemu) '
+            )
+      return res
+    self._TestPostcopyAfterPrecopy(runcmd_migration_failed, False)
+
+  def testAlreadyInPostcopy(self):
+    def runcmd_already_in_postcopy(cmd, res):
+      res = utils.RunResult(0, None, '', '', cmd, None, None)
+      if cmd.find('info migrate') != -1:
+        res.stdout = (
+            'QEMU 2.5.0 monitor - type \'help\' for more information\n'
+            '(qemu) info migrate\n'
+            'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
+            'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
+            'Migration status: postcopy-active\n'
+            'skipped: 0 pages\n'
+            'dirty sync count: %i\n'
+            '(qemu) '
+          )
+      return res
+    self._TestPostcopyAfterPrecopy(runcmd_already_in_postcopy, False)
+
 if __name__ == "__main__":
   testutils.GanetiTestProgram()