Improve error message for replace-disks
[ganeti-github.git] / lib / cmdlib / instance_storage.py
index 6f0b089..b1fea8b 100644 (file)
@@ -179,7 +179,7 @@ def IsExclusiveStorageEnabledNodeName(cfg, nodename):
   return IsExclusiveStorageEnabledNode(cfg, ni)
 
 
-def CreateBlockDev(lu, node, instance, device, force_create, info,
+def _CreateBlockDev(lu, node, instance, device, force_create, info,
                     force_open):
   """Wrapper around L{_CreateBlockDevInner}.
 
@@ -192,7 +192,26 @@ def CreateBlockDev(lu, node, instance, device, force_create, info,
                               force_open, excl_stor)
 
 
-def CreateDisks(lu, instance, to_skip=None, target_node=None):
+def _UndoCreateDisks(lu, disks_created):
+  """Undo the work performed by L{CreateDisks}.
+
+  This function is called in case of an error to undo the work of
+  L{CreateDisks}.
+
+  @type lu: L{LogicalUnit}
+  @param lu: the logical unit on whose behalf we execute
+  @param disks_created: the result returned by L{CreateDisks}
+
+  """
+  for (node, disk) in disks_created:
+    lu.cfg.SetDiskID(disk, node)
+    result = lu.rpc.call_blockdev_remove(node, disk)
+    if result.fail_msg:
+      logging.warning("Failed to remove newly-created disk %s on node %s:"
+                      " %s", disk, node, result.fail_msg)
+
+
+def CreateDisks(lu, instance, to_skip=None, target_node=None, disks=None):
   """Create all disks for an instance.
 
   This abstracts away some work from AddInstance.
@@ -205,8 +224,12 @@ def CreateDisks(lu, instance, to_skip=None, target_node=None):
   @param to_skip: list of indices to skip
   @type target_node: string
   @param target_node: if passed, overrides the target node for creation
-  @rtype: boolean
-  @return: the success of the creation
+  @type disks: list of {objects.Disk}
+  @param disks: the disks to create; if not specified, all the disks of the
+      instance are created
+  @return: information about the created disks, to be used to call
+      L{_UndoCreateDisks}
+  @raise errors.OpPrereqError: in case of error
 
   """
   info = GetInstanceInfoText(instance)
@@ -217,6 +240,9 @@ def CreateDisks(lu, instance, to_skip=None, target_node=None):
     pnode = target_node
     all_nodes = [pnode]
 
+  if disks is None:
+    disks = instance.disks
+
   if instance.disk_template in constants.DTS_FILEBASED:
     file_storage_dir = os.path.dirname(instance.disks[0].logical_id[1])
     result = lu.rpc.call_file_storage_dir_create(pnode, file_storage_dir)
@@ -225,32 +251,22 @@ def CreateDisks(lu, instance, to_skip=None, target_node=None):
                  " node %s" % (file_storage_dir, pnode))
 
   disks_created = []
-  # Note: this needs to be kept in sync with adding of disks in
-  # LUInstanceSetParams
-  for idx, device in enumerate(instance.disks):
+  for idx, device in enumerate(disks):
     if to_skip and idx in to_skip:
       continue
     logging.info("Creating disk %s for instance '%s'", idx, instance.name)
-    #HARDCODE
     for node in all_nodes:
       f_create = node == pnode
       try:
-        CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
+        _CreateBlockDev(lu, node, instance, device, f_create, info, f_create)
         disks_created.append((node, device))
-      except errors.OpExecError:
-        logging.warning("Creating disk %s for instance '%s' failed",
-                        idx, instance.name)
       except errors.DeviceCreationError, e:
         logging.warning("Creating disk %s for instance '%s' failed",
                         idx, instance.name)
         disks_created.extend(e.created_devices)
-        for (node, disk) in disks_created:
-          lu.cfg.SetDiskID(disk, node)
-          result = lu.rpc.call_blockdev_remove(node, disk)
-          if result.fail_msg:
-            logging.warning("Failed to remove newly-created disk %s on node %s:"
-                            " %s", device, node, result.fail_msg)
+        _UndoCreateDisks(lu, disks_created)
         raise errors.OpExecError(e.message)
+  return disks_created
 
 
 def ComputeDiskSizePerVG(disk_template, disks):
@@ -800,7 +816,14 @@ class LUInstanceRecreateDisks(LogicalUnit):
     # All touched nodes must be locked
     mylocks = self.owned_locks(locking.LEVEL_NODE)
     assert mylocks.issuperset(frozenset(instance.all_nodes))
-    CreateDisks(self, instance, to_skip=to_skip)
+    new_disks = CreateDisks(self, instance, to_skip=to_skip)
+
+    # TODO: Release node locks before wiping, or explain why it's not possible
+    if self.cfg.GetClusterInfo().prealloc_wipe_disks:
+      wipedisks = [(idx, disk, 0)
+                   for (idx, disk) in enumerate(instance.disks)
+                   if idx not in to_skip]
+      WipeOrCleanupDisks(self, instance, disks=wipedisks, cleanup=new_disks)
 
 
 def _CheckNodesFreeDiskOnVG(lu, nodenames, vg, requested):
@@ -990,6 +1013,28 @@ def WipeDisks(lu, instance, disks=None):
                         " failed", idx, instance.name)
 
 
+def WipeOrCleanupDisks(lu, instance, disks=None, cleanup=None):
+  """Wrapper for L{WipeDisks} that handles errors.
+
+  @type lu: L{LogicalUnit}
+  @param lu: the logical unit on whose behalf we execute
+  @type instance: L{objects.Instance}
+  @param instance: the instance whose disks we should wipe
+  @param disks: see L{WipeDisks}
+  @param cleanup: the result returned by L{CreateDisks}, used for cleanup in
+      case of error
+  @raise errors.OpPrereqError: in case of failure
+
+  """
+  try:
+    WipeDisks(lu, instance, disks=disks)
+  except errors.OpExecError:
+    logging.warning("Wiping disks for instance '%s' failed",
+                    instance.name)
+    _UndoCreateDisks(lu, cleanup)
+    raise
+
+
 def ExpandCheckDisks(instance, disks):
   """Return the instance disks selected by the disks list
 
@@ -1004,7 +1049,8 @@ def ExpandCheckDisks(instance, disks):
   else:
     if not set(disks).issubset(instance.disks):
       raise errors.ProgrammerError("Can only act on disks belonging to the"
-                                   " target instance")
+                                   " target instance: expected a subset of %r,"
+                                   " got %r" % (instance.disks, disks))
     return disks
 
 
@@ -1093,6 +1139,7 @@ def ShutdownInstanceDisks(lu, instance, disks=None, ignore_primary=False):
   ignored.
 
   """
+  lu.cfg.MarkInstanceDisksInactive(instance.name)
   all_result = True
   disks = ExpandCheckDisks(instance, disks)
 
@@ -1159,6 +1206,10 @@ def AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
   # into any other network-connected state (Connected, SyncTarget,
   # SyncSource, etc.)
 
+  # mark instance disks as active before doing actual work, so watcher does
+  # not try to shut them down erroneously
+  lu.cfg.MarkInstanceDisksActive(iname)
+
   # 1st pass, assemble on all nodes in secondary mode
   for idx, inst_disk in enumerate(disks):
     for node, node_disk in inst_disk.ComputeNodeTree(instance.primary_node):
@@ -1210,6 +1261,9 @@ def AssembleInstanceDisks(lu, instance, disks=None, ignore_secondaries=False,
   for disk in disks:
     lu.cfg.SetDiskID(disk, instance.primary_node)
 
+  if not disks_ok:
+    lu.cfg.MarkInstanceDisksInactive(iname)
+
   return disks_ok, device_info
 
 
@@ -1358,6 +1412,7 @@ class LUInstanceGrowDisk(LogicalUnit):
 
     if wipe_disks:
       # Get disk size from primary node for wiping
+      self.cfg.SetDiskID(disk, instance.primary_node)
       result = self.rpc.call_blockdev_getsize(instance.primary_node, [disk])
       result.Raise("Failed to retrieve disk size from node '%s'" %
                    instance.primary_node)
@@ -1414,9 +1469,9 @@ class LUInstanceGrowDisk(LogicalUnit):
       if disk_abort:
         self.LogWarning("Disk syncing has not returned a good status; check"
                         " the instance")
-      if instance.admin_state != constants.ADMINST_UP:
+      if not instance.disks_active:
         _SafeShutdownInstanceDisks(self, instance, disks=[disk])
-    elif instance.admin_state != constants.ADMINST_UP:
+    elif not instance.disks_active:
       self.LogWarning("Not shutting down the disk even if the instance is"
                       " not supposed to be running because no wait for"
                       " sync mode was requested")
@@ -1604,6 +1659,7 @@ class LUInstanceActivateDisks(NoHooksLU):
 
     if self.op.wait_for_sync:
       if not WaitForSync(self, self.instance):
+        self.cfg.MarkInstanceDisksInactive(self.instance.name)
         raise errors.OpExecError("Some disks of the instance are degraded!")
 
     return disks_info
@@ -1989,7 +2045,7 @@ class TLReplaceDisks(Tasklet):
     feedback_fn("Current seconary node: %s" %
                 utils.CommaJoin(self.instance.secondary_nodes))
 
-    activate_disks = (self.instance.admin_state != constants.ADMINST_UP)
+    activate_disks = not self.instance.disks_active
 
     # Activate the instance disks if we're replacing them on a down instance
     if activate_disks:
@@ -2055,8 +2111,14 @@ class TLReplaceDisks(Tasklet):
         if msg or not result.payload:
           if not msg:
             msg = "disk not found"
-          raise errors.OpExecError("Can't find disk/%d on node %s: %s" %
-                                   (idx, node, msg))
+          if not self._CheckDisksActivated(self.instance):
+            extra_hint = ("\nDisks seem to be not properly activated. Try"
+                          " running activate-disks on the instance before"
+                          " using replace-disks.")
+          else:
+            extra_hint = ""
+          raise errors.OpExecError("Can't find disk/%d on node %s: %s%s" %
+                                   (idx, node, msg, extra_hint))
 
   def _CheckDisksConsistency(self, node_name, on_primary, ldisk):
     for idx, dev in enumerate(self.instance.disks):
@@ -2111,9 +2173,12 @@ class TLReplaceDisks(Tasklet):
 
       # we pass force_create=True to force the LVM creation
       for new_lv in new_lvs:
-        _CreateBlockDevInner(self.lu, node_name, self.instance, new_lv, True,
-                             GetInstanceInfoText(self.instance), False,
-                             excl_stor)
+        try:
+          _CreateBlockDevInner(self.lu, node_name, self.instance, new_lv, True,
+                               GetInstanceInfoText(self.instance), False,
+                               excl_stor)
+        except errors.DeviceCreationError, e:
+          raise errors.OpExecError("Can't create block device: %s" % e.message)
 
     return iv_names
 
@@ -2328,9 +2393,12 @@ class TLReplaceDisks(Tasklet):
                       (self.new_node, idx))
       # we pass force_create=True to force LVM creation
       for new_lv in dev.children:
-        _CreateBlockDevInner(self.lu, self.new_node, self.instance, new_lv,
-                             True, GetInstanceInfoText(self.instance), False,
-                             excl_stor)
+        try:
+          _CreateBlockDevInner(self.lu, self.new_node, self.instance, new_lv,
+                               True, GetInstanceInfoText(self.instance), False,
+                               excl_stor)
+        except errors.DeviceCreationError, e:
+          raise errors.OpExecError("Can't create block device: %s" % e.message)
 
     # Step 4: dbrd minors and drbd setups changes
     # after this, we must manually remove the drbd minors on both the