Merge branch 'stable-2.16' into stable-2.17 stable-2.17
authorBrian Foley <bpfoley@google.com>
Fri, 16 Dec 2016 17:23:46 +0000 (17:23 +0000)
committerBrian Foley <bpfoley@google.com>
Fri, 16 Dec 2016 17:40:55 +0000 (17:40 +0000)
* stable-2.15
  Fix gnt-instance console instance unpausing for xl toolstack
  Disable pylint too-many-nested-blocks in _RunCmdPipe
  Reduce nesting in import-export ProcessChildIO
  Reduce nesting in LUOobCommand.Exec
  Reduce nesting in LUInstanceCreate.RunOsScripts
  Reduce nesting in RemoveNodeSshKeyBulk key calculation
  Reduce nesting in RemoveNodeSshKeyBulk ssh logic
  Reduce nesting in gnt-cluster VerifyDisks missing disk loop
  Reduce nesting in _CheckVLANArguments
  Reduce nesting in StartDaemon
  Disable pylint bad-continuation warning
  Disable pylint superfluous-parens warning
  Disable pylint redefined-variable-type warning
  Disable pylint too-many-branches warnings
  Disable pylint broad-except warnings
  Disable incorrect pylint assigning-non-slot warning
  Quell pylint unbalanced-tuple-unpacking warning
  Cleanup: Use new-style classes everywhere
  Quell pylint socket.timeout warning
  Quell the pylint wrong-import-order warnings
  Quell cell-var-from-loop warning
  Use default value lambda param to avoid cell-var-from-loop
  Quell too-many-boolean-expressions
  Remove pylint tests removed in pylint 2.0
  Quell trailing newline
  Quell bad-whitespace warning
  Quell consider-using-enumerate warning
  Disable pylint unsubscriptable-object warning
  Disable pylint bare-except warning
  Disable unwanted pylint wrong-import-position warnings
  Disable pylint unused-wildcard-import warning
  Disable incorrect pylint not-callable warning
  Disable pylint unpacking-non-sequence warning
  Disable pylint misplaced-comparison-constant warning
  Disable incorect pylint simplify-if-statement warning
  Disable pylint eval-used warning
  Disable pylint invalid-name warning
  Disable pylint import-self warning
  Disable some pylint unused-import warnings
  Replace deprecated pylint >=0.27 pragma with new form
  Delete old warning disables removed from pylint 1.6
  Fix pylint >1.4 pycurl no-member warnings
  Cleanup: Remove unused/duplicate module/fn import
  Cleanup: Fix unidiomatic-typecheck
  Cleanup: Remove some unneeded pylint disables
  Cleanup: Iterate dict rather than key list
  Cleanup: Remove unused format key
  Cleanup: StartInstance and RebootInstance return None
  Cleanup: Fix for/else with no break in AddAuthorizedKeys
  Cleanup: Replace map/filters with list comprehensions
  Cleanup: del is a statement not a function
  Cleanup: Use FOO not in BAR instead of not FOO in BAR
  Cleanup: Simplify boolean assignment
  Cleanup: Remove some unnecessary if (...) parens
  Fix invalid variable error for file-based disks
  FIX: Refactor DiagnoseOS to use a loop, not an inner fn
  FIX: Set INSTANCE_NICn_NETWORK_NAME only if net is defined
  StartInstance restores instance state if running
  Allow migrate --cleanup to adopt an instance
  Make migrate --cleanup more robust
  Make finalize_migration_{src,dst} a single op
  Make FinalizeMigration{Src,Dst} more robust
  Fix instance state detection in _Shutdowninstance
  Code cleanup in hypervisor backend
  Fix for incorrect parsing of DRBD versions
  Fix for instance reinstall not updating config
  Change a few errors to report names, not UUIDs
  Give atomicWriteFile temp filenames a more distinct pattern
  LV check failure should print instance name
  Add ganeti-noded and ganeti-rapi --max-clients options
  Disable logging CallRPCMethod timings in non-debug configs
  568 Update hv_kvm to handle output from qemu >= 1.6.0
  Improve cluster verify ssh key errors
  Fix inconsistent spaces vs tabs indent in makefile

* stable-2.13
  Bugfix: migrate needs HypervisorClass, not an instance

Fix easy merge conflict in lib/backend.py -- dead code removed in 2.15

Signed-off-by: Brian Foley <bpfoley@google.com>
Reviewed-by: Federico Pareschi <morg@google.com>

26 files changed:
1  2 
Makefile.am
lib/backend.py
lib/bootstrap.py
lib/cli.py
lib/cli_opts.py
lib/client/gnt_cluster.py
lib/cmdlib/cluster/__init__.py
lib/cmdlib/cluster/verify.py
lib/cmdlib/common.py
lib/cmdlib/misc.py
lib/config/__init__.py
lib/masterd/iallocator.py
lib/objects.py
lib/rpc_defs.py
lib/server/noded.py
lib/ssh.py
lib/tools/common.py
lib/tools/node_cleanup.py
lib/utils/process.py
lib/utils/retry.py
lib/watcher/__init__.py
src/Ganeti/THH/PyRPC.hs
src/Ganeti/Utils/Atomic.hs
test/py/testutils/config_mock.py
tools/cluster-merge
tools/move-instance

diff --combined Makefile.am
@@@ -151,7 -151,6 +151,7 @@@ HS_DIRS = 
        src/Ganeti/JQueue \
        src/Ganeti/Locking \
        src/Ganeti/Logging \
 +      src/Ganeti/MaintD \
        src/Ganeti/Monitoring \
        src/Ganeti/Metad \
        src/Ganeti/Objects \
@@@ -235,17 -234,17 +235,17 @@@ DIRS = 
        test/data/ovfdata \
        test/data/ovfdata/other \
        test/data/cgroup_root \
-         test/data/cgroup_root/memory \
-         test/data/cgroup_root/memory/lxc \
-         test/data/cgroup_root/memory/lxc/instance1 \
-         test/data/cgroup_root/cpuset \
-         test/data/cgroup_root/cpuset/some_group \
-         test/data/cgroup_root/cpuset/some_group/lxc \
-         test/data/cgroup_root/cpuset/some_group/lxc/instance1 \
-         test/data/cgroup_root/devices \
-         test/data/cgroup_root/devices/some_group \
-         test/data/cgroup_root/devices/some_group/lxc \
-         test/data/cgroup_root/devices/some_group/lxc/instance1 \
+       test/data/cgroup_root/memory \
+       test/data/cgroup_root/memory/lxc \
+       test/data/cgroup_root/memory/lxc/instance1 \
+       test/data/cgroup_root/cpuset \
+       test/data/cgroup_root/cpuset/some_group \
+       test/data/cgroup_root/cpuset/some_group/lxc \
+       test/data/cgroup_root/cpuset/some_group/lxc/instance1 \
+       test/data/cgroup_root/devices \
+       test/data/cgroup_root/devices/some_group \
+       test/data/cgroup_root/devices/some_group/lxc \
+       test/data/cgroup_root/devices/some_group/lxc/instance1 \
        test/py \
        test/py/testutils \
        test/py/cmdlib \
@@@ -303,8 -302,6 +303,8 @@@ CLEANFILES = 
        $(addsuffix /*.py[co],$(DIRS)) \
        $(addsuffix /*.hi,$(HS_DIRS)) \
        $(addsuffix /*.o,$(HS_DIRS)) \
 +      $(addsuffix /*.dyn_hi,$(HS_DIRS)) \
 +      $(addsuffix /*.dyn_o,$(HS_DIRS)) \
        $(addsuffix /*.$(HTEST_SUFFIX)_hi,$(HS_DIRS)) \
        $(addsuffix /*.$(HTEST_SUFFIX)_o,$(HS_DIRS)) \
        $(HASKELL_PACKAGE_VERSIONS_FILE) \
        src/ganeti-confd \
        src/ganeti-wconfd \
        src/ganeti-luxid \
 +      src/ganeti-maintd \
        src/ganeti-metad \
        src/ganeti-mond \
        .hpc/*.mix src/*.tix test/hs/*.tix *.tix \
@@@ -375,7 -371,7 +375,7 @@@ clean-local
  HS_GENERATED_FILES = $(HS_PROGS) src/hluxid src/ganeti-luxid \
        src/hconfd src/ganeti-confd
  if ENABLE_MOND
 -HS_GENERATED_FILES += src/ganeti-mond
 +HS_GENERATED_FILES += src/ganeti-mond src/ganeti-maintd
  endif
  if ENABLE_METADATA
  HS_GENERATED_FILES += src/ganeti-metad
@@@ -414,7 -410,6 +414,7 @@@ BUILT_EXAMPLES = 
        doc/examples/systemd/ganeti-kvmd.service \
        doc/examples/systemd/ganeti-luxid.service \
        doc/examples/systemd/ganeti-metad.service \
 +      doc/examples/systemd/ganeti-maintd.service \
        doc/examples/systemd/ganeti-mond.service \
        doc/examples/systemd/ganeti-noded.service \
        doc/examples/systemd/ganeti-rapi.service \
@@@ -540,9 -535,9 +540,9 @@@ hypervisor_PYTHON = 
        lib/hypervisor/hv_xen.py
  
  hypervisor_hv_kvm_PYTHON = \
-   lib/hypervisor/hv_kvm/__init__.py \
-   lib/hypervisor/hv_kvm/monitor.py \
-   lib/hypervisor/hv_kvm/netdev.py
+       lib/hypervisor/hv_kvm/__init__.py \
+       lib/hypervisor/hv_kvm/monitor.py \
+       lib/hypervisor/hv_kvm/netdev.py
  
  jqueue_PYTHON = \
        lib/jqueue/__init__.py \
@@@ -665,7 -660,6 +665,7 @@@ docinput = 
        doc/design-2.14.rst \
        doc/design-2.15.rst \
        doc/design-2.16.rst \
 +      doc/design-2.17.rst \
        doc/design-allocation-efficiency.rst \
        doc/design-autorepair.rst \
        doc/design-bulk-create.rst \
        doc/design-location.rst \
        doc/design-linuxha.rst \
        doc/design-lu-generated-jobs.rst \
 +      doc/design-macvtap.rst \
 +      doc/design-memory-over-commitment.rst \
 +      doc/design-migration-speed-hbal.rst \
        doc/design-monitoring-agent.rst \
        doc/design-move-instance-improvements.rst \
        doc/design-multi-reloc.rst \
        doc/design-multi-storage-htools.rst \
        doc/design-multi-version-tests.rst \
 +      doc/design-n-m-redundancy.rst \
        doc/design-network.rst \
        doc/design-network2.rst \
        doc/design-node-add.rst \
@@@ -779,7 -769,7 +779,7 @@@ HS_COMPILE_PROGS = 
        src/hs2py \
        src/rpc-test
  if ENABLE_MOND
 -HS_COMPILE_PROGS += src/ganeti-mond
 +HS_COMPILE_PROGS += src/ganeti-mond src/ganeti-maintd
  endif
  if ENABLE_METADATA
  HS_COMPILE_PROGS += src/ganeti-metad
@@@ -895,7 -885,6 +895,7 @@@ HPCEXCL = --exclude Main 
        $(patsubst src.%,--exclude Test.%,$(subst /,.,$(patsubst %.hs,%, $(HS_LIB_SRCS))))
  
  HS_LIB_SRCS = \
 +      src/Ganeti/Prelude.hs \
        src/Ganeti/BasicTypes.hs \
        src/Ganeti/Codec.hs \
        src/Ganeti/Common.hs \
        src/Ganeti/DataCollectors.hs \
        src/Ganeti/DataCollectors/CLI.hs \
        src/Ganeti/DataCollectors/CPUload.hs \
 +      src/Ganeti/DataCollectors/Diagnose.hs \
        src/Ganeti/DataCollectors/Diskstats.hs \
        src/Ganeti/DataCollectors/Drbd.hs \
        src/Ganeti/DataCollectors/InstStatus.hs \
        src/Ganeti/DataCollectors/InstStatusTypes.hs \
 +      src/Ganeti/DataCollectors/KvmRSS.hs \
        src/Ganeti/DataCollectors/Lv.hs \
        src/Ganeti/DataCollectors/Program.hs \
        src/Ganeti/DataCollectors/Types.hs \
        src/Ganeti/HTools/Cluster/AllocationSolution.hs \
        src/Ganeti/HTools/Cluster/Evacuate.hs \
        src/Ganeti/HTools/Cluster/Metrics.hs \
 +      src/Ganeti/HTools/Cluster/MetricsComponents.hs \
 +      src/Ganeti/HTools/Cluster/MetricsTH.hs \
        src/Ganeti/HTools/Cluster/Moves.hs \
        src/Ganeti/HTools/Cluster/Utils.hs \
        src/Ganeti/HTools/Container.hs \
        src/Ganeti/HTools/Program/Hsqueeze.hs \
        src/Ganeti/HTools/Program/Hroller.hs \
        src/Ganeti/HTools/Program/Main.hs \
 +      src/Ganeti/HTools/RedundancyLevel.hs \
 +      src/Ganeti/HTools/Repair.hs \
        src/Ganeti/HTools/Tags.hs \
        src/Ganeti/HTools/Tags/Constants.hs \
        src/Ganeti/HTools/Types.hs \
        src/Ganeti/Logging/Lifted.hs \
        src/Ganeti/Logging/WriterLog.hs \
        src/Ganeti/Luxi.hs \
 +      src/Ganeti/MaintD/Autorepairs.hs \
 +      src/Ganeti/MaintD/Balance.hs \
 +      src/Ganeti/MaintD/CleanupIncidents.hs \
 +      src/Ganeti/MaintD/CollectIncidents.hs \
 +      src/Ganeti/MaintD/FailIncident.hs \
 +      src/Ganeti/MaintD/HandleIncidents.hs \
 +        src/Ganeti/MaintD/MemoryState.hs \
 +      src/Ganeti/MaintD/Server.hs \
 +      src/Ganeti/MaintD/Utils.hs \
        src/Ganeti/Network.hs \
        src/Ganeti/Objects.hs \
        src/Ganeti/Objects/BitArray.hs \
        src/Ganeti/Objects/Disk.hs \
        src/Ganeti/Objects/Instance.hs \
 +      src/Ganeti/Objects/HvState.hs \
        src/Ganeti/Objects/Lens.hs \
 +      src/Ganeti/Objects/Maintenance.hs \
        src/Ganeti/Objects/Nic.hs \
        src/Ganeti/OpCodes.hs \
        src/Ganeti/OpCodes/Lens.hs \
        src/Ganeti/Utils.hs \
        src/Ganeti/Utils/Atomic.hs \
        src/Ganeti/Utils/AsyncWorker.hs \
 +      src/Ganeti/Utils/Http.hs \
        src/Ganeti/Utils/IORef.hs \
        src/Ganeti/Utils/Livelock.hs \
        src/Ganeti/Utils/Monad.hs \
@@@ -1233,11 -1204,11 +1233,11 @@@ endi
        PYTHONPATH=. ENABLE_MANPAGES=$(ENABLE_MANPAGES) COPY_DOC=1 \
          HTML_THEME=$(SPHINX_HTML_THEME) \
        $(RUN_IN_TEMPDIR) autotools/sphinx-wrapper $(SPHINX) -q -W -b html \
-           -d . \
-           -D version="$(VERSION_MAJOR).$(VERSION_MINOR)" \
-           -D release="$(PACKAGE_VERSION)" \
-           -D graphviz_dot="$(DOT)" \
-           doc $(CURDIR)/$$dir && \
+               -d . \
+               -D version="$(VERSION_MAJOR).$(VERSION_MINOR)" \
+               -D release="$(PACKAGE_VERSION)" \
+               -D graphviz_dot="$(DOT)" \
+       doc $(CURDIR)/$$dir && \
        rm -f $$dir/.buildinfo $$dir/objects.inv
        touch $@
  
@@@ -1515,10 -1486,10 +1515,10 @@@ dist_sbin_SCRIPTS = 
  
  nodist_sbin_SCRIPTS = \
        daemons/ganeti-cleaner \
-   src/ganeti-kvmd \
-   src/ganeti-luxid \
-   src/ganeti-confd \
-   src/ganeti-wconfd
+       src/ganeti-kvmd \
+       src/ganeti-luxid \
+       src/ganeti-confd \
+       src/ganeti-wconfd
  
  src/ganeti-luxid: src/hluxid
        cp -f $< $@
@@@ -1532,7 -1503,7 +1532,7 @@@ src/ganeti-confd: src/hconf
        cp -f $< $@
  
  if ENABLE_MOND
 -nodist_sbin_SCRIPTS += src/ganeti-mond
 +nodist_sbin_SCRIPTS += src/ganeti-mond src/ganeti-maintd
  endif
  
  if ENABLE_METADATA
@@@ -1572,7 -1543,7 +1572,7 @@@ dist_tools_python_SCRIPTS = 
  
  nodist_tools_python_SCRIPTS = \
        tools/node-cleanup \
-         $(python_scripts_shebang)
+       $(python_scripts_shebang)
  
  tools_python_basenames = \
        $(patsubst shebang/%,%,\
@@@ -1643,7 -1614,6 +1643,7 @@@ EXTRA_DIST += 
        daemons/ganeti-cleaner.in \
        $(pkglib_python_scripts) \
        devel/build_chroot \
 +      devel/cert_digest.py \
        devel/upload \
        devel/webserver \
        tools/kvm-ifup.in \
        tools/vif-ganeti-metad.in \
        tools/net-common.in \
        tools/vcluster-setup.in \
-         $(python_scripts) \
+       $(python_scripts) \
        $(docinput) \
        doc/html \
        $(BUILT_EXAMPLES:%=%.in) \
@@@ -1744,9 -1714,6 +1744,9 @@@ TEST_FILES = 
        test/autotools/autotools-check-news.test \
        test/data/htools/clean-nonzero-score.data \
        test/data/htools/common-suffix.data \
 +      test/data/htools/dyn1.json \
 +      test/data/htools/dyn2.json \
 +      test/data/htools/dyn3.json \
        test/data/htools/empty-cluster.data \
        test/data/htools/hail-alloc-dedicated-1.json \
        test/data/htools/hail-alloc-desired-location.json \
        test/data/htools/hail-alloc-secondary.json \
        test/data/htools/hail-alloc-spindles.json \
        test/data/htools/hail-alloc-twodisks.json \
 +      test/data/htools/hail-alloc-memory-over-commitment.json \
        test/data/htools/hail-change-group.json \
        test/data/htools/hail-invalid-reloc.json \
        test/data/htools/hail-node-evac.json \
        test/data/htools/hail-reloc-drbd.json \
        test/data/htools/hail-reloc-drbd-crowded.json \
 +      test/data/htools/hbal-avoid-disk-moves.data \
        test/data/htools/hbal-cpu-speed.data \
        test/data/htools/hbal-desiredlocation-1.data \
        test/data/htools/hbal-desiredlocation-2.data \
        test/data/htools/hbal-desiredlocation-3.data \
        test/data/htools/hbal-desiredlocation-4.data \
        test/data/htools/hbal-dyn.data \
 +      test/data/htools/hbal-dyn2.data \
        test/data/htools/hbal-evac.data \
        test/data/htools/hbal-excl-tags.data \
        test/data/htools/hbal-forth.data \
        test/data/htools/hbal-location-1.data \
        test/data/htools/hbal-location-exclusion.data \
        test/data/htools/hbal-location-2.data \
 +      test/data/htools/hbal-memory-over-commitment.data \
 +      test/data/htools/hbal-memory-over-commitment-2.data \
        test/data/htools/hbal-migration-1.data \
        test/data/htools/hbal-migration-2.data \
        test/data/htools/hbal-migration-3.data \
        test/data/bdev-rbd/output_invalid.txt \
        test/data/cert1.pem \
        test/data/cert2.pem \
-         test/data/cgroup_root/memory/lxc/instance1/memory.limit_in_bytes \
-         test/data/cgroup_root/cpuset/some_group/lxc/instance1/cpuset.cpus \
-         test/data/cgroup_root/devices/some_group/lxc/instance1/devices.list \
+       test/data/cgroup_root/memory/lxc/instance1/memory.limit_in_bytes \
+       test/data/cgroup_root/cpuset/some_group/lxc/instance1/cpuset.cpus \
+       test/data/cgroup_root/devices/some_group/lxc/instance1/devices.list \
        test/data/cluster_config_2.7.json \
        test/data/cluster_config_2.8.json \
        test/data/cluster_config_2.9.json \
        test/data/cluster_config_2.13.json \
        test/data/cluster_config_2.14.json \
        test/data/cluster_config_2.15.json \
 +      test/data/cluster_config_2.16.json \
 +      test/data/cluster_config_2.17.json \
        test/data/instance-minor-pairing.txt \
        test/data/instance-disks.txt \
        test/data/ip-addr-show-dummy0.txt \
        test/data/ovfdata/wrong_ova.ova \
        test/data/ovfdata/wrong_xml.ovf \
        test/data/proc_cgroup.txt \
-         test/data/proc_diskstats.txt \
+       test/data/proc_diskstats.txt \
        test/data/proc_drbd8.txt \
        test/data/proc_drbd80-emptyline.txt \
        test/data/proc_drbd80-emptyversion.txt \
@@@ -2757,13 -2717,27 +2757,27 @@@ PEP8_EXCLUDE = $(subst $(space),$(comma
  
  # A space-separated list of pylint warnings to completely ignore:
  # I0013 = disable warnings for ignoring whole files
- LINT_DISABLE = I0013
+ # R0912 = disable too many branches warning. It's useful, but ganeti requires
+ #         a lot of refactoring to fix this.
+ # R0204 = disable redefined-variable-type warning. There are a large number of
+ #         cases where Ganeti assigns multiple types (eg set/list, float/int) to
+ #         the same variable, and these are benign.
+ # C0325 = disable superfluous-parens. There are a lot of cases where this is
+ #         overzealous, eg where we use parens to make it clear that we're
+ #         deliberately doing a comparison that should yield bool, or are using
+ #         parens clarify precedence or to allow multi-line expressions.
+ # C0330 = disable wrong indentation warnings. pylint is much more strict than
+ #         pep8, and it would be too invasive to fix all these.
+ LINT_DISABLE = I0013 R0912 R0204 C0325 C0330
  # Additional pylint options
  LINT_OPTS =
  # The combined set of pylint options
  LINT_OPTS_ALL = $(LINT_OPTS) \
    $(addprefix --disable=,$(LINT_DISABLE))
  
+ # Whitelist loading pycurl C extension for attribute checking
+ LINT_OPTS_ALL += --extension-pkg-whitelist=pycurl
  LINT_TARGETS = pylint pylint-qa pylint-test
  if HAS_PEP8
  LINT_TARGETS += pep8
@@@ -2964,7 -2938,7 +2978,7 @@@ TAGS: $(GENERATED_FILES
          $(filter-out -O -Werror,$(HFLAGS)) \
                -osuf tags.o \
                -hisuf tags.hi \
-     -lcurl \
+       -lcurl \
          $(HS_LIBTEST_SRCS)
        find . -path './lib/*.py' -o -path './scripts/gnt-*' -o \
          -path './daemons/ganeti-*' -o -path './tools/*' -o \
diff --combined lib/backend.py
  
  
  import base64
+ import contextlib
+ import collections
  import errno
  import logging
  import os
  import os.path
- import pycurl
  import random
  import re
  import shutil
@@@ -60,8 -61,8 +61,8 @@@ import sta
  import tempfile
  import time
  import zlib
- import contextlib
- import collections
+ import pycurl
  
  from ganeti import errors
  from ganeti import http
@@@ -1018,16 -1019,17 +1019,17 @@@ def _VerifySshSetup(node_status_list, m
      missing_uuids = set([])
      if pub_uuids_set != pot_mc_uuids_set:
        unknown_uuids = pub_uuids_set - pot_mc_uuids_set
+       pub_key_path = "%s:%s" % (my_name, ganeti_pub_keys_file)
        if unknown_uuids:
-         result.append("The following node UUIDs are listed in the public key"
-                       " file on node '%s', but are not potential master"
-                       " candidates: %s."
-                       % (my_name, ", ".join(list(unknown_uuids))))
+         result.append("The following node UUIDs are listed in the shared public"
+                       " keys file %s, but are not potential master"
+                       " candidates: %s." %
+                       (pub_key_path, ", ".join(list(unknown_uuids))))
        missing_uuids = pot_mc_uuids_set - pub_uuids_set
        if missing_uuids:
          result.append("The following node UUIDs of potential master candidates"
-                       " are missing in the public key file on node %s: %s."
-                       % (my_name, ", ".join(list(missing_uuids))))
+                       " are missing in the shared public keys file %s: %s." %
+                       (pub_key_path, ", ".join(list(missing_uuids))))
  
      (_, key_files) = \
        ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
      my_keys = pub_keys[my_uuid]
  
      node_pub_key = utils.ReadFile(node_pub_key_file)
+     node_pub_key_path = "%s:%s" % (my_name, node_pub_key_file)
      if node_pub_key.strip() not in my_keys:
-       result.append("The dsa key of node %s does not match this node's key"
-                     " in the pub key file." % my_name)
+       result.append("The key for node %s in the cluster config does not match"
+                     " this node's key in the node public key file %s." %
+                     (my_name, node_pub_key_path))
      if len(my_keys) != 1:
-       result.append("There is more than one key for node %s in the public key"
-                     " file." % my_name)
+       result.append("There is more than one key for node %s in the node public"
+                     " key file %s." % (my_name, node_pub_key_path))
    else:
      if len(pub_keys.keys()) > 0:
-       result.append("The public key file of node '%s' is not empty, although"
-                     " the node is not a potential master candidate."
-                     % my_name)
+       result.append("The public key file %s is not empty, although"
+                     " the node is not a potential master candidate." %
+                     node_pub_key_path)
  
    # Check that all master candidate keys are in the authorized_keys file
    (auth_key_file, _) = \
@@@ -1463,9 -1467,7 +1467,9 @@@ def AddNodeSshKey(node_uuid, node_name
                    pub_key_file=pathutils.SSH_PUB_KEYS,
                    ssconf_store=None,
                    noded_cert_file=pathutils.NODED_CERT_FILE,
 -                  run_cmd_fn=ssh.RunSshCmdWithStdin):
 +                  run_cmd_fn=ssh.RunSshCmdWithStdin,
 +                  ssh_update_debug=False,
 +                  ssh_update_verbose=False):
    """Distributes a node's public SSH key across the cluster.
  
    Note that this function should only be executed on the master node, which
                             pub_key_file=pub_key_file,
                             ssconf_store=ssconf_store,
                             noded_cert_file=noded_cert_file,
 -                           run_cmd_fn=run_cmd_fn)
 +                           run_cmd_fn=run_cmd_fn,
 +                           ssh_update_debug=ssh_update_debug,
 +                           ssh_update_verbose=ssh_update_verbose)
  
  
  # Node info named tuple specifically for the use with AddNodeSshKeyBulk
@@@ -1521,9 -1521,7 +1525,9 @@@ def AddNodeSshKeyBulk(node_list
                        pub_key_file=pathutils.SSH_PUB_KEYS,
                        ssconf_store=None,
                        noded_cert_file=pathutils.NODED_CERT_FILE,
 -                      run_cmd_fn=ssh.RunSshCmdWithStdin):
 +                      run_cmd_fn=ssh.RunSshCmdWithStdin,
 +                      ssh_update_debug=False,
 +                      ssh_update_verbose=False):
    """Distributes a node's public SSH key across the cluster.
  
    Note that this function should only be executed on the master node, which
          (constants.SSHS_OVERRIDE, all_keys)
  
        try:
 +        backoff = 5  # seconds
          utils.RetryByNumberOfTimes(
 -            constants.SSHS_MAX_RETRIES,
 +            constants.SSHS_MAX_RETRIES, backoff,
              errors.SshUpdateError,
              run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
              ssh_port_map.get(node_info.name), node_data,
 -            debug=False, verbose=False, use_cluster_key=False,
 -            ask_key=False, strict_host_check=False)
 +            debug=ssh_update_debug, verbose=ssh_update_verbose,
 +            use_cluster_key=False, ask_key=False, strict_host_check=False)
        except errors.SshUpdateError as e:
          # Clean up the master's public key file if adding key fails
          if node_info.to_public_keys:
      if node in potential_master_candidates:
        logging.debug("Updating SSH key files of node '%s'.", node)
        try:
 +        backoff = 5  # seconds
          utils.RetryByNumberOfTimes(
 -            constants.SSHS_MAX_RETRIES,
 -            errors.SshUpdateError,
 +            constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
              run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
              ssh_port_map.get(node), pot_mc_data,
 -            debug=False, verbose=False, use_cluster_key=False,
 -            ask_key=False, strict_host_check=False)
 +            debug=ssh_update_debug, verbose=ssh_update_verbose,
 +            use_cluster_key=False, ask_key=False, strict_host_check=False)
        except errors.SshUpdateError as last_exception:
          error_msg = ("When adding the key of node '%s', updating SSH key"
                       " files of node '%s' failed after %s retries."
        if to_authorized_keys:
          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
                     ssh_port_map.get(node), base_data,
 -                   debug=False, verbose=False, use_cluster_key=False,
 -                   ask_key=False, strict_host_check=False)
 +                   debug=ssh_update_debug, verbose=ssh_update_verbose,
 +                   use_cluster_key=False, ask_key=False,
 +                   strict_host_check=False)
  
    return node_errors
  
  
 +# TODO: will be fixed with pending patch series.
 +# pylint: disable=R0913
  def RemoveNodeSshKey(node_uuid, node_name,
                       master_candidate_uuids,
                       potential_master_candidates,
                       ssconf_store=None,
                       noded_cert_file=pathutils.NODED_CERT_FILE,
                       readd=False,
 -                     run_cmd_fn=ssh.RunSshCmdWithStdin):
 +                     run_cmd_fn=ssh.RunSshCmdWithStdin,
 +                     ssh_update_debug=False,
 +                     ssh_update_verbose=False):
    """Removes the node's SSH keys from the key files and distributes those.
  
    Note that at least one of the flags C{from_authorized_keys},
                                ssconf_store=ssconf_store,
                                noded_cert_file=noded_cert_file,
                                readd=readd,
 -                              run_cmd_fn=run_cmd_fn)
 +                              run_cmd_fn=run_cmd_fn,
 +                              ssh_update_debug=ssh_update_debug,
 +                              ssh_update_verbose=ssh_update_verbose)
  
  
  # Node info named tuple specifically for the use with RemoveNodeSshKeyBulk
@@@ -1773,9 -1763,7 +1777,9 @@@ def RemoveNodeSshKeyBulk(node_list
                           ssconf_store=None,
                           noded_cert_file=pathutils.NODED_CERT_FILE,
                           readd=False,
 -                         run_cmd_fn=ssh.RunSshCmdWithStdin):
 +                         run_cmd_fn=ssh.RunSshCmdWithStdin,
 +                         ssh_update_debug=False,
 +                         ssh_update_verbose=False):
    """Removes the node's SSH keys from the key files and distributes those.
  
    Note that at least one of the flags C{from_authorized_keys},
          if master_uuid:
            master_keys = ssh.QueryPubKeyFile([master_uuid],
                                              key_file=pub_key_file)
-           for master_key in master_keys:
-             if master_key in keys[node_info.uuid]:
-               keys[node_info.uuid].remove(master_key)
+           # Remove any master keys from the list of keys to remove from the node
+           keys[node_info.uuid] = list(
+               set(keys[node_info.uuid]) - set(master_keys))
  
        all_keys_to_remove.update(keys)
  
          error_msg_final = ("When removing the key of node '%s', updating the"
                             " SSH key files of node '%s' failed. Last error"
                             " was: %s.")
-         if node in potential_master_candidates:
-           logging.debug("Updating key setup of potential master candidate node"
-                         " %s.", node)
+         if node in potential_master_candidates or from_authorized_keys:
+           if node in potential_master_candidates:
+             node_desc = "potential master candidate"
+           else:
+             node_desc = "normal"
+           logging.debug("Updating key setup of %s node %s.", node_desc, node)
            try:
 +            backoff = 5  # seconds
              utils.RetryByNumberOfTimes(
 -                constants.SSHS_MAX_RETRIES,
 -                errors.SshUpdateError,
 +                constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
                  run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
                  ssh_port, pot_mc_data,
 -                debug=False, verbose=False, use_cluster_key=False,
 -                ask_key=False, strict_host_check=False)
 +                debug=ssh_update_debug, verbose=ssh_update_verbose,
 +                use_cluster_key=False, ask_key=False, strict_host_check=False)
            except errors.SshUpdateError as last_exception:
              error_msg = error_msg_final % (
                  node_info.name, node, last_exception)
              result_msgs.append((node, error_msg))
              logging.error(error_msg)
  
-         else:
-           if from_authorized_keys:
-             logging.debug("Updating key setup of normal node %s.", node)
-             try:
-               backoff = 5  # seconds
-               utils.RetryByNumberOfTimes(
-                   constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError,
-                   run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
-                   ssh_port, base_data,
-                   debug=ssh_update_debug, verbose=ssh_update_verbose,
-                   use_cluster_key=False, ask_key=False, strict_host_check=False)
-             except errors.SshUpdateError as last_exception:
-               error_msg = error_msg_final % (
-                   node_info.name, node, last_exception)
-               result_msgs.append((node, error_msg))
-               logging.error(error_msg)
    for node_info in node_list:
      if node_info.clear_authorized_keys or node_info.from_public_keys or \
          node_info.clear_public_keys:
        logging.debug("Updating SSH key setup of target node '%s'.",
                      node_info.name)
        try:
 +        backoff = 5  # seconds
          utils.RetryByNumberOfTimes(
 -            constants.SSHS_MAX_RETRIES,
 +            constants.SSHS_MAX_RETRIES, backoff,
              errors.SshUpdateError,
              run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE,
              ssh_port, data,
 -            debug=False, verbose=False, use_cluster_key=False,
 -            ask_key=False, strict_host_check=False)
 +            debug=ssh_update_debug, verbose=ssh_update_verbose,
 +            use_cluster_key=False, ask_key=False, strict_host_check=False)
        except errors.SshUpdateError as last_exception:
          result_msgs.append(
              (node_info.name,
        ssh.RemovePublicKey(node_uuid, key_file=pub_key_file)
  
    return result_msgs
 +# pylint: enable=R0913
 +
 +
 +def RemoveSshKeyFromPublicKeyFile(node_name,
 +                                  pub_key_file=pathutils.SSH_PUB_KEYS,
 +                                  ssconf_store=None):
 +  """Removes a SSH key from the master's public key file.
 +
 +  This is an operation that is only used to clean up after failed operations
 +  (for example failed hooks before adding a node). To avoid abuse of this
 +  function (and the matching RPC call), we add a safety check to make sure
 +  that only stray keys can be removed that belong to nodes that are not
 +  in the cluster (anymore).
 +
 +  @type node_name: string
 +  @param node_name: the name of the node whose key is removed
 +
 +  """
 +  if not ssconf_store:
 +    ssconf_store = ssconf.SimpleStore()
 +
 +  node_list = ssconf_store.GetNodeList()
 +
 +  if node_name in node_list:
 +    raise errors.SshUpdateError("Cannot remove key of node '%s',"
 +                                " because it still belongs to the cluster."
 +                                % node_name)
 +
 +  keys_by_name = ssh.QueryPubKeyFile([node_name], key_file=pub_key_file)
 +  if not keys_by_name or node_name not in keys_by_name:
 +    logging.info("The node '%s' whose key is supposed to be removed does not"
 +                 " have an entry in the public key file. Hence, there is"
 +                 " nothing left to do.", node_name)
  
 +  ssh.RemovePublicKey(node_name, key_file=pub_key_file)
  
 -def _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, ssh_key_type,
 -                        ssh_key_bits, pub_key_file=pathutils.SSH_PUB_KEYS,
 +
 +def _GenerateNodeSshKey(node_name, ssh_port_map, ssh_key_type, ssh_key_bits,
                          ssconf_store=None,
                          noded_cert_file=pathutils.NODED_CERT_FILE,
                          run_cmd_fn=ssh.RunSshCmdWithStdin,
 -                        suffix=""):
 +                        suffix="",
 +                        ssh_update_debug=False,
 +                        ssh_update_verbose=False):
    """Generates the root SSH key pair on the node.
  
 -  @type node_uuid: str
 -  @param node_uuid: UUID of the node whose key is removed
    @type node_name: str
    @param node_name: name of the node whose key is remove
    @type ssh_port_map: dict of str to int
    if not ssconf_store:
      ssconf_store = ssconf.SimpleStore()
  
 -  keys_by_uuid = ssh.QueryPubKeyFile([node_uuid], key_file=pub_key_file)
 -  if not keys_by_uuid or node_uuid not in keys_by_uuid:
 -    raise errors.SshUpdateError("Node %s (UUID: %s) whose key is requested to"
 -                                " be regenerated is not registered in the"
 -                                " public keys file." % (node_name, node_uuid))
 -
    data = {}
    _InitSshUpdateData(data, noded_cert_file, ssconf_store)
    cluster_name = data[constants.SSHS_CLUSTER_NAME]
  
    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
               ssh_port_map.get(node_name), data,
 -             debug=False, verbose=False, use_cluster_key=False,
 -             ask_key=False, strict_host_check=False)
 +             debug=ssh_update_debug, verbose=ssh_update_verbose,
 +             use_cluster_key=False, ask_key=False, strict_host_check=False)
  
  
  def _GetMasterNodeUUID(node_uuid_name_map, master_node_name):
@@@ -2100,15 -2047,58 +2092,15 @@@ def _GetOldMasterKeys(master_node_uuid
    return old_master_keys_by_uuid
  
  
 -def _GetNewMasterKey(root_keyfiles, master_node_uuid):
 -  new_master_keys = []
 -  for (_, (_, public_key_file)) in root_keyfiles.items():
 -    public_key_dir = os.path.dirname(public_key_file)
 -    public_key_file_tmp_filename = \
 -        os.path.splitext(os.path.basename(public_key_file))[0] \
 -        + constants.SSHS_MASTER_SUFFIX + ".pub"
 -    public_key_path_tmp = os.path.join(public_key_dir,
 -                                       public_key_file_tmp_filename)
 -    if os.path.exists(public_key_path_tmp):
 -      # for some key types, there might not be any keys
 -      key = utils.ReadFile(public_key_path_tmp)
 -      new_master_keys.append(key)
 -  if not new_master_keys:
 -    raise errors.SshUpdateError("Cannot find any type of temporary SSH key.")
 -  return {master_node_uuid: new_master_keys}
 -
 -
 -def _ReplaceMasterKeyOnMaster(root_keyfiles):
 -  number_of_moves = 0
 -  for (_, (private_key_file, public_key_file)) in root_keyfiles.items():
 -    key_dir = os.path.dirname(public_key_file)
 -    private_key_file_tmp = \
 -      os.path.basename(private_key_file) + constants.SSHS_MASTER_SUFFIX
 -    public_key_file_tmp = private_key_file_tmp + ".pub"
 -    private_key_path_tmp = os.path.join(key_dir,
 -                                        private_key_file_tmp)
 -    public_key_path_tmp = os.path.join(key_dir,
 -                                       public_key_file_tmp)
 -    if os.path.exists(public_key_file):
 -      utils.CreateBackup(public_key_file)
 -      utils.RemoveFile(public_key_file)
 -    if os.path.exists(private_key_file):
 -      utils.CreateBackup(private_key_file)
 -      utils.RemoveFile(private_key_file)
 -    if os.path.exists(public_key_path_tmp) and \
 -        os.path.exists(private_key_path_tmp):
 -      # for some key types, there might not be any keys
 -      shutil.move(public_key_path_tmp, public_key_file)
 -      shutil.move(private_key_path_tmp, private_key_file)
 -      number_of_moves += 1
 -  if not number_of_moves:
 -    raise errors.SshUpdateError("Could not move at least one master SSH key.")
 -
 -
  def RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
                   potential_master_candidates, old_key_type, new_key_type,
                   new_key_bits,
                   ganeti_pub_keys_file=pathutils.SSH_PUB_KEYS,
                   ssconf_store=None,
                   noded_cert_file=pathutils.NODED_CERT_FILE,
 -                 run_cmd_fn=ssh.RunSshCmdWithStdin):
 +                 run_cmd_fn=ssh.RunSshCmdWithStdin,
 +                 ssh_update_debug=False,
 +                 ssh_update_verbose=False):
    """Renews all SSH keys and updates authorized_keys and ganeti_pub_keys.
  
    @type node_uuids: list of str
      raise errors.ProgrammerError("List of nodes UUIDs and node names"
                                   " does not match in length.")
  
 -  (_, root_keyfiles) = \
 -    ssh.GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
 -  (_, old_pub_keyfile) = root_keyfiles[old_key_type]
 -  (_, new_pub_keyfile) = root_keyfiles[new_key_type]
 -  old_master_key = utils.ReadFile(old_pub_keyfile)
 +  old_pub_keyfile = ssh.GetSshPubKeyFilename(old_key_type)
 +  new_pub_keyfile = ssh.GetSshPubKeyFilename(new_key_type)
 +  old_master_key = ssh.ReadLocalSshPubKeys([old_key_type])
  
    node_uuid_name_map = zip(node_uuids, node_names)
  
      node_list.append((node_uuid, node_name, master_candidate,
                        potential_master_candidate))
  
 -    keys_by_uuid = ssh.QueryPubKeyFile([node_uuid],
 -                                       key_file=ganeti_pub_keys_file)
 -    if not keys_by_uuid:
 -      raise errors.SshUpdateError("No public key of node %s (UUID %s) found,"
 -                                  " not generating a new key."
 -                                  % (node_name, node_uuid))
 -
      if master_candidate:
        logging.debug("Fetching old SSH key from node '%s'.", node_name)
 -      old_pub_key = ssh.ReadRemoteSshPubKeys(old_pub_keyfile,
 -                                             node_name, cluster_name,
 -                                             ssh_port_map[node_name],
 -                                             False, # ask_key
 -                                             False) # key_check
 +      old_pub_key = ssh.ReadRemoteSshPubKey(old_pub_keyfile,
 +                                            node_name, cluster_name,
 +                                            ssh_port_map[node_name],
 +                                            False, # ask_key
 +                                            False) # key_check
        if old_pub_key != old_master_key:
          # If we are already in a multi-key setup (that is past Ganeti 2.12),
          # we can safely remove the old key of the node. Otherwise, we cannot
          node_info_to_remove,
          master_candidate_uuids,
          potential_master_candidates,
 -        master_uuid=master_node_uuid)
 +        master_uuid=master_node_uuid,
 +        pub_key_file=ganeti_pub_keys_file,
 +        ssconf_store=ssconf_store,
 +        noded_cert_file=noded_cert_file,
 +        run_cmd_fn=run_cmd_fn,
 +        ssh_update_debug=ssh_update_debug,
 +        ssh_update_verbose=ssh_update_verbose)
      if node_errors:
        all_node_errors = all_node_errors + node_errors
  
        in node_list:
  
      logging.debug("Generating new SSH key for node '%s'.", node_name)
 -    _GenerateNodeSshKey(node_uuid, node_name, ssh_port_map, new_key_type,
 -                        new_key_bits, pub_key_file=ganeti_pub_keys_file,
 +    _GenerateNodeSshKey(node_name, ssh_port_map, new_key_type, new_key_bits,
                          ssconf_store=ssconf_store,
                          noded_cert_file=noded_cert_file,
 -                        run_cmd_fn=run_cmd_fn)
 +                        run_cmd_fn=run_cmd_fn,
 +                        ssh_update_verbose=ssh_update_verbose,
 +                        ssh_update_debug=ssh_update_debug)
  
      try:
        logging.debug("Fetching newly created SSH key from node '%s'.", node_name)
 -      pub_key = ssh.ReadRemoteSshPubKeys(new_pub_keyfile,
 -                                         node_name, cluster_name,
 -                                         ssh_port_map[node_name],
 -                                         False, # ask_key
 -                                         False) # key_check
 +      pub_key = ssh.ReadRemoteSshPubKey(new_pub_keyfile,
 +                                        node_name, cluster_name,
 +                                        ssh_port_map[node_name],
 +                                        False, # ask_key
 +                                        False) # key_check
      except:
        raise errors.SshUpdateError("Could not fetch key of node %s"
                                    " (UUID %s)" % (node_name, node_uuid))
        node_keys_to_add, potential_master_candidates,
        pub_key_file=ganeti_pub_keys_file, ssconf_store=ssconf_store,
        noded_cert_file=noded_cert_file,
 -      run_cmd_fn=run_cmd_fn)
 +      run_cmd_fn=run_cmd_fn,
 +      ssh_update_debug=ssh_update_debug,
 +      ssh_update_verbose=ssh_update_verbose)
    if node_errors:
      all_node_errors = all_node_errors + node_errors
  
  
    # Generate a new master key with a suffix, don't touch the old one for now
    logging.debug("Generate new ssh key of master.")
 -  _GenerateNodeSshKey(master_node_uuid, master_node_name, ssh_port_map,
 +  _GenerateNodeSshKey(master_node_name, ssh_port_map,
                        new_key_type, new_key_bits,
                        ssconf_store=ssconf_store,
                        noded_cert_file=noded_cert_file,
                        run_cmd_fn=run_cmd_fn,
 -                      suffix=constants.SSHS_MASTER_SUFFIX)
 +                      suffix=constants.SSHS_MASTER_SUFFIX,
 +                      ssh_update_debug=ssh_update_debug,
 +                      ssh_update_verbose=ssh_update_verbose)
    # Read newly created master key
 -  new_master_key_dict = _GetNewMasterKey(root_keyfiles, master_node_uuid)
 +  new_master_keys = ssh.ReadLocalSshPubKeys(
 +      [new_key_type], suffix=constants.SSHS_MASTER_SUFFIX)
  
    # Replace master key in the master nodes' public key file
    ssh.RemovePublicKey(master_node_uuid, key_file=ganeti_pub_keys_file)
 -  for pub_key in new_master_key_dict[master_node_uuid]:
 +  for pub_key in new_master_keys:
      ssh.AddPublicKey(master_node_uuid, pub_key, key_file=ganeti_pub_keys_file)
  
    # Add new master key to all node's public and authorized keys
        to_authorized_keys=True, to_public_keys=True,
        get_public_keys=False, pub_key_file=ganeti_pub_keys_file,
        ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
 -      run_cmd_fn=run_cmd_fn)
 +      run_cmd_fn=run_cmd_fn,
 +      ssh_update_debug=ssh_update_debug,
 +      ssh_update_verbose=ssh_update_verbose)
    if node_errors:
      all_node_errors = all_node_errors + node_errors
  
    # Remove the old key file and rename the new key to the non-temporary filename
 -  _ReplaceMasterKeyOnMaster(root_keyfiles)
 +  ssh.ReplaceSshKeys(new_key_type, new_key_type,
 +                     src_key_suffix=constants.SSHS_MASTER_SUFFIX)
  
    # Remove old key from authorized keys
    (auth_key_file, _) = \
        potential_master_candidates,
        keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
        from_public_keys=False, clear_authorized_keys=False,
 -      clear_public_keys=False)
 +      clear_public_keys=False,
 +      pub_key_file=ganeti_pub_keys_file,
 +      ssconf_store=ssconf_store,
 +      noded_cert_file=noded_cert_file,
 +      run_cmd_fn=run_cmd_fn,
 +      ssh_update_debug=ssh_update_debug,
 +      ssh_update_verbose=ssh_update_verbose)
    if node_errors:
      all_node_errors = all_node_errors + node_errors
  
@@@ -2921,15 -2900,21 +2913,21 @@@ def StartInstance(instance, startup_pau
    @rtype: None
  
    """
-   instance_info = _GetInstanceInfo(instance)
+   try:
+     instance_info = _GetInstanceInfo(instance)
+     hyper = hypervisor.GetHypervisor(instance.hypervisor)
  
-   if instance_info and not _IsInstanceUserDown(instance_info):
-     logging.info("Instance '%s' already running, not starting", instance.name)
-     return
+     if instance_info and not _IsInstanceUserDown(instance_info):
+       logging.info("Instance '%s' already running, not starting", instance.name)
+       if hyper.VerifyInstance(instance):
+         return
+       logging.info("Instance '%s' hypervisor config out of date. Restoring.",
+                    instance.name)
+       block_devices = _GatherAndLinkBlockDevs(instance)
+       hyper.RestoreInstance(instance, block_devices)
+       return
  
-   try:
      block_devices = _GatherAndLinkBlockDevs(instance)
-     hyper = hypervisor.GetHypervisor(instance.hypervisor)
      hyper.StartInstance(instance, block_devices, startup_paused)
      if store_reason:
        _StoreInstReasonTrail(instance.name, reason)
@@@ -3050,9 -3035,8 +3048,8 @@@ def InstanceReboot(instance, reboot_typ
    elif reboot_type == constants.INSTANCE_REBOOT_HARD:
      try:
        InstanceShutdown(instance, shutdown_timeout, reason, store_reason=False)
-       result = StartInstance(instance, False, reason, store_reason=False)
+       StartInstance(instance, False, reason, store_reason=False)
        _StoreInstReasonTrail(instance.name, reason)
-       return result
      except errors.HypervisorError, err:
        _Fail("Failed to hard reboot instance '%s': %s", instance.name, err)
    else:
@@@ -5788,25 -5772,18 +5785,25 @@@ def _PrepareRestrictedCmd(path, cmd
    return _verify_cmd(path, cmd)
  
  
 -def RunRestrictedCmd(cmd,
 -                     _lock_timeout=_RCMD_LOCK_TIMEOUT,
 -                     _lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
 -                     _path=pathutils.RESTRICTED_COMMANDS_DIR,
 -                     _sleep_fn=time.sleep,
 -                     _prepare_fn=_PrepareRestrictedCmd,
 -                     _runcmd_fn=utils.RunCmd,
 -                     _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
 -  """Executes a restricted command after performing strict tests.
 +def RunConstrainedCmd(cmd,
 +                      lock_file,
 +                      path,
 +                      inp=None,
 +                      _lock_timeout=_RCMD_LOCK_TIMEOUT,
 +                      _sleep_fn=time.sleep,
 +                      _prepare_fn=_PrepareRestrictedCmd,
 +                      _runcmd_fn=utils.RunCmd,
 +                      _enabled=constants.ENABLE_RESTRICTED_COMMANDS):
 +  """Executes a command after performing strict tests.
  
    @type cmd: string
    @param cmd: Command name
 +  @type lock_file: string
 +  @param lock_file: path to the lock file
 +  @type path: string
 +  @param path: path to the directory in which the command is present
 +  @type inp: string
 +  @param inp: Input to be passed to the command
    @rtype: string
    @return: Command output
    @raise RPCFail: In case of an error
    try:
      cmdresult = None
      try:
 -      lock = utils.FileLock.Open(_lock_file)
 +      lock = utils.FileLock.Open(lock_file)
        lock.Exclusive(blocking=True, timeout=_lock_timeout)
  
 -      (status, value) = _prepare_fn(_path, cmd)
 +      (status, value) = _prepare_fn(path, cmd)
  
        if status:
 +        if inp:
 +          input_fd = tempfile.TemporaryFile()
 +          input_fd.write(inp)
 +          input_fd.flush()
 +          input_fd.seek(0)
 +        else:
 +          input_fd = None
          cmdresult = _runcmd_fn([value], env={}, reset_env=True,
 -                               postfork_fn=lambda _: lock.Unlock())
 +                               postfork_fn=lambda _: lock.Unlock(),
 +                               input_fd=input_fd)
 +        if input_fd:
 +          input_fd.close()
        else:
          logging.error(value)
      except Exception: # pylint: disable=W0703
diff --combined lib/bootstrap.py
@@@ -679,7 -679,7 +679,7 @@@ def InitCluster(cluster_name, mac_prefi
    for template, dt_params in diskparams.items():
      param_keys = set(dt_params.keys())
      default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
-     if not (param_keys <= default_param_keys):
+     if param_keys > default_param_keys:
        unknown_params = param_keys - default_param_keys
        raise errors.OpPrereqError("Invalid parameters for disk template %s:"
                                   " %s" % (template,
@@@ -867,7 -867,6 +867,7 @@@ def InitConfig(version, cluster_config
      default_nodegroup.uuid: default_nodegroup,
      }
    now = time.time()
 +  maintenance = objects.Maintenance(serial_no=1, ctime=now, mtime=now)
    config_data = objects.ConfigData(version=version,
                                     cluster=cluster_config,
                                     nodegroups=nodegroups,
                                     networks={},
                                     disks={},
                                     filters={},
 +                                   maintenance=maintenance,
                                     serial_no=1,
                                     ctime=now, mtime=now)
    utils.WriteFile(cfg_file,
@@@ -936,8 -934,6 +936,8 @@@ def SetupNodeDaemon(opts, cluster_name
      constants.NDS_CLUSTER_NAME: cluster_name,
      constants.NDS_NODE_DAEMON_CERTIFICATE:
        utils.ReadFile(pathutils.NODED_CERT_FILE),
 +    constants.NDS_HMAC:
 +      utils.ReadFile(pathutils.CONFD_HMAC_KEY),
      constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
      constants.NDS_START_NODE_DAEMON: True,
      constants.NDS_NODE_NAME: node,
diff --combined lib/cli.py
@@@ -39,7 -39,9 +39,9 @@@ import loggin
  import errno
  import itertools
  import shlex
  from cStringIO import StringIO
+ from optparse import (OptionParser, TitledHelpFormatter)
  
  from ganeti import utils
  from ganeti import errors
@@@ -55,12 -57,10 +57,10 @@@ from ganeti import pathutil
  from ganeti import serializer
  import ganeti.cli_opts
  # Import constants
- from ganeti.cli_opts import *  # pylint: disable=W0401
+ from ganeti.cli_opts import *  # pylint: disable=W0401,W0614
  
  from ganeti.runtime import (GetClient)
  
- from optparse import (OptionParser, TitledHelpFormatter)
  
  __all__ = [
    # Generic functions for CLI programs
@@@ -1715,8 -1715,8 +1715,8 @@@ def GenerateTable(headers, fields, sepa
    if unitfields is None:
      unitfields = []
  
-   numfields = utils.FieldSet(*numfields)   # pylint: disable=W0142
-   unitfields = utils.FieldSet(*unitfields) # pylint: disable=W0142
+   numfields = utils.FieldSet(*numfields)
+   unitfields = utils.FieldSet(*unitfields)
  
    format_fields = []
    for field in fields:
@@@ -2341,10 -2341,9 +2341,9 @@@ def GetNodesSshPorts(nodes, cl)
    @rtype: a list of tuples
  
    """
-   return [t[0] for t in
-              cl.QueryNodes(names=nodes,
-                            fields=["ndp/ssh_port"],
-                            use_locking=False)]
+   return [t[0] for t in cl.QueryNodes(names=nodes,
+                                       fields=["ndp/ssh_port"],
+                                       use_locking=False)]
  
  
  def GetNodeUUIDs(nodes, cl):
    @rtype: a list of tuples
  
    """
-   return [t[0] for t in
-              cl.QueryNodes(names=nodes,
-                            fields=["uuid"],
-                            use_locking=False)]
+   return [t[0] for t in cl.QueryNodes(names=nodes,
+                                       fields=["uuid"],
+                                       use_locking=False)]
  
  
  def _ToStream(stream, txt, *args):
@@@ -2781,7 -2779,7 +2779,7 @@@ def _InitISpecsFromSplitOpts(ipolicy, i
    else:
      forced_type = TISPECS_CLUSTER_TYPES
    for specs in ispecs_transposed.values():
-     assert type(specs) is dict
+     assert isinstance(specs, dict)
      utils.ForceDictType(specs, forced_type)
  
    # then transpose
@@@ -2877,7 -2875,6 +2875,7 @@@ def CreateIPolicyFromOpts(ispecs_mem_si
                            ipolicy_disk_templates=None,
                            ipolicy_vcpu_ratio=None,
                            ipolicy_spindle_ratio=None,
 +                          ipolicy_memory_ratio=None,
                            group_ipolicy=False,
                            allowed_values=None,
                            fill_all=False):
  
    split_specs = (ispecs_mem_size or ispecs_cpu_count or ispecs_disk_count or
                   ispecs_disk_size or ispecs_nic_count)
-   if (split_specs and (minmax_ispecs is not None or std_ispecs is not None)):
+   if split_specs and (minmax_ispecs is not None or std_ispecs is not None):
      raise errors.OpPrereqError("A --specs-xxx option cannot be specified"
                                 " together with any --ipolicy-xxx-specs option",
                                 errors.ECODE_INVAL)
      _InitISpecsFromSplitOpts(ipolicy_out, ispecs_mem_size, ispecs_cpu_count,
                               ispecs_disk_count, ispecs_disk_size,
                               ispecs_nic_count, group_ipolicy, fill_all)
-   elif (minmax_ispecs is not None or std_ispecs is not None):
+   elif minmax_ispecs is not None or std_ispecs is not None:
      _InitISpecsFromFullOpts(ipolicy_out, minmax_ispecs, std_ispecs,
                              group_ipolicy, allowed_values)
  
      ipolicy_out[constants.IPOLICY_VCPU_RATIO] = ipolicy_vcpu_ratio
    if ipolicy_spindle_ratio is not None:
      ipolicy_out[constants.IPOLICY_SPINDLE_RATIO] = ipolicy_spindle_ratio
 +  if ipolicy_memory_ratio is not None:
 +    ipolicy_out[constants.IPOLICY_MEMORY_RATIO] = ipolicy_memory_ratio
  
    assert not (frozenset(ipolicy_out.keys()) - constants.IPOLICY_ALL_KEYS)
  
@@@ -2932,7 -2927,7 +2930,7 @@@ def _NotAContainer(data)
    @rtype: bool
  
    """
-   return not (isinstance(data, (list, dict, tuple)))
+   return not isinstance(data, (list, dict, tuple))
  
  
  def _GetAlignmentMapping(data):
diff --combined lib/cli_opts.py
@@@ -31,6 -31,9 +31,9 @@@
  """Module containing Ganeti's command line parsing options"""
  
  import re
+ from optparse import (Option, OptionValueError)
  import simplejson
  
  from ganeti import utils
@@@ -40,8 -43,6 +43,6 @@@ from ganeti import compa
  from ganeti import pathutils
  from ganeti import serializer
  
- from optparse import (Option, OptionValueError)
  
  __all__ = [
    "ABSOLUTE_OPT",
@@@ -82,7 -83,6 +83,7 @@@
    "DST_NODE_OPT",
    "EARLY_RELEASE_OPT",
    "ENABLED_DATA_COLLECTORS_OPT",
 +  "DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT",
    "ENABLED_DISK_TEMPLATES_OPT",
    "ENABLED_HV_OPT",
    "ENABLED_USER_SHUTDOWN_OPT",
    "IGNORE_SOFT_ERRORS_OPT",
    "IGNORE_SIZE_OPT",
    "INCLUDEDEFAULTS_OPT",
 +  "INPUT_OPT",
    "INSTALL_IMAGE_OPT",
    "INSTANCE_COMMUNICATION_NETWORK_OPT",
    "INSTANCE_COMMUNICATION_OPT",
    "IPOLICY_STD_SPECS_OPT",
    "IPOLICY_STD_SPECS_STR",
    "IPOLICY_VCPU_RATIO",
 +  "IPOLICY_MEMORY_RATIO",
    "LONG_SLEEP_OPT",
    "MAC_PREFIX_OPT",
 +  "MAINT_BALANCE_OPT",
 +  "MAINT_BALANCE_THRESHOLD_OPT",
 +  "MAINT_INTERVAL_OPT",
    "MAINTAIN_NODE_HEALTH_OPT",
    "MASTER_NETDEV_OPT",
    "MASTER_NETMASK_OPT",
    "MC_OPT",
    "MIGRATION_MODE_OPT",
    "MODIFY_ETCHOSTS_OPT",
 +  "MODIFY_SSH_SETUP_OPT",
    "NET_OPT",
    "NETWORK6_OPT",
    "NETWORK_OPT",
@@@ -560,7 -554,7 +561,7 @@@ class CliOption(Option)
  
  
  # optparse.py sets make_option, so we do it for our own option class, too
- cli_option = CliOption
+ cli_option = CliOption # pylint: disable=C0103
  
  
  _YORNO = "yes|no"
@@@ -814,13 -808,6 +815,13 @@@ IPOLICY_SPINDLE_RATIO = cli_option("--i
                                     help=("The maximum allowed instances to"
                                           " spindle ratio"))
  
 +IPOLICY_MEMORY_RATIO = cli_option("--ipolicy-memory-ratio",
 +                                   dest="ipolicy_memory_ratio",
 +                                   type="maybefloat", default=None,
 +                                   help=("The maximum allowed used memory to"
 +                                         " physicall memory ratio (in terms of"
 +                                         " memory overcommitment)"))
 +
  HYPERVISOR_OPT = cli_option("-H", "--hypervisor-parameters", dest="hypervisor",
                              help="Hypervisor and hypervisor options, in the"
                              " format hypervisor:option=value,option=value,...",
@@@ -1114,21 -1101,6 +1115,21 @@@ COMPRESSION_TOOLS_OPT = 
                 help="Comma-separated list of compression tools which are"
                      " allowed to be used by Ganeti in various operations")
  
 +MAINT_INTERVAL_OPT = \
 +  cli_option("--maintenance-interval", dest="maint_round_delay", type="int",
 +             default=None, help="Minimal time in seconds, the maintenance"
 +             " daemon waits between rounds")
 +
 +MAINT_BALANCE_OPT = \
 +  cli_option("--auto-balance-cluster", dest="maint_balance", type="bool",
 +             default=None, metavar=_YORNO, help="Whether the maintenance"
 +             " daemon should balance the cluster")
 +
 +MAINT_BALANCE_THRESHOLD_OPT = \
 +  cli_option("--auto-balance-threshold", dest="maint_balance_threshold",
 +             type="float", default=None, metavar="CLUSTERSCORE",
 +             help="Minimal gain for an auto-balancing step to be taken")
 +
  VG_NAME_OPT = cli_option("--vg-name", dest="vg_name",
                           help=("Enables LVM and specifies the volume group"
                                 " name (cluster-wide) for disk allocation"
@@@ -1207,12 -1179,6 +1208,12 @@@ NOMODIFY_SSH_SETUP_OPT = cli_option("--
                                      help="Don't initialize SSH keys",
                                      action="store_false", default=True)
  
 +MODIFY_SSH_SETUP_OPT = \
 + cli_option("--modify-ssh-setup", dest="modify_ssh_setup", metavar=_YORNO,
 +            default=None, type="bool",
 +            help="Defines whether the cluster should update node SSH keys"
 +            " on node add and on renew-crypto")
 +
  ERROR_CODES_OPT = cli_option("--error-codes", dest="error_codes",
                               help="Enable parseable error messages",
                               action="store_true", default=False)
@@@ -1623,17 -1589,6 +1624,17 @@@ ENABLED_DATA_COLLECTORS_OPT = 
                 "in the format collector=bool, where collector is one of %s."
                 % ", ".join(constants.DATA_COLLECTOR_NAMES))
  
 +DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT = \
 +    cli_option("--diagnose-data-collector-filename",
 +                         dest="diagnose_data_collector_filename",
 +                         help=("Set's the file name of the script"
 +                               " diagnose data collector should run"
 +                               " If this value is empty string, the collector"
 +                               " will return a success value"
 +                               " without running anything"),
 +                         type="string")
 +
 +
  VERIFY_CLUTTER_OPT = cli_option(
      "--verify-ssh-clutter", default=False, dest="verify_clutter",
      help="Verify that Ganeti did not clutter"
@@@ -1643,11 -1598,6 +1644,11 @@@ LONG_SLEEP_OPT = cli_option
      "--long-sleep", default=False, dest="long_sleep",
      help="Allow long shutdowns when backing up instances", action="store_true")
  
 +INPUT_OPT = cli_option("--input", dest="input", default=None,
 +                       help=("input to be passed as stdin"
 +                             " to the repair command"),
 +                       type="string")
 +
  SSH_KEY_TYPE_OPT = \
      cli_option("--ssh-key-type", default=None,
                 choices=list(constants.SSHK_ALL), dest="ssh_key_type",
@@@ -1705,7 -1655,6 +1706,7 @@@ INSTANCE_POLICY_OPTS = 
    IPOLICY_DISK_TEMPLATES,
    IPOLICY_VCPU_RATIO,
    IPOLICY_SPINDLE_RATIO,
 +  IPOLICY_MEMORY_RATIO,
    ]
  
  # instance policy split specs options
  # W0614: Unused import %s from wildcard import (since we need cli)
  # C0103: Invalid name gnt-cluster
  
- from cStringIO import StringIO
+ import itertools
  import os
  import time
- import OpenSSL
  import tempfile
- import itertools
+ from cStringIO import StringIO
+ import OpenSSL
  
  from ganeti.cli import *
  from ganeti import bootstrap
@@@ -59,7 -61,6 +61,7 @@@ from ganeti import sscon
  from ganeti import ssh
  from ganeti import uidpool
  from ganeti import utils
 +from ganeti import wconfd
  from ganeti.client import base
  
  
@@@ -95,10 -96,6 +97,10 @@@ DATA_COLLECTOR_INTERVAL_OPT = cli_optio
      "--data-collector-interval", default={}, type="keyval",
      help="Set collection intervals in seconds of data collectors.")
  
 +STRICT_OPT = cli_option("--no-strict", default=False,
 +                        dest="no_strict", action="store_true",
 +                        help="Do not run group verify in strict mode")
 +
  _EPO_PING_INTERVAL = 30 # 30 seconds between pings
  _EPO_PING_TIMEOUT = 1 # 1 second
  _EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
@@@ -203,7 -200,7 +205,7 @@@ def InitCluster(opts, args)
    # check the disk template types here, as we cannot rely on the type check done
    # by the opcode parameter types
    diskparams_keys = set(diskparams.keys())
-   if not (diskparams_keys <= constants.DISK_TEMPLATES):
+   if diskparams_keys > constants.DISK_TEMPLATES:
      unknown = utils.NiceSort(diskparams_keys - constants.DISK_TEMPLATES)
      ToStderr("Disk templates unknown: %s" % utils.CommaJoin(unknown))
      return 1
      ipolicy_disk_templates=opts.ipolicy_disk_templates,
      ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
      ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
 +    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
      fill_all=True)
  
    if opts.candidate_pool_size is None:
  
    default_ialloc_params = opts.default_iallocator_params
  
-   if opts.enabled_user_shutdown:
-     enabled_user_shutdown = True
-   else:
-     enabled_user_shutdown = False
+   enabled_user_shutdown = bool(opts.enabled_user_shutdown)
  
    if opts.ssh_key_type:
      ssh_key_type = opts.ssh_key_type
@@@ -804,8 -797,7 +803,8 @@@ def VerifyDisks(opts, args)
    """
    cl = GetClient()
  
 -  op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup)
 +  op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup,
 +                                    is_strict=not opts.no_strict)
  
    result = SubmitOpCode(op, cl=cl, opts=opts)
  
          if all_missing:
            ToStdout("Instance %s cannot be verified as it lives on"
                     " broken nodes", iname)
-         else:
-           ToStdout("Instance %s has missing logical volumes:", iname)
-           ival.sort()
-           for node, vol in ival:
-             if node in bad_nodes:
-               ToStdout("\tbroken node %s /dev/%s", node, vol)
-             else:
-               ToStdout("\t%s /dev/%s", node, vol)
+           continue
+         ToStdout("Instance %s has missing logical volumes:", iname)
+         ival.sort()
+         for node, vol in ival:
+           if node in bad_nodes:
+             ToStdout("\tbroken node %s /dev/%s", node, vol)
+           else:
+             ToStdout("\t%s /dev/%s", node, vol)
  
        ToStdout("You need to replace or recreate disks for all the above"
                 " instances if this message persists after fixing broken nodes.")
@@@ -1218,9 -1211,7 +1218,9 @@@ def _RenewCrypto(new_cluster_cert, new_
          node_certificates=new_node_cert or new_cluster_cert,
          renew_ssh_keys=new_ssh_keys,
          ssh_key_type=ssh_key_type,
 -        ssh_key_bits=ssh_key_bits)
 +        ssh_key_bits=ssh_key_bits,
 +        verbose=verbose,
 +        debug=debug)
      SubmitOpCode(renew_op, cl=cl)
  
    ToStdout("All requested certificates and keys have been replaced."
@@@ -1277,10 -1268,10 +1277,10 @@@ def _BuildGanetiPubKeys(options, pub_ke
  
    # get the key files of all non-master nodes
    for node in nonmaster_nodes:
 -    pub_key = ssh.ReadRemoteSshPubKeys(pub_key_filename, node, cluster_name,
 -                                       ssh_port_map[node],
 -                                       options.ssh_key_check,
 -                                       options.ssh_key_check)
 +    pub_key = ssh.ReadRemoteSshPubKey(pub_key_filename, node, cluster_name,
 +                                      ssh_port_map[node],
 +                                      options.ssh_key_check,
 +                                      options.ssh_key_check)
      ssh.AddPublicKey(node_uuid_map[node], pub_key, key_file=pub_key_file)
  
  
@@@ -1400,9 -1391,7 +1400,9 @@@ def SetClusterParams(opts, args)
            opts.ipolicy_disk_templates is not None or
            opts.ipolicy_vcpu_ratio is not None or
            opts.ipolicy_spindle_ratio is not None or
 +          opts.ipolicy_memory_ratio is not None or
            opts.modify_etc_hosts is not None or
 +          opts.modify_ssh_setup is not None or
            opts.file_storage_dir is not None or
            opts.install_image is not None or
            opts.instance_communication_network is not None or
            opts.compression_tools is not None or
            opts.shared_file_storage_dir is not None or
            opts.enabled_user_shutdown is not None or
 +          opts.maint_round_delay is not None or
 +          opts.maint_balance is not None or
 +          opts.maint_balance_threshold is not None or
            opts.data_collector_interval or
 +          opts.diagnose_data_collector_filename is not None or
            opts.enabled_data_collectors):
      ToStderr("Please give at least one of the parameters.")
      return 1
      ipolicy_disk_templates=opts.ipolicy_disk_templates,
      ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
      ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
 +    ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
      )
  
    mnh = opts.maintain_node_health
      max_tracked_jobs=opts.max_tracked_jobs,
      maintain_node_health=mnh,
      modify_etc_hosts=opts.modify_etc_hosts,
 +    modify_ssh_setup=opts.modify_ssh_setup,
      uid_pool=uid_pool,
      add_uids=add_uids,
      remove_uids=remove_uids,
      shared_file_storage_dir=opts.shared_file_storage_dir,
      compression_tools=compression_tools,
      enabled_user_shutdown=opts.enabled_user_shutdown,
 +    maint_round_delay=opts.maint_round_delay,
 +    maint_balance=opts.maint_balance,
 +    maint_balance_threshold=opts.maint_balance_threshold,
      enabled_data_collectors=enabled_data_collectors,
      data_collector_interval=data_collector_interval,
 +    diagnose_data_collector_filename=opts.diagnose_data_collector_filename
      )
    return base.GetResult(None, opts, SubmitOrSend(op, opts))
  
@@@ -1962,21 -1941,6 +1962,21 @@@ def Epo(opts, args, qcl=None, _on_fn=_E
      return _off_fn(opts, node_list, inst_map)
  
  
 +def RemoveRepair(opts, args):
 +  """Uncoditionally remove a repair event
 +
 +  @param opts: the command line options selected by the user (ignored)
 +  @type args: list
 +  @param args: one element, the uuid of the event to remove
 +  @rtype: int
 +  @return: the desired exit code
 +
 +  """
 +  uuid = args[0]
 +  wconfd.Client().RmMaintdIncident(uuid)
 +  return 0
 +
 +
  def _GetCreateCommand(info):
    buf = StringIO()
    buf.write("gnt-cluster init")
@@@ -2529,7 -2493,7 +2529,7 @@@ commands = 
       VERIFY_CLUTTER_OPT],
      "", "Does a check on the cluster configuration"),
    "verify-disks": (
 -    VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT],
 +    VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT, STRICT_OPT],
      "", "Does a check on the cluster disk status"),
    "repair-disk-sizes": (
      RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
       PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT, USE_EXTERNAL_MIP_SCRIPT,
       DISK_PARAMS_OPT, HV_STATE_OPT, DISK_STATE_OPT] + SUBMIT_OPTS +
       [ENABLED_DISK_TEMPLATES_OPT, IPOLICY_STD_SPECS_OPT, MODIFY_ETCHOSTS_OPT,
 -      ENABLED_USER_SHUTDOWN_OPT] +
 +      MODIFY_SSH_SETUP_OPT, ENABLED_USER_SHUTDOWN_OPT] +
       INSTANCE_POLICY_OPTS +
       [GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT, ZEROING_IMAGE_OPT,
        COMPRESSION_TOOLS_OPT] +
 -     [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT],
 +     [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT,
 +      DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT,
 +      MAINT_INTERVAL_OPT, MAINT_BALANCE_OPT, MAINT_BALANCE_THRESHOLD_OPT],
      "[opts...]",
      "Alters the parameters of the cluster"),
    "renew-crypto": (
    "upgrade": (
      UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
      "Upgrade (or downgrade) to a new Ganeti version"),
 +  "remove-repair": (
 +    RemoveRepair, [ArgUnknown()], [], "<uuid>",
 +    "Remove a repair event from the list of pending events"),
    }
  
  
@@@ -67,7 -67,7 +67,7 @@@ from ganeti.cmdlib.common import ShareA
    CheckIpolicyVsDiskTemplates, CheckDiskAccessModeValidity, \
    CheckDiskAccessModeConsistency, GetClientCertDigest, \
    AddInstanceCommunicationNetworkOp, ConnectInstanceCommunicationNetworkOp, \
-   CheckImageValidity, CheckDiskAccessModeConsistency, EnsureKvmdOnNodes
+   CheckImageValidity, EnsureKvmdOnNodes
  
  import ganeti.masterd.instance
  
@@@ -191,9 -191,7 +191,9 @@@ class LUClusterRenewCrypto(NoHooksLU)
        potential_master_candidates,
        cluster_info.ssh_key_type, # Old key type
        self.ssh_key_type,         # New key type
 -      self.ssh_key_bits)         # New key bits
 +      self.ssh_key_bits,         # New key bits
 +      self.op.debug,
 +      self.op.verbose)
      result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
  
      # After the keys have been successfully swapped, time to commit the change
@@@ -1481,20 -1479,6 +1481,20 @@@ class LUClusterSetParams(LogicalUnit)
          feedback_fn("Cluster LVM configuration already in desired"
                      " state, not changing")
  
 +  def _SetDiagnoseDataCollectorFilename(self, feedback_fn):
 +    """Determines and sets the filename of the script
 +    diagnose data collector should run.
 +
 +    """
 +    if self.op.diagnose_data_collector_filename is not None:
 +      fn = self.op.diagnose_data_collector_filename
 +      if fn != self.cfg.GetDiagnoseDataCollectorFilename():
 +        self.cfg.SetDiagnoseDataCollectorFilename(fn)
 +      else:
 +        feedback_fn("Diagnose data collector filename"
 +                    " configuration already in desired"
 +                    " state, not changing")
 +
    def _SetFileStorageDir(self, feedback_fn):
      """Set the file storage directory.
  
      self._SetSharedFileStorageDir(feedback_fn)
      self.cfg.Update(self.cluster, feedback_fn)
      self._SetDrbdHelper(feedback_fn)
 +    self._SetDiagnoseDataCollectorFilename(feedback_fn)
  
      # re-read the fresh configuration again
      self.cluster = self.cfg.GetClusterInfo()
      if self.op.modify_etc_hosts is not None:
        self.cluster.modify_etc_hosts = self.op.modify_etc_hosts
  
 +    if self.op.modify_ssh_setup is not None:
 +      if (self.op.modify_ssh_setup and
 +          not self.cfg.GetClusterInfo().modify_ssh_setup):
 +        feedback_fn(
 +          "Enabling modify_ssh_setup for cluster. You may need to run"
 +          " 'gnt-cluster renew-crypto --new-ssh-keys --no-ssh-key-check'"
 +          " to redistribute the ssh public key settings for each node.")
 +      self.cluster.modify_ssh_setup = self.op.modify_ssh_setup
 +
      if self.op.prealloc_wipe_disks is not None:
        self.cluster.prealloc_wipe_disks = self.op.prealloc_wipe_disks
  
      if self.op.compression_tools is not None:
        self.cfg.SetCompressionTools(self.op.compression_tools)
  
 +    if self.op.maint_round_delay is not None:
 +      self.cfg.SetMaintdRoundDelay(self.op.maint_round_delay)
 +
 +    if self.op.maint_balance is not None:
 +      self.cfg.SetMaintdBalance(self.op.maint_balance)
 +
 +    if self.op.maint_balance_threshold is not None:
 +      self.cfg.SetMaintdBalanceThreshold(self.op.maint_balance_threshold)
 +
      network_name = self.op.instance_communication_network
      if network_name is not None:
        return self._ModifyInstanceCommunicationNetwork(self.cfg,
@@@ -141,7 -141,6 +141,6 @@@ class _VerifyErrors(object)
      # Report messages via the feedback_fn
      # pylint: disable=E1101
      self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list)
-     # pylint: enable=E1101
  
      # do not mark the operation as failed for WARN cases only
      if log_type == self.ETYPE_ERROR:
@@@ -210,7 -209,7 +209,7 @@@ class LUClusterVerify(NoHooksLU)
        for group in groups)
  
      # Fix up all parameters
-     for op in itertools.chain(*jobs): # pylint: disable=W0142
+     for op in itertools.chain(*jobs):
        op.debug_simulate_errors = self.op.debug_simulate_errors
        op.verbose = self.op.verbose
        op.error_codes = self.op.error_codes
@@@ -259,10 -258,8 +258,10 @@@ class LUClusterVerifyDisks(NoHooksLU)
        return ResultWithJobs([])
      else:
        # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
 -      return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
 -                             for group in group_names])
 +      return ResultWithJobs(
 +          [[opcodes.OpGroupVerifyDisks(group_name=group,
 +                                       is_strict=self.op.is_strict)]
 +           for group in group_names])
  
  
  class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
@@@ -390,8 -387,6 +389,8 @@@ class LUClusterVerifyGroup(LogicalUnit
      @ivar sbp: dictionary of {primary-node: list of instances} for all
          instances for which this node is secondary (config)
      @ivar mfree: free memory, as reported by hypervisor (runtime)
 +    @ivar mtotal: total memory, as reported by hypervisor (runtime)
 +    @ivar mdom0: domain0 memory, as reported by hypervisor (runtime)
      @ivar dfree: free disk, as reported by the node (runtime)
      @ivar offline: the offline status (config)
      @type rpc_fail: boolean
        self.sinst = []
        self.sbp = {}
        self.mfree = 0
 +      self.mtotal = 0
 +      self.mdom0 = 0
        self.dfree = 0
        self.offline = offline
        self.vm_capable = vm_capable
  
      # We detect here the nodes that will need the extra RPC calls for verifying
      # split LV volumes; they should be locked.
-     extra_lv_nodes = set()
+     extra_lv_nodes = {}
  
      for inst in self.my_inst_info.values():
        disks = self.cfg.GetInstanceDisks(inst.uuid)
          inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
          for nuuid in inst_nodes:
            if self.all_node_info[nuuid].group != self.group_uuid:
-             extra_lv_nodes.add(nuuid)
+             if nuuid in extra_lv_nodes:
+               extra_lv_nodes[nuuid].append(inst.name)
+             else:
+               extra_lv_nodes[nuuid] = [inst.name]
  
+     extra_lv_nodes_set = set(extra_lv_nodes.iterkeys())
      unlocked_lv_nodes = \
-         extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
+         extra_lv_nodes_set.difference(self.owned_locks(locking.LEVEL_NODE))
  
      if unlocked_lv_nodes:
+       node_strings = ['%s: [%s]' % (
+           self.cfg.GetNodeName(node), utils.CommaJoin(extra_lv_nodes[node]))
+             for node in unlocked_lv_nodes]
        raise errors.OpPrereqError("Missing node locks for LV check: %s" %
-                                  utils.CommaJoin(unlocked_lv_nodes),
+                                  utils.CommaJoin(node_strings),
                                   errors.ECODE_STATE)
-     self.extra_lv_nodes = list(extra_lv_nodes)
+     self.extra_lv_nodes = list(extra_lv_nodes_set)
  
    def _VerifyNode(self, ninfo, nresult):
      """Perform some basic validation on data returned from a node.
      # exclusive_storage wants all PVs to have the same size (approximately),
      # if the smallest and the biggest ones are okay, everything is fine.
      # pv_min is None iff pv_max is None
-     vals = filter((lambda ni: ni.pv_min is not None), node_image.values())
+     vals = [ni for ni in node_image.values() if ni.pv_min is not None]
      if not vals:
        return
      (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
  
      """
      cluster_info = self.cfg.GetClusterInfo()
 +    ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info,
 +                                                            self.group_info)
 +    memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO]
 +
      for node_uuid, n_img in node_image.items():
        # This code checks that every node which is now listed as
        # secondary has enough memory to host all instances it is
        # WARNING: we currently take into account down instances as well
        # as up ones, considering that even if they're down someone
        # might want to start them even in the event of a node failure.
 +      node_cfg = self.all_node_info[node_uuid]
        if n_img.offline or \
 -         self.all_node_info[node_uuid].group != self.group_uuid:
 +         node_cfg.group != self.group_uuid:
          # we're skipping nodes marked offline and nodes in other groups from
          # the N+1 warning, since most likely we don't have good memory
          # information from them; we already list instances living on such
            bep = cluster_info.FillBE(all_insts[inst_uuid])
            if bep[constants.BE_AUTO_BALANCE]:
              needed_mem += bep[constants.BE_MINMEM]
 -        test = n_img.mfree < needed_mem
 +        mnode = n_img.mdom0
 +        (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0]
 +        if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM:
 +          mnode = hv_state["mem_node"]
 +        # minimum allowed free memory (it's negative due to over-commitment)
 +        mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1)
 +        test = n_img.mfree - needed_mem < mem_treshold
          self._ErrorIf(test, constants.CV_ENODEN1,
                        self.cfg.GetNodeName(node_uuid),
                        "not enough memory to accomodate instance failovers"
      """
      # try to read free memory (from the hypervisor)
      hv_info = nresult.get(constants.NV_HVINFO, None)
 -    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
 +    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \
 +                                         or "memory_total" not in hv_info \
 +                                         or "memory_dom0" not in hv_info
      self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
                    "rpc call to node failed (hvinfo)")
      if not test:
        try:
          nimg.mfree = int(hv_info["memory_free"])
 +        nimg.mtotal = int(hv_info["memory_total"])
 +        nimg.mdom0 = int(hv_info["memory_dom0"])
        except (ValueError, TypeError):
          self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
                        "node returned invalid nodeinfo, check hypervisor")
diff --combined lib/cmdlib/common.py
@@@ -125,10 -125,11 +125,11 @@@ def CheckNodeGroupInstances(cfg, group_
    wanted_instances = frozenset(cfg.GetInstanceNames(
                                   cfg.GetNodeGroupInstances(group_uuid)))
    if owned_instance_names != wanted_instances:
+     group_name = cfg.GetNodeGroup(group_uuid).name
      raise errors.OpPrereqError("Instances in node group '%s' changed since"
                                 " locks were acquired, wanted '%s', have '%s';"
                                 " retry the operation" %
-                                (group_uuid,
+                                (group_name,
                                  utils.CommaJoin(wanted_instances),
                                  utils.CommaJoin(owned_instance_names)),
                                 errors.ECODE_STATE)
@@@ -482,9 -483,7 +483,9 @@@ def AddMasterCandidateSshKey
      potential_master_candidates,
      True, # add node's key to all node's 'authorized_keys'
      True, # all nodes are potential master candidates
 -    False) # do not update the node's public keys
 +    False, # do not update the node's public keys
 +    lu.op.debug,
 +    lu.op.verbose)
    ssh_result[master_node].Raise(
      "Could not update the SSH setup of node '%s' after promotion"
      " (UUID: %s)." % (node.name, node.uuid))
@@@ -1588,15 -1587,15 +1589,15 @@@ def EnsureKvmdOnNodes(lu, feedback_fn, 
    if start_nodes:
      results = lu.rpc.call_node_ensure_daemon(start_nodes, constants.KVMD, True)
      for node_uuid in start_nodes:
-       results[node_uuid].Warn("Failed to start KVM daemon in node '%s'" %
-                               node_uuid, feedback_fn)
+       results[node_uuid].Warn("Failed to start KVM daemon on node '%s'" %
+                               lu.cfg.GetNodeName(node_uuid), feedback_fn)
  
    # Stop KVM where necessary
    if stop_nodes:
      results = lu.rpc.call_node_ensure_daemon(stop_nodes, constants.KVMD, False)
      for node_uuid in stop_nodes:
-       results[node_uuid].Warn("Failed to stop KVM daemon in node '%s'" %
-                               node_uuid, feedback_fn)
+       results[node_uuid].Warn("Failed to stop KVM daemon on node '%s'" %
+                               lu.cfg.GetNodeName(node_uuid), feedback_fn)
  
  
  def WarnAboutFailedSshUpdates(result, master_uuid, feedback_fn):
diff --combined lib/cmdlib/misc.py
@@@ -40,11 -40,7 +40,11 @@@ from ganeti import qlan
  from ganeti import query
  from ganeti import utils
  from ganeti.cmdlib.base import NoHooksLU, QueryBase
 -from ganeti.cmdlib.common import GetWantedNodes, SupportsOob
 +from ganeti.cmdlib.common import (
 +  GetWantedNodes,
 +  SupportsOob,
 +  ExpandNodeUuidAndName
 +)
  
  
  class LUOobCommand(NoHooksLU):
          self.LogWarning("Out-of-band RPC failed on node '%s': %s",
                          node.name, result.fail_msg)
          node_entry.append((constants.RS_NODATA, None))
+         continue
+       try:
+         self._CheckPayload(result)
+       except errors.OpExecError, err:
+         self.LogWarning("Payload returned by node '%s' is not valid: %s",
+                         node.name, err)
+         node_entry.append((constants.RS_NODATA, None))
        else:
-         try:
-           self._CheckPayload(result)
-         except errors.OpExecError, err:
-           self.LogWarning("Payload returned by node '%s' is not valid: %s",
-                           node.name, err)
-           node_entry.append((constants.RS_NODATA, None))
-         else:
-           if self.op.command == constants.OOB_HEALTH:
-             # For health we should log important events
-             for item, status in result.payload:
-               if status in [constants.OOB_STATUS_WARNING,
-                             constants.OOB_STATUS_CRITICAL]:
-                 self.LogWarning("Item '%s' on node '%s' has status '%s'",
-                                 item, node.name, status)
-           if self.op.command == constants.OOB_POWER_ON:
-             node.powered = True
-           elif self.op.command == constants.OOB_POWER_OFF:
-             node.powered = False
-           elif self.op.command == constants.OOB_POWER_STATUS:
-             powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
-             if powered != node.powered:
-               logging.warning(("Recorded power state (%s) of node '%s' does not"
-                                " match actual power state (%s)"), node.powered,
-                               node.name, powered)
-           # For configuration changing commands we should update the node
-           if self.op.command in (constants.OOB_POWER_ON,
-                                  constants.OOB_POWER_OFF):
-             self.cfg.Update(node, feedback_fn)
-           node_entry.append((constants.RS_NORMAL, result.payload))
-           if (self.op.command == constants.OOB_POWER_ON and
-               idx < len(self.nodes) - 1):
-             time.sleep(self.op.power_delay)
+         if self.op.command == constants.OOB_HEALTH:
+           # For health we should log important events
+           for item, status in result.payload:
+             if status in [constants.OOB_STATUS_WARNING,
+                           constants.OOB_STATUS_CRITICAL]:
+               self.LogWarning("Item '%s' on node '%s' has status '%s'",
+                               item, node.name, status)
+         if self.op.command == constants.OOB_POWER_ON:
+           node.powered = True
+         elif self.op.command == constants.OOB_POWER_OFF:
+           node.powered = False
+         elif self.op.command == constants.OOB_POWER_STATUS:
+           powered = result.payload[constants.OOB_POWER_STATUS_POWERED]
+           if powered != node.powered:
+             logging.warning(("Recorded power state (%s) of node '%s' does not"
+                              " match actual power state (%s)"), node.powered,
+                             node.name, powered)
+         # For configuration changing commands we should update the node
+         if self.op.command in (constants.OOB_POWER_ON,
+                                constants.OOB_POWER_OFF):
+           self.cfg.Update(node, feedback_fn)
+         node_entry.append((constants.RS_NORMAL, result.payload))
+         if (self.op.command == constants.OOB_POWER_ON and
+             idx < len(self.nodes) - 1):
+           time.sleep(self.op.power_delay)
  
      return ret
  
@@@ -422,35 -419,3 +423,35 @@@ class LURestrictedCommand(NoHooksLU)
          result.append((True, nres.payload))
  
      return result
 +
 +
 +class LURepairCommand(NoHooksLU):
 +  """Logical unit for executing repair commands.
 +
 +  """
 +  REQ_BGL = False
 +
 +  def ExpandNames(self):
 +    self.node_uuid, _ = ExpandNodeUuidAndName(self.cfg, None, self.op.node_name)
 +
 +    self.needed_locks = {
 +      locking.LEVEL_NODE: self.node_uuid,
 +      }
 +    self.share_locks = {
 +      locking.LEVEL_NODE: False,
 +      }
 +
 +  def CheckPrereq(self):
 +    """Check prerequisites.
 +
 +    """
 +
 +  def Exec(self, feedback_fn):
 +    """Execute restricted command and return output.
 +
 +    """
 +    owned_nodes = frozenset(self.owned_locks(locking.LEVEL_NODE))
 +    assert self.node_uuid in owned_nodes
 +    return self.rpc.call_repair_command(self.op.node_name,
 +                                            self.op.command,
 +                                            self.op.input).data[1]
diff --combined lib/config/__init__.py
@@@ -225,30 -225,6 +225,30 @@@ class ConfigWriter(object)
      """
      return self._UnlockedGetNdParams(node)
  
 +  def _UnlockedGetFilledHvStateParams(self, node):
 +    cfg = self._ConfigData()
 +    cluster_hv_state = cfg.cluster.hv_state_static
 +    def_hv = self._UnlockedGetHypervisorType()
 +    cluster_fv = constants.HVST_DEFAULTS if def_hv not in cluster_hv_state \
 +                                         else cluster_hv_state[def_hv]
 +    group_hv_state = self._UnlockedGetNodeGroup(node.group).hv_state_static
 +    group_fv = cluster_fv if def_hv not in group_hv_state else \
 +               objects.FillDict(cluster_fv, group_hv_state[def_hv])
 +    node_fv = group_fv if def_hv not in node.hv_state_static else \
 +              objects.FillDict(group_fv, node.hv_state_static[def_hv])
 +    return {def_hv: node_fv}
 +
 +  @ConfigSync(shared=1)
 +  def GetFilledHvStateParams(self, node):
 +    """Get the node params populated with cluster defaults.
 +
 +    @type node: L{objects.Node}
 +    @param node: The node we want to know the params for
 +    @return: A dict with the filled in node hv_state params for the default hv
 +
 +    """
 +    return self._UnlockedGetFilledHvStateParams(node)
 +
    @ConfigSync(shared=1)
    def GetNdGroupParams(self, nodegroup):
      """Get the node groups params populated with cluster defaults.
      """
      return self._ConfigData().cluster.gluster_storage_dir
  
 +  def _UnlockedGetHypervisorType(self):
 +    """Get the hypervisor type for this cluster.
 +
 +    """
 +    return self._ConfigData().cluster.enabled_hypervisors[0]
 +
    @ConfigSync(shared=1)
    def GetHypervisorType(self):
      """Get the hypervisor type for this cluster.
  
      """
 -    return self._ConfigData().cluster.enabled_hypervisors[0]
 +    return self._UnlockedGetHypervisorType()
  
    @ConfigSync(shared=1)
    def GetRsaHostKey(self):
  
      if expanded_name is not None:
        # there has to be exactly one instance with that name
-       inst = (filter(lambda n: n.name == expanded_name, all_insts)[0])
+       inst = [n for n in all_insts if n.name == expanded_name][0]
        return (inst.uuid, inst.name)
      else:
        return (None, None)
  
      if expanded_name is not None:
        # there has to be exactly one node with that name
-       node = (filter(lambda n: n.name == expanded_name, all_nodes)[0])
+       node = [n for n in all_nodes if n.name == expanded_name][0]
        return (node.uuid, node.name)
      else:
        return (None, None)
  
      # Update timestamps and serials (only once per node/group object)
      now = time.time()
-     for obj in frozenset(itertools.chain(*resmod)): # pylint: disable=W0142
+     for obj in frozenset(itertools.chain(*resmod)):
        obj.serial_no += 1
        obj.mtime = now
  
      self._ConfigData().cluster.serial_no += 1
  
    @ConfigSync(shared=1)
 +  def GetDiagnoseDataCollectorFilename(self):
 +    """Return the diagnose data collector filename
 +
 +    """
 +    return self._ConfigData().cluster.diagnose_data_collector_filename
 +
 +  @ConfigSync()
 +  def SetDiagnoseDataCollectorFilename(self, fn):
 +    """Set the volume group name.
 +
 +    """
 +    self._ConfigData().cluster.diagnose_data_collector_filename = fn
 +    self._ConfigData().cluster.serial_no += 1
 +
 +  @ConfigSync(shared=1)
    def GetDRBDHelper(self):
      """Return DRBD usermode helper.
  
        if disk_uuid in inst_info.disks:
          return inst_uuid
  
 +  def SetMaintdRoundDelay(self, delay):
 +    """Set the minimal time the maintenance daemon should wait between rounds"""
 +    utils.SimpleRetry(True, self._wconfd.SetMaintdRoundDelay, 0.1, 30,
 +                      args=[delay])
 +
 +  def SetMaintdBalance(self, flag):
 +    """Enable/disable auto-balancing by the maintenance daemon"""
 +    utils.SimpleRetry(True, self._wconfd.SetMaintdBalance, 0.1, 30,
 +                      args=[flag])
 +
 +  def SetMaintdBalanceThreshold(self, score):
 +    """Set the minimal score improvement per move for balancing steps"""
 +    utils.SimpleRetry(True, self._wconfd.SetMaintdBalanceThreshold, 0.1, 30,
 +                      args=[score])
 +
  
  class DetachedConfig(ConfigWriter):
    """Read-only snapshot of the config."""
  
  """Module implementing the iallocator code."""
  
+ import logging
  from ganeti import compat
  from ganeti import constants
  from ganeti import errors
  from ganeti import ht
  from ganeti import outils
  from ganeti import opcodes
  from ganeti import serializer
  from ganeti import utils
  
+ import ganeti.rpc.node as rpc
  import ganeti.masterd.instance as gmi
  
- import logging
  _STRING_LIST = ht.TListOf(ht.TString)
  _JOB_LIST = ht.TListOf(ht.TListOf(ht.TStrictDict(True, False, {
     # pylint: disable=E1101
@@@ -156,7 -156,7 +156,7 @@@ class IARequestBase(outils.ValidatedSlo
      @raises ResultValidationError: If validation fails
  
      """
-     if ia.success and not self.REQ_RESULT(result):
+     if ia.success and not self.REQ_RESULT(result): # pylint: disable=E1102
        raise errors.ResultValidationError("iallocator returned invalid result,"
                                           " expected %s, got %s" %
                                           (self.REQ_RESULT, result))
@@@ -572,7 -572,6 +572,7 @@@ class IAllocator(object)
        "master_capable": ninfo.master_capable,
        "vm_capable": ninfo.vm_capable,
        "ndparams": cfg.GetNdParams(ninfo),
 +      "hv_state": cfg.GetFilledHvStateParams(ninfo)
        })
        for ninfo in node_cfg.values())
  
diff --combined lib/objects.py
@@@ -35,11 -35,14 +35,14 @@@ pass to and from external parties
  
  """
  
- # pylint: disable=E0203,W0201,R0902
+ # pylint: disable=E0203,E0237,W0201,R0902
  
  # E0203: Access to member %r before its definition, since we use
  # objects.py which doesn't explicitly initialise its members
  
+ # E0237: Assigning to attribute not defined in class slots. pylint doesn't
+ # appear to notice many of the slots defined in __slots__ for several objects.
  # W0201: Attribute '%s' defined outside __init__
  
  # R0902: Allow instances of these objects to have more than 20 attributes
@@@ -50,6 -53,7 +53,7 @@@ import cop
  import logging
  import time
  from cStringIO import StringIO
+ from socket import AF_INET
  
  from ganeti import errors
  from ganeti import constants
@@@ -58,12 -62,10 +62,10 @@@ from ganeti import outil
  from ganeti import utils
  from ganeti import serializer
  
  
  __all__ = ["ConfigObject", "ConfigData", "NIC", "Disk", "Instance",
             "OS", "Node", "NodeGroup", "Cluster", "FillDict", "Network",
 -           "Filter"]
 +           "Filter", "Maintenance"]
  
  _TIMESTAMPS = ["ctime", "mtime"]
  _UUID = ["uuid"]
@@@ -278,7 -280,7 +280,7 @@@ class ConfigObject(outils.ValidatedSlot
        raise errors.ConfigurationError("Invalid object passed to FromDict:"
                                        " expected dict, got %s" % type(val))
      val_str = dict([(str(k), v) for k, v in val.iteritems()])
-     obj = cls(**val_str) # pylint: disable=W0142
+     obj = cls(**val_str)
      return obj
  
    def Copy(self):
@@@ -416,7 -418,6 +418,7 @@@ class ConfigData(ConfigObject)
      "networks",
      "disks",
      "filters",
 +    "maintenance",
      "serial_no",
      ] + _TIMESTAMPS
  
      """
      mydict = super(ConfigData, self).ToDict(_with_private=_with_private)
      mydict["cluster"] = mydict["cluster"].ToDict()
 +    mydict["maintenance"] = mydict["maintenance"].ToDict()
      for key in ("nodes", "instances", "nodegroups", "networks", "disks",
                  "filters"):
        mydict[key] = outils.ContainerToDicts(mydict[key])
      obj.networks = outils.ContainerFromDicts(obj.networks, dict, Network)
      obj.disks = outils.ContainerFromDicts(obj.disks, dict, Disk)
      obj.filters = outils.ContainerFromDicts(obj.filters, dict, Filter)
 +    obj.maintenance = Maintenance.FromDict(obj.maintenance)
      return obj
  
    def DisksOfType(self, dev_type):
        disk.UpgradeConfig()
      if self.filters is None:
        self.filters = {}
 +    if self.maintenance is None:
 +      self.maintenance = Maintenance.FromDict({})
 +    self.maintenance.UpgradeConfig()
  
    def _UpgradeEnabledDiskTemplates(self):
      """Upgrade the cluster's enabled disk templates by inspecting the currently
@@@ -555,20 -551,6 +557,20 @@@ class Filter(ConfigObject)
                 "predicates", "action", "reason_trail"] + _UUID
  
  
 +class Maintenance(ConfigObject):
 +  """Config object representing the state of the maintenance daemon"""
 +  __slots__ = ["roundDelay", "jobs", "evacuated", "balance", "balanceThreshold",
 +               "incidents", "serial_no"] + _TIMESTAMPS
 +
 +  def UpgradeConfig(self):
 +    if self.serial_no is None:
 +      self.serial_no = 1
 +    if self.mtime is None:
 +      self.mtime = time.time()
 +    if self.ctime is None:
 +      self.ctime = time.time()
 +
 +
  class Disk(ConfigObject):
    """Config object representing a block device."""
    __slots__ = [
@@@ -1275,7 -1257,7 +1277,7 @@@ class Instance(TaggableObject)
      if _with_private:
        bo["osparams_private"] = self.osparams_private.Unprivate()
  
-     for attr in "nics", :
+     for attr in ("nics",):
        alist = bo.get(attr, None)
        if alist:
          nlist = outils.ContainerToDicts(alist)
@@@ -1513,11 -1495,6 +1515,11 @@@ class Node(TaggableObject)
      if self.powered is None:
        self.powered = True
  
 +    if self.hv_state_static is None:
 +      self.hv_state_static = {}
 +    if self.disk_state_static is None:
 +      self.disk_state_static = {}
 +
    def ToDict(self, _with_private=False):
      """Custom function for serializing.
  
@@@ -1615,11 -1592,6 +1617,11 @@@ class NodeGroup(TaggableObject)
      if self.ipolicy is None:
        self.ipolicy = MakeEmptyIPolicy()
  
 +    if self.hv_state_static is None:
 +      self.hv_state_static = {}
 +    if self.disk_state_static is None:
 +      self.disk_state_static = {}
 +
      if self.networks is None:
        self.networks = {}
  
@@@ -1705,7 -1677,6 +1707,7 @@@ class Cluster(TaggableObject)
      "compression_tools",
      "enabled_user_shutdown",
      "data_collectors",
 +    "diagnose_data_collector_filename",
      "ssh_key_type",
      "ssh_key_bits",
      ] + _TIMESTAMPS + _UUID
        - at public visibility:  {public}
        - at private visibility: {private}
        - at secret visibility:  {secret}
-       """.format(dupes=formatter(duplicate_keys),
-                  public=formatter(params_public & duplicate_keys),
+       """.format(public=formatter(params_public & duplicate_keys),
                   private=formatter(params_private & duplicate_keys),
                   secret=formatter(params_secret & duplicate_keys))
        raise errors.OpPrereqError(msg)
diff --combined lib/rpc_defs.py
@@@ -147,7 -147,7 +147,7 @@@ def _NodeInfoPreProc(node, args)
    assert len(args) == 2
    # The storage_units argument is either a dictionary with one value for each
    # node, or a fixed value to be used for all the nodes
-   if type(args[0]) is dict:
+   if isinstance(args[0], dict):
      return [args[0][node], args[1]]
    else:
      return args
@@@ -543,9 -543,7 +543,9 @@@ _NODE_CALLS = 
      ("to_public_keys", None, "Whether the node's key should be added"
       " to all nodes' public key file"),
      ("get_public_keys", None, "Whether the node should get the other nodes'"
 -     " public keys")],
 +     " public keys"),
 +    ("debug", None, "Set loglevel of ssh calls to 'debug'."),
 +    ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
      None, None, "Distribute a new node's public SSH key on the cluster."),
    ("node_ssh_key_remove", MULTI, None, constants.RPC_TMO_FAST, [
      ("node_uuid", None, "UUID of the node whose key is removed"),
      ("clear_public_keys", None,
       "If the 'ganeti_pub_keys' file of the node should be cleared."),
      ("readd", None,
 -     "Whether this is a readd operation.")],
 +     "Whether this is a readd operation."),
 +    ("debug", None, "Set loglevel of ssh calls to 'debug'."),
 +    ("verbose", None, "Set loglevel of ssh calls to 'verbose'.")],
      None, None, "Remove a node's SSH key from the other nodes' key files."),
    ("node_ssh_keys_renew", MULTI, None, constants.RPC_TMO_4HRS, [
      ("node_uuids", None, "UUIDs of the nodes whose key is renewed"),
      ("potential_master_candidates", None, "Potential master candidates"),
      ("old_key_type", None, "The type of key previously used"),
      ("new_key_type", None, "The type of key to generate"),
 -    ("new_key_bits", None, "The length of the key to generate")],
 +    ("new_key_bits", None, "The length of the key to generate"),
 +    ("debug", None, "Set logging of SSH update tool to 'debug'."),
 +    ("verbose", None, "Set logging of SSH update tool to 'info'.")],
      None, None, "Renew all SSH key pairs of all nodes nodes."),
 +  ("node_ssh_key_remove_light", MULTI, None, constants.RPC_TMO_FAST, [
 +    ("node_name", None, "Name of the node whose key is removed")],
 +    None, None, "Remove a node's SSH key from the master's public key file."),
    ]
  
  _MISC_CALLS = [
    ("restricted_command", MULTI, None, constants.RPC_TMO_SLOW, [
      ("cmd", None, "Command name"),
      ], None, None, "Runs restricted command"),
 +  ("repair_command", SINGLE, None, constants.RPC_TMO_SLOW, [
 +    ("cmd", None, "Command name"),
 +    ("inp", None, "Input to be passed as stdin"),
 +    ], None, None, "Runs repair command"),
    ("run_oob", SINGLE, None, constants.RPC_TMO_NORMAL, [
      ("oob_program", None, None),
      ("command", None, None),
diff --combined lib/server/noded.py
  
  """Ganeti node daemon"""
  
- # pylint: disable=C0103,W0142
+ # pylint: disable=C0103
  
  # C0103: Functions in this module need to have a given name structure,
  # and the name of the daemon doesn't match
  
- # W0142: Used * or ** magic, since we do use it extensively in this
- # module
  import os
  import sys
  import logging
@@@ -203,7 -200,7 +200,7 @@@ class NodeRequestHandler(http.server.Ht
        # And return the error's arguments, which must be already in
        # correct tuple format
        result = err.args
-     except Exception, err:
+     except Exception, err: # pylint: disable=W0703
        logging.exception("Error in RPC call")
        result = (False, "Error while executing backend function: %s" % str(err))
  
  
      """
      (node_uuid, node_name, potential_master_candidates,
 -     to_authorized_keys, to_public_keys, get_public_keys) = params
 +     to_authorized_keys, to_public_keys, get_public_keys,
 +     debug, verbose) = params
      return backend.AddNodeSshKey(node_uuid, node_name,
                                   potential_master_candidates,
                                   to_authorized_keys=to_authorized_keys,
                                   to_public_keys=to_public_keys,
 -                                 get_public_keys=get_public_keys)
 +                                 get_public_keys=get_public_keys,
 +                                 ssh_update_debug=debug,
 +                                 ssh_update_verbose=verbose)
  
    @staticmethod
    def perspective_node_ssh_keys_renew(params):
      """
      (node_uuids, node_names, master_candidate_uuids,
       potential_master_candidates, old_key_type, new_key_type,
 -     new_key_bits) = params
 +     new_key_bits, debug, verbose) = params
      return backend.RenewSshKeys(node_uuids, node_names, master_candidate_uuids,
                                  potential_master_candidates, old_key_type,
 -                                new_key_type, new_key_bits)
 +                                new_key_type, new_key_bits,
 +                                ssh_update_debug=debug,
 +                                ssh_update_verbose=verbose)
  
    @staticmethod
    def perspective_node_ssh_key_remove(params):
      (node_uuid, node_name,
       master_candidate_uuids, potential_master_candidates,
       from_authorized_keys, from_public_keys, clear_authorized_keys,
 -     clear_public_keys, readd) = params
 +     clear_public_keys, readd, debug, verbose) = params
      return backend.RemoveNodeSshKey(node_uuid, node_name,
                                      master_candidate_uuids,
                                      potential_master_candidates,
                                      from_public_keys=from_public_keys,
                                      clear_authorized_keys=clear_authorized_keys,
                                      clear_public_keys=clear_public_keys,
 -                                    readd=readd)
 +                                    readd=readd,
 +                                    ssh_update_debug=debug,
 +                                    ssh_update_verbose=verbose)
 +
 +  @staticmethod
 +  def perspective_node_ssh_key_remove_light(params):
 +    """Removes a node's SSH key from the master's public key file.
 +
 +    """
 +    (node_name, ) = params
 +    return backend.RemoveSshKeyFromPublicKeyFile(node_name)
  
    # cluster --------------------------
  
      """
      (cmd, ) = params
  
 -    return backend.RunRestrictedCmd(cmd)
 +    return backend.RunConstrainedCmd(
 +      cmd,
 +      lock_file=pathutils.RESTRICTED_COMMANDS_LOCK_FILE,
 +      path=pathutils.RESTRICTED_COMMANDS_DIR)
 +
 +  @staticmethod
 +  def perspective_repair_command(params):
 +    """ Run a repair command.
 +
 +    """
 +    (cmd, inp, ) = params
 +
 +    return backend.RunConstrainedCmd(
 +      cmd,
 +      lock_file=pathutils.REPAIR_COMMANDS_LOCK_FILE,
 +      path=pathutils.REPAIR_COMMANDS_DIR,
 +      inp=inp)
  
    @staticmethod
    def perspective_write_ssconf_files(params):
      return backend.CleanupImportExport(params[0])
  
  
- def CheckNoded(_, args):
+ def CheckNoded(options, args):
    """Initial checks whether to run or exit with a failure.
  
    """
      print >> sys.stderr, ("Usage: %s [-f] [-d] [-p port] [-b ADDRESS]" %
                            sys.argv[0])
      sys.exit(constants.EXIT_FAILURE)
+   if options.max_clients < 1:
+     print >> sys.stderr, ("%s --max-clients argument must be >= 1" %
+                           sys.argv[0])
+     sys.exit(constants.EXIT_FAILURE)
    try:
      codecs.lookup("string-escape")
    except LookupError:
@@@ -1401,7 -1373,6 +1404,6 @@@ def SSLVerifyPeer(conn, cert, errnum, e
    else:
      logging.error("Invalid errdepth value: %s.", errdepth)
      return False
-   # pylint: enable=W0613
  
  
  def PrepNoded(options, _):
    handler = NodeRequestHandler()
  
    mainloop = daemon.Mainloop()
-   server = \
-     http.server.HttpServer(mainloop, options.bind_address, options.port,
-                            handler, ssl_params=ssl_params, ssl_verify_peer=True,
-                            request_executor_class=request_executor_class,
-                            ssl_verify_callback=SSLVerifyPeer)
+   server = http.server.HttpServer(
+       mainloop, options.bind_address, options.port, options.max_clients,
+       handler, ssl_params=ssl_params, ssl_verify_peer=True,
+       request_executor_class=request_executor_class,
+       ssl_verify_callback=SSLVerifyPeer)
    server.Start()
  
    return (mainloop, server)
@@@ -1468,6 -1439,10 +1470,10 @@@ def Main()
    parser.add_option("--no-mlock", dest="mlock",
                      help="Do not mlock the node memory in ram",
                      default=True, action="store_false")
+   parser.add_option("--max-clients", dest="max_clients",
+                     default=20, type="int",
+                     help="Number of simultaneous connections accepted"
+                     " by noded")
  
    daemon.GenericMain(constants.NODED, parser, CheckNoded, PrepNoded, ExecNoded,
                       default_ssl_cert=pathutils.NODED_CERT_FILE,
diff --combined lib/ssh.py
@@@ -35,7 -35,6 +35,7 @@@
  
  import logging
  import os
 +import shutil
  import tempfile
  
  from collections import namedtuple
@@@ -172,13 -171,13 +172,13 @@@ def AddAuthorizedKeys(file_obj, keys)
                             in key_field_list
                             if split_key != line_key]
        nl = line.endswith("\n")
-     else:
-       if not nl:
-         f.write("\n")
-       for (key, _) in key_field_list:
-         f.write(key.rstrip("\r\n"))
-         f.write("\n")
-       f.flush()
+     if not nl:
+       f.write("\n")
+     for (key, _) in key_field_list:
+       f.write(key.rstrip("\r\n"))
+       f.write("\n")
+     f.flush()
    finally:
      f.close()
  
@@@ -728,7 -727,7 +728,7 @@@ def InitPubKeyFile(master_uuid, key_typ
    AddPublicKey(master_uuid, key, key_file=key_file)
  
  
- class SshRunner:
+ class SshRunner(object):
    """Wrapper for SSH commands.
  
    """
@@@ -1074,8 -1073,8 +1074,8 @@@ def RunSshCmdWithStdin(cluster_name, no
                               (result.cmd, result.fail_reason))
  
  
 -def ReadRemoteSshPubKeys(pub_key_file, node, cluster_name, port, ask_key,
 -                         strict_host_check):
 +def ReadRemoteSshPubKey(pub_key_file, node, cluster_name, port, ask_key,
 +                        strict_host_check):
    """Fetches a public SSH key from a node via SSH.
  
    @type pub_key_file: string
    return result.stdout
  
  
 +def GetSshKeyFilenames(key_type, suffix=""):
 +  """Get filenames of the SSH key pair of the given type.
 +
 +  @type key_type: string
 +  @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
 +  @type suffix: string
 +  @param suffix: optional suffix for the key filenames
 +  @rtype: tuple of (string, string)
 +  @returns: a tuple containing the name of the private key file and the
 +       public key file.
 +
 +  """
 +  if key_type not in constants.SSHK_ALL:
 +    raise errors.SshUpdateError("Unsupported key type '%s'. Supported key types"
 +                                " are: %s." % (key_type, constants.SSHK_ALL))
 +  (_, root_keyfiles) = \
 +      GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
 +  if not key_type in root_keyfiles.keys():
 +    raise errors.SshUpdateError("No keyfile for key type '%s' available."
 +                                % key_type)
 +
 +  key_filenames = root_keyfiles[key_type]
 +  if suffix:
 +    key_filenames = [_ComputeKeyFilePathWithSuffix(key_filename, suffix)
 +                     for key_filename in key_filenames]
 +
 +  return key_filenames
 +
 +
 +def GetSshPubKeyFilename(key_type, suffix=""):
 +  """Get filename of the public SSH key of the given type.
 +
 +  @type key_type: string
 +  @param key_type: type of SSH key, must be element of C{constants.SSHK_ALL}
 +  @type suffix: string
 +  @param suffix: optional suffix for the key filenames
 +  @rtype: string
 +  @returns: file name of the public key file
 +
 +  """
 +  return GetSshKeyFilenames(key_type, suffix=suffix)[1]
 +
 +
 +def _ComputeKeyFilePathWithSuffix(key_filepath, suffix):
 +  """Converts the given key filename to a key filename with a suffix.
 +
 +  @type key_filepath: string
 +  @param key_filepath: path of the key file
 +  @type suffix: string
 +  @param suffix: suffix to be appended to the basename of the file
 +
 +  """
 +  path = os.path.dirname(key_filepath)
 +  ext = os.path.splitext(os.path.basename(key_filepath))[1]
 +  basename = os.path.splitext(os.path.basename(key_filepath))[0]
 +  return os.path.join(path, basename + suffix + ext)
 +
 +
 +def ReplaceSshKeys(src_key_type, dest_key_type,
 +                   src_key_suffix="", dest_key_suffix=""):
 +  """Replaces an SSH key pair by another SSH key pair.
 +
 +  Note that both parts, the private and the public key, are replaced.
 +
 +  @type src_key_type: string
 +  @param src_key_type: key type of key pair that is replacing the other
 +      key pair
 +  @type dest_key_type: string
 +  @param dest_key_type: key type of the key pair that is being replaced
 +      by the source key pair
 +  @type src_key_suffix: string
 +  @param src_key_suffix: optional suffix of the key files of the source
 +      key pair
 +  @type dest_key_suffix: string
 +  @param dest_key_suffix: optional suffix of the keey files of the
 +      destination key pair
 +
 +  """
 +  (src_priv_filename, src_pub_filename) = GetSshKeyFilenames(
 +      src_key_type, suffix=src_key_suffix)
 +  (dest_priv_filename, dest_pub_filename) = GetSshKeyFilenames(
 +      dest_key_type, suffix=dest_key_suffix)
 +
 +  if not (os.path.exists(src_priv_filename) and
 +          os.path.exists(src_pub_filename)):
 +    raise errors.SshUpdateError(
 +        "At least one of the source key files is missing: %s",
 +        ", ".join([src_priv_filename, src_pub_filename]))
 +
 +  for dest_file in [dest_priv_filename, dest_pub_filename]:
 +    if os.path.exists(dest_file):
 +      utils.CreateBackup(dest_file)
 +      utils.RemoveFile(dest_file)
 +
 +  shutil.move(src_priv_filename, dest_priv_filename)
 +  shutil.move(src_pub_filename, dest_pub_filename)
 +
 +
 +def ReadLocalSshPubKeys(key_types, suffix=""):
 +  """Reads the local root user SSH key.
 +
 +  @type key_types: list of string
 +  @param key_types: types of SSH keys. Must be subset of constants.SSHK_ALL. If
 +      'None' or [], all available keys are returned.
 +  @type suffix: string
 +  @param suffix: optional suffix to be attached to key names when reading
 +      them. Used for temporary key files.
 +  @rtype: list of string
 +  @return: list of public keys
 +
 +  """
 +  fetch_key_types = []
 +  if key_types:
 +    fetch_key_types += key_types
 +  else:
 +    fetch_key_types = constants.SSHK_ALL
 +
 +  (_, root_keyfiles) = \
 +      GetAllUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False)
 +
 +  result_keys = []
 +  for (public_key_type, (_, public_key_file)) in root_keyfiles.items():
 +
 +    if public_key_type not in fetch_key_types:
 +      continue
 +
 +    public_key_dir = os.path.dirname(public_key_file)
 +    public_key_filename = ""
 +    if suffix:
 +      public_key_filename = \
 +          os.path.splitext(os.path.basename(public_key_file))[0] \
 +          + suffix + ".pub"
 +    else:
 +      public_key_filename = public_key_file
 +    public_key_path = os.path.join(public_key_dir,
 +                                   public_key_filename)
 +
 +    if not os.path.exists(public_key_path):
 +      raise errors.SshUpdateError("Cannot find SSH public key of type '%s'."
 +                                  % public_key_type)
 +    else:
 +      key = utils.ReadFile(public_key_path)
 +      result_keys.append(key)
 +
 +  return result_keys
 +
 +
  # Update gnt-cluster.rst when changing which combinations are valid.
  KeyBitInfo = namedtuple('KeyBitInfo', ['default', 'validation_fn'])
  SSH_KEY_VALID_BITS = {
diff --combined lib/tools/common.py
  """
  
  import logging
- import OpenSSL
  import os
  import time
  from cStringIO import StringIO
  
+ import OpenSSL
  from ganeti import constants
  from ganeti import errors
  from ganeti import pathutils
@@@ -182,19 -184,6 +184,19 @@@ def VerifyClusterName(data, error_fn, c
    return name
  
  
 +def VerifyHmac(data, error_fn):
 +  """Verifies the presence of the hmac secret.
 +
 +  @type data: dict
 +
 +  """
 +  hmac = data.get(constants.NDS_HMAC)
 +  if not hmac:
 +    raise error_fn("Hmac key must be provided")
 +
 +  return hmac
 +
 +
  def LoadData(raw, data_check):
    """Parses and verifies input data.
  
@@@ -80,9 -80,7 +80,9 @@@ def Main()
    """
    opts = ParseOptions()
  
 -  utils.SetupToolLogging(opts.debug, opts.verbose)
 +  utils.SetupToolLogging(
 +      opts.debug, opts.verbose,
 +      toolname=os.path.splitext(os.path.basename(__file__))[0])
  
    try:
      # List of files to delete. Contains tuples consisting of the absolute path
@@@ -93,9 -91,8 +93,8 @@@
        (pathutils.CLUSTER_CONF_FILE, True),
        (pathutils.CLUSTER_DOMAIN_SECRET_FILE, True),
        ]
-     clean_files.extend((s, True) for s in pathutils.ALL_CERT_FILES)
-     clean_files.extend((s, False) for s in
-                            ssconf.SimpleStore().GetFileList())
+     clean_files.extend((f, True) for f in pathutils.ALL_CERT_FILES)
+     clean_files.extend((f, False) for f in ssconf.SimpleStore().GetFileList())
  
      if not opts.yes_do_it:
        cli.ToStderr("Cleaning a node is irreversible. If you really want to"
diff --combined lib/utils/process.py
@@@ -185,8 -185,7 +185,8 @@@ def RunCmd(cmd, env=None, output=None, 
    @type noclose_fds: list
    @param noclose_fds: list of additional (fd >=3) file descriptors to leave
                        open for the child process
 -  @type input_fd: C{file}-like object or numeric file descriptor
 +  @type input_fd: C{file}-like object containing an actual file descriptor
 +                  or numeric file descriptor
    @param input_fd: File descriptor for process' standard input
    @type postfork_fn: Callable receiving PID as parameter
    @param postfork_fn: Callback run after fork but before timeout
@@@ -361,15 -360,11 +361,11 @@@ def StartDaemon(cmd, env=None, cwd="/"
            # First fork
            pid = os.fork()
            if pid == 0:
-             try:
-               # Child process, won't return
-               _StartDaemonChild(errpipe_read, errpipe_write,
-                                 pidpipe_read, pidpipe_write,
-                                 cmd, cmd_env, cwd,
-                                 output, output_fd, pidfile)
-             finally:
-               # Well, maybe child process failed
-               os._exit(1) # pylint: disable=W0212
+             # Try to start child process, will either execve or exit on failure.
+             _StartDaemonChild(errpipe_read, errpipe_write,
+                               pidpipe_read, pidpipe_write,
+                               cmd, cmd_env, cwd,
+                               output, output_fd, pidfile)
          finally:
            utils_wrapper.CloseFdNoError(errpipe_write)
  
@@@ -527,8 -522,7 +523,8 @@@ def _RunCmdPipe(cmd, env, via_shell, cw
    @type noclose_fds: list
    @param noclose_fds: list of additional (fd >=3) file descriptors to leave
                        open for the child process
 -  @type input_fd: C{file}-like object or numeric file descriptor
 +  @type input_fd: C{file}-like object containing an actual file descriptor
 +                  or numeric file descriptor
    @param input_fd: File descriptor for process' standard input
    @type postfork_fn: Callable receiving PID as parameter
    @param postfork_fn: Function run after fork but before timeout
    @return: (out, err, status)
  
    """
+   # pylint: disable=R0101
    poller = select.poll()
  
    if interactive:
@@@ -936,12 -931,12 +933,12 @@@ def Daemonize(logfile)
  
    # this might fail
    pid = os.fork()
-   if (pid == 0):  # The first child.
+   if pid == 0:  # The first child.
      SetupDaemonEnv()
  
      # this might fail
      pid = os.fork() # Fork a second child.
-     if (pid == 0):  # The second child.
+     if pid == 0:  # The second child.
        utils_wrapper.CloseFdNoError(rpipe)
      else:
        # exit() or _exit()?  See below.
@@@ -1099,7 -1094,7 +1096,7 @@@ def CloseFDs(noclose_fds=None)
      MAXFD = 1024
  
    maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
-   if (maxfd == resource.RLIM_INFINITY):
+   if maxfd == resource.RLIM_INFINITY:
      maxfd = MAXFD
  
    # Iterate through and close all file descriptors (except the standard ones)
diff --combined lib/utils/retry.py
@@@ -170,7 -170,6 +170,6 @@@ def Retry(fn, delay, timeout, args=None
    while True:
      retry_args = []
      try:
-       # pylint: disable=W0142
        return fn(*args)
      except RetryAgain, err:
        retry_args = err.args
      remaining_time = end_time - _time_fn()
  
      if remaining_time <= 0.0:
-       # pylint: disable=W0142
        raise RetryTimeout(*retry_args)
  
      assert remaining_time > 0.0
@@@ -218,7 -216,6 +216,6 @@@ def SimpleRetry(expected, fn, delay, ti
    rdict = {}
  
    def helper(*innerargs):
-     # pylint: disable=W0142
      result = rdict["result"] = fn(*innerargs)
      if not ((callable(expected) and expected(result)) or result == expected):
        raise RetryAgain()
@@@ -253,8 -250,7 +250,8 @@@ def CountRetry(expected, fn, count, arg
                       wait_fn=inc_tries, _time_fn=get_tries)
  
  
 -def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs):
 +def RetryByNumberOfTimes(max_retries, backoff, exception_class, fn, *args,
 +                         **kwargs):
    """Retries calling a function up to the specified number of times.
  
    @type max_retries: integer
    @type fn: callable
    @param fn: Function to be called (up to the specified maximum number of
               retries.
 +  @type backoff: int
 +  @param backoff: this enables and configures the back off behavior after
 +     failed tries. If value is '0', there will be no delay between failed
 +     tries. If the value is a positive integer, it is interpreted as the
 +     base length of the back off delay (in seconds). That means there will be a
 +     delay between failed tries of the length specified in this paramter. With
 +     each next retry, the delay is increased by the factor of two. For example,
 +     if the value is '2', the first delay is 2 seconds, the second 4 seconds,
 +     the third 8 seconds (until the max_retries) are hit or the function call
 +     succeeds.
  
    """
 +  if backoff < 0:
 +    raise exception_class("Backoff must be a non-negative integer.")
 +
    last_exception = None
 +  delay = backoff
    for i in range(max_retries):
      try:
        fn(*args, **kwargs)
      except errors.OpExecError as e:
        logging.error("Error after retry no. %s: %s.", i, e)
        last_exception = e
 +      time.sleep(delay)
 +      delay *= 2
    else:
      if last_exception:
        raise exception_class("Error after %s retries. Last exception: %s."
diff --combined lib/watcher/__init__.py
@@@ -345,36 -345,12 +345,36 @@@ def _CheckForOfflineNodes(nodes, instan
    return compat.any(nodes[node_name].offline for node_name in instance.snodes)
  
  
 -def _VerifyDisks(cl, uuid, nodes, instances):
 +def _GetPendingVerifyDisks(cl, uuid):
 +  """Checks if there are any currently running or pending group verify jobs and
 +  if so, returns their id.
 +
 +  """
 +  qfilter = qlang.MakeSimpleFilter("status",
 +                                    frozenset([constants.JOB_STATUS_RUNNING,
 +                                               constants.JOB_STATUS_QUEUED,
 +                                               constants.JOB_STATUS_WAITING]))
 +  qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
 +
 +  ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
 +         if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
 +  return ids
 +
 +
 +def _VerifyDisks(cl, uuid, nodes, instances, is_strict):
    """Run a per-group "gnt-cluster verify-disks".
  
    """
 +
 +  existing_jobs = _GetPendingVerifyDisks(cl, uuid)
 +  if existing_jobs:
 +    logging.info("There are verify disks jobs already pending (%s), skipping "
 +                 "VerifyDisks step for %s.",
 +                 utils.CommaJoin(existing_jobs), uuid)
 +    return
 +
    op = opcodes.OpGroupVerifyDisks(
 -    group_name=uuid, priority=constants.OP_PRIO_LOW)
 +    group_name=uuid, priority=constants.OP_PRIO_LOW, is_strict=is_strict)
    op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
                  "Verifying disks of group %s" % uuid,
                  utils.EpochNano())]
@@@ -501,9 -477,6 +501,9 @@@ def ParseOptions()
                      help="Don't wait for child processes")
    parser.add_option("--no-verify-disks", dest="no_verify_disks", default=False,
                      action="store_true", help="Do not verify disk status")
 +  parser.add_option("--no-strict", dest="no_strict",
 +                    default=False, action="store_true",
 +                    help="Do not run group verify in strict mode")
    parser.add_option("--rapi-ip", dest="rapi_ip",
                      default=constants.IP4_ADDRESS_LOCALHOST,
                      help="Use this IP to talk to RAPI.")
@@@ -731,7 -704,6 +731,7 @@@ def _GlobalWatcher(opts)
    # we are on master now
    utils.EnsureDaemon(constants.RAPI)
    utils.EnsureDaemon(constants.WCONFD)
 +  utils.EnsureDaemon(constants.MAINTD)
  
    # If RAPI isn't responding to queries, try one restart
    logging.debug("Attempting to talk to remote API on %s",
@@@ -871,7 -843,7 +871,7 @@@ def _GroupWatcher(opts)
  
    logging.debug("Using state file %s", state_path)
  
 -  # Global watcher
 +  # Group watcher file lock
    statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
    if not statefile:
      return constants.EXIT_FAILURE
  
      started = _CheckInstances(client, notepad, instances, locks)
      _CheckDisks(client, notepad, nodes, instances, started)
 -
 -    # Check if the nodegroup only has ext storage type
 -    only_ext = compat.all(i.disk_template == constants.DT_EXT
 -                          for i in instances.values())
 -
 -    # We skip current NodeGroup verification if there are only external storage
 -    # devices. Currently we provide an interface for external storage provider
 -    # for disk verification implementations, however current ExtStorageDevice
 -    # does not provide an API for this yet.
 -    #
 -    # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
 -    # is implemented.
 -    if not opts.no_verify_disks and not only_ext:
 -      _VerifyDisks(client, group_uuid, nodes, instances)
    except Exception, err:
      logging.info("Not updating status file due to failure: %s", err)
      raise
    else:
      # Save changes for next run
      notepad.Save(state_path)
 +    notepad.Close()
 +
 +  # Check if the nodegroup only has ext storage type
 +  only_ext = compat.all(i.disk_template == constants.DT_EXT
 +                        for i in instances.values())
 +
 +  # We skip current NodeGroup verification if there are only external storage
 +  # devices. Currently we provide an interface for external storage provider
 +  # for disk verification implementations, however current ExtStorageDevice
 +  # does not provide an API for this yet.
 +  #
 +  # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
 +  # is implemented.
 +  if not opts.no_verify_disks and not only_ext:
 +    is_strict = not opts.no_strict
 +    _VerifyDisks(client, group_uuid, nodes, instances, is_strict=is_strict)
  
    return constants.EXIT_SUCCESS
  
@@@ -964,7 -934,7 +964,7 @@@ def Main()
      logging.error("Job queue is full, can't query cluster state")
    except errors.JobQueueDrainError:
      logging.error("Job queue is drained, can't maintain cluster state")
-   except Exception, err:
+   except Exception, err: # pylint: disable=W0703
      logging.exception(str(err))
      return constants.EXIT_FAILURE
  
diff --combined src/Ganeti/THH/PyRPC.hs
@@@ -40,11 -40,9 +40,11 @@@ module Ganeti.THH.PyRP
    , genPyUDSRpcStubStr
    ) where
  
 -import Control.Monad
 +import Prelude ()
 +import Ganeti.Prelude
 +
 +import Control.Monad (liftM, zipWithM)
  import Data.Char (toLower, toUpper)
 -import Data.Functor
  import Data.Maybe (fromMaybe)
  import Language.Haskell.TH
  import Language.Haskell.TH.Syntax (liftString)
@@@ -183,7 -181,7 +183,7 @@@ genPyUDSRpcStub className constName = l
                                        namesToClass className stubCode
    where
      header = text "# This file is automatically generated, do not edit!" $+$
-              text "# pylint: disable-all"
+              text "# pylint: skip-file"
      stubCode =
        abstrMethod genericInvokeName [ text "method", text "*args"] $+$
        method socketPathName [] (
@@@ -43,7 -43,7 +43,7 @@@ module Ganeti.Utils.Atomi
  import qualified Control.Exception.Lifted as L
  import Control.Monad
  import Control.Monad.Base (MonadBase(..))
 -import Control.Monad.Error
 +import Control.Monad.Error.Class (MonadError)
  import Control.Monad.Trans.Control
  import System.FilePath.Posix (takeDirectory, takeBaseName)
  import System.IO
@@@ -80,8 -80,14 +80,14 @@@ fsyncFileChecked path 
  atomicUpdateFile :: (MonadBaseControl IO m)
                   => FilePath -> (FilePath -> Handle -> m a) -> m a
  atomicUpdateFile path action = do
+   -- Put a separator on the filename pattern to produce temporary filenames
+   -- such as job-1234-NNNNNN.tmp instead of job-1234NNNNNN. The latter can cause
+   -- problems (as well as user confusion) because temporary filenames have the
+   -- same format as real filenames, and anything that scans a directory won't be
+   -- able to tell them apart.
+   let filenameTemplate = takeBaseName path ++ "-.tmp"
    (tmppath, tmphandle) <- liftBase $ openBinaryTempFile (takeDirectory path)
-                                                         (takeBaseName path)
+                                                         filenameTemplate
    r <- L.finally (action tmppath tmphandle)
                   (liftBase (hClose tmphandle >> fsyncFileChecked tmppath))
    -- if all went well, rename the file
  -- | Opens a file in a R/W mode, locks it (blocking if needed) and runs
  -- a given action while the file is locked. Releases the lock and
  -- closes the file afterwards.
 -withLockedFile :: (MonadError e m, Error e, MonadBaseControl IO m)
 +withLockedFile :: (MonadError e m, FromString e, MonadBaseControl IO m)
                 => FilePath -> (Fd -> m a) -> m a
  withLockedFile path =
      L.bracket (openAndLock path) (liftBase . closeFd)
    where
 -    openAndLock :: (MonadError e m, Error e, MonadBaseControl IO m)
 +    openAndLock :: (MonadError e m, FromString e, MonadBaseControl IO m)
                  => FilePath -> m Fd
      openAndLock p = liftBase $ do
        fd <- openFd p ReadWrite Nothing defaultFileFlags
@@@ -64,7 -64,6 +64,7 @@@ def _UpdateIvNames(base_idx, disks)
  
  
  # pylint: disable=R0904
 +# pylint: disable=W0102
  class ConfigMock(config.ConfigWriter):
    """A mocked cluster configuration with added methods for easy customization.
  
                        ndparams=None,
                        diskparams=None,
                        ipolicy=None,
 -                      hv_state_static=None,
 +                      hv_state_static={},
                        disk_state_static=None,
                        alloc_policy=None,
                        networks=None):
                   ndparams=None,
                   powered=True,
                   hv_state=None,
 -                 hv_state_static=None,
 +                 hv_state_static={},
                   disk_state=None,
                   disk_state_static=None):
      """Add a new L{objects.Node} to the cluster configuration
      return net
  
    def AddOrphanDisk(self, **params):
-     disk = self.CreateDisk(**params)  # pylint: disable=W0142
+     disk = self.CreateDisk(**params)
      self._UnlockedAddDisk(disk)
  
    def ConnectNetworkToGroup(self, net, group, netparams=None):
diff --combined tools/cluster-merge
@@@ -448,10 -448,7 +448,7 @@@ class Merger(object)
        check_params_strict.append("shared_file_storage_dir")
      check_params.extend(check_params_strict)
  
-     if self.params == _PARAMS_STRICT:
-       params_strict = True
-     else:
-       params_strict = False
+     params_strict = (self.params == _PARAMS_STRICT)
  
      for param_name in check_params:
        my_param = getattr(my_cluster, param_name)
@@@ -807,9 -804,7 +804,9 @@@ def main()
  
    (options, args) = parser.parse_args()
  
 -  utils.SetupToolLogging(options.debug, options.verbose)
 +  utils.SetupToolLogging(
 +      options.debug, options.verbose,
 +      toolname=os.path.splitext(os.path.basename(__file__))[0])
  
    if not args:
      parser.error("No clusters specified")
diff --combined tools/move-instance
@@@ -51,8 -51,6 +51,6 @@@ from ganeti import compa
  from ganeti import rapi
  from ganeti import errors
  
- import ganeti.rapi.client # pylint: disable=W0611
- import ganeti.rapi.client_utils
  from ganeti.rapi.client import UsesRapiClient
  
  
@@@ -416,7 -414,7 +414,7 @@@ class MoveRuntime(object)
        errmsg = None
      except Abort:
        errmsg = "Aborted"
-     except Exception, err:
+     except Exception, err:  # pylint: disable=W0703
        logging.exception("Caught unhandled exception")
        errmsg = str(err)
  
@@@ -945,9 -943,9 +943,9 @@@ def _CheckInstanceOptions(parser, optio
        options.nics = cli.ParseNicOption(options.nics)
    else:
      # Moving more than one instance
-     if (options.dest_instance_name or options.dest_primary_node or
-         options.dest_secondary_node or options.hvparams or
-         options.beparams or options.osparams or options.nics):
+     if compat.any(options.dest_instance_name, options.dest_primary_node,
+                   options.dest_secondary_node, options.hvparams,
+                   options.beparams, options.osparams, options.nics):
        parser.error("The options --dest-instance-name, --dest-primary-node,"
                     " --dest-secondary-node, --hypervisor-parameters,"
                     " --backend-parameters, --os-parameters and --net can"
@@@ -1033,10 -1031,7 +1031,10 @@@ def main()
    """
    (parser, options, args) = ParseOptions()
  
 -  utils.SetupToolLogging(options.debug, options.verbose, threadname=True)
 +  utils.SetupToolLogging(
 +      options.debug, options.verbose, threadname=True,
 +      toolname=os.path.splitext(os.path.basename(__file__))[0],
 +      logfile=None)
  
    (src_cluster_name, dest_cluster_name, instance_names) = \
      CheckOptions(parser, options, args)