4 # Copyright (C) 2006, 2007, 2010, 2011, 2012, 2013, 2014 Google Inc.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 """Cluster related commands"""
32 # pylint: disable=W0401,W0613,W0614,C0103
33 # W0401: Wildcard import ganeti.cli
34 # W0613: Unused argument, since all functions follow the same API
35 # W0614: Unused import %s from wildcard import (since we need cli)
36 # C0103: Invalid name gnt-cluster
38 from cStringIO
import StringIO
45 from ganeti
.cli
import *
46 from ganeti
import bootstrap
47 from ganeti
import compat
48 from ganeti
import constants
49 from ganeti
import config
50 from ganeti
import errors
51 from ganeti
import netutils
52 from ganeti
import objects
53 from ganeti
import opcodes
54 from ganeti
import pathutils
55 from ganeti
import qlang
56 from ganeti
import serializer
57 from ganeti
import ssconf
58 from ganeti
import ssh
59 from ganeti
import uidpool
60 from ganeti
import utils
61 from ganeti
.client
import base
64 ON_OPT
= cli_option("--on", default
=False,
65 action
="store_true", dest
="on",
66 help="Recover from an EPO")
68 GROUPS_OPT
= cli_option("--groups", default
=False,
69 action
="store_true", dest
="groups",
70 help="Arguments are node groups instead of nodes")
72 FORCE_FAILOVER
= cli_option("--yes-do-it", dest
="yes_do_it",
73 help="Override interactive check for --no-voting",
74 default
=False, action
="store_true")
76 FORCE_DISTRIBUTION
= cli_option("--yes-do-it", dest
="yes_do_it",
77 help="Unconditionally distribute the"
78 " configuration, even if the queue"
80 default
=False, action
="store_true")
82 TO_OPT
= cli_option("--to", default
=None, type="string",
83 help="The Ganeti version to upgrade to")
85 RESUME_OPT
= cli_option("--resume", default
=False, action
="store_true",
86 help="Resume any pending Ganeti upgrades")
88 DATA_COLLECTOR_INTERVAL_OPT
= cli_option(
89 "--data-collector-interval", default
={}, type="keyval",
90 help="Set collection intervals in seconds of data collectors.")
92 _EPO_PING_INTERVAL
= 30 # 30 seconds between pings
93 _EPO_PING_TIMEOUT
= 1 # 1 second
94 _EPO_REACHABLE_TIMEOUT
= 15 * 60 # 15 minutes
97 def _InitEnabledDiskTemplates(opts
):
98 """Initialize the list of enabled disk templates.
101 if opts
.enabled_disk_templates
:
102 return opts
.enabled_disk_templates
.split(",")
104 return constants
.DEFAULT_ENABLED_DISK_TEMPLATES
107 def _InitVgName(opts
, enabled_disk_templates
):
108 """Initialize the volume group name.
110 @type enabled_disk_templates: list of strings
111 @param enabled_disk_templates: cluster-wide enabled disk templates
115 if opts
.vg_name
is not None:
116 vg_name
= opts
.vg_name
118 if not utils
.IsLvmEnabled(enabled_disk_templates
):
119 ToStdout("You specified a volume group with --vg-name, but you did not"
120 " enable any disk template that uses lvm.")
121 elif utils
.IsLvmEnabled(enabled_disk_templates
):
122 raise errors
.OpPrereqError(
123 "LVM disk templates are enabled, but vg name not set.")
124 elif utils
.IsLvmEnabled(enabled_disk_templates
):
125 vg_name
= constants
.DEFAULT_VG
129 def _InitDrbdHelper(opts
, enabled_disk_templates
):
130 """Initialize the DRBD usermode helper.
133 drbd_enabled
= constants
.DT_DRBD8
in enabled_disk_templates
135 if not drbd_enabled
and opts
.drbd_helper
is not None:
136 ToStdout("Note: You specified a DRBD usermode helper, while DRBD storage"
140 if opts
.drbd_helper
is None:
141 return constants
.DEFAULT_DRBD_HELPER
142 if opts
.drbd_helper
== '':
143 raise errors
.OpPrereqError(
144 "Unsetting the drbd usermode helper while enabling DRBD is not"
147 return opts
.drbd_helper
151 def InitCluster(opts
, args
):
152 """Initialize the cluster.
154 @param opts: the command line options selected by the user
156 @param args: should contain only one element, the desired
159 @return: the desired exit code
162 enabled_disk_templates
= _InitEnabledDiskTemplates(opts
)
165 vg_name
= _InitVgName(opts
, enabled_disk_templates
)
166 drbd_helper
= _InitDrbdHelper(opts
, enabled_disk_templates
)
167 except errors
.OpPrereqError
, e
:
171 master_netdev
= opts
.master_netdev
172 if master_netdev
is None:
173 nic_mode
= opts
.nicparams
.get(constants
.NIC_MODE
, None)
175 # default case, use bridging
176 master_netdev
= constants
.DEFAULT_BRIDGE
177 elif nic_mode
== constants
.NIC_MODE_OVS
:
178 # default ovs is different from default bridge
179 master_netdev
= constants
.DEFAULT_OVS
180 opts
.nicparams
[constants
.NIC_LINK
] = constants
.DEFAULT_OVS
182 hvlist
= opts
.enabled_hypervisors
184 hvlist
= constants
.DEFAULT_ENABLED_HYPERVISOR
185 hvlist
= hvlist
.split(",")
187 hvparams
= dict(opts
.hvparams
)
188 beparams
= opts
.beparams
189 nicparams
= opts
.nicparams
191 diskparams
= dict(opts
.diskparams
)
193 # check the disk template types here, as we cannot rely on the type check done
194 # by the opcode parameter types
195 diskparams_keys
= set(diskparams
.keys())
196 if not (diskparams_keys
<= constants
.DISK_TEMPLATES
):
197 unknown
= utils
.NiceSort(diskparams_keys
- constants
.DISK_TEMPLATES
)
198 ToStderr("Disk templates unknown: %s" % utils
.CommaJoin(unknown
))
201 # prepare beparams dict
202 beparams
= objects
.FillDict(constants
.BEC_DEFAULTS
, beparams
)
203 utils
.ForceDictType(beparams
, constants
.BES_PARAMETER_COMPAT
)
205 # prepare nicparams dict
206 nicparams
= objects
.FillDict(constants
.NICC_DEFAULTS
, nicparams
)
207 utils
.ForceDictType(nicparams
, constants
.NICS_PARAMETER_TYPES
)
209 # prepare ndparams dict
210 if opts
.ndparams
is None:
211 ndparams
= dict(constants
.NDC_DEFAULTS
)
213 ndparams
= objects
.FillDict(constants
.NDC_DEFAULTS
, opts
.ndparams
)
214 utils
.ForceDictType(ndparams
, constants
.NDS_PARAMETER_TYPES
)
216 # prepare hvparams dict
217 for hv
in constants
.HYPER_TYPES
:
218 if hv
not in hvparams
:
220 hvparams
[hv
] = objects
.FillDict(constants
.HVC_DEFAULTS
[hv
], hvparams
[hv
])
221 utils
.ForceDictType(hvparams
[hv
], constants
.HVS_PARAMETER_TYPES
)
223 # prepare diskparams dict
224 for templ
in constants
.DISK_TEMPLATES
:
225 if templ
not in diskparams
:
226 diskparams
[templ
] = {}
227 diskparams
[templ
] = objects
.FillDict(constants
.DISK_DT_DEFAULTS
[templ
],
229 utils
.ForceDictType(diskparams
[templ
], constants
.DISK_DT_TYPES
)
231 # prepare ipolicy dict
232 ipolicy
= CreateIPolicyFromOpts(
233 ispecs_mem_size
=opts
.ispecs_mem_size
,
234 ispecs_cpu_count
=opts
.ispecs_cpu_count
,
235 ispecs_disk_count
=opts
.ispecs_disk_count
,
236 ispecs_disk_size
=opts
.ispecs_disk_size
,
237 ispecs_nic_count
=opts
.ispecs_nic_count
,
238 minmax_ispecs
=opts
.ipolicy_bounds_specs
,
239 std_ispecs
=opts
.ipolicy_std_specs
,
240 ipolicy_disk_templates
=opts
.ipolicy_disk_templates
,
241 ipolicy_vcpu_ratio
=opts
.ipolicy_vcpu_ratio
,
242 ipolicy_spindle_ratio
=opts
.ipolicy_spindle_ratio
,
245 if opts
.candidate_pool_size
is None:
246 opts
.candidate_pool_size
= constants
.MASTER_POOL_SIZE_DEFAULT
248 if opts
.mac_prefix
is None:
249 opts
.mac_prefix
= constants
.DEFAULT_MAC_PREFIX
251 uid_pool
= opts
.uid_pool
252 if uid_pool
is not None:
253 uid_pool
= uidpool
.ParseUidPool(uid_pool
)
255 if opts
.prealloc_wipe_disks
is None:
256 opts
.prealloc_wipe_disks
= False
258 external_ip_setup_script
= opts
.use_external_mip_script
259 if external_ip_setup_script
is None:
260 external_ip_setup_script
= False
263 primary_ip_version
= int(opts
.primary_ip_version
)
264 except (ValueError, TypeError), err
:
265 ToStderr("Invalid primary ip version value: %s" % str(err
))
268 master_netmask
= opts
.master_netmask
270 if master_netmask
is not None:
271 master_netmask
= int(master_netmask
)
272 except (ValueError, TypeError), err
:
273 ToStderr("Invalid master netmask value: %s" % str(err
))
277 disk_state
= utils
.FlatToDict(opts
.disk_state
)
281 hv_state
= dict(opts
.hv_state
)
283 if opts
.install_image
:
284 install_image
= opts
.install_image
288 if opts
.zeroing_image
:
289 zeroing_image
= opts
.zeroing_image
293 compression_tools
= _GetCompressionTools(opts
)
295 default_ialloc_params
= opts
.default_iallocator_params
297 if opts
.enabled_user_shutdown
:
298 enabled_user_shutdown
= True
300 enabled_user_shutdown
= False
302 bootstrap
.InitCluster(cluster_name
=args
[0],
303 secondary_ip
=opts
.secondary_ip
,
305 mac_prefix
=opts
.mac_prefix
,
306 master_netmask
=master_netmask
,
307 master_netdev
=master_netdev
,
308 file_storage_dir
=opts
.file_storage_dir
,
309 shared_file_storage_dir
=opts
.shared_file_storage_dir
,
310 gluster_storage_dir
=opts
.gluster_storage_dir
,
311 enabled_hypervisors
=hvlist
,
316 diskparams
=diskparams
,
318 candidate_pool_size
=opts
.candidate_pool_size
,
319 modify_etc_hosts
=opts
.modify_etc_hosts
,
320 modify_ssh_setup
=opts
.modify_ssh_setup
,
321 maintain_node_health
=opts
.maintain_node_health
,
322 drbd_helper
=drbd_helper
,
324 default_iallocator
=opts
.default_iallocator
,
325 default_iallocator_params
=default_ialloc_params
,
326 primary_ip_version
=primary_ip_version
,
327 prealloc_wipe_disks
=opts
.prealloc_wipe_disks
,
328 use_external_mip_script
=external_ip_setup_script
,
330 disk_state
=disk_state
,
331 enabled_disk_templates
=enabled_disk_templates
,
332 install_image
=install_image
,
333 zeroing_image
=zeroing_image
,
334 compression_tools
=compression_tools
,
335 enabled_user_shutdown
=enabled_user_shutdown
,
337 op
= opcodes
.OpClusterPostInit()
338 SubmitOpCode(op
, opts
=opts
)
343 def DestroyCluster(opts
, args
):
344 """Destroy the cluster.
346 @param opts: the command line options selected by the user
348 @param args: should be an empty list
350 @return: the desired exit code
353 if not opts
.yes_do_it
:
354 ToStderr("Destroying a cluster is irreversible. If you really want"
355 " destroy this cluster, supply the --yes-do-it option.")
358 op
= opcodes
.OpClusterDestroy()
359 master_uuid
= SubmitOpCode(op
, opts
=opts
)
360 # if we reached this, the opcode didn't fail; we can proceed to
361 # shutdown all the daemons
362 bootstrap
.FinalizeClusterDestroy(master_uuid
)
366 def RenameCluster(opts
, args
):
367 """Rename the cluster.
369 @param opts: the command line options selected by the user
371 @param args: should contain only one element, the new cluster name
373 @return: the desired exit code
378 (cluster_name
, ) = cl
.QueryConfigValues(["cluster_name"])
382 usertext
= ("This will rename the cluster from '%s' to '%s'. If you are"
383 " connected over the network to the cluster name, the"
384 " operation is very dangerous as the IP address will be"
385 " removed from the node and the change may not go through."
386 " Continue?") % (cluster_name
, new_name
)
387 if not AskUser(usertext
):
390 op
= opcodes
.OpClusterRename(name
=new_name
)
391 result
= SubmitOpCode(op
, opts
=opts
, cl
=cl
)
394 ToStdout("Cluster renamed from '%s' to '%s'", cluster_name
, result
)
399 def ActivateMasterIp(opts
, args
):
400 """Activates the master IP.
403 op
= opcodes
.OpClusterActivateMasterIp()
408 def DeactivateMasterIp(opts
, args
):
409 """Deactivates the master IP.
413 usertext
= ("This will disable the master IP. All the open connections to"
414 " the master IP will be closed. To reach the master you will"
415 " need to use its node IP."
417 if not AskUser(usertext
):
420 op
= opcodes
.OpClusterDeactivateMasterIp()
425 def RedistributeConfig(opts
, args
):
426 """Forces push of the cluster configuration.
428 @param opts: the command line options selected by the user
430 @param args: empty list
432 @return: the desired exit code
435 op
= opcodes
.OpClusterRedistConf()
437 SubmitOpCodeToDrainedQueue(op
)
439 SubmitOrSend(op
, opts
)
443 def ShowClusterVersion(opts
, args
):
444 """Write version of ganeti software to the standard output.
446 @param opts: the command line options selected by the user
448 @param args: should be an empty list
450 @return: the desired exit code
454 result
= cl
.QueryClusterInfo()
455 ToStdout("Software version: %s", result
["software_version"])
456 ToStdout("Internode protocol: %s", result
["protocol_version"])
457 ToStdout("Configuration format: %s", result
["config_version"])
458 ToStdout("OS api version: %s", result
["os_api_version"])
459 ToStdout("Export interface: %s", result
["export_version"])
460 ToStdout("VCS version: %s", result
["vcs_version"])
464 def ShowClusterMaster(opts
, args
):
465 """Write name of master node to the standard output.
467 @param opts: the command line options selected by the user
469 @param args: should be an empty list
471 @return: the desired exit code
474 master
= bootstrap
.GetMaster()
479 def _FormatGroupedParams(paramsdict
, roman
=False):
480 """Format Grouped parameters (be, nic, disk) by group.
482 @type paramsdict: dict of dicts
483 @param paramsdict: {group: {param: value, ...}, ...}
484 @rtype: dict of dicts
485 @return: copy of the input dictionaries with strings as values
489 for (item
, val
) in paramsdict
.items():
490 if isinstance(val
, dict):
491 ret
[item
] = _FormatGroupedParams(val
, roman
=roman
)
492 elif roman
and isinstance(val
, int):
493 ret
[item
] = compat
.TryToRoman(val
)
499 def _FormatDataCollectors(paramsdict
):
500 """Format Grouped parameters (be, nic, disk) by group.
502 @type paramsdict: dict of dicts
503 @param paramsdict: response of QueryClusterInfo
504 @rtype: dict of dicts
505 @return: parameter grouped by data collector
509 enabled
= paramsdict
[constants
.DATA_COLLECTORS_ENABLED_NAME
]
510 interval
= paramsdict
[constants
.DATA_COLLECTORS_INTERVAL_NAME
]
514 ret
[key
] = dict(active
=enabled
[key
],
515 interval
="%.3fs" % (interval
[key
] / 1e6
))
519 def ShowClusterConfig(opts
, args
):
520 """Shows cluster information.
522 @param opts: the command line options selected by the user
524 @param args: should be an empty list
526 @return: the desired exit code
530 result
= cl
.QueryClusterInfo()
533 tags
= utils
.CommaJoin(utils
.NiceSort(result
["tags"]))
536 if result
["reserved_lvs"]:
537 reserved_lvs
= utils
.CommaJoin(result
["reserved_lvs"])
539 reserved_lvs
= "(none)"
541 enabled_hv
= result
["enabled_hypervisors"]
542 hvparams
= dict((k
, v
) for k
, v
in result
["hvparams"].iteritems()
546 ("Cluster name", result
["name"]),
547 ("Cluster UUID", result
["uuid"]),
549 ("Creation time", utils
.FormatTime(result
["ctime"])),
550 ("Modification time", utils
.FormatTime(result
["mtime"])),
552 ("Master node", result
["master"]),
554 ("Architecture (this node)",
555 "%s (%s)" % (result
["architecture"][0], result
["architecture"][1])),
559 ("Default hypervisor", result
["default_hypervisor"]),
560 ("Enabled hypervisors", utils
.CommaJoin(enabled_hv
)),
562 ("Hypervisor parameters", _FormatGroupedParams(hvparams
,
563 opts
.roman_integers
)),
565 ("OS-specific hypervisor parameters",
566 _FormatGroupedParams(result
["os_hvp"], opts
.roman_integers
)),
568 ("OS parameters", _FormatGroupedParams(result
["osparams"],
569 opts
.roman_integers
)),
571 ("Hidden OSes", utils
.CommaJoin(result
["hidden_os"])),
572 ("Blacklisted OSes", utils
.CommaJoin(result
["blacklisted_os"])),
574 ("Cluster parameters", [
575 ("candidate pool size",
576 compat
.TryToRoman(result
["candidate_pool_size"],
577 convert
=opts
.roman_integers
)),
578 ("maximal number of jobs running simultaneously",
579 compat
.TryToRoman(result
["max_running_jobs"],
580 convert
=opts
.roman_integers
)),
581 ("maximal number of jobs simultaneously tracked by the scheduler",
582 compat
.TryToRoman(result
["max_tracked_jobs"],
583 convert
=opts
.roman_integers
)),
584 ("mac prefix", result
["mac_prefix"]),
585 ("master netdev", result
["master_netdev"]),
586 ("master netmask", compat
.TryToRoman(result
["master_netmask"],
587 opts
.roman_integers
)),
588 ("use external master IP address setup script",
589 result
["use_external_mip_script"]),
590 ("lvm volume group", result
["volume_group_name"]),
591 ("lvm reserved volumes", reserved_lvs
),
592 ("drbd usermode helper", result
["drbd_usermode_helper"]),
593 ("file storage path", result
["file_storage_dir"]),
594 ("shared file storage path", result
["shared_file_storage_dir"]),
595 ("gluster storage path", result
["gluster_storage_dir"]),
596 ("maintenance of node health", result
["maintain_node_health"]),
597 ("uid pool", uidpool
.FormatUidPool(result
["uid_pool"])),
598 ("default instance allocator", result
["default_iallocator"]),
599 ("default instance allocator parameters",
600 result
["default_iallocator_params"]),
601 ("primary ip version", compat
.TryToRoman(result
["primary_ip_version"],
602 opts
.roman_integers
)),
603 ("preallocation wipe disks", result
["prealloc_wipe_disks"]),
604 ("OS search path", utils
.CommaJoin(pathutils
.OS_SEARCH_PATH
)),
605 ("ExtStorage Providers search path",
606 utils
.CommaJoin(pathutils
.ES_SEARCH_PATH
)),
607 ("enabled disk templates",
608 utils
.CommaJoin(result
["enabled_disk_templates"])),
609 ("install image", result
["install_image"]),
610 ("instance communication network",
611 result
["instance_communication_network"]),
612 ("zeroing image", result
["zeroing_image"]),
613 ("compression tools", result
["compression_tools"]),
614 ("enabled user shutdown", result
["enabled_user_shutdown"]),
617 ("Default node parameters",
618 _FormatGroupedParams(result
["ndparams"], roman
=opts
.roman_integers
)),
620 ("Default instance parameters",
621 _FormatGroupedParams(result
["beparams"], roman
=opts
.roman_integers
)),
623 ("Default nic parameters",
624 _FormatGroupedParams(result
["nicparams"], roman
=opts
.roman_integers
)),
626 ("Default disk parameters",
627 _FormatGroupedParams(result
["diskparams"], roman
=opts
.roman_integers
)),
629 ("Instance policy - limits for instances",
630 FormatPolicyInfo(result
["ipolicy"], None, True, opts
.roman_integers
)),
631 ("Data collectors", _FormatDataCollectors(result
)),
634 PrintGenericInfo(info
)
638 def ClusterCopyFile(opts
, args
):
639 """Copy a file from master to some nodes.
641 @param opts: the command line options selected by the user
643 @param args: should contain only one element, the path of
644 the file to be copied
646 @return: the desired exit code
650 filename
= os
.path
.abspath(filename
)
652 if not os
.path
.exists(filename
):
653 raise errors
.OpPrereqError("No such filename '%s'" % filename
,
659 cluster_name
= cl
.QueryConfigValues(["cluster_name"])[0]
661 results
= GetOnlineNodes(nodes
=opts
.nodes
, cl
=qcl
, filter_master
=True,
662 secondary_ips
=opts
.use_replication_network
,
663 nodegroup
=opts
.nodegroup
)
664 ports
= GetNodesSshPorts(opts
.nodes
, qcl
)
669 srun
= ssh
.SshRunner(cluster_name
)
670 for (node
, port
) in zip(results
, ports
):
671 if not srun
.CopyFileToNode(node
, port
, filename
):
672 ToStderr("Copy of file %s to node %s:%d failed", filename
, node
, port
)
677 def RunClusterCommand(opts
, args
):
678 """Run a command on some nodes.
680 @param opts: the command line options selected by the user
682 @param args: should contain the command to be run and its arguments
684 @return: the desired exit code
690 command
= " ".join(args
)
692 nodes
= GetOnlineNodes(nodes
=opts
.nodes
, cl
=qcl
, nodegroup
=opts
.nodegroup
)
693 ports
= GetNodesSshPorts(nodes
, qcl
)
695 cluster_name
, master_node
= cl
.QueryConfigValues(["cluster_name",
698 srun
= ssh
.SshRunner(cluster_name
=cluster_name
)
700 # Make sure master node is at list end
701 if master_node
in nodes
:
702 nodes
.remove(master_node
)
703 nodes
.append(master_node
)
705 for (name
, port
) in zip(nodes
, ports
):
706 result
= srun
.Run(name
, constants
.SSH_LOGIN_USER
, command
, port
=port
)
708 if opts
.failure_only
and result
.exit_code
== constants
.EXIT_SUCCESS
:
709 # Do not output anything for successful commands
712 ToStdout("------------------------------------------------")
713 if opts
.show_machine_names
:
714 for line
in result
.output
.splitlines():
715 ToStdout("%s: %s", name
, line
)
717 ToStdout("node: %s", name
)
718 ToStdout("%s", result
.output
)
719 ToStdout("return code = %s", result
.exit_code
)
724 def VerifyCluster(opts
, args
):
725 """Verify integrity of cluster, performing various test on nodes.
727 @param opts: the command line options selected by the user
729 @param args: should be an empty list
731 @return: the desired exit code
736 if opts
.skip_nplusone_mem
:
737 skip_checks
.append(constants
.VERIFY_NPLUSONE_MEM
)
741 op
= opcodes
.OpClusterVerify(verbose
=opts
.verbose
,
742 error_codes
=opts
.error_codes
,
743 debug_simulate_errors
=opts
.simulate_errors
,
744 skip_checks
=skip_checks
,
745 ignore_errors
=opts
.ignore_errors
,
746 group_name
=opts
.nodegroup
,
747 verify_clutter
=opts
.verify_clutter
)
748 result
= SubmitOpCode(op
, cl
=cl
, opts
=opts
)
750 # Keep track of submitted jobs
751 jex
= JobExecutor(cl
=cl
, opts
=opts
)
753 for (status
, job_id
) in result
[constants
.JOB_IDS_KEY
]:
754 jex
.AddJobId(None, status
, job_id
)
756 results
= jex
.GetResults()
758 (bad_jobs
, bad_results
) = \
760 # Convert iterators to lists
763 map(compat
.partial(itertools
.ifilterfalse
, bool),
764 # Convert result to booleans in a tuple
765 zip(*((job_success
, len(op_results
) == 1 and op_results
[0])
766 for (job_success
, op_results
) in results
)))))
768 if bad_jobs
== 0 and bad_results
== 0:
769 rcode
= constants
.EXIT_SUCCESS
771 rcode
= constants
.EXIT_FAILURE
773 ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs
)
778 def VerifyDisks(opts
, args
):
779 """Verify integrity of cluster disks.
781 @param opts: the command line options selected by the user
783 @param args: should be an empty list
785 @return: the desired exit code
790 op
= opcodes
.OpClusterVerifyDisks()
792 result
= SubmitOpCode(op
, cl
=cl
, opts
=opts
)
794 # Keep track of submitted jobs
795 jex
= JobExecutor(cl
=cl
, opts
=opts
)
797 for (status
, job_id
) in result
[constants
.JOB_IDS_KEY
]:
798 jex
.AddJobId(None, status
, job_id
)
800 retcode
= constants
.EXIT_SUCCESS
802 for (status
, result
) in jex
.GetResults():
804 ToStdout("Job failed: %s", result
)
807 ((bad_nodes
, instances
, missing
), ) = result
809 for node
, text
in bad_nodes
.items():
810 ToStdout("Error gathering data on node %s: %s",
811 node
, utils
.SafeEncode(text
[-400:]))
812 retcode
= constants
.EXIT_FAILURE
813 ToStdout("You need to fix these nodes first before fixing instances")
815 for iname
in instances
:
818 op
= opcodes
.OpInstanceActivateDisks(instance_name
=iname
)
820 ToStdout("Activating disks for instance '%s'", iname
)
821 SubmitOpCode(op
, opts
=opts
, cl
=cl
)
822 except errors
.GenericError
, err
:
823 nret
, msg
= FormatError(err
)
825 ToStderr("Error activating disks for instance %s: %s", iname
, msg
)
828 for iname
, ival
in missing
.iteritems():
829 all_missing
= compat
.all(x
[0] in bad_nodes
for x
in ival
)
831 ToStdout("Instance %s cannot be verified as it lives on"
832 " broken nodes", iname
)
834 ToStdout("Instance %s has missing logical volumes:", iname
)
836 for node
, vol
in ival
:
837 if node
in bad_nodes
:
838 ToStdout("\tbroken node %s /dev/%s", node
, vol
)
840 ToStdout("\t%s /dev/%s", node
, vol
)
842 ToStdout("You need to replace or recreate disks for all the above"
843 " instances if this message persists after fixing broken nodes.")
844 retcode
= constants
.EXIT_FAILURE
846 ToStdout("No disks need to be activated.")
851 def RepairDiskSizes(opts
, args
):
852 """Verify sizes of cluster disks.
854 @param opts: the command line options selected by the user
856 @param args: optional list of instances to restrict check to
858 @return: the desired exit code
861 op
= opcodes
.OpClusterRepairDiskSizes(instances
=args
)
862 SubmitOpCode(op
, opts
=opts
)
866 def MasterFailover(opts
, args
):
867 """Failover the master node.
869 This command, when run on a non-master node, will cause the current
870 master to cease being master, and the non-master to become new
873 @param opts: the command line options selected by the user
875 @param args: should be an empty list
877 @return: the desired exit code
880 if not opts
.no_voting
:
881 # Verify that a majority of nodes is still healthy
882 if not bootstrap
.MajorityHealthy():
883 ToStderr("Master-failover with voting is only possible if the majority"
884 " of nodes is still healthy; use the --no-voting option after"
885 " ensuring by other means that you won't end up in a dual-master"
888 if opts
.no_voting
and not opts
.yes_do_it
:
889 usertext
= ("This will perform the failover even if most other nodes"
890 " are down, or if this node is outdated. This is dangerous"
891 " as it can lead to a non-consistent cluster. Check the"
892 " gnt-cluster(8) man page before proceeding. Continue?")
893 if not AskUser(usertext
):
896 rvlaue
, msgs
= bootstrap
.MasterFailover(no_voting
=opts
.no_voting
)
902 def MasterPing(opts
, args
):
903 """Checks if the master is alive.
905 @param opts: the command line options selected by the user
907 @param args: should be an empty list
909 @return: the desired exit code
914 cl
.QueryClusterInfo()
916 except Exception: # pylint: disable=W0703
920 def SearchTags(opts
, args
):
921 """Searches the tags on all the cluster.
923 @param opts: the command line options selected by the user
925 @param args: should contain only one element, the tag pattern
927 @return: the desired exit code
930 op
= opcodes
.OpTagsSearch(pattern
=args
[0])
931 result
= SubmitOpCode(op
, opts
=opts
)
934 result
= list(result
)
936 for path
, tag
in result
:
937 ToStdout("%s %s", path
, tag
)
940 def _ReadAndVerifyCert(cert_filename
, verify_private_key
=False):
941 """Reads and verifies an X509 certificate.
943 @type cert_filename: string
944 @param cert_filename: the path of the file containing the certificate to
945 verify encoded in PEM format
946 @type verify_private_key: bool
947 @param verify_private_key: whether to verify the private key in addition to
948 the public certificate
950 @return: a string containing the PEM-encoded certificate.
954 pem
= utils
.ReadFile(cert_filename
)
956 raise errors
.X509CertError(cert_filename
,
957 "Unable to read certificate: %s" % str(err
))
960 OpenSSL
.crypto
.load_certificate(OpenSSL
.crypto
.FILETYPE_PEM
, pem
)
961 except Exception, err
:
962 raise errors
.X509CertError(cert_filename
,
963 "Unable to load certificate: %s" % str(err
))
965 if verify_private_key
:
967 OpenSSL
.crypto
.load_privatekey(OpenSSL
.crypto
.FILETYPE_PEM
, pem
)
968 except Exception, err
:
969 raise errors
.X509CertError(cert_filename
,
970 "Unable to load private key: %s" % str(err
))
975 def _RenewCrypto(new_cluster_cert
, new_rapi_cert
, # pylint: disable=R0911
976 rapi_cert_filename
, new_spice_cert
, spice_cert_filename
,
977 spice_cacert_filename
, new_confd_hmac_key
, new_cds
,
978 cds_filename
, force
, new_node_cert
, new_ssh_keys
,
980 """Renews cluster certificates, keys and secrets.
982 @type new_cluster_cert: bool
983 @param new_cluster_cert: Whether to generate a new cluster certificate
984 @type new_rapi_cert: bool
985 @param new_rapi_cert: Whether to generate a new RAPI certificate
986 @type rapi_cert_filename: string
987 @param rapi_cert_filename: Path to file containing new RAPI certificate
988 @type new_spice_cert: bool
989 @param new_spice_cert: Whether to generate a new SPICE certificate
990 @type spice_cert_filename: string
991 @param spice_cert_filename: Path to file containing new SPICE certificate
992 @type spice_cacert_filename: string
993 @param spice_cacert_filename: Path to file containing the certificate of the
994 CA that signed the SPICE certificate
995 @type new_confd_hmac_key: bool
996 @param new_confd_hmac_key: Whether to generate a new HMAC key
998 @param new_cds: Whether to generate a new cluster domain secret
999 @type cds_filename: string
1000 @param cds_filename: Path to file containing new cluster domain secret
1002 @param force: Whether to ask user for confirmation
1003 @type new_node_cert: bool
1004 @param new_node_cert: Whether to generate new node certificates
1005 @type new_ssh_keys: bool
1006 @param new_ssh_keys: Whether to generate new node SSH keys
1007 @type verbose: boolean
1008 @param verbose: show verbose output
1009 @type debug: boolean
1010 @param debug: show debug output
1013 ToStdout("Updating certificates now. Running \"gnt-cluster verify\" "
1014 " is recommended after this operation.")
1016 if new_rapi_cert
and rapi_cert_filename
:
1017 ToStderr("Only one of the --new-rapi-certificate and --rapi-certificate"
1018 " options can be specified at the same time.")
1021 if new_cds
and cds_filename
:
1022 ToStderr("Only one of the --new-cluster-domain-secret and"
1023 " --cluster-domain-secret options can be specified at"
1027 if new_spice_cert
and (spice_cert_filename
or spice_cacert_filename
):
1028 ToStderr("When using --new-spice-certificate, the --spice-certificate"
1029 " and --spice-ca-certificate must not be used.")
1032 if bool(spice_cacert_filename
) ^
bool(spice_cert_filename
):
1033 ToStderr("Both --spice-certificate and --spice-ca-certificate must be"
1037 rapi_cert_pem
, spice_cert_pem
, spice_cacert_pem
= (None, None, None)
1039 if rapi_cert_filename
:
1040 rapi_cert_pem
= _ReadAndVerifyCert(rapi_cert_filename
, True)
1041 if spice_cert_filename
:
1042 spice_cert_pem
= _ReadAndVerifyCert(spice_cert_filename
, True)
1043 spice_cacert_pem
= _ReadAndVerifyCert(spice_cacert_filename
)
1044 except errors
.X509CertError
, err
:
1045 ToStderr("Unable to load X509 certificate from %s: %s", err
[0], err
[1])
1050 cds
= utils
.ReadFile(cds_filename
)
1051 except Exception, err
: # pylint: disable=W0703
1052 ToStderr("Can't load new cluster domain secret from %s: %s" %
1053 (cds_filename
, str(err
)))
1059 usertext
= ("This requires all daemons on all nodes to be restarted and"
1060 " may take some time. Continue?")
1061 if not AskUser(usertext
):
1064 def _RenewCryptoInner(ctx
):
1065 ctx
.feedback_fn("Updating certificates and keys")
1067 bootstrap
.GenerateClusterCrypto(False,
1074 rapi_cert_pem
=rapi_cert_pem
,
1075 spice_cert_pem
=spice_cert_pem
,
1076 spice_cacert_pem
=spice_cacert_pem
,
1081 if new_rapi_cert
or rapi_cert_pem
:
1082 files_to_copy
.append(pathutils
.RAPI_CERT_FILE
)
1084 if new_spice_cert
or spice_cert_pem
:
1085 files_to_copy
.append(pathutils
.SPICE_CERT_FILE
)
1086 files_to_copy
.append(pathutils
.SPICE_CACERT_FILE
)
1088 if new_confd_hmac_key
:
1089 files_to_copy
.append(pathutils
.CONFD_HMAC_KEY
)
1092 files_to_copy
.append(pathutils
.CLUSTER_DOMAIN_SECRET_FILE
)
1095 for node_name
in ctx
.nonmaster_nodes
:
1096 port
= ctx
.ssh_ports
[node_name
]
1097 ctx
.feedback_fn("Copying %s to %s:%d" %
1098 (", ".join(files_to_copy
), node_name
, port
))
1099 for file_name
in files_to_copy
:
1100 ctx
.ssh
.CopyFileToNode(node_name
, port
, file_name
)
1102 def _RenewClientCerts(ctx
):
1103 ctx
.feedback_fn("Updating client SSL certificates.")
1105 cluster_name
= ssconf
.SimpleStore().GetClusterName()
1107 for node_name
in ctx
.nonmaster_nodes
+ [ctx
.master_node
]:
1108 ssh_port
= ctx
.ssh_ports
[node_name
]
1110 constants
.NDS_CLUSTER_NAME
: cluster_name
,
1111 constants
.NDS_NODE_DAEMON_CERTIFICATE
:
1112 utils
.ReadFile(pathutils
.NODED_CERT_FILE
),
1113 constants
.NDS_NODE_NAME
: node_name
,
1114 constants
.NDS_ACTION
: constants
.CRYPTO_ACTION_CREATE
,
1117 ssh
.RunSshCmdWithStdin(
1120 pathutils
.SSL_UPDATE
,
1124 verbose
=ctx
.verbose
,
1125 use_cluster_key
=True,
1127 strict_host_check
=True)
1129 # Create a temporary ssconf file using the master's client cert digest
1130 # and the 'bootstrap' keyword to enable distribution of all nodes' digests.
1131 master_digest
= utils
.GetCertificateDigest()
1132 ssconf_master_candidate_certs_filename
= os
.path
.join(
1133 pathutils
.DATA_DIR
, "%s%s" %
1134 (constants
.SSCONF_FILEPREFIX
, constants
.SS_MASTER_CANDIDATES_CERTS
))
1136 ssconf_master_candidate_certs_filename
,
1137 data
="%s=%s" % (constants
.CRYPTO_BOOTSTRAP
, master_digest
))
1138 for node_name
in ctx
.nonmaster_nodes
:
1139 port
= ctx
.ssh_ports
[node_name
]
1140 ctx
.feedback_fn("Copying %s to %s:%d" %
1141 (ssconf_master_candidate_certs_filename
, node_name
, port
))
1142 ctx
.ssh
.CopyFileToNode(node_name
, port
,
1143 ssconf_master_candidate_certs_filename
)
1145 # Write the boostrap entry to the config using wconfd.
1146 config_live_lock
= utils
.livelock
.LiveLock("renew_crypto")
1147 cfg
= config
.GetConfig(None, config_live_lock
)
1148 cfg
.AddNodeToCandidateCerts(constants
.CRYPTO_BOOTSTRAP
, master_digest
)
1149 cfg
.Update(cfg
.GetClusterInfo(), ctx
.feedback_fn
)
1151 def _RenewServerAndClientCerts(ctx
):
1152 ctx
.feedback_fn("Updating the cluster SSL certificate.")
1154 master_name
= ssconf
.SimpleStore().GetMasterNode()
1155 bootstrap
.GenerateClusterCrypto(True, # cluster cert
1158 False, # confd hmac key
1163 for node_name
in ctx
.nonmaster_nodes
:
1164 port
= ctx
.ssh_ports
[node_name
]
1165 server_cert
= pathutils
.NODED_CERT_FILE
1166 ctx
.feedback_fn("Copying %s to %s:%d" %
1167 (server_cert
, node_name
, port
))
1168 ctx
.ssh
.CopyFileToNode(node_name
, port
, server_cert
)
1170 _RenewClientCerts(ctx
)
1172 if new_rapi_cert
or new_spice_cert
or new_confd_hmac_key
or new_cds
:
1173 RunWhileClusterStopped(ToStdout
, _RenewCryptoInner
)
1175 # If only node certficates are recreated, call _RenewClientCerts only.
1176 if new_node_cert
and not new_cluster_cert
:
1177 RunWhileDaemonsStopped(ToStdout
, [constants
.NODED
, constants
.WCONFD
],
1178 _RenewClientCerts
, verbose
=verbose
, debug
=debug
)
1180 # If the cluster certificate are renewed, the client certificates need
1181 # to be renewed too.
1182 if new_cluster_cert
:
1183 RunWhileDaemonsStopped(ToStdout
, [constants
.NODED
, constants
.WCONFD
],
1184 _RenewServerAndClientCerts
, verbose
=verbose
,
1187 if new_node_cert
or new_cluster_cert
or new_ssh_keys
:
1189 renew_op
= opcodes
.OpClusterRenewCrypto(
1190 node_certificates
=new_node_cert
or new_cluster_cert
,
1191 ssh_keys
=new_ssh_keys
)
1192 SubmitOpCode(renew_op
, cl
=cl
)
1194 ToStdout("All requested certificates and keys have been replaced."
1195 " Running \"gnt-cluster verify\" now is recommended.")
1200 def _BuildGanetiPubKeys(options
, pub_key_file
=pathutils
.SSH_PUB_KEYS
, cl
=None,
1201 get_online_nodes_fn
=GetOnlineNodes
,
1202 get_nodes_ssh_ports_fn
=GetNodesSshPorts
,
1203 get_node_uuids_fn
=GetNodeUUIDs
,
1205 """Recreates the 'ganeti_pub_key' file by polling all nodes.
1208 if os
.path
.exists(pub_key_file
):
1209 utils
.CreateBackup(pub_key_file
)
1210 utils
.RemoveFile(pub_key_file
)
1212 ssh
.ClearPubKeyFile(pub_key_file
)
1217 (cluster_name
, master_node
) = \
1218 cl
.QueryConfigValues(["cluster_name", "master_node"])
1220 online_nodes
= get_online_nodes_fn([], cl
=cl
)
1221 ssh_ports
= get_nodes_ssh_ports_fn(online_nodes
+ [master_node
], cl
)
1222 ssh_port_map
= dict(zip(online_nodes
+ [master_node
], ssh_ports
))
1224 node_uuids
= get_node_uuids_fn(online_nodes
+ [master_node
], cl
)
1225 node_uuid_map
= dict(zip(online_nodes
+ [master_node
], node_uuids
))
1227 nonmaster_nodes
= [name
for name
in online_nodes
1228 if name
!= master_node
]
1230 _
, pub_key_filename
, _
= \
1231 ssh
.GetUserFiles(constants
.SSH_LOGIN_USER
, mkdir
=False, dircheck
=False,
1232 kind
=constants
.SSHK_DSA
, _homedir_fn
=homedir_fn
)
1234 # get the key file of the master node
1235 pub_key
= utils
.ReadFile(pub_key_filename
)
1236 ssh
.AddPublicKey(node_uuid_map
[master_node
], pub_key
,
1237 key_file
=pub_key_file
)
1239 # get the key files of all non-master nodes
1240 for node
in nonmaster_nodes
:
1241 pub_key
= ssh
.ReadRemoteSshPubKeys(pub_key_filename
, node
, cluster_name
,
1243 options
.ssh_key_check
,
1244 options
.ssh_key_check
)
1245 ssh
.AddPublicKey(node_uuid_map
[node
], pub_key
, key_file
=pub_key_file
)
1248 def RenewCrypto(opts
, args
):
1249 """Renews cluster certificates, keys and secrets.
1252 if opts
.new_ssh_keys
:
1253 _BuildGanetiPubKeys(opts
)
1254 return _RenewCrypto(opts
.new_cluster_cert
,
1257 opts
.new_spice_cert
,
1260 opts
.new_confd_hmac_key
,
1261 opts
.new_cluster_domain_secret
,
1262 opts
.cluster_domain_secret
,
1270 def _GetEnabledDiskTemplates(opts
):
1271 """Determine the list of enabled disk templates.
1274 if opts
.enabled_disk_templates
:
1275 return opts
.enabled_disk_templates
.split(",")
1280 def _GetVgName(opts
, enabled_disk_templates
):
1281 """Determine the volume group name.
1283 @type enabled_disk_templates: list of strings
1284 @param enabled_disk_templates: cluster-wide enabled disk-templates
1287 # consistency between vg name and enabled disk templates
1289 if opts
.vg_name
is not None:
1290 vg_name
= opts
.vg_name
1291 if enabled_disk_templates
:
1292 if vg_name
and not utils
.IsLvmEnabled(enabled_disk_templates
):
1293 ToStdout("You specified a volume group with --vg-name, but you did not"
1294 " enable any of the following lvm-based disk templates: %s" %
1295 utils
.CommaJoin(constants
.DTS_LVM
))
1299 def _GetDrbdHelper(opts
, enabled_disk_templates
):
1300 """Determine the DRBD usermode helper.
1303 drbd_helper
= opts
.drbd_helper
1304 if enabled_disk_templates
:
1305 drbd_enabled
= constants
.DT_DRBD8
in enabled_disk_templates
1306 if not drbd_enabled
and opts
.drbd_helper
:
1307 ToStdout("You specified a DRBD usermode helper with "
1308 " --drbd-usermode-helper while DRBD is not enabled.")
1312 def _GetCompressionTools(opts
):
1313 """Determine the list of custom compression tools.
1316 if opts
.compression_tools
:
1317 return opts
.compression_tools
.split(",")
1318 elif opts
.compression_tools
is None:
1319 return None # To note the parameter was not provided
1321 return constants
.IEC_DEFAULT_TOOLS
# Resetting to default
1324 def SetClusterParams(opts
, args
):
1325 """Modify the cluster.
1327 @param opts: the command line options selected by the user
1329 @param args: should be an empty list
1331 @return: the desired exit code
1334 if not (opts
.vg_name
is not None or
1335 opts
.drbd_helper
is not None or
1336 opts
.enabled_hypervisors
or opts
.hvparams
or
1337 opts
.beparams
or opts
.nicparams
or
1338 opts
.ndparams
or opts
.diskparams
or
1339 opts
.candidate_pool_size
is not None or
1340 opts
.max_running_jobs
is not None or
1341 opts
.max_tracked_jobs
is not None or
1342 opts
.uid_pool
is not None or
1343 opts
.maintain_node_health
is not None or
1344 opts
.add_uids
is not None or
1345 opts
.remove_uids
is not None or
1346 opts
.default_iallocator
is not None or
1347 opts
.default_iallocator_params
is not None or
1348 opts
.reserved_lvs
is not None or
1349 opts
.mac_prefix
is not None or
1350 opts
.master_netdev
is not None or
1351 opts
.master_netmask
is not None or
1352 opts
.use_external_mip_script
is not None or
1353 opts
.prealloc_wipe_disks
is not None or
1355 opts
.enabled_disk_templates
or
1357 opts
.ipolicy_bounds_specs
is not None or
1358 opts
.ipolicy_std_specs
is not None or
1359 opts
.ipolicy_disk_templates
is not None or
1360 opts
.ipolicy_vcpu_ratio
is not None or
1361 opts
.ipolicy_spindle_ratio
is not None or
1362 opts
.modify_etc_hosts
is not None or
1363 opts
.file_storage_dir
is not None or
1364 opts
.install_image
is not None or
1365 opts
.instance_communication_network
is not None or
1366 opts
.zeroing_image
is not None or
1367 opts
.shared_file_storage_dir
is not None or
1368 opts
.compression_tools
is not None or
1369 opts
.shared_file_storage_dir
is not None or
1370 opts
.enabled_user_shutdown
is not None or
1371 opts
.data_collector_interval
or
1372 opts
.enabled_data_collectors
):
1373 ToStderr("Please give at least one of the parameters.")
1376 enabled_disk_templates
= _GetEnabledDiskTemplates(opts
)
1377 vg_name
= _GetVgName(opts
, enabled_disk_templates
)
1380 drbd_helper
= _GetDrbdHelper(opts
, enabled_disk_templates
)
1381 except errors
.OpPrereqError
, e
:
1385 hvlist
= opts
.enabled_hypervisors
1386 if hvlist
is not None:
1387 hvlist
= hvlist
.split(",")
1389 # a list of (name, dict) we can pass directly to dict() (or [])
1390 hvparams
= dict(opts
.hvparams
)
1391 for hv_params
in hvparams
.values():
1392 utils
.ForceDictType(hv_params
, constants
.HVS_PARAMETER_TYPES
)
1394 diskparams
= dict(opts
.diskparams
)
1396 for dt_params
in diskparams
.values():
1397 utils
.ForceDictType(dt_params
, constants
.DISK_DT_TYPES
)
1399 beparams
= opts
.beparams
1400 utils
.ForceDictType(beparams
, constants
.BES_PARAMETER_COMPAT
)
1402 nicparams
= opts
.nicparams
1403 utils
.ForceDictType(nicparams
, constants
.NICS_PARAMETER_TYPES
)
1405 ndparams
= opts
.ndparams
1406 if ndparams
is not None:
1407 utils
.ForceDictType(ndparams
, constants
.NDS_PARAMETER_TYPES
)
1409 ipolicy
= CreateIPolicyFromOpts(
1410 minmax_ispecs
=opts
.ipolicy_bounds_specs
,
1411 std_ispecs
=opts
.ipolicy_std_specs
,
1412 ipolicy_disk_templates
=opts
.ipolicy_disk_templates
,
1413 ipolicy_vcpu_ratio
=opts
.ipolicy_vcpu_ratio
,
1414 ipolicy_spindle_ratio
=opts
.ipolicy_spindle_ratio
,
1417 mnh
= opts
.maintain_node_health
1419 uid_pool
= opts
.uid_pool
1420 if uid_pool
is not None:
1421 uid_pool
= uidpool
.ParseUidPool(uid_pool
)
1423 add_uids
= opts
.add_uids
1424 if add_uids
is not None:
1425 add_uids
= uidpool
.ParseUidPool(add_uids
)
1427 remove_uids
= opts
.remove_uids
1428 if remove_uids
is not None:
1429 remove_uids
= uidpool
.ParseUidPool(remove_uids
)
1431 if opts
.reserved_lvs
is not None:
1432 if opts
.reserved_lvs
== "":
1433 opts
.reserved_lvs
= []
1435 opts
.reserved_lvs
= utils
.UnescapeAndSplit(opts
.reserved_lvs
, sep
=",")
1437 if opts
.master_netmask
is not None:
1439 opts
.master_netmask
= int(opts
.master_netmask
)
1441 ToStderr("The --master-netmask option expects an int parameter.")
1444 ext_ip_script
= opts
.use_external_mip_script
1447 disk_state
= utils
.FlatToDict(opts
.disk_state
)
1451 hv_state
= dict(opts
.hv_state
)
1453 compression_tools
= _GetCompressionTools(opts
)
1455 enabled_data_collectors
= dict(
1456 (k
, v
.lower().startswith("t"))
1457 for k
, v
in opts
.enabled_data_collectors
.items())
1459 unrecognized_data_collectors
= [
1460 k
for k
in enabled_data_collectors
.keys()
1461 if k
not in constants
.DATA_COLLECTOR_NAMES
]
1462 if unrecognized_data_collectors
:
1463 ToStderr("Data collector names not recognized: %s" %
1464 ", ".join(unrecognized_data_collectors
))
1467 data_collector_interval
= dict(
1468 (k
, long(1e6
* float(v
)))
1469 for (k
, v
) in opts
.data_collector_interval
.items())
1471 ToStderr("Can't transform all values to integers: {}".format(
1472 opts
.data_collector_interval
))
1474 if any(v
<= 0 for v
in data_collector_interval
):
1475 ToStderr("Some interval times where not above zero.")
1478 op
= opcodes
.OpClusterSetParams(
1480 drbd_helper
=drbd_helper
,
1481 enabled_hypervisors
=hvlist
,
1485 nicparams
=nicparams
,
1487 diskparams
=diskparams
,
1489 candidate_pool_size
=opts
.candidate_pool_size
,
1490 max_running_jobs
=opts
.max_running_jobs
,
1491 max_tracked_jobs
=opts
.max_tracked_jobs
,
1492 maintain_node_health
=mnh
,
1493 modify_etc_hosts
=opts
.modify_etc_hosts
,
1496 remove_uids
=remove_uids
,
1497 default_iallocator
=opts
.default_iallocator
,
1498 default_iallocator_params
=opts
.default_iallocator_params
,
1499 prealloc_wipe_disks
=opts
.prealloc_wipe_disks
,
1500 mac_prefix
=opts
.mac_prefix
,
1501 master_netdev
=opts
.master_netdev
,
1502 master_netmask
=opts
.master_netmask
,
1503 reserved_lvs
=opts
.reserved_lvs
,
1504 use_external_mip_script
=ext_ip_script
,
1506 disk_state
=disk_state
,
1507 enabled_disk_templates
=enabled_disk_templates
,
1509 file_storage_dir
=opts
.file_storage_dir
,
1510 install_image
=opts
.install_image
,
1511 instance_communication_network
=opts
.instance_communication_network
,
1512 zeroing_image
=opts
.zeroing_image
,
1513 shared_file_storage_dir
=opts
.shared_file_storage_dir
,
1514 compression_tools
=compression_tools
,
1515 enabled_user_shutdown
=opts
.enabled_user_shutdown
,
1516 enabled_data_collectors
=enabled_data_collectors
,
1517 data_collector_interval
=data_collector_interval
,
1519 return base
.GetResult(None, opts
, SubmitOrSend(op
, opts
))
1522 def QueueOps(opts
, args
):
1523 """Queue operations.
1525 @param opts: the command line options selected by the user
1527 @param args: should contain only one element, the subcommand
1529 @return: the desired exit code
1533 client
= GetClient()
1534 if command
in ("drain", "undrain"):
1535 drain_flag
= command
== "drain"
1536 client
.SetQueueDrainFlag(drain_flag
)
1537 elif command
== "info":
1538 result
= client
.QueryConfigValues(["drain_flag"])
1543 ToStdout("The drain flag is %s" % val
)
1545 raise errors
.OpPrereqError("Command '%s' is not valid." % command
,
1551 def _ShowWatcherPause(until
):
1552 if until
is None or until
< time
.time():
1553 ToStdout("The watcher is not paused.")
1555 ToStdout("The watcher is paused until %s.", time
.ctime(until
))
1558 def WatcherOps(opts
, args
):
1559 """Watcher operations.
1561 @param opts: the command line options selected by the user
1563 @param args: should contain only one element, the subcommand
1565 @return: the desired exit code
1569 client
= GetClient()
1571 if command
== "continue":
1572 client
.SetWatcherPause(None)
1573 ToStdout("The watcher is no longer paused.")
1575 elif command
== "pause":
1577 raise errors
.OpPrereqError("Missing pause duration", errors
.ECODE_INVAL
)
1579 result
= client
.SetWatcherPause(time
.time() + ParseTimespec(args
[1]))
1580 _ShowWatcherPause(result
)
1582 elif command
== "info":
1583 result
= client
.QueryConfigValues(["watcher_pause"])
1584 _ShowWatcherPause(result
[0])
1587 raise errors
.OpPrereqError("Command '%s' is not valid." % command
,
1593 def _OobPower(opts
, node_list
, power
):
1594 """Puts the node in the list to desired power state.
1596 @param opts: The command line options selected by the user
1597 @param node_list: The list of nodes to operate on
1598 @param power: True if they should be powered on, False otherwise
1599 @return: The success of the operation (none failed)
1603 command
= constants
.OOB_POWER_ON
1605 command
= constants
.OOB_POWER_OFF
1607 op
= opcodes
.OpOobCommand(node_names
=node_list
,
1610 timeout
=opts
.oob_timeout
,
1611 power_delay
=opts
.power_delay
)
1612 result
= SubmitOpCode(op
, opts
=opts
)
1614 for node_result
in result
:
1615 (node_tuple
, data_tuple
) = node_result
1616 (_
, node_name
) = node_tuple
1617 (data_status
, _
) = data_tuple
1618 if data_status
!= constants
.RS_NORMAL
:
1619 assert data_status
!= constants
.RS_UNAVAIL
1621 ToStderr("There was a problem changing power for %s, please investigate",
1630 def _InstanceStart(opts
, inst_list
, start
, no_remember
=False):
1631 """Puts the instances in the list to desired state.
1633 @param opts: The command line options selected by the user
1634 @param inst_list: The list of instances to operate on
1635 @param start: True if they should be started, False for shutdown
1636 @param no_remember: If the instance state should be remembered
1637 @return: The success of the operation (none failed)
1641 opcls
= opcodes
.OpInstanceStartup
1642 text_submit
, text_success
, text_failed
= ("startup", "started", "starting")
1644 opcls
= compat
.partial(opcodes
.OpInstanceShutdown
,
1645 timeout
=opts
.shutdown_timeout
,
1646 no_remember
=no_remember
)
1647 text_submit
, text_success
, text_failed
= ("shutdown", "stopped", "stopping")
1649 jex
= JobExecutor(opts
=opts
)
1651 for inst
in inst_list
:
1652 ToStdout("Submit %s of instance %s", text_submit
, inst
)
1653 op
= opcls(instance_name
=inst
)
1654 jex
.QueueJob(inst
, op
)
1656 results
= jex
.GetResults()
1657 bad_cnt
= len([1 for (success
, _
) in results
if not success
])
1660 ToStdout("All instances have been %s successfully", text_success
)
1662 ToStderr("There were errors while %s instances:\n"
1663 "%d error(s) out of %d instance(s)", text_failed
, bad_cnt
,
1670 class _RunWhenNodesReachableHelper(object):
1671 """Helper class to make shared internal state sharing easier.
1673 @ivar success: Indicates if all action_cb calls were successful
1676 def __init__(self
, node_list
, action_cb
, node2ip
, port
, feedback_fn
,
1677 _ping_fn
=netutils
.TcpPing
, _sleep_fn
=time
.sleep
):
1680 @param node_list: The list of nodes to be reachable
1681 @param action_cb: Callback called when a new host is reachable
1683 @param node2ip: Node to ip mapping
1684 @param port: The port to use for the TCP ping
1685 @param feedback_fn: The function used for feedback
1686 @param _ping_fn: Function to check reachabilty (for unittest use only)
1687 @param _sleep_fn: Function to sleep (for unittest use only)
1690 self
.down
= set(node_list
)
1692 self
.node2ip
= node2ip
1694 self
.action_cb
= action_cb
1696 self
.feedback_fn
= feedback_fn
1697 self
._ping_fn
= _ping_fn
1698 self
._sleep_fn
= _sleep_fn
1701 """When called we run action_cb.
1703 @raises utils.RetryAgain: When there are still down nodes
1706 if not self
.action_cb(self
.up
):
1707 self
.success
= False
1710 raise utils
.RetryAgain()
1714 def Wait(self
, secs
):
1715 """Checks if a host is up or waits remaining seconds.
1717 @param secs: The secs remaining
1721 for node
in self
.down
:
1722 if self
._ping_fn(self
.node2ip
[node
], self
.port
, timeout
=_EPO_PING_TIMEOUT
,
1723 live_port_needed
=True):
1724 self
.feedback_fn("Node %s became available" % node
)
1726 self
.down
-= self
.up
1727 # If we have a node available there is the possibility to run the
1728 # action callback successfully, therefore we don't wait and return
1731 self
._sleep_fn(max(0.0, start
+ secs
- time
.time()))
1734 def _RunWhenNodesReachable(node_list
, action_cb
, interval
):
1735 """Run action_cb when nodes become reachable.
1737 @param node_list: The list of nodes to be reachable
1738 @param action_cb: Callback called when a new host is reachable
1739 @param interval: The earliest time to retry
1742 client
= GetClient()
1743 cluster_info
= client
.QueryClusterInfo()
1744 if cluster_info
["primary_ip_version"] == constants
.IP4_VERSION
:
1745 family
= netutils
.IPAddress
.family
1747 family
= netutils
.IP6Address
.family
1749 node2ip
= dict((node
, netutils
.GetHostname(node
, family
=family
).ip
)
1750 for node
in node_list
)
1752 port
= netutils
.GetDaemonPort(constants
.NODED
)
1753 helper
= _RunWhenNodesReachableHelper(node_list
, action_cb
, node2ip
, port
,
1757 return utils
.Retry(helper
, interval
, _EPO_REACHABLE_TIMEOUT
,
1758 wait_fn
=helper
.Wait
)
1759 except utils
.RetryTimeout
:
1760 ToStderr("Time exceeded while waiting for nodes to become reachable"
1761 " again:\n - %s", " - ".join(helper
.down
))
1765 def _MaybeInstanceStartup(opts
, inst_map
, nodes_online
,
1766 _instance_start_fn
=_InstanceStart
):
1767 """Start the instances conditional based on node_states.
1769 @param opts: The command line options selected by the user
1770 @param inst_map: A dict of inst -> nodes mapping
1771 @param nodes_online: A list of nodes online
1772 @param _instance_start_fn: Callback to start instances (unittest use only)
1773 @return: Success of the operation on all instances
1776 start_inst_list
= []
1777 for (inst
, nodes
) in inst_map
.items():
1778 if not (nodes
- nodes_online
):
1779 # All nodes the instance lives on are back online
1780 start_inst_list
.append(inst
)
1782 for inst
in start_inst_list
:
1786 return _instance_start_fn(opts
, start_inst_list
, True)
1791 def _EpoOn(opts
, full_node_list
, node_list
, inst_map
):
1792 """Does the actual power on.
1794 @param opts: The command line options selected by the user
1795 @param full_node_list: All nodes to operate on (includes nodes not supporting
1797 @param node_list: The list of nodes to operate on (all need to support OOB)
1798 @param inst_map: A dict of inst -> nodes mapping
1799 @return: The desired exit status
1802 if node_list
and not _OobPower(opts
, node_list
, False):
1803 ToStderr("Not all nodes seem to get back up, investigate and start"
1804 " manually if needed")
1806 # Wait for the nodes to be back up
1807 action_cb
= compat
.partial(_MaybeInstanceStartup
, opts
, dict(inst_map
))
1809 ToStdout("Waiting until all nodes are available again")
1810 if not _RunWhenNodesReachable(full_node_list
, action_cb
, _EPO_PING_INTERVAL
):
1811 ToStderr("Please investigate and start stopped instances manually")
1812 return constants
.EXIT_FAILURE
1814 return constants
.EXIT_SUCCESS
1817 def _EpoOff(opts
, node_list
, inst_map
):
1818 """Does the actual power off.
1820 @param opts: The command line options selected by the user
1821 @param node_list: The list of nodes to operate on (all need to support OOB)
1822 @param inst_map: A dict of inst -> nodes mapping
1823 @return: The desired exit status
1826 if not _InstanceStart(opts
, inst_map
.keys(), False, no_remember
=True):
1827 ToStderr("Please investigate and stop instances manually before continuing")
1828 return constants
.EXIT_FAILURE
1831 return constants
.EXIT_SUCCESS
1833 if _OobPower(opts
, node_list
, False):
1834 return constants
.EXIT_SUCCESS
1836 return constants
.EXIT_FAILURE
1839 def Epo(opts
, args
, qcl
=None, _on_fn
=_EpoOn
, _off_fn
=_EpoOff
,
1840 _confirm_fn
=ConfirmOperation
,
1841 _stdout_fn
=ToStdout
, _stderr_fn
=ToStderr
):
1844 @param opts: the command line options selected by the user
1846 @param args: should contain only one element, the subcommand
1848 @return: the desired exit code
1851 if opts
.groups
and opts
.show_all
:
1852 _stderr_fn("Only one of --groups or --all are allowed")
1853 return constants
.EXIT_FAILURE
1854 elif args
and opts
.show_all
:
1855 _stderr_fn("Arguments in combination with --all are not allowed")
1856 return constants
.EXIT_FAILURE
1864 itertools
.chain(*qcl
.QueryGroups(args
, ["node_list"], False))
1866 node_query_list
= args
1868 result
= qcl
.QueryNodes(node_query_list
, ["name", "master", "pinst_list",
1869 "sinst_list", "powered", "offline"],
1872 all_nodes
= map(compat
.fst
, result
)
1875 for (node
, master
, pinsts
, sinsts
, powered
, offline
) in result
:
1877 for inst
in (pinsts
+ sinsts
):
1878 if inst
in inst_map
:
1880 inst_map
[inst
].add(node
)
1882 inst_map
[inst
] = set()
1884 inst_map
[inst
] = set([node
])
1886 if master
and opts
.on
:
1887 # We ignore the master for turning on the machines, in fact we are
1888 # already operating on the master at this point :)
1890 elif master
and not opts
.show_all
:
1891 _stderr_fn("%s is the master node, please do a master-failover to another"
1892 " node not affected by the EPO or use --all if you intend to"
1893 " shutdown the whole cluster", node
)
1894 return constants
.EXIT_FAILURE
1895 elif powered
is None:
1896 _stdout_fn("Node %s does not support out-of-band handling, it can not be"
1897 " handled in a fully automated manner", node
)
1898 elif powered
== opts
.on
:
1899 _stdout_fn("Node %s is already in desired power state, skipping", node
)
1900 elif not offline
or (offline
and powered
):
1901 node_list
.append(node
)
1903 if not (opts
.force
or _confirm_fn(all_nodes
, "nodes", "epo")):
1904 return constants
.EXIT_FAILURE
1907 return _on_fn(opts
, all_nodes
, node_list
, inst_map
)
1909 return _off_fn(opts
, node_list
, inst_map
)
1912 def _GetCreateCommand(info
):
1914 buf
.write("gnt-cluster init")
1915 PrintIPolicyCommand(buf
, info
["ipolicy"], False)
1917 buf
.write(info
["name"])
1918 return buf
.getvalue()
1921 def ShowCreateCommand(opts
, args
):
1922 """Shows the command that can be used to re-create the cluster.
1924 Currently it works only for ipolicy specs.
1928 result
= cl
.QueryClusterInfo()
1929 ToStdout(_GetCreateCommand(result
))
1932 def _RunCommandAndReport(cmd
):
1933 """Run a command and report its output, iff it failed.
1935 @param cmd: the command to execute
1938 @return: False, if the execution failed.
1941 result
= utils
.RunCmd(cmd
)
1943 ToStderr("Command %s failed: %s; Output %s" %
1944 (cmd
, result
.fail_reason
, result
.output
))
1949 def _VerifyCommand(cmd
):
1950 """Verify that a given command succeeds on all online nodes.
1952 As this function is intended to run during upgrades, it
1953 is implemented in such a way that it still works, if all Ganeti
1956 @param cmd: the command to execute
1959 @return: the list of node names that are online where
1963 command
= utils
.text
.ShellQuoteArgs([str(val
) for val
in cmd
])
1965 nodes
= ssconf
.SimpleStore().GetOnlineNodeList()
1966 master_node
= ssconf
.SimpleStore().GetMasterNode()
1967 cluster_name
= ssconf
.SimpleStore().GetClusterName()
1969 # If master node is in 'nodes', make sure master node is at list end
1970 if master_node
in nodes
:
1971 nodes
.remove(master_node
)
1972 nodes
.append(master_node
)
1976 srun
= ssh
.SshRunner(cluster_name
=cluster_name
)
1978 result
= srun
.Run(name
, constants
.SSH_LOGIN_USER
, command
)
1979 if result
.exit_code
!= 0:
1985 def _VerifyVersionInstalled(versionstring
):
1986 """Verify that the given version of ganeti is installed on all online nodes.
1988 Do nothing, if this is the case, otherwise print an appropriate
1991 @param versionstring: the version to check for
1992 @type versionstring: string
1994 @return: True, if the version is installed on all online nodes
1997 badnodes
= _VerifyCommand(["test", "-d",
1998 os
.path
.join(pathutils
.PKGLIBDIR
, versionstring
)])
2000 ToStderr("Ganeti version %s not installed on nodes %s"
2001 % (versionstring
, ", ".join(badnodes
)))
2008 """Determine the list of running jobs.
2011 @return: the number of jobs still running
2015 qfilter
= qlang
.MakeSimpleFilter("status",
2016 frozenset([constants
.JOB_STATUS_RUNNING
]))
2017 return len(cl
.Query(constants
.QR_JOB
, [], qfilter
).data
)
2020 def _SetGanetiVersion(versionstring
):
2021 """Set the active version of ganeti to the given versionstring
2023 @type versionstring: string
2025 @return: the list of nodes where the version change failed
2029 if constants
.HAS_GNU_LN
:
2030 failed
.extend(_VerifyCommand(
2031 ["ln", "-s", "-f", "-T",
2032 os
.path
.join(pathutils
.PKGLIBDIR
, versionstring
),
2033 os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/lib")]))
2034 failed
.extend(_VerifyCommand(
2035 ["ln", "-s", "-f", "-T",
2036 os
.path
.join(pathutils
.SHAREDIR
, versionstring
),
2037 os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/share")]))
2039 failed
.extend(_VerifyCommand(
2040 ["rm", "-f", os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/lib")]))
2041 failed
.extend(_VerifyCommand(
2042 ["ln", "-s", "-f", os
.path
.join(pathutils
.PKGLIBDIR
, versionstring
),
2043 os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/lib")]))
2044 failed
.extend(_VerifyCommand(
2045 ["rm", "-f", os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/share")]))
2046 failed
.extend(_VerifyCommand(
2047 ["ln", "-s", "-f", os
.path
.join(pathutils
.SHAREDIR
, versionstring
),
2048 os
.path
.join(pathutils
.SYSCONFDIR
, "ganeti/share")]))
2049 return list(set(failed
))
2052 def _ExecuteCommands(fns
):
2053 """Execute a list of functions, in reverse order.
2055 @type fns: list of functions.
2056 @param fns: the functions to be executed.
2059 for fn
in reversed(fns
):
2063 def _GetConfigVersion():
2064 """Determine the version the configuration file currently has.
2066 @rtype: tuple or None
2067 @return: (major, minor, revision) if the version can be determined,
2071 config_data
= serializer
.LoadJson(utils
.ReadFile(pathutils
.CLUSTER_CONF_FILE
))
2073 config_version
= config_data
["version"]
2076 return utils
.SplitVersion(config_version
)
2079 def _ReadIntentToUpgrade():
2080 """Read the file documenting the intent to upgrade the cluster.
2082 @rtype: (string, string) or (None, None)
2083 @return: (old version, version to upgrade to), if the file exists,
2084 and (None, None) otherwise.
2087 if not os
.path
.isfile(pathutils
.INTENT_TO_UPGRADE
):
2090 contentstring
= utils
.ReadFile(pathutils
.INTENT_TO_UPGRADE
)
2091 contents
= utils
.UnescapeAndSplit(contentstring
)
2092 if len(contents
) != 3:
2093 # file syntactically mal-formed
2095 return (contents
[0], contents
[1])
2098 def _WriteIntentToUpgrade(version
):
2099 """Write file documenting the intent to upgrade the cluster.
2101 @type version: string
2102 @param version: the version we intent to upgrade to
2105 utils
.WriteFile(pathutils
.INTENT_TO_UPGRADE
,
2106 data
=utils
.EscapeAndJoin([constants
.RELEASE_VERSION
, version
,
2107 "%d" % os
.getpid()]))
2110 def _UpgradeBeforeConfigurationChange(versionstring
):
2112 Carry out all the tasks necessary for an upgrade that happen before
2113 the configuration file, or Ganeti version, changes.
2115 @type versionstring: string
2116 @param versionstring: the version to upgrade to
2117 @rtype: (bool, list)
2118 @return: tuple of a bool indicating success and a list of rollback tasks
2123 if not _VerifyVersionInstalled(versionstring
):
2124 return (False, rollback
)
2126 _WriteIntentToUpgrade(versionstring
)
2128 lambda: utils
.RunCmd(["rm", "-f", pathutils
.INTENT_TO_UPGRADE
]))
2130 ToStdoutAndLoginfo("Draining queue")
2131 client
= GetClient()
2132 client
.SetQueueDrainFlag(True)
2134 rollback
.append(lambda: GetClient().SetQueueDrainFlag(False))
2136 if utils
.SimpleRetry(0, _GetRunning
,
2137 constants
.UPGRADE_QUEUE_POLL_INTERVAL
,
2138 constants
.UPGRADE_QUEUE_DRAIN_TIMEOUT
):
2139 ToStderr("Failed to completely empty the queue.")
2140 return (False, rollback
)
2142 ToStdoutAndLoginfo("Pausing the watcher for one hour.")
2143 rollback
.append(lambda: GetClient().SetWatcherPause(None))
2144 GetClient().SetWatcherPause(time
.time() + 60 * 60)
2146 ToStdoutAndLoginfo("Stopping daemons on master node.")
2147 if not _RunCommandAndReport([pathutils
.DAEMON_UTIL
, "stop-all"]):
2148 return (False, rollback
)
2150 if not _VerifyVersionInstalled(versionstring
):
2151 utils
.RunCmd([pathutils
.DAEMON_UTIL
, "start-all"])
2152 return (False, rollback
)
2154 ToStdoutAndLoginfo("Stopping daemons everywhere.")
2155 rollback
.append(lambda: _VerifyCommand([pathutils
.DAEMON_UTIL
, "start-all"]))
2156 badnodes
= _VerifyCommand([pathutils
.DAEMON_UTIL
, "stop-all"])
2158 ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes
),))
2159 return (False, rollback
)
2161 backuptar
= os
.path
.join(pathutils
.BACKUP_DIR
, "ganeti%d.tar" % time
.time())
2162 ToStdoutAndLoginfo("Backing up configuration as %s", backuptar
)
2163 if not _RunCommandAndReport(["mkdir", "-p", pathutils
.BACKUP_DIR
]):
2164 return (False, rollback
)
2166 # Create the archive in a safe manner, as it contains sensitive
2168 (_
, tmp_name
) = tempfile
.mkstemp(prefix
=backuptar
, dir=pathutils
.BACKUP_DIR
)
2169 if not _RunCommandAndReport(["tar", "-cf", tmp_name
,
2170 "--exclude=queue/archive",
2171 pathutils
.DATA_DIR
]):
2172 return (False, rollback
)
2174 os
.rename(tmp_name
, backuptar
)
2175 return (True, rollback
)
2178 def _VersionSpecificDowngrade():
2180 Perform any additional downrade tasks that are version specific
2181 and need to be done just after the configuration downgrade. This
2182 function needs to be idempotent, so that it can be redone if the
2183 downgrade procedure gets interrupted after changing the
2186 Note that this function has to be reset with every version bump.
2188 @return: True upon success
2190 ToStdoutAndLoginfo("Performing version-specific downgrade tasks.")
2195 def _SwitchVersionAndConfig(versionstring
, downgrade
):
2197 Switch to the new Ganeti version and change the configuration,
2200 @type versionstring: string
2201 @param versionstring: the version to change to
2202 @type downgrade: bool
2203 @param downgrade: True, if the configuration should be downgraded
2204 @rtype: (bool, list)
2205 @return: tupe of a bool indicating success, and a list of
2206 additional rollback tasks
2211 ToStdoutAndLoginfo("Downgrading configuration")
2212 if not _RunCommandAndReport([pathutils
.CFGUPGRADE
, "--downgrade", "-f"]):
2213 return (False, rollback
)
2214 # Note: version specific downgrades need to be done before switching
2215 # binaries, so that we still have the knowledgeable binary if the downgrade
2216 # process gets interrupted at this point.
2217 if not _VersionSpecificDowngrade():
2218 return (False, rollback
)
2220 # Configuration change is the point of no return. From then onwards, it is
2221 # safer to push through the up/dowgrade than to try to roll it back.
2223 ToStdoutAndLoginfo("Switching to version %s on all nodes", versionstring
)
2224 rollback
.append(lambda: _SetGanetiVersion(constants
.DIR_VERSION
))
2225 badnodes
= _SetGanetiVersion(versionstring
)
2227 ToStderr("Failed to switch to Ganeti version %s on nodes %s"
2228 % (versionstring
, ", ".join(badnodes
)))
2230 return (False, rollback
)
2232 # Now that we have changed to the new version of Ganeti we should
2233 # not communicate over luxi any more, as luxi might have changed in
2234 # incompatible ways. Therefore, manually call the corresponding ganeti
2235 # commands using their canonical (version independent) path.
2238 ToStdoutAndLoginfo("Upgrading configuration")
2239 if not _RunCommandAndReport([pathutils
.CFGUPGRADE
, "-f"]):
2240 return (False, rollback
)
2242 return (True, rollback
)
2245 def _UpgradeAfterConfigurationChange(oldversion
):
2247 Carry out the upgrade actions necessary after switching to the new
2248 Ganeti version and updating the configuration.
2250 As this part is run at a time where the new version of Ganeti is already
2251 running, no communication should happen via luxi, as this is not a stable
2252 interface. Also, as the configuration change is the point of no return,
2253 all actions are pushed trough, even if some of them fail.
2255 @param oldversion: the version the upgrade started from
2256 @type oldversion: string
2258 @return: the intended return value
2263 ToStdoutAndLoginfo("Ensuring directories everywhere.")
2264 badnodes
= _VerifyCommand([pathutils
.ENSURE_DIRS
])
2266 ToStderr("Warning: failed to ensure directories on %s." %
2267 (", ".join(badnodes
)))
2270 ToStdoutAndLoginfo("Starting daemons everywhere.")
2271 badnodes
= _VerifyCommand([pathutils
.DAEMON_UTIL
, "start-all"])
2273 ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes
),))
2276 ToStdoutAndLoginfo("Redistributing the configuration.")
2277 if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]):
2280 ToStdoutAndLoginfo("Restarting daemons everywhere.")
2281 badnodes
= _VerifyCommand([pathutils
.DAEMON_UTIL
, "stop-all"])
2282 badnodes
.extend(_VerifyCommand([pathutils
.DAEMON_UTIL
, "start-all"]))
2284 ToStderr("Warning: failed to start daemons on %s." %
2285 (", ".join(list(set(badnodes
))),))
2288 ToStdoutAndLoginfo("Undraining the queue.")
2289 if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]):
2292 _RunCommandAndReport(["rm", "-f", pathutils
.INTENT_TO_UPGRADE
])
2294 ToStdoutAndLoginfo("Running post-upgrade hooks")
2295 if not _RunCommandAndReport([pathutils
.POST_UPGRADE
, oldversion
]):
2298 ToStdoutAndLoginfo("Unpausing the watcher.")
2299 if not _RunCommandAndReport(["gnt-cluster", "watcher", "continue"]):
2302 ToStdoutAndLoginfo("Verifying cluster.")
2303 if not _RunCommandAndReport(["gnt-cluster", "verify"]):
2309 def UpgradeGanetiCommand(opts
, args
):
2310 """Upgrade a cluster to a new ganeti version.
2312 @param opts: the command line options selected by the user
2314 @param args: should be an empty list
2316 @return: the desired exit code
2319 if ((not opts
.resume
and opts
.to
is None)
2320 or (opts
.resume
and opts
.to
is not None)):
2321 ToStderr("Precisely one of the options --to and --resume"
2325 # If we're not told to resume, verify there is no upgrade
2328 oldversion
, versionstring
= _ReadIntentToUpgrade()
2329 if versionstring
is not None:
2330 # An upgrade is going on; verify whether the target matches
2331 if versionstring
== opts
.to
:
2332 ToStderr("An upgrade is already in progress. Target version matches,"
2337 ToStderr("An upgrade from %s to %s is in progress; use --resume to"
2338 " finish it first" % (oldversion
, versionstring
))
2341 utils
.SetupLogging(pathutils
.LOG_COMMANDS
, 'gnt-cluster upgrade', debug
=1)
2343 oldversion
= constants
.RELEASE_VERSION
2346 ssconf
.CheckMaster(False)
2347 oldversion
, versionstring
= _ReadIntentToUpgrade()
2348 if versionstring
is None:
2350 version
= utils
.version
.ParseVersion(versionstring
)
2353 configversion
= _GetConfigVersion()
2354 if configversion
is None:
2356 # If the upgrade we resume was an upgrade between compatible
2357 # versions (like 2.10.0 to 2.10.1), the correct configversion
2358 # does not guarantee that the config has been updated.
2359 # However, in the case of a compatible update with the configuration
2360 # not touched, we are running a different dirversion with the same
2362 config_already_modified
= \
2363 (utils
.IsCorrectConfigVersion(version
, configversion
) and
2364 not (versionstring
!= constants
.DIR_VERSION
and
2365 configversion
== (constants
.CONFIG_MAJOR
, constants
.CONFIG_MINOR
,
2366 constants
.CONFIG_REVISION
)))
2367 if not config_already_modified
:
2368 # We have to start from the beginning; however, some daemons might have
2369 # already been stopped, so the only way to get into a well-defined state
2370 # is by starting all daemons again.
2371 _VerifyCommand([pathutils
.DAEMON_UTIL
, "start-all"])
2373 versionstring
= opts
.to
2374 config_already_modified
= False
2375 version
= utils
.version
.ParseVersion(versionstring
)
2377 ToStderr("Could not parse version string %s" % versionstring
)
2380 msg
= utils
.version
.UpgradeRange(version
)
2382 ToStderr("Cannot upgrade to %s: %s" % (versionstring
, msg
))
2385 if not config_already_modified
:
2386 success
, rollback
= _UpgradeBeforeConfigurationChange(versionstring
)
2388 _ExecuteCommands(rollback
)
2393 downgrade
= utils
.version
.ShouldCfgdowngrade(version
)
2395 success
, additionalrollback
= \
2396 _SwitchVersionAndConfig(versionstring
, downgrade
)
2398 rollback
.extend(additionalrollback
)
2399 _ExecuteCommands(rollback
)
2402 return _UpgradeAfterConfigurationChange(oldversion
)
2407 InitCluster
, [ArgHost(min=1, max=1)],
2408 [BACKEND_OPT
, CP_SIZE_OPT
, ENABLED_HV_OPT
, GLOBAL_FILEDIR_OPT
,
2409 HVLIST_OPT
, MAC_PREFIX_OPT
, MASTER_NETDEV_OPT
, MASTER_NETMASK_OPT
,
2410 NIC_PARAMS_OPT
, NOMODIFY_ETCHOSTS_OPT
, NOMODIFY_SSH_SETUP_OPT
,
2411 SECONDARY_IP_OPT
, VG_NAME_OPT
, MAINTAIN_NODE_HEALTH_OPT
, UIDPOOL_OPT
,
2412 DRBD_HELPER_OPT
, DEFAULT_IALLOCATOR_OPT
, DEFAULT_IALLOCATOR_PARAMS_OPT
,
2413 PRIMARY_IP_VERSION_OPT
, PREALLOC_WIPE_DISKS_OPT
, NODE_PARAMS_OPT
,
2414 GLOBAL_SHARED_FILEDIR_OPT
, USE_EXTERNAL_MIP_SCRIPT
, DISK_PARAMS_OPT
,
2415 HV_STATE_OPT
, DISK_STATE_OPT
, ENABLED_DISK_TEMPLATES_OPT
,
2416 IPOLICY_STD_SPECS_OPT
, GLOBAL_GLUSTER_FILEDIR_OPT
, INSTALL_IMAGE_OPT
,
2417 ZEROING_IMAGE_OPT
, COMPRESSION_TOOLS_OPT
,
2418 ENABLED_USER_SHUTDOWN_OPT
,
2420 + INSTANCE_POLICY_OPTS
+ SPLIT_ISPECS_OPTS
,
2421 "[opts...] <cluster_name>", "Initialises a new cluster configuration"),
2423 DestroyCluster
, ARGS_NONE
, [YES_DOIT_OPT
],
2424 "", "Destroy cluster"),
2426 RenameCluster
, [ArgHost(min=1, max=1)],
2427 [FORCE_OPT
, DRY_RUN_OPT
],
2429 "Renames the cluster"),
2431 RedistributeConfig
, ARGS_NONE
, SUBMIT_OPTS
+
2432 [DRY_RUN_OPT
, PRIORITY_OPT
, FORCE_DISTRIBUTION
],
2433 "", "Forces a push of the configuration file and ssconf files"
2434 " to the nodes in the cluster"),
2436 VerifyCluster
, ARGS_NONE
,
2437 [VERBOSE_OPT
, DEBUG_SIMERR_OPT
, ERROR_CODES_OPT
, NONPLUS1_OPT
,
2438 DRY_RUN_OPT
, PRIORITY_OPT
, NODEGROUP_OPT
, IGNORE_ERRORS_OPT
,
2439 VERIFY_CLUTTER_OPT
],
2440 "", "Does a check on the cluster configuration"),
2442 VerifyDisks
, ARGS_NONE
, [PRIORITY_OPT
],
2443 "", "Does a check on the cluster disk status"),
2444 "repair-disk-sizes": (
2445 RepairDiskSizes
, ARGS_MANY_INSTANCES
, [DRY_RUN_OPT
, PRIORITY_OPT
],
2446 "[instance...]", "Updates mismatches in recorded disk sizes"),
2447 "master-failover": (
2448 MasterFailover
, ARGS_NONE
, [NOVOTING_OPT
, FORCE_FAILOVER
],
2449 "", "Makes the current node the master"),
2451 MasterPing
, ARGS_NONE
, [],
2452 "", "Checks if the master is alive"),
2454 ShowClusterVersion
, ARGS_NONE
, [],
2455 "", "Shows the cluster version"),
2457 ShowClusterMaster
, ARGS_NONE
, [],
2458 "", "Shows the cluster master"),
2460 ClusterCopyFile
, [ArgFile(min=1, max=1)],
2461 [NODE_LIST_OPT
, USE_REPL_NET_OPT
, NODEGROUP_OPT
],
2462 "[-n node...] <filename>", "Copies a file to all (or only some) nodes"),
2464 RunClusterCommand
, [ArgCommand(min=1)],
2465 [NODE_LIST_OPT
, NODEGROUP_OPT
, SHOW_MACHINE_OPT
, FAILURE_ONLY_OPT
],
2466 "[-n node...] <command>", "Runs a command on all (or only some) nodes"),
2468 ShowClusterConfig
, ARGS_NONE
, [ROMAN_OPT
],
2469 "[--roman]", "Show cluster configuration"),
2471 ListTags
, ARGS_NONE
, [], "", "List the tags of the cluster"),
2473 AddTags
, [ArgUnknown()], [TAG_SRC_OPT
, PRIORITY_OPT
] + SUBMIT_OPTS
,
2474 "tag...", "Add tags to the cluster"),
2476 RemoveTags
, [ArgUnknown()], [TAG_SRC_OPT
, PRIORITY_OPT
] + SUBMIT_OPTS
,
2477 "tag...", "Remove tags from the cluster"),
2479 SearchTags
, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT
], "",
2480 "Searches the tags on all objects on"
2481 " the cluster for a given pattern (regex)"),
2484 [ArgChoice(min=1, max=1, choices
=["drain", "undrain", "info"])],
2485 [], "drain|undrain|info", "Change queue properties"),
2488 [ArgChoice(min=1, max=1, choices
=["pause", "continue", "info"]),
2489 ArgSuggest(min=0, max=1, choices
=["30m", "1h", "4h"])],
2491 "{pause <timespec>|continue|info}", "Change watcher properties"),
2493 SetClusterParams
, ARGS_NONE
,
2495 BACKEND_OPT
, CP_SIZE_OPT
, RQL_OPT
, MAX_TRACK_OPT
, INSTALL_IMAGE_OPT
,
2496 INSTANCE_COMMUNICATION_NETWORK_OPT
, ENABLED_HV_OPT
, HVLIST_OPT
,
2497 MAC_PREFIX_OPT
, MASTER_NETDEV_OPT
, MASTER_NETMASK_OPT
, NIC_PARAMS_OPT
,
2498 VG_NAME_OPT
, MAINTAIN_NODE_HEALTH_OPT
, UIDPOOL_OPT
, ADD_UIDS_OPT
,
2499 REMOVE_UIDS_OPT
, DRBD_HELPER_OPT
, DEFAULT_IALLOCATOR_OPT
,
2500 DEFAULT_IALLOCATOR_PARAMS_OPT
, RESERVED_LVS_OPT
, DRY_RUN_OPT
, PRIORITY_OPT
,
2501 PREALLOC_WIPE_DISKS_OPT
, NODE_PARAMS_OPT
, USE_EXTERNAL_MIP_SCRIPT
,
2502 DISK_PARAMS_OPT
, HV_STATE_OPT
, DISK_STATE_OPT
] + SUBMIT_OPTS
+
2503 [ENABLED_DISK_TEMPLATES_OPT
, IPOLICY_STD_SPECS_OPT
, MODIFY_ETCHOSTS_OPT
,
2504 ENABLED_USER_SHUTDOWN_OPT
] +
2505 INSTANCE_POLICY_OPTS
+
2506 [GLOBAL_FILEDIR_OPT
, GLOBAL_SHARED_FILEDIR_OPT
, ZEROING_IMAGE_OPT
,
2507 COMPRESSION_TOOLS_OPT
] +
2508 [ENABLED_DATA_COLLECTORS_OPT
, DATA_COLLECTOR_INTERVAL_OPT
],
2510 "Alters the parameters of the cluster"),
2512 RenewCrypto
, ARGS_NONE
,
2513 [NEW_CLUSTER_CERT_OPT
, NEW_RAPI_CERT_OPT
, RAPI_CERT_OPT
,
2514 NEW_CONFD_HMAC_KEY_OPT
, FORCE_OPT
,
2515 NEW_CLUSTER_DOMAIN_SECRET_OPT
, CLUSTER_DOMAIN_SECRET_OPT
,
2516 NEW_SPICE_CERT_OPT
, SPICE_CERT_OPT
, SPICE_CACERT_OPT
,
2517 NEW_NODE_CERT_OPT
, NEW_SSH_KEY_OPT
, NOSSH_KEYCHECK_OPT
,
2520 "Renews cluster certificates, keys and secrets"),
2522 Epo
, [ArgUnknown()],
2523 [FORCE_OPT
, ON_OPT
, GROUPS_OPT
, ALL_OPT
, OOB_TIMEOUT_OPT
,
2524 SHUTDOWN_TIMEOUT_OPT
, POWER_DELAY_OPT
],
2526 "Performs an emergency power-off on given args"),
2527 "activate-master-ip": (
2528 ActivateMasterIp
, ARGS_NONE
, [], "", "Activates the master IP"),
2529 "deactivate-master-ip": (
2530 DeactivateMasterIp
, ARGS_NONE
, [CONFIRM_OPT
], "",
2531 "Deactivates the master IP"),
2532 "show-ispecs-cmd": (
2533 ShowCreateCommand
, ARGS_NONE
, [], "",
2534 "Show the command line to re-create the cluster"),
2536 UpgradeGanetiCommand
, ARGS_NONE
, [TO_OPT
, RESUME_OPT
], "",
2537 "Upgrade (or downgrade) to a new Ganeti version"),
2541 #: dictionary with aliases for commands
2543 "masterfailover": "master-failover",
2549 return GenericMain(commands
, override
={"tag_type": constants
.TAG_CLUSTER
},