Implement predictive queue cluster parameter
[ganeti-github.git] / lib / client / gnt_cluster.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2010, 2011, 2012, 2013, 2014 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 """Cluster related commands"""
31
32 # pylint: disable=W0401,W0613,W0614,C0103
33 # W0401: Wildcard import ganeti.cli
34 # W0613: Unused argument, since all functions follow the same API
35 # W0614: Unused import %s from wildcard import (since we need cli)
36 # C0103: Invalid name gnt-cluster
37
38 import itertools
39 import os
40 import time
41 import tempfile
42
43 from cStringIO import StringIO
44
45 import OpenSSL
46
47 from ganeti.cli import *
48 from ganeti import bootstrap
49 from ganeti import compat
50 from ganeti import constants
51 from ganeti import config
52 from ganeti import errors
53 from ganeti import netutils
54 from ganeti import objects
55 from ganeti import opcodes
56 from ganeti import pathutils
57 from ganeti import qlang
58 from ganeti.rpc.node import RunWithRPC
59 from ganeti import serializer
60 from ganeti import ssconf
61 from ganeti import ssh
62 from ganeti import uidpool
63 from ganeti import utils
64 from ganeti import wconfd
65 from ganeti.client import base
66
67
68 ON_OPT = cli_option("--on", default=False,
69 action="store_true", dest="on",
70 help="Recover from an EPO")
71
72 GROUPS_OPT = cli_option("--groups", default=False,
73 action="store_true", dest="groups",
74 help="Arguments are node groups instead of nodes")
75
76 FORCE_FAILOVER = cli_option("--yes-do-it", dest="yes_do_it",
77 help="Override interactive check for --no-voting",
78 default=False, action="store_true")
79
80 IGNORE_OFFLINE_NODES_FAILOVER = cli_option(
81 "--ignore-offline-nodes", dest="ignore_offline_nodes",
82 help="Ignores offline nodes for master failover voting", default=True)
83
84 FORCE_DISTRIBUTION = cli_option("--yes-do-it", dest="yes_do_it",
85 help="Unconditionally distribute the"
86 " configuration, even if the queue"
87 " is drained",
88 default=False, action="store_true")
89
90 TO_OPT = cli_option("--to", default=None, type="string",
91 help="The Ganeti version to upgrade to")
92
93 RESUME_OPT = cli_option("--resume", default=False, action="store_true",
94 help="Resume any pending Ganeti upgrades")
95
96 DATA_COLLECTOR_INTERVAL_OPT = cli_option(
97 "--data-collector-interval", default={}, type="keyval",
98 help="Set collection intervals in seconds of data collectors.")
99
100 STRICT_OPT = cli_option("--no-strict", default=False,
101 dest="no_strict", action="store_true",
102 help="Do not run group verify in strict mode")
103
104 _EPO_PING_INTERVAL = 30 # 30 seconds between pings
105 _EPO_PING_TIMEOUT = 1 # 1 second
106 _EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
107
108
109 def _InitEnabledDiskTemplates(opts):
110 """Initialize the list of enabled disk templates.
111
112 """
113 if opts.enabled_disk_templates:
114 return opts.enabled_disk_templates.split(",")
115 else:
116 return constants.DEFAULT_ENABLED_DISK_TEMPLATES
117
118
119 def _InitVgName(opts, enabled_disk_templates):
120 """Initialize the volume group name.
121
122 @type enabled_disk_templates: list of strings
123 @param enabled_disk_templates: cluster-wide enabled disk templates
124
125 """
126 vg_name = None
127 if opts.vg_name is not None:
128 vg_name = opts.vg_name
129 if vg_name:
130 if not utils.IsLvmEnabled(enabled_disk_templates):
131 ToStdout("You specified a volume group with --vg-name, but you did not"
132 " enable any disk template that uses lvm.")
133 elif utils.IsLvmEnabled(enabled_disk_templates):
134 raise errors.OpPrereqError(
135 "LVM disk templates are enabled, but vg name not set.")
136 elif utils.IsLvmEnabled(enabled_disk_templates):
137 vg_name = constants.DEFAULT_VG
138 return vg_name
139
140
141 def _InitDrbdHelper(opts, enabled_disk_templates, feedback_fn=ToStdout):
142 """Initialize the DRBD usermode helper.
143
144 """
145 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
146
147 if not drbd_enabled and opts.drbd_helper is not None:
148 feedback_fn("Note: You specified a DRBD usermode helper, while DRBD storage"
149 " is not enabled.")
150
151 if drbd_enabled:
152 if opts.drbd_helper is None:
153 return constants.DEFAULT_DRBD_HELPER
154 if opts.drbd_helper == '':
155 raise errors.OpPrereqError(
156 "Unsetting the drbd usermode helper while enabling DRBD is not"
157 " allowed.")
158
159 return opts.drbd_helper
160
161
162 @RunWithRPC
163 def InitCluster(opts, args):
164 """Initialize the cluster.
165
166 @param opts: the command line options selected by the user
167 @type args: list
168 @param args: should contain only one element, the desired
169 cluster name
170 @rtype: int
171 @return: the desired exit code
172
173 """
174 enabled_disk_templates = _InitEnabledDiskTemplates(opts)
175
176 try:
177 vg_name = _InitVgName(opts, enabled_disk_templates)
178 drbd_helper = _InitDrbdHelper(opts, enabled_disk_templates)
179 except errors.OpPrereqError, e:
180 ToStderr(str(e))
181 return 1
182
183 master_netdev = opts.master_netdev
184 if master_netdev is None:
185 nic_mode = opts.nicparams.get(constants.NIC_MODE, None)
186 if not nic_mode:
187 # default case, use bridging
188 master_netdev = constants.DEFAULT_BRIDGE
189 elif nic_mode == constants.NIC_MODE_OVS:
190 # default ovs is different from default bridge
191 master_netdev = constants.DEFAULT_OVS
192 opts.nicparams[constants.NIC_LINK] = constants.DEFAULT_OVS
193
194 hvlist = opts.enabled_hypervisors
195 if hvlist is None:
196 hvlist = constants.DEFAULT_ENABLED_HYPERVISOR
197 hvlist = hvlist.split(",")
198
199 hvparams = dict(opts.hvparams)
200 beparams = opts.beparams
201 nicparams = opts.nicparams
202
203 diskparams = dict(opts.diskparams)
204
205 # check the disk template types here, as we cannot rely on the type check done
206 # by the opcode parameter types
207 diskparams_keys = set(diskparams.keys())
208 if diskparams_keys > constants.DISK_TEMPLATES:
209 unknown = utils.NiceSort(diskparams_keys - constants.DISK_TEMPLATES)
210 ToStderr("Disk templates unknown: %s" % utils.CommaJoin(unknown))
211 return 1
212
213 # prepare beparams dict
214 beparams = objects.FillDict(constants.BEC_DEFAULTS, beparams)
215 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
216
217 # prepare nicparams dict
218 nicparams = objects.FillDict(constants.NICC_DEFAULTS, nicparams)
219 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
220
221 # prepare ndparams dict
222 if opts.ndparams is None:
223 ndparams = dict(constants.NDC_DEFAULTS)
224 else:
225 ndparams = objects.FillDict(constants.NDC_DEFAULTS, opts.ndparams)
226 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
227
228 # prepare hvparams dict
229 for hv in constants.HYPER_TYPES:
230 if hv not in hvparams:
231 hvparams[hv] = {}
232 hvparams[hv] = objects.FillDict(constants.HVC_DEFAULTS[hv], hvparams[hv])
233 utils.ForceDictType(hvparams[hv], constants.HVS_PARAMETER_TYPES)
234
235 # prepare diskparams dict
236 for templ in constants.DISK_TEMPLATES:
237 if templ not in diskparams:
238 diskparams[templ] = {}
239 diskparams[templ] = objects.FillDict(constants.DISK_DT_DEFAULTS[templ],
240 diskparams[templ])
241 utils.ForceDictType(diskparams[templ], constants.DISK_DT_TYPES)
242
243 # prepare ipolicy dict
244 ipolicy = CreateIPolicyFromOpts(
245 ispecs_mem_size=opts.ispecs_mem_size,
246 ispecs_cpu_count=opts.ispecs_cpu_count,
247 ispecs_disk_count=opts.ispecs_disk_count,
248 ispecs_disk_size=opts.ispecs_disk_size,
249 ispecs_nic_count=opts.ispecs_nic_count,
250 minmax_ispecs=opts.ipolicy_bounds_specs,
251 std_ispecs=opts.ipolicy_std_specs,
252 ipolicy_disk_templates=opts.ipolicy_disk_templates,
253 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
254 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
255 ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
256 fill_all=True)
257
258 if opts.candidate_pool_size is None:
259 opts.candidate_pool_size = constants.MASTER_POOL_SIZE_DEFAULT
260
261 if opts.mac_prefix is None:
262 opts.mac_prefix = constants.DEFAULT_MAC_PREFIX
263
264 uid_pool = opts.uid_pool
265 if uid_pool is not None:
266 uid_pool = uidpool.ParseUidPool(uid_pool)
267
268 if opts.prealloc_wipe_disks is None:
269 opts.prealloc_wipe_disks = False
270
271 external_ip_setup_script = opts.use_external_mip_script
272 if external_ip_setup_script is None:
273 external_ip_setup_script = False
274
275 try:
276 primary_ip_version = int(opts.primary_ip_version)
277 except (ValueError, TypeError), err:
278 ToStderr("Invalid primary ip version value: %s" % str(err))
279 return 1
280
281 master_netmask = opts.master_netmask
282 try:
283 if master_netmask is not None:
284 master_netmask = int(master_netmask)
285 except (ValueError, TypeError), err:
286 ToStderr("Invalid master netmask value: %s" % str(err))
287 return 1
288
289 if opts.disk_state:
290 disk_state = utils.FlatToDict(opts.disk_state)
291 else:
292 disk_state = {}
293
294 hv_state = dict(opts.hv_state)
295
296 if opts.install_image:
297 install_image = opts.install_image
298 else:
299 install_image = ""
300
301 if opts.zeroing_image:
302 zeroing_image = opts.zeroing_image
303 else:
304 zeroing_image = ""
305
306 compression_tools = _GetCompressionTools(opts)
307
308 default_ialloc_params = opts.default_iallocator_params
309
310 enabled_user_shutdown = bool(opts.enabled_user_shutdown)
311
312 if opts.enabled_predictive_queue is not None:
313 enabled_predictive_queue = bool(opts.enabled_predictive_queue)
314 else:
315 enabled_predictive_queue = True # Predictive queue is enabled by default.
316
317 if opts.ssh_key_type:
318 ssh_key_type = opts.ssh_key_type
319 else:
320 ssh_key_type = constants.SSH_DEFAULT_KEY_TYPE
321
322 ssh_key_bits = ssh.DetermineKeyBits(ssh_key_type, opts.ssh_key_bits, None,
323 None)
324
325 bootstrap.InitCluster(cluster_name=args[0],
326 secondary_ip=opts.secondary_ip,
327 vg_name=vg_name,
328 mac_prefix=opts.mac_prefix,
329 master_netmask=master_netmask,
330 master_netdev=master_netdev,
331 file_storage_dir=opts.file_storage_dir,
332 shared_file_storage_dir=opts.shared_file_storage_dir,
333 gluster_storage_dir=opts.gluster_storage_dir,
334 enabled_hypervisors=hvlist,
335 hvparams=hvparams,
336 beparams=beparams,
337 nicparams=nicparams,
338 ndparams=ndparams,
339 diskparams=diskparams,
340 ipolicy=ipolicy,
341 candidate_pool_size=opts.candidate_pool_size,
342 modify_etc_hosts=opts.modify_etc_hosts,
343 modify_ssh_setup=opts.modify_ssh_setup,
344 maintain_node_health=opts.maintain_node_health,
345 drbd_helper=drbd_helper,
346 uid_pool=uid_pool,
347 default_iallocator=opts.default_iallocator,
348 default_iallocator_params=default_ialloc_params,
349 primary_ip_version=primary_ip_version,
350 prealloc_wipe_disks=opts.prealloc_wipe_disks,
351 use_external_mip_script=external_ip_setup_script,
352 hv_state=hv_state,
353 disk_state=disk_state,
354 enabled_disk_templates=enabled_disk_templates,
355 install_image=install_image,
356 zeroing_image=zeroing_image,
357 compression_tools=compression_tools,
358 enabled_user_shutdown=enabled_user_shutdown,
359 ssh_key_type=ssh_key_type,
360 ssh_key_bits=ssh_key_bits,
361 enabled_predictive_queue=enabled_predictive_queue,
362 )
363 op = opcodes.OpClusterPostInit()
364 SubmitOpCode(op, opts=opts)
365 return 0
366
367
368 @RunWithRPC
369 def DestroyCluster(opts, args):
370 """Destroy the cluster.
371
372 @param opts: the command line options selected by the user
373 @type args: list
374 @param args: should be an empty list
375 @rtype: int
376 @return: the desired exit code
377
378 """
379 if not opts.yes_do_it:
380 ToStderr("Destroying a cluster is irreversible. If you really want"
381 " destroy this cluster, supply the --yes-do-it option.")
382 return 1
383
384 op = opcodes.OpClusterDestroy()
385 master_uuid = SubmitOpCode(op, opts=opts)
386 # if we reached this, the opcode didn't fail; we can proceed to
387 # shutdown all the daemons
388 bootstrap.FinalizeClusterDestroy(master_uuid)
389 return 0
390
391
392 def RenameCluster(opts, args):
393 """Rename the cluster.
394
395 @param opts: the command line options selected by the user
396 @type args: list
397 @param args: should contain only one element, the new cluster name
398 @rtype: int
399 @return: the desired exit code
400
401 """
402 cl = GetClient()
403
404 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
405
406 new_name = args[0]
407 if not opts.force:
408 usertext = ("This will rename the cluster from '%s' to '%s'. If you are"
409 " connected over the network to the cluster name, the"
410 " operation is very dangerous as the IP address will be"
411 " removed from the node and the change may not go through."
412 " Continue?") % (cluster_name, new_name)
413 if not AskUser(usertext):
414 return 1
415
416 op = opcodes.OpClusterRename(name=new_name)
417 result = SubmitOpCode(op, opts=opts, cl=cl)
418
419 if result:
420 ToStdout("Cluster renamed from '%s' to '%s'", cluster_name, result)
421
422 return 0
423
424
425 def ActivateMasterIp(opts, args):
426 """Activates the master IP.
427
428 """
429 op = opcodes.OpClusterActivateMasterIp()
430 SubmitOpCode(op)
431 return 0
432
433
434 def DeactivateMasterIp(opts, args):
435 """Deactivates the master IP.
436
437 """
438 if not opts.confirm:
439 usertext = ("This will disable the master IP. All the open connections to"
440 " the master IP will be closed. To reach the master you will"
441 " need to use its node IP."
442 " Continue?")
443 if not AskUser(usertext):
444 return 1
445
446 op = opcodes.OpClusterDeactivateMasterIp()
447 SubmitOpCode(op)
448 return 0
449
450
451 def RedistributeConfig(opts, args):
452 """Forces push of the cluster configuration.
453
454 @param opts: the command line options selected by the user
455 @type args: list
456 @param args: empty list
457 @rtype: int
458 @return: the desired exit code
459
460 """
461 op = opcodes.OpClusterRedistConf()
462 if opts.yes_do_it:
463 SubmitOpCodeToDrainedQueue(op)
464 else:
465 SubmitOrSend(op, opts)
466 return 0
467
468
469 def ShowClusterVersion(opts, args):
470 """Write version of ganeti software to the standard output.
471
472 @param opts: the command line options selected by the user
473 @type args: list
474 @param args: should be an empty list
475 @rtype: int
476 @return: the desired exit code
477
478 """
479 cl = GetClient()
480 result = cl.QueryClusterInfo()
481 ToStdout("Software version: %s", result["software_version"])
482 ToStdout("Internode protocol: %s", result["protocol_version"])
483 ToStdout("Configuration format: %s", result["config_version"])
484 ToStdout("OS api version: %s", result["os_api_version"])
485 ToStdout("Export interface: %s", result["export_version"])
486 ToStdout("VCS version: %s", result["vcs_version"])
487 return 0
488
489
490 def ShowClusterMaster(opts, args):
491 """Write name of master node to the standard output.
492
493 @param opts: the command line options selected by the user
494 @type args: list
495 @param args: should be an empty list
496 @rtype: int
497 @return: the desired exit code
498
499 """
500 master = bootstrap.GetMaster()
501 ToStdout(master)
502 return 0
503
504
505 def _FormatGroupedParams(paramsdict, roman=False):
506 """Format Grouped parameters (be, nic, disk) by group.
507
508 @type paramsdict: dict of dicts
509 @param paramsdict: {group: {param: value, ...}, ...}
510 @rtype: dict of dicts
511 @return: copy of the input dictionaries with strings as values
512
513 """
514 ret = {}
515 for (item, val) in paramsdict.items():
516 if isinstance(val, dict):
517 ret[item] = _FormatGroupedParams(val, roman=roman)
518 elif roman and isinstance(val, int):
519 ret[item] = compat.TryToRoman(val)
520 else:
521 ret[item] = str(val)
522 return ret
523
524
525 def _FormatDataCollectors(paramsdict):
526 """Format Grouped parameters (be, nic, disk) by group.
527
528 @type paramsdict: dict of dicts
529 @param paramsdict: response of QueryClusterInfo
530 @rtype: dict of dicts
531 @return: parameter grouped by data collector
532
533 """
534
535 enabled = paramsdict[constants.DATA_COLLECTORS_ENABLED_NAME]
536 interval = paramsdict[constants.DATA_COLLECTORS_INTERVAL_NAME]
537
538 ret = {}
539 for key in enabled:
540 ret[key] = dict(active=enabled[key],
541 interval="%.3fs" % (interval[key] / 1e6))
542 return ret
543
544
545 def ShowClusterConfig(opts, args):
546 """Shows cluster information.
547
548 @param opts: the command line options selected by the user
549 @type args: list
550 @param args: should be an empty list
551 @rtype: int
552 @return: the desired exit code
553
554 """
555 cl = GetClient()
556 result = cl.QueryClusterInfo()
557
558 if result["tags"]:
559 tags = utils.CommaJoin(utils.NiceSort(result["tags"]))
560 else:
561 tags = "(none)"
562 if result["reserved_lvs"]:
563 reserved_lvs = utils.CommaJoin(result["reserved_lvs"])
564 else:
565 reserved_lvs = "(none)"
566
567 enabled_hv = result["enabled_hypervisors"]
568 hvparams = dict((k, v) for k, v in result["hvparams"].iteritems()
569 if k in enabled_hv)
570
571 info = [
572 ("Cluster name", result["name"]),
573 ("Cluster UUID", result["uuid"]),
574
575 ("Creation time", utils.FormatTime(result["ctime"])),
576 ("Modification time", utils.FormatTime(result["mtime"])),
577
578 ("Master node", result["master"]),
579
580 ("Architecture (this node)",
581 "%s (%s)" % (result["architecture"][0], result["architecture"][1])),
582
583 ("Tags", tags),
584
585 ("Default hypervisor", result["default_hypervisor"]),
586 ("Enabled hypervisors", utils.CommaJoin(enabled_hv)),
587
588 ("Hypervisor parameters", _FormatGroupedParams(hvparams,
589 opts.roman_integers)),
590
591 ("OS-specific hypervisor parameters",
592 _FormatGroupedParams(result["os_hvp"], opts.roman_integers)),
593
594 ("OS parameters", _FormatGroupedParams(result["osparams"],
595 opts.roman_integers)),
596
597 ("Hidden OSes", utils.CommaJoin(result["hidden_os"])),
598 ("Blacklisted OSes", utils.CommaJoin(result["blacklisted_os"])),
599
600 ("Cluster parameters", [
601 ("candidate pool size",
602 compat.TryToRoman(result["candidate_pool_size"],
603 convert=opts.roman_integers)),
604 ("maximal number of jobs running simultaneously",
605 compat.TryToRoman(result["max_running_jobs"],
606 convert=opts.roman_integers)),
607 ("maximal number of jobs simultaneously tracked by the scheduler",
608 compat.TryToRoman(result["max_tracked_jobs"],
609 convert=opts.roman_integers)),
610 ("mac prefix", result["mac_prefix"]),
611 ("master netdev", result["master_netdev"]),
612 ("master netmask", compat.TryToRoman(result["master_netmask"],
613 opts.roman_integers)),
614 ("use external master IP address setup script",
615 result["use_external_mip_script"]),
616 ("lvm volume group", result["volume_group_name"]),
617 ("lvm reserved volumes", reserved_lvs),
618 ("drbd usermode helper", result["drbd_usermode_helper"]),
619 ("file storage path", result["file_storage_dir"]),
620 ("shared file storage path", result["shared_file_storage_dir"]),
621 ("gluster storage path", result["gluster_storage_dir"]),
622 ("maintenance of node health", result["maintain_node_health"]),
623 ("uid pool", uidpool.FormatUidPool(result["uid_pool"])),
624 ("default instance allocator", result["default_iallocator"]),
625 ("default instance allocator parameters",
626 result["default_iallocator_params"]),
627 ("primary ip version", compat.TryToRoman(result["primary_ip_version"],
628 opts.roman_integers)),
629 ("preallocation wipe disks", result["prealloc_wipe_disks"]),
630 ("OS search path", utils.CommaJoin(pathutils.OS_SEARCH_PATH)),
631 ("ExtStorage Providers search path",
632 utils.CommaJoin(pathutils.ES_SEARCH_PATH)),
633 ("enabled disk templates",
634 utils.CommaJoin(result["enabled_disk_templates"])),
635 ("install image", result["install_image"]),
636 ("instance communication network",
637 result["instance_communication_network"]),
638 ("zeroing image", result["zeroing_image"]),
639 ("compression tools", result["compression_tools"]),
640 ("enabled user shutdown", result["enabled_user_shutdown"]),
641 ("modify ssh setup", result["modify_ssh_setup"]),
642 ("ssh_key_type", result["ssh_key_type"]),
643 ("ssh_key_bits", result["ssh_key_bits"]),
644 ("enabled predictive queue", result["enabled_predictive_queue"])
645 ]),
646
647 ("Default node parameters",
648 _FormatGroupedParams(result["ndparams"], roman=opts.roman_integers)),
649
650 ("Default instance parameters",
651 _FormatGroupedParams(result["beparams"], roman=opts.roman_integers)),
652
653 ("Default nic parameters",
654 _FormatGroupedParams(result["nicparams"], roman=opts.roman_integers)),
655
656 ("Default disk parameters",
657 _FormatGroupedParams(result["diskparams"], roman=opts.roman_integers)),
658
659 ("Instance policy - limits for instances",
660 FormatPolicyInfo(result["ipolicy"], None, True, opts.roman_integers)),
661 ("Data collectors", _FormatDataCollectors(result)),
662 ]
663
664 PrintGenericInfo(info)
665 return 0
666
667
668 def ClusterCopyFile(opts, args):
669 """Copy a file from master to some nodes.
670
671 @param opts: the command line options selected by the user
672 @type args: list
673 @param args: should contain only one element, the path of
674 the file to be copied
675 @rtype: int
676 @return: the desired exit code
677
678 """
679 filename = args[0]
680 filename = os.path.abspath(filename)
681
682 if not os.path.exists(filename):
683 raise errors.OpPrereqError("No such filename '%s'" % filename,
684 errors.ECODE_INVAL)
685
686 cl = GetClient()
687 qcl = GetClient()
688 try:
689 cluster_name = cl.QueryConfigValues(["cluster_name"])[0]
690
691 results = GetOnlineNodes(nodes=opts.nodes, cl=qcl, filter_master=True,
692 secondary_ips=opts.use_replication_network,
693 nodegroup=opts.nodegroup)
694 ports = GetNodesSshPorts(opts.nodes, qcl)
695 finally:
696 cl.Close()
697 qcl.Close()
698
699 srun = ssh.SshRunner(cluster_name)
700 for (node, port) in zip(results, ports):
701 if not srun.CopyFileToNode(node, port, filename):
702 ToStderr("Copy of file %s to node %s:%d failed", filename, node, port)
703
704 return 0
705
706
707 def RunClusterCommand(opts, args):
708 """Run a command on some nodes.
709
710 @param opts: the command line options selected by the user
711 @type args: list
712 @param args: should contain the command to be run and its arguments
713 @rtype: int
714 @return: the desired exit code
715
716 """
717 cl = GetClient()
718 qcl = GetClient()
719
720 command = " ".join(args)
721
722 nodes = GetOnlineNodes(nodes=opts.nodes, cl=qcl, nodegroup=opts.nodegroup)
723 ports = GetNodesSshPorts(nodes, qcl)
724
725 cluster_name, master_node = cl.QueryConfigValues(["cluster_name",
726 "master_node"])
727
728 srun = ssh.SshRunner(cluster_name=cluster_name)
729
730 # Make sure master node is at list end
731 if master_node in nodes:
732 nodes.remove(master_node)
733 nodes.append(master_node)
734
735 for (name, port) in zip(nodes, ports):
736 result = srun.Run(name, constants.SSH_LOGIN_USER, command, port=port)
737
738 if opts.failure_only and result.exit_code == constants.EXIT_SUCCESS:
739 # Do not output anything for successful commands
740 continue
741
742 ToStdout("------------------------------------------------")
743 if opts.show_machine_names:
744 for line in result.output.splitlines():
745 ToStdout("%s: %s", name, line)
746 else:
747 ToStdout("node: %s", name)
748 ToStdout("%s", result.output)
749 ToStdout("return code = %s", result.exit_code)
750
751 return 0
752
753
754 def VerifyCluster(opts, args):
755 """Verify integrity of cluster, performing various test on nodes.
756
757 @param opts: the command line options selected by the user
758 @type args: list
759 @param args: should be an empty list
760 @rtype: int
761 @return: the desired exit code
762
763 """
764 skip_checks = []
765
766 if opts.skip_nplusone_mem:
767 skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
768
769 cl = GetClient()
770
771 op = opcodes.OpClusterVerify(verbose=opts.verbose,
772 error_codes=opts.error_codes,
773 debug_simulate_errors=opts.simulate_errors,
774 skip_checks=skip_checks,
775 ignore_errors=opts.ignore_errors,
776 group_name=opts.nodegroup,
777 verify_clutter=opts.verify_clutter)
778 result = SubmitOpCode(op, cl=cl, opts=opts)
779
780 # Keep track of submitted jobs
781 jex = JobExecutor(cl=cl, opts=opts)
782
783 for (status, job_id) in result[constants.JOB_IDS_KEY]:
784 jex.AddJobId(None, status, job_id)
785
786 results = jex.GetResults()
787
788 bad_jobs = sum(1 for (job_success, _) in results if not job_success)
789 bad_results = sum(1 for (_, op_res) in results if not (op_res and op_res[0]))
790
791 if bad_jobs == 0 and bad_results == 0:
792 rcode = constants.EXIT_SUCCESS
793 else:
794 rcode = constants.EXIT_FAILURE
795 if bad_jobs > 0:
796 ToStdout("%s job(s) failed while verifying the cluster.", bad_jobs)
797
798 return rcode
799
800
801 def VerifyDisks(opts, args):
802 """Verify integrity of cluster disks.
803
804 @param opts: the command line options selected by the user
805 @type args: list
806 @param args: should be an empty list
807 @rtype: int
808 @return: the desired exit code
809
810 """
811 cl = GetClient()
812
813 op = opcodes.OpClusterVerifyDisks(group_name=opts.nodegroup,
814 is_strict=not opts.no_strict)
815
816 result = SubmitOpCode(op, cl=cl, opts=opts)
817
818 # Keep track of submitted jobs
819 jex = JobExecutor(cl=cl, opts=opts)
820
821 for (status, job_id) in result[constants.JOB_IDS_KEY]:
822 jex.AddJobId(None, status, job_id)
823
824 retcode = constants.EXIT_SUCCESS
825
826 for (status, result) in jex.GetResults():
827 if not status:
828 ToStdout("Job failed: %s", result)
829 continue
830
831 ((bad_nodes, instances, missing), ) = result
832
833 for node, text in bad_nodes.items():
834 ToStdout("Error gathering data on node %s: %s",
835 node, utils.SafeEncode(text[-400:]))
836 retcode = constants.EXIT_FAILURE
837 ToStdout("You need to fix these nodes first before fixing instances")
838
839 for iname in instances:
840 if iname in missing:
841 continue
842 op = opcodes.OpInstanceActivateDisks(instance_name=iname)
843 try:
844 ToStdout("Activating disks for instance '%s'", iname)
845 SubmitOpCode(op, opts=opts, cl=cl)
846 except errors.GenericError, err:
847 nret, msg = FormatError(err)
848 retcode |= nret
849 ToStderr("Error activating disks for instance %s: %s", iname, msg)
850
851 if missing:
852 for iname, ival in missing.iteritems():
853 all_missing = compat.all(x[0] in bad_nodes for x in ival)
854 if all_missing:
855 ToStdout("Instance %s cannot be verified as it lives on"
856 " broken nodes", iname)
857 continue
858
859 ToStdout("Instance %s has missing logical volumes:", iname)
860 ival.sort()
861 for node, vol in ival:
862 if node in bad_nodes:
863 ToStdout("\tbroken node %s /dev/%s", node, vol)
864 else:
865 ToStdout("\t%s /dev/%s", node, vol)
866
867 ToStdout("You need to replace or recreate disks for all the above"
868 " instances if this message persists after fixing broken nodes.")
869 retcode = constants.EXIT_FAILURE
870 elif not instances:
871 ToStdout("No disks need to be activated.")
872
873 return retcode
874
875
876 def RepairDiskSizes(opts, args):
877 """Verify sizes of cluster disks.
878
879 @param opts: the command line options selected by the user
880 @type args: list
881 @param args: optional list of instances to restrict check to
882 @rtype: int
883 @return: the desired exit code
884
885 """
886 op = opcodes.OpClusterRepairDiskSizes(instances=args)
887 SubmitOpCode(op, opts=opts)
888
889
890 @RunWithRPC
891 def MasterFailover(opts, args):
892 """Failover the master node.
893
894 This command, when run on a non-master node, will cause the current
895 master to cease being master, and the non-master to become new
896 master.
897
898 @param opts: the command line options selected by the user
899 @type args: list
900 @param args: should be an empty list
901 @rtype: int
902 @return: the desired exit code
903
904 """
905 if opts.no_voting:
906 # Don't ask for confirmation if the user provides the confirmation flag.
907 if not opts.yes_do_it:
908 usertext = ("This will perform the failover even if most other nodes"
909 " are down, or if this node is outdated. This is dangerous"
910 " as it can lead to a non-consistent cluster. Check the"
911 " gnt-cluster(8) man page before proceeding. Continue?")
912 if not AskUser(usertext):
913 return 1
914 else:
915 # Verify that a majority of nodes are still healthy
916 (majority_healthy, unhealthy_nodes) = bootstrap.MajorityHealthy(
917 opts.ignore_offline_nodes)
918 if not majority_healthy:
919 ToStderr("Master-failover with voting is only possible if the majority"
920 " of nodes are still healthy; use the --no-voting option after"
921 " ensuring by other means that you won't end up in a dual-master"
922 " scenario. Unhealthy nodes: %s" % unhealthy_nodes)
923 return 1
924
925 rvalue, msgs = bootstrap.MasterFailover(no_voting=opts.no_voting)
926 for msg in msgs:
927 ToStderr(msg)
928
929 return rvalue
930
931
932 def MasterPing(opts, args):
933 """Checks if the master is alive.
934
935 @param opts: the command line options selected by the user
936 @type args: list
937 @param args: should be an empty list
938 @rtype: int
939 @return: the desired exit code
940
941 """
942 try:
943 cl = GetClient()
944 cl.QueryClusterInfo()
945 return 0
946 except Exception: # pylint: disable=W0703
947 return 1
948
949
950 def SearchTags(opts, args):
951 """Searches the tags on all the cluster.
952
953 @param opts: the command line options selected by the user
954 @type args: list
955 @param args: should contain only one element, the tag pattern
956 @rtype: int
957 @return: the desired exit code
958
959 """
960 op = opcodes.OpTagsSearch(pattern=args[0])
961 result = SubmitOpCode(op, opts=opts)
962 if not result:
963 return 1
964 result = list(result)
965 result.sort()
966 for path, tag in result:
967 ToStdout("%s %s", path, tag)
968
969
970 def _ReadAndVerifyCert(cert_filename, verify_private_key=False):
971 """Reads and verifies an X509 certificate.
972
973 @type cert_filename: string
974 @param cert_filename: the path of the file containing the certificate to
975 verify encoded in PEM format
976 @type verify_private_key: bool
977 @param verify_private_key: whether to verify the private key in addition to
978 the public certificate
979 @rtype: string
980 @return: a string containing the PEM-encoded certificate.
981
982 """
983 try:
984 pem = utils.ReadFile(cert_filename)
985 except IOError, err:
986 raise errors.X509CertError(cert_filename,
987 "Unable to read certificate: %s" % str(err))
988
989 try:
990 OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, pem)
991 except Exception, err:
992 raise errors.X509CertError(cert_filename,
993 "Unable to load certificate: %s" % str(err))
994
995 if verify_private_key:
996 try:
997 OpenSSL.crypto.load_privatekey(OpenSSL.crypto.FILETYPE_PEM, pem)
998 except Exception, err:
999 raise errors.X509CertError(cert_filename,
1000 "Unable to load private key: %s" % str(err))
1001
1002 return pem
1003
1004
1005 # pylint: disable=R0913
1006 def _RenewCrypto(new_cluster_cert, new_rapi_cert, # pylint: disable=R0911
1007 rapi_cert_filename, new_spice_cert, spice_cert_filename,
1008 spice_cacert_filename, new_confd_hmac_key, new_cds,
1009 cds_filename, force, new_node_cert, new_ssh_keys,
1010 ssh_key_type, ssh_key_bits, verbose, debug):
1011 """Renews cluster certificates, keys and secrets.
1012
1013 @type new_cluster_cert: bool
1014 @param new_cluster_cert: Whether to generate a new cluster certificate
1015 @type new_rapi_cert: bool
1016 @param new_rapi_cert: Whether to generate a new RAPI certificate
1017 @type rapi_cert_filename: string
1018 @param rapi_cert_filename: Path to file containing new RAPI certificate
1019 @type new_spice_cert: bool
1020 @param new_spice_cert: Whether to generate a new SPICE certificate
1021 @type spice_cert_filename: string
1022 @param spice_cert_filename: Path to file containing new SPICE certificate
1023 @type spice_cacert_filename: string
1024 @param spice_cacert_filename: Path to file containing the certificate of the
1025 CA that signed the SPICE certificate
1026 @type new_confd_hmac_key: bool
1027 @param new_confd_hmac_key: Whether to generate a new HMAC key
1028 @type new_cds: bool
1029 @param new_cds: Whether to generate a new cluster domain secret
1030 @type cds_filename: string
1031 @param cds_filename: Path to file containing new cluster domain secret
1032 @type force: bool
1033 @param force: Whether to ask user for confirmation
1034 @type new_node_cert: bool
1035 @param new_node_cert: Whether to generate new node certificates
1036 @type new_ssh_keys: bool
1037 @param new_ssh_keys: Whether to generate new node SSH keys
1038 @type ssh_key_type: One of L{constants.SSHK_ALL}
1039 @param ssh_key_type: The type of SSH key to be generated
1040 @type ssh_key_bits: int
1041 @param ssh_key_bits: The length of the key to be generated
1042 @type verbose: boolean
1043 @param verbose: Show verbose output
1044 @type debug: boolean
1045 @param debug: Show debug output
1046
1047 """
1048 ToStdout("Updating certificates now. Running \"gnt-cluster verify\" "
1049 " is recommended after this operation.")
1050
1051 if new_rapi_cert and rapi_cert_filename:
1052 ToStderr("Only one of the --new-rapi-certificate and --rapi-certificate"
1053 " options can be specified at the same time.")
1054 return 1
1055
1056 if new_cds and cds_filename:
1057 ToStderr("Only one of the --new-cluster-domain-secret and"
1058 " --cluster-domain-secret options can be specified at"
1059 " the same time.")
1060 return 1
1061
1062 if new_spice_cert and (spice_cert_filename or spice_cacert_filename):
1063 ToStderr("When using --new-spice-certificate, the --spice-certificate"
1064 " and --spice-ca-certificate must not be used.")
1065 return 1
1066
1067 if bool(spice_cacert_filename) ^ bool(spice_cert_filename):
1068 ToStderr("Both --spice-certificate and --spice-ca-certificate must be"
1069 " specified.")
1070 return 1
1071
1072 rapi_cert_pem, spice_cert_pem, spice_cacert_pem = (None, None, None)
1073 try:
1074 if rapi_cert_filename:
1075 rapi_cert_pem = _ReadAndVerifyCert(rapi_cert_filename, True)
1076 if spice_cert_filename:
1077 spice_cert_pem = _ReadAndVerifyCert(spice_cert_filename, True)
1078 spice_cacert_pem = _ReadAndVerifyCert(spice_cacert_filename)
1079 except errors.X509CertError, err:
1080 ToStderr("Unable to load X509 certificate from %s: %s", err[0], err[1])
1081 return 1
1082
1083 if cds_filename:
1084 try:
1085 cds = utils.ReadFile(cds_filename)
1086 except Exception, err: # pylint: disable=W0703
1087 ToStderr("Can't load new cluster domain secret from %s: %s" %
1088 (cds_filename, str(err)))
1089 return 1
1090 else:
1091 cds = None
1092
1093 if not force:
1094 usertext = ("This requires all daemons on all nodes to be restarted and"
1095 " may take some time. Continue?")
1096 if not AskUser(usertext):
1097 return 1
1098
1099 def _RenewCryptoInner(ctx):
1100 ctx.feedback_fn("Updating certificates and keys")
1101
1102 bootstrap.GenerateClusterCrypto(False,
1103 new_rapi_cert,
1104 new_spice_cert,
1105 new_confd_hmac_key,
1106 new_cds,
1107 False,
1108 None,
1109 rapi_cert_pem=rapi_cert_pem,
1110 spice_cert_pem=spice_cert_pem,
1111 spice_cacert_pem=spice_cacert_pem,
1112 cds=cds)
1113
1114 files_to_copy = []
1115
1116 if new_rapi_cert or rapi_cert_pem:
1117 files_to_copy.append(pathutils.RAPI_CERT_FILE)
1118
1119 if new_spice_cert or spice_cert_pem:
1120 files_to_copy.append(pathutils.SPICE_CERT_FILE)
1121 files_to_copy.append(pathutils.SPICE_CACERT_FILE)
1122
1123 if new_confd_hmac_key:
1124 files_to_copy.append(pathutils.CONFD_HMAC_KEY)
1125
1126 if new_cds or cds:
1127 files_to_copy.append(pathutils.CLUSTER_DOMAIN_SECRET_FILE)
1128
1129 if files_to_copy:
1130 for node_name in ctx.nonmaster_nodes:
1131 port = ctx.ssh_ports[node_name]
1132 ctx.feedback_fn("Copying %s to %s:%d" %
1133 (", ".join(files_to_copy), node_name, port))
1134 for file_name in files_to_copy:
1135 ctx.ssh.CopyFileToNode(node_name, port, file_name)
1136
1137 def _RenewClientCerts(ctx):
1138 ctx.feedback_fn("Updating client SSL certificates.")
1139
1140 cluster_name = ssconf.SimpleStore().GetClusterName()
1141
1142 for node_name in ctx.nonmaster_nodes + [ctx.master_node]:
1143 ssh_port = ctx.ssh_ports[node_name]
1144 data = {
1145 constants.NDS_CLUSTER_NAME: cluster_name,
1146 constants.NDS_NODE_DAEMON_CERTIFICATE:
1147 utils.ReadFile(pathutils.NODED_CERT_FILE),
1148 constants.NDS_NODE_NAME: node_name,
1149 constants.NDS_ACTION: constants.CRYPTO_ACTION_CREATE,
1150 }
1151
1152 ssh.RunSshCmdWithStdin(
1153 cluster_name,
1154 node_name,
1155 pathutils.SSL_UPDATE,
1156 ssh_port,
1157 data,
1158 debug=ctx.debug,
1159 verbose=ctx.verbose,
1160 use_cluster_key=True,
1161 ask_key=False,
1162 strict_host_check=True)
1163
1164 # Create a temporary ssconf file using the master's client cert digest
1165 # and the 'bootstrap' keyword to enable distribution of all nodes' digests.
1166 master_digest = utils.GetCertificateDigest()
1167 ssconf_master_candidate_certs_filename = os.path.join(
1168 pathutils.DATA_DIR, "%s%s" %
1169 (constants.SSCONF_FILEPREFIX, constants.SS_MASTER_CANDIDATES_CERTS))
1170 utils.WriteFile(
1171 ssconf_master_candidate_certs_filename,
1172 data="%s=%s" % (constants.CRYPTO_BOOTSTRAP, master_digest))
1173 for node_name in ctx.nonmaster_nodes:
1174 port = ctx.ssh_ports[node_name]
1175 ctx.feedback_fn("Copying %s to %s:%d" %
1176 (ssconf_master_candidate_certs_filename, node_name, port))
1177 ctx.ssh.CopyFileToNode(node_name, port,
1178 ssconf_master_candidate_certs_filename)
1179
1180 # Write the boostrap entry to the config using wconfd.
1181 config_live_lock = utils.livelock.LiveLock("renew_crypto")
1182 cfg = config.GetConfig(None, config_live_lock)
1183 cfg.AddNodeToCandidateCerts(constants.CRYPTO_BOOTSTRAP, master_digest)
1184 cfg.Update(cfg.GetClusterInfo(), ctx.feedback_fn)
1185
1186 def _RenewServerAndClientCerts(ctx):
1187 ctx.feedback_fn("Updating the cluster SSL certificate.")
1188
1189 master_name = ssconf.SimpleStore().GetMasterNode()
1190 bootstrap.GenerateClusterCrypto(True, # cluster cert
1191 False, # rapi cert
1192 False, # spice cert
1193 False, # confd hmac key
1194 False, # cds
1195 True, # client cert
1196 master_name)
1197
1198 for node_name in ctx.nonmaster_nodes:
1199 port = ctx.ssh_ports[node_name]
1200 server_cert = pathutils.NODED_CERT_FILE
1201 ctx.feedback_fn("Copying %s to %s:%d" %
1202 (server_cert, node_name, port))
1203 ctx.ssh.CopyFileToNode(node_name, port, server_cert)
1204
1205 _RenewClientCerts(ctx)
1206
1207 if new_rapi_cert or new_spice_cert or new_confd_hmac_key or new_cds:
1208 RunWhileClusterStopped(ToStdout, _RenewCryptoInner)
1209
1210 # If only node certficates are recreated, call _RenewClientCerts only.
1211 if new_node_cert and not new_cluster_cert:
1212 RunWhileDaemonsStopped(ToStdout, [constants.NODED, constants.WCONFD],
1213 _RenewClientCerts, verbose=verbose, debug=debug)
1214
1215 # If the cluster certificate are renewed, the client certificates need
1216 # to be renewed too.
1217 if new_cluster_cert:
1218 RunWhileDaemonsStopped(ToStdout, [constants.NODED, constants.WCONFD],
1219 _RenewServerAndClientCerts, verbose=verbose,
1220 debug=debug)
1221
1222 if new_node_cert or new_cluster_cert or new_ssh_keys:
1223 cl = GetClient()
1224 renew_op = opcodes.OpClusterRenewCrypto(
1225 node_certificates=new_node_cert or new_cluster_cert,
1226 renew_ssh_keys=new_ssh_keys,
1227 ssh_key_type=ssh_key_type,
1228 ssh_key_bits=ssh_key_bits,
1229 verbose=verbose,
1230 debug=debug)
1231 SubmitOpCode(renew_op, cl=cl)
1232
1233 ToStdout("All requested certificates and keys have been replaced."
1234 " Running \"gnt-cluster verify\" now is recommended.")
1235
1236 return 0
1237
1238
1239 def _BuildGanetiPubKeys(options, pub_key_file=pathutils.SSH_PUB_KEYS, cl=None,
1240 get_online_nodes_fn=GetOnlineNodes,
1241 get_nodes_ssh_ports_fn=GetNodesSshPorts,
1242 get_node_uuids_fn=GetNodeUUIDs,
1243 homedir_fn=None):
1244 """Recreates the 'ganeti_pub_key' file by polling all nodes.
1245
1246 """
1247
1248 if not cl:
1249 cl = GetClient()
1250
1251 (cluster_name, master_node, modify_ssh_setup, ssh_key_type) = \
1252 cl.QueryConfigValues(["cluster_name", "master_node", "modify_ssh_setup",
1253 "ssh_key_type"])
1254
1255 # In case Ganeti is not supposed to modify the SSH setup, simply exit and do
1256 # not update this file.
1257 if not modify_ssh_setup:
1258 return
1259
1260 if os.path.exists(pub_key_file):
1261 utils.CreateBackup(pub_key_file)
1262 utils.RemoveFile(pub_key_file)
1263
1264 ssh.ClearPubKeyFile(pub_key_file)
1265
1266 online_nodes = get_online_nodes_fn([], cl=cl)
1267 ssh_ports = get_nodes_ssh_ports_fn(online_nodes + [master_node], cl)
1268 ssh_port_map = dict(zip(online_nodes + [master_node], ssh_ports))
1269
1270 node_uuids = get_node_uuids_fn(online_nodes + [master_node], cl)
1271 node_uuid_map = dict(zip(online_nodes + [master_node], node_uuids))
1272
1273 nonmaster_nodes = [name for name in online_nodes
1274 if name != master_node]
1275
1276 _, pub_key_filename, _ = \
1277 ssh.GetUserFiles(constants.SSH_LOGIN_USER, mkdir=False, dircheck=False,
1278 kind=ssh_key_type, _homedir_fn=homedir_fn)
1279
1280 # get the key file of the master node
1281 pub_key = utils.ReadFile(pub_key_filename)
1282 ssh.AddPublicKey(node_uuid_map[master_node], pub_key,
1283 key_file=pub_key_file)
1284
1285 # get the key files of all non-master nodes
1286 for node in nonmaster_nodes:
1287 pub_key = ssh.ReadRemoteSshPubKey(pub_key_filename, node, cluster_name,
1288 ssh_port_map[node],
1289 options.ssh_key_check,
1290 options.ssh_key_check)
1291 ssh.AddPublicKey(node_uuid_map[node], pub_key, key_file=pub_key_file)
1292
1293
1294 def RenewCrypto(opts, args):
1295 """Renews cluster certificates, keys and secrets.
1296
1297 """
1298 if opts.new_ssh_keys:
1299 _BuildGanetiPubKeys(opts)
1300 return _RenewCrypto(opts.new_cluster_cert,
1301 opts.new_rapi_cert,
1302 opts.rapi_cert,
1303 opts.new_spice_cert,
1304 opts.spice_cert,
1305 opts.spice_cacert,
1306 opts.new_confd_hmac_key,
1307 opts.new_cluster_domain_secret,
1308 opts.cluster_domain_secret,
1309 opts.force,
1310 opts.new_node_cert,
1311 opts.new_ssh_keys,
1312 opts.ssh_key_type,
1313 opts.ssh_key_bits,
1314 opts.verbose,
1315 opts.debug > 0)
1316
1317
1318 def _GetEnabledDiskTemplates(opts):
1319 """Determine the list of enabled disk templates.
1320
1321 """
1322 if opts.enabled_disk_templates:
1323 return opts.enabled_disk_templates.split(",")
1324 else:
1325 return None
1326
1327
1328 def _GetVgName(opts, enabled_disk_templates):
1329 """Determine the volume group name.
1330
1331 @type enabled_disk_templates: list of strings
1332 @param enabled_disk_templates: cluster-wide enabled disk-templates
1333
1334 """
1335 # consistency between vg name and enabled disk templates
1336 vg_name = None
1337 if opts.vg_name is not None:
1338 vg_name = opts.vg_name
1339 if enabled_disk_templates:
1340 if vg_name and not utils.IsLvmEnabled(enabled_disk_templates):
1341 ToStdout("You specified a volume group with --vg-name, but you did not"
1342 " enable any of the following lvm-based disk templates: %s" %
1343 utils.CommaJoin(constants.DTS_LVM))
1344 return vg_name
1345
1346
1347 def _GetDrbdHelper(opts, enabled_disk_templates):
1348 """Determine the DRBD usermode helper.
1349
1350 """
1351 drbd_helper = opts.drbd_helper
1352 if enabled_disk_templates:
1353 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
1354 if not drbd_enabled and opts.drbd_helper:
1355 ToStdout("You specified a DRBD usermode helper with "
1356 " --drbd-usermode-helper while DRBD is not enabled.")
1357 return drbd_helper
1358
1359
1360 def _GetCompressionTools(opts):
1361 """Determine the list of custom compression tools.
1362
1363 """
1364 if opts.compression_tools:
1365 return opts.compression_tools.split(",")
1366 elif opts.compression_tools is None:
1367 return None # To note the parameter was not provided
1368 else:
1369 return constants.IEC_DEFAULT_TOOLS # Resetting to default
1370
1371
1372 def SetClusterParams(opts, args):
1373 """Modify the cluster.
1374
1375 @param opts: the command line options selected by the user
1376 @type args: list
1377 @param args: should be an empty list
1378 @rtype: int
1379 @return: the desired exit code
1380
1381 """
1382 if not (opts.vg_name is not None or
1383 opts.drbd_helper is not None or
1384 opts.enabled_hypervisors or opts.hvparams or
1385 opts.beparams or opts.nicparams or
1386 opts.ndparams or opts.diskparams or
1387 opts.candidate_pool_size is not None or
1388 opts.max_running_jobs is not None or
1389 opts.max_tracked_jobs is not None or
1390 opts.uid_pool is not None or
1391 opts.maintain_node_health is not None or
1392 opts.add_uids is not None or
1393 opts.remove_uids is not None or
1394 opts.default_iallocator is not None or
1395 opts.default_iallocator_params is not None or
1396 opts.reserved_lvs is not None or
1397 opts.mac_prefix is not None or
1398 opts.master_netdev is not None or
1399 opts.master_netmask is not None or
1400 opts.use_external_mip_script is not None or
1401 opts.prealloc_wipe_disks is not None or
1402 opts.hv_state or
1403 opts.enabled_disk_templates or
1404 opts.disk_state or
1405 opts.ipolicy_bounds_specs is not None or
1406 opts.ipolicy_std_specs is not None or
1407 opts.ipolicy_disk_templates is not None or
1408 opts.ipolicy_vcpu_ratio is not None or
1409 opts.ipolicy_spindle_ratio is not None or
1410 opts.ipolicy_memory_ratio is not None or
1411 opts.modify_etc_hosts is not None or
1412 opts.modify_ssh_setup is not None or
1413 opts.file_storage_dir is not None or
1414 opts.install_image is not None or
1415 opts.instance_communication_network is not None or
1416 opts.zeroing_image is not None or
1417 opts.shared_file_storage_dir is not None or
1418 opts.compression_tools is not None or
1419 opts.shared_file_storage_dir is not None or
1420 opts.enabled_user_shutdown is not None or
1421 opts.maint_round_delay is not None or
1422 opts.maint_balance is not None or
1423 opts.maint_balance_threshold is not None or
1424 opts.data_collector_interval or
1425 opts.diagnose_data_collector_filename is not None or
1426 opts.enabled_data_collectors or
1427 opts.enabled_predictive_queue is not None):
1428 ToStderr("Please give at least one of the parameters.")
1429 return 1
1430
1431 enabled_disk_templates = _GetEnabledDiskTemplates(opts)
1432 vg_name = _GetVgName(opts, enabled_disk_templates)
1433
1434 try:
1435 drbd_helper = _GetDrbdHelper(opts, enabled_disk_templates)
1436 except errors.OpPrereqError, e:
1437 ToStderr(str(e))
1438 return 1
1439
1440 hvlist = opts.enabled_hypervisors
1441 if hvlist is not None:
1442 hvlist = hvlist.split(",")
1443
1444 # a list of (name, dict) we can pass directly to dict() (or [])
1445 hvparams = dict(opts.hvparams)
1446 for hv_params in hvparams.values():
1447 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
1448
1449 diskparams = dict(opts.diskparams)
1450
1451 for dt_params in diskparams.values():
1452 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
1453
1454 beparams = opts.beparams
1455 utils.ForceDictType(beparams, constants.BES_PARAMETER_COMPAT)
1456
1457 nicparams = opts.nicparams
1458 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
1459
1460 ndparams = opts.ndparams
1461 if ndparams is not None:
1462 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
1463
1464 ipolicy = CreateIPolicyFromOpts(
1465 minmax_ispecs=opts.ipolicy_bounds_specs,
1466 std_ispecs=opts.ipolicy_std_specs,
1467 ipolicy_disk_templates=opts.ipolicy_disk_templates,
1468 ipolicy_vcpu_ratio=opts.ipolicy_vcpu_ratio,
1469 ipolicy_spindle_ratio=opts.ipolicy_spindle_ratio,
1470 ipolicy_memory_ratio=opts.ipolicy_memory_ratio,
1471 )
1472
1473 mnh = opts.maintain_node_health
1474
1475 uid_pool = opts.uid_pool
1476 if uid_pool is not None:
1477 uid_pool = uidpool.ParseUidPool(uid_pool)
1478
1479 add_uids = opts.add_uids
1480 if add_uids is not None:
1481 add_uids = uidpool.ParseUidPool(add_uids)
1482
1483 remove_uids = opts.remove_uids
1484 if remove_uids is not None:
1485 remove_uids = uidpool.ParseUidPool(remove_uids)
1486
1487 if opts.reserved_lvs is not None:
1488 if opts.reserved_lvs == "":
1489 opts.reserved_lvs = []
1490 else:
1491 opts.reserved_lvs = utils.UnescapeAndSplit(opts.reserved_lvs, sep=",")
1492
1493 if opts.master_netmask is not None:
1494 try:
1495 opts.master_netmask = int(opts.master_netmask)
1496 except ValueError:
1497 ToStderr("The --master-netmask option expects an int parameter.")
1498 return 1
1499
1500 ext_ip_script = opts.use_external_mip_script
1501
1502 if opts.disk_state:
1503 disk_state = utils.FlatToDict(opts.disk_state)
1504 else:
1505 disk_state = {}
1506
1507 hv_state = dict(opts.hv_state)
1508
1509 compression_tools = _GetCompressionTools(opts)
1510
1511 enabled_data_collectors = dict(
1512 (k, v.lower().startswith("t"))
1513 for k, v in opts.enabled_data_collectors.items())
1514
1515 unrecognized_data_collectors = [
1516 k for k in enabled_data_collectors.keys()
1517 if k not in constants.DATA_COLLECTOR_NAMES]
1518 if unrecognized_data_collectors:
1519 ToStderr("Data collector names not recognized: %s" %
1520 ", ".join(unrecognized_data_collectors))
1521
1522 try:
1523 data_collector_interval = dict(
1524 (k, long(1e6 * float(v)))
1525 for (k, v) in opts.data_collector_interval.items())
1526 except ValueError:
1527 ToStderr("Can't transform all values to integers: {}".format(
1528 opts.data_collector_interval))
1529 return 1
1530 if any(v <= 0 for v in data_collector_interval):
1531 ToStderr("Some interval times where not above zero.")
1532 return 1
1533
1534 op = opcodes.OpClusterSetParams(
1535 vg_name=vg_name,
1536 drbd_helper=drbd_helper,
1537 enabled_hypervisors=hvlist,
1538 hvparams=hvparams,
1539 os_hvp=None,
1540 beparams=beparams,
1541 nicparams=nicparams,
1542 ndparams=ndparams,
1543 diskparams=diskparams,
1544 ipolicy=ipolicy,
1545 candidate_pool_size=opts.candidate_pool_size,
1546 max_running_jobs=opts.max_running_jobs,
1547 max_tracked_jobs=opts.max_tracked_jobs,
1548 maintain_node_health=mnh,
1549 modify_etc_hosts=opts.modify_etc_hosts,
1550 modify_ssh_setup=opts.modify_ssh_setup,
1551 uid_pool=uid_pool,
1552 add_uids=add_uids,
1553 remove_uids=remove_uids,
1554 default_iallocator=opts.default_iallocator,
1555 default_iallocator_params=opts.default_iallocator_params,
1556 prealloc_wipe_disks=opts.prealloc_wipe_disks,
1557 mac_prefix=opts.mac_prefix,
1558 master_netdev=opts.master_netdev,
1559 master_netmask=opts.master_netmask,
1560 reserved_lvs=opts.reserved_lvs,
1561 use_external_mip_script=ext_ip_script,
1562 hv_state=hv_state,
1563 disk_state=disk_state,
1564 enabled_disk_templates=enabled_disk_templates,
1565 force=opts.force,
1566 file_storage_dir=opts.file_storage_dir,
1567 install_image=opts.install_image,
1568 instance_communication_network=opts.instance_communication_network,
1569 zeroing_image=opts.zeroing_image,
1570 shared_file_storage_dir=opts.shared_file_storage_dir,
1571 compression_tools=compression_tools,
1572 enabled_user_shutdown=opts.enabled_user_shutdown,
1573 maint_round_delay=opts.maint_round_delay,
1574 maint_balance=opts.maint_balance,
1575 maint_balance_threshold=opts.maint_balance_threshold,
1576 enabled_data_collectors=enabled_data_collectors,
1577 data_collector_interval=data_collector_interval,
1578 diagnose_data_collector_filename=opts.diagnose_data_collector_filename,
1579 enabled_predictive_queue=opts.enabled_predictive_queue
1580 )
1581 return base.GetResult(None, opts, SubmitOrSend(op, opts))
1582
1583
1584 def QueueOps(opts, args):
1585 """Queue operations.
1586
1587 @param opts: the command line options selected by the user
1588 @type args: list
1589 @param args: should contain only one element, the subcommand
1590 @rtype: int
1591 @return: the desired exit code
1592
1593 """
1594 command = args[0]
1595 client = GetClient()
1596 if command in ("drain", "undrain"):
1597 drain_flag = command == "drain"
1598 client.SetQueueDrainFlag(drain_flag)
1599 elif command == "info":
1600 result = client.QueryConfigValues(["drain_flag"])
1601 if result[0]:
1602 val = "set"
1603 else:
1604 val = "unset"
1605 ToStdout("The drain flag is %s" % val)
1606 else:
1607 raise errors.OpPrereqError("Command '%s' is not valid." % command,
1608 errors.ECODE_INVAL)
1609
1610 return 0
1611
1612
1613 def _ShowWatcherPause(until):
1614 if until is None or until < time.time():
1615 ToStdout("The watcher is not paused.")
1616 else:
1617 ToStdout("The watcher is paused until %s.", time.ctime(until))
1618
1619
1620 def WatcherOps(opts, args):
1621 """Watcher operations.
1622
1623 @param opts: the command line options selected by the user
1624 @type args: list
1625 @param args: should contain only one element, the subcommand
1626 @rtype: int
1627 @return: the desired exit code
1628
1629 """
1630 command = args[0]
1631 client = GetClient()
1632
1633 if command == "continue":
1634 client.SetWatcherPause(None)
1635 ToStdout("The watcher is no longer paused.")
1636
1637 elif command == "pause":
1638 if len(args) < 2:
1639 raise errors.OpPrereqError("Missing pause duration", errors.ECODE_INVAL)
1640
1641 result = client.SetWatcherPause(time.time() + ParseTimespec(args[1]))
1642 _ShowWatcherPause(result)
1643
1644 elif command == "info":
1645 result = client.QueryConfigValues(["watcher_pause"])
1646 _ShowWatcherPause(result[0])
1647
1648 else:
1649 raise errors.OpPrereqError("Command '%s' is not valid." % command,
1650 errors.ECODE_INVAL)
1651
1652 return 0
1653
1654
1655 def _OobPower(opts, node_list, power):
1656 """Puts the node in the list to desired power state.
1657
1658 @param opts: The command line options selected by the user
1659 @param node_list: The list of nodes to operate on
1660 @param power: True if they should be powered on, False otherwise
1661 @return: The success of the operation (none failed)
1662
1663 """
1664 if power:
1665 command = constants.OOB_POWER_ON
1666 else:
1667 command = constants.OOB_POWER_OFF
1668
1669 op = opcodes.OpOobCommand(node_names=node_list,
1670 command=command,
1671 ignore_status=True,
1672 timeout=opts.oob_timeout,
1673 power_delay=opts.power_delay)
1674 result = SubmitOpCode(op, opts=opts)
1675 errs = 0
1676 for node_result in result:
1677 (node_tuple, data_tuple) = node_result
1678 (_, node_name) = node_tuple
1679 (data_status, _) = data_tuple
1680 if data_status != constants.RS_NORMAL:
1681 assert data_status != constants.RS_UNAVAIL
1682 errs += 1
1683 ToStderr("There was a problem changing power for %s, please investigate",
1684 node_name)
1685
1686 if errs > 0:
1687 return False
1688
1689 return True
1690
1691
1692 def _InstanceStart(opts, inst_list, start, no_remember=False):
1693 """Puts the instances in the list to desired state.
1694
1695 @param opts: The command line options selected by the user
1696 @param inst_list: The list of instances to operate on
1697 @param start: True if they should be started, False for shutdown
1698 @param no_remember: If the instance state should be remembered
1699 @return: The success of the operation (none failed)
1700
1701 """
1702 if start:
1703 opcls = opcodes.OpInstanceStartup
1704 text_submit, text_success, text_failed = ("startup", "started", "starting")
1705 else:
1706 opcls = compat.partial(opcodes.OpInstanceShutdown,
1707 timeout=opts.shutdown_timeout,
1708 no_remember=no_remember)
1709 text_submit, text_success, text_failed = ("shutdown", "stopped", "stopping")
1710
1711 jex = JobExecutor(opts=opts)
1712
1713 for inst in inst_list:
1714 ToStdout("Submit %s of instance %s", text_submit, inst)
1715 op = opcls(instance_name=inst)
1716 jex.QueueJob(inst, op)
1717
1718 results = jex.GetResults()
1719 bad_cnt = len([1 for (success, _) in results if not success])
1720
1721 if bad_cnt == 0:
1722 ToStdout("All instances have been %s successfully", text_success)
1723 else:
1724 ToStderr("There were errors while %s instances:\n"
1725 "%d error(s) out of %d instance(s)", text_failed, bad_cnt,
1726 len(results))
1727 return False
1728
1729 return True
1730
1731
1732 class _RunWhenNodesReachableHelper(object):
1733 """Helper class to make shared internal state sharing easier.
1734
1735 @ivar success: Indicates if all action_cb calls were successful
1736
1737 """
1738 def __init__(self, node_list, action_cb, node2ip, port, feedback_fn,
1739 _ping_fn=netutils.TcpPing, _sleep_fn=time.sleep):
1740 """Init the object.
1741
1742 @param node_list: The list of nodes to be reachable
1743 @param action_cb: Callback called when a new host is reachable
1744 @type node2ip: dict
1745 @param node2ip: Node to ip mapping
1746 @param port: The port to use for the TCP ping
1747 @param feedback_fn: The function used for feedback
1748 @param _ping_fn: Function to check reachabilty (for unittest use only)
1749 @param _sleep_fn: Function to sleep (for unittest use only)
1750
1751 """
1752 self.down = set(node_list)
1753 self.up = set()
1754 self.node2ip = node2ip
1755 self.success = True
1756 self.action_cb = action_cb
1757 self.port = port
1758 self.feedback_fn = feedback_fn
1759 self._ping_fn = _ping_fn
1760 self._sleep_fn = _sleep_fn
1761
1762 def __call__(self):
1763 """When called we run action_cb.
1764
1765 @raises utils.RetryAgain: When there are still down nodes
1766
1767 """
1768 if not self.action_cb(self.up):
1769 self.success = False
1770
1771 if self.down:
1772 raise utils.RetryAgain()
1773 else:
1774 return self.success
1775
1776 def Wait(self, secs):
1777 """Checks if a host is up or waits remaining seconds.
1778
1779 @param secs: The secs remaining
1780
1781 """
1782 start = time.time()
1783 for node in self.down:
1784 if self._ping_fn(self.node2ip[node], self.port, timeout=_EPO_PING_TIMEOUT,
1785 live_port_needed=True):
1786 self.feedback_fn("Node %s became available" % node)
1787 self.up.add(node)
1788 self.down -= self.up
1789 # If we have a node available there is the possibility to run the
1790 # action callback successfully, therefore we don't wait and return
1791 return
1792
1793 self._sleep_fn(max(0.0, start + secs - time.time()))
1794
1795
1796 def _RunWhenNodesReachable(node_list, action_cb, interval):
1797 """Run action_cb when nodes become reachable.
1798
1799 @param node_list: The list of nodes to be reachable
1800 @param action_cb: Callback called when a new host is reachable
1801 @param interval: The earliest time to retry
1802
1803 """
1804 client = GetClient()
1805 cluster_info = client.QueryClusterInfo()
1806 if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
1807 family = netutils.IPAddress.family
1808 else:
1809 family = netutils.IP6Address.family
1810
1811 node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
1812 for node in node_list)
1813
1814 port = netutils.GetDaemonPort(constants.NODED)
1815 helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip, port,
1816 ToStdout)
1817
1818 try:
1819 return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
1820 wait_fn=helper.Wait)
1821 except utils.RetryTimeout:
1822 ToStderr("Time exceeded while waiting for nodes to become reachable"
1823 " again:\n - %s", " - ".join(helper.down))
1824 return False
1825
1826
1827 def _MaybeInstanceStartup(opts, inst_map, nodes_online,
1828 _instance_start_fn=_InstanceStart):
1829 """Start the instances conditional based on node_states.
1830
1831 @param opts: The command line options selected by the user
1832 @param inst_map: A dict of inst -> nodes mapping
1833 @param nodes_online: A list of nodes online
1834 @param _instance_start_fn: Callback to start instances (unittest use only)
1835 @return: Success of the operation on all instances
1836
1837 """
1838 start_inst_list = []
1839 for (inst, nodes) in inst_map.items():
1840 if not (nodes - nodes_online):
1841 # All nodes the instance lives on are back online
1842 start_inst_list.append(inst)
1843
1844 for inst in start_inst_list:
1845 del inst_map[inst]
1846
1847 if start_inst_list:
1848 return _instance_start_fn(opts, start_inst_list, True)
1849
1850 return True
1851
1852
1853 def _EpoOn(opts, full_node_list, node_list, inst_map):
1854 """Does the actual power on.
1855
1856 @param opts: The command line options selected by the user
1857 @param full_node_list: All nodes to operate on (includes nodes not supporting
1858 OOB)
1859 @param node_list: The list of nodes to operate on (all need to support OOB)
1860 @param inst_map: A dict of inst -> nodes mapping
1861 @return: The desired exit status
1862
1863 """
1864 if node_list and not _OobPower(opts, node_list, False):
1865 ToStderr("Not all nodes seem to get back up, investigate and start"
1866 " manually if needed")
1867
1868 # Wait for the nodes to be back up
1869 action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
1870
1871 ToStdout("Waiting until all nodes are available again")
1872 if not _RunWhenNodesReachable(full_node_list, action_cb, _EPO_PING_INTERVAL):
1873 ToStderr("Please investigate and start stopped instances manually")
1874 return constants.EXIT_FAILURE
1875
1876 return constants.EXIT_SUCCESS
1877
1878
1879 def _EpoOff(opts, node_list, inst_map):
1880 """Does the actual power off.
1881
1882 @param opts: The command line options selected by the user
1883 @param node_list: The list of nodes to operate on (all need to support OOB)
1884 @param inst_map: A dict of inst -> nodes mapping
1885 @return: The desired exit status
1886
1887 """
1888 if not _InstanceStart(opts, inst_map.keys(), False, no_remember=True):
1889 ToStderr("Please investigate and stop instances manually before continuing")
1890 return constants.EXIT_FAILURE
1891
1892 if not node_list:
1893 return constants.EXIT_SUCCESS
1894
1895 if _OobPower(opts, node_list, False):
1896 return constants.EXIT_SUCCESS
1897 else:
1898 return constants.EXIT_FAILURE
1899
1900
1901 def Epo(opts, args, qcl=None, _on_fn=_EpoOn, _off_fn=_EpoOff,
1902 _confirm_fn=ConfirmOperation,
1903 _stdout_fn=ToStdout, _stderr_fn=ToStderr):
1904 """EPO operations.
1905
1906 @param opts: the command line options selected by the user
1907 @type args: list
1908 @param args: should contain only one element, the subcommand
1909 @rtype: int
1910 @return: the desired exit code
1911
1912 """
1913 if opts.groups and opts.show_all:
1914 _stderr_fn("Only one of --groups or --all are allowed")
1915 return constants.EXIT_FAILURE
1916 elif args and opts.show_all:
1917 _stderr_fn("Arguments in combination with --all are not allowed")
1918 return constants.EXIT_FAILURE
1919
1920 if qcl is None:
1921 # Query client
1922 qcl = GetClient()
1923
1924 if opts.groups:
1925 node_query_list = \
1926 itertools.chain(*qcl.QueryGroups(args, ["node_list"], False))
1927 else:
1928 node_query_list = args
1929
1930 result = qcl.QueryNodes(node_query_list, ["name", "master", "pinst_list",
1931 "sinst_list", "powered", "offline"],
1932 False)
1933
1934 all_nodes = map(compat.fst, result)
1935 node_list = []
1936 inst_map = {}
1937 for (node, master, pinsts, sinsts, powered, offline) in result:
1938 if not offline:
1939 for inst in (pinsts + sinsts):
1940 if inst in inst_map:
1941 if not master:
1942 inst_map[inst].add(node)
1943 elif master:
1944 inst_map[inst] = set()
1945 else:
1946 inst_map[inst] = set([node])
1947
1948 if master and opts.on:
1949 # We ignore the master for turning on the machines, in fact we are
1950 # already operating on the master at this point :)
1951 continue
1952 elif master and not opts.show_all:
1953 _stderr_fn("%s is the master node, please do a master-failover to another"
1954 " node not affected by the EPO or use --all if you intend to"
1955 " shutdown the whole cluster", node)
1956 return constants.EXIT_FAILURE
1957 elif powered is None:
1958 _stdout_fn("Node %s does not support out-of-band handling, it can not be"
1959 " handled in a fully automated manner", node)
1960 elif powered == opts.on:
1961 _stdout_fn("Node %s is already in desired power state, skipping", node)
1962 elif not offline or (offline and powered):
1963 node_list.append(node)
1964
1965 if not (opts.force or _confirm_fn(all_nodes, "nodes", "epo")):
1966 return constants.EXIT_FAILURE
1967
1968 if opts.on:
1969 return _on_fn(opts, all_nodes, node_list, inst_map)
1970 else:
1971 return _off_fn(opts, node_list, inst_map)
1972
1973
1974 def RemoveRepair(opts, args):
1975 """Uncoditionally remove a repair event
1976
1977 @param opts: the command line options selected by the user (ignored)
1978 @type args: list
1979 @param args: one element, the uuid of the event to remove
1980 @rtype: int
1981 @return: the desired exit code
1982
1983 """
1984 uuid = args[0]
1985 wconfd.Client().RmMaintdIncident(uuid)
1986 return 0
1987
1988
1989 def _GetCreateCommand(info):
1990 buf = StringIO()
1991 buf.write("gnt-cluster init")
1992 PrintIPolicyCommand(buf, info["ipolicy"], False)
1993 buf.write(" ")
1994 buf.write(info["name"])
1995 return buf.getvalue()
1996
1997
1998 def ShowCreateCommand(opts, args):
1999 """Shows the command that can be used to re-create the cluster.
2000
2001 Currently it works only for ipolicy specs.
2002
2003 """
2004 cl = GetClient()
2005 result = cl.QueryClusterInfo()
2006 ToStdout(_GetCreateCommand(result))
2007
2008
2009 def _RunCommandAndReport(cmd):
2010 """Run a command and report its output, iff it failed.
2011
2012 @param cmd: the command to execute
2013 @type cmd: list
2014 @rtype: bool
2015 @return: False, if the execution failed.
2016
2017 """
2018 result = utils.RunCmd(cmd)
2019 if result.failed:
2020 ToStderr("Command %s failed: %s; Output %s" %
2021 (cmd, result.fail_reason, result.output))
2022 return False
2023 return True
2024
2025
2026 def _VerifyCommand(cmd):
2027 """Verify that a given command succeeds on all online nodes.
2028
2029 As this function is intended to run during upgrades, it
2030 is implemented in such a way that it still works, if all Ganeti
2031 daemons are down.
2032 @param cmd: a list of unquoted shell arguments
2033 @type cmd: list
2034 @rtype: list
2035 @return: the list of node names that are online where
2036 the command failed.
2037
2038 """
2039 command = utils.text.ShellQuoteArgs([str(val) for val in cmd])
2040 return _VerifyCommandRaw(command)
2041
2042
2043 def _VerifyCommandRaw(command):
2044 """Verify that a given command succeeds on all online nodes.
2045
2046 As this function is intended to run during upgrades, it
2047 is implemented in such a way that it still works, if all Ganeti
2048 daemons are down.
2049 @param cmd: a bare string to pass to SSH. The caller must do their
2050 own shell/ssh escaping.
2051 @type cmd: string
2052 @rtype: list
2053 @return: the list of node names that are online where
2054 the command failed.
2055
2056 """
2057
2058 nodes = ssconf.SimpleStore().GetOnlineNodeList()
2059 master_node = ssconf.SimpleStore().GetMasterNode()
2060 cluster_name = ssconf.SimpleStore().GetClusterName()
2061
2062 # If master node is in 'nodes', make sure master node is at list end
2063 if master_node in nodes:
2064 nodes.remove(master_node)
2065 nodes.append(master_node)
2066
2067 failed = []
2068
2069 srun = ssh.SshRunner(cluster_name=cluster_name)
2070 for name in nodes:
2071 result = srun.Run(name, constants.SSH_LOGIN_USER, command)
2072 if result.exit_code != 0:
2073 failed.append(name)
2074
2075 return failed
2076
2077
2078 def _VerifyVersionInstalled(versionstring):
2079 """Verify that the given version of ganeti is installed on all online nodes.
2080
2081 Do nothing, if this is the case, otherwise print an appropriate
2082 message to stderr.
2083
2084 @param versionstring: the version to check for
2085 @type versionstring: string
2086 @rtype: bool
2087 @return: True, if the version is installed on all online nodes
2088
2089 """
2090 badnodes = _VerifyCommand(["test", "-d",
2091 os.path.join(pathutils.PKGLIBDIR, versionstring)])
2092 if badnodes:
2093 ToStderr("Ganeti version %s not installed on nodes %s"
2094 % (versionstring, ", ".join(badnodes)))
2095 return False
2096
2097 return True
2098
2099
2100 def _GetRunning():
2101 """Determine the list of running jobs.
2102
2103 @rtype: list
2104 @return: the number of jobs still running
2105
2106 """
2107 cl = GetClient()
2108 qfilter = qlang.MakeSimpleFilter("status",
2109 frozenset([constants.JOB_STATUS_RUNNING]))
2110 return len(cl.Query(constants.QR_JOB, [], qfilter).data)
2111
2112
2113 def _SetGanetiVersionAndEnsure(versionstring):
2114 """Symlink the active version of ganeti to the given versionstring,
2115 and run the ensure-dirs script.
2116
2117 @type versionstring: string
2118 @rtype: list
2119 @return: the list of nodes where the version change failed
2120
2121 """
2122
2123 # Update symlinks to point at the new version.
2124 if constants.HAS_GNU_LN:
2125 link_lib_cmd = [
2126 "ln", "-s", "-f", "-T",
2127 os.path.join(pathutils.PKGLIBDIR, versionstring),
2128 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]
2129 link_share_cmd = [
2130 "ln", "-s", "-f", "-T",
2131 os.path.join(pathutils.SHAREDIR, versionstring),
2132 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]
2133 cmds = [link_lib_cmd, link_share_cmd]
2134 else:
2135 rm_lib_cmd = [
2136 "rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]
2137 link_lib_cmd = [
2138 "ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring),
2139 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]
2140 rm_share_cmd = [
2141 "rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]
2142 ln_share_cmd = [
2143 "ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring),
2144 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]
2145 cmds = [rm_lib_cmd, link_lib_cmd, rm_share_cmd, ln_share_cmd]
2146
2147 # Run the ensure-dirs script to verify the new version is OK.
2148 cmds.append([pathutils.ENSURE_DIRS])
2149
2150 # Submit all commands to ssh, exiting on the first failure.
2151 # The command string is a single argument that's given to ssh to submit to
2152 # the remote shell, so it only needs enough escaping to satisfy the remote
2153 # shell, rather than the 2 levels of escaping usually required when using
2154 # ssh from the commandline.
2155 quoted_cmds = [utils.text.ShellQuoteArgs(cmd) for cmd in cmds]
2156 cmd = " && ".join(quoted_cmds)
2157 failed = _VerifyCommandRaw(cmd)
2158 return list(set(failed))
2159
2160
2161 def _ExecuteCommands(fns):
2162 """Execute a list of functions, in reverse order.
2163
2164 @type fns: list of functions.
2165 @param fns: the functions to be executed.
2166
2167 """
2168 for fn in reversed(fns):
2169 fn()
2170
2171
2172 def _GetConfigVersion():
2173 """Determine the version the configuration file currently has.
2174
2175 @rtype: tuple or None
2176 @return: (major, minor, revision) if the version can be determined,
2177 None otherwise
2178
2179 """
2180 config_data = serializer.LoadJson(utils.ReadFile(pathutils.CLUSTER_CONF_FILE))
2181 try:
2182 config_version = config_data["version"]
2183 except KeyError:
2184 return None
2185 return utils.SplitVersion(config_version)
2186
2187
2188 def _ReadIntentToUpgrade():
2189 """Read the file documenting the intent to upgrade the cluster.
2190
2191 @rtype: (string, string) or (None, None)
2192 @return: (old version, version to upgrade to), if the file exists,
2193 and (None, None) otherwise.
2194
2195 """
2196 if not os.path.isfile(pathutils.INTENT_TO_UPGRADE):
2197 return (None, None)
2198
2199 contentstring = utils.ReadFile(pathutils.INTENT_TO_UPGRADE)
2200 contents = utils.UnescapeAndSplit(contentstring)
2201 if len(contents) != 3:
2202 # file syntactically mal-formed
2203 return (None, None)
2204 return (contents[0], contents[1])
2205
2206
2207 def _WriteIntentToUpgrade(version):
2208 """Write file documenting the intent to upgrade the cluster.
2209
2210 @type version: string
2211 @param version: the version we intent to upgrade to
2212
2213 """
2214 utils.WriteFile(pathutils.INTENT_TO_UPGRADE,
2215 data=utils.EscapeAndJoin([constants.RELEASE_VERSION, version,
2216 "%d" % os.getpid()]))
2217
2218
2219 def _UpgradeBeforeConfigurationChange(versionstring):
2220 """
2221 Carry out all the tasks necessary for an upgrade that happen before
2222 the configuration file, or Ganeti version, changes.
2223
2224 @type versionstring: string
2225 @param versionstring: the version to upgrade to
2226 @rtype: (bool, list)
2227 @return: tuple of a bool indicating success and a list of rollback tasks
2228
2229 """
2230 rollback = []
2231
2232 ToStdoutAndLoginfo("Verifying %s present on all nodes", versionstring)
2233 if not _VerifyVersionInstalled(versionstring):
2234 return (False, rollback)
2235
2236 _WriteIntentToUpgrade(versionstring)
2237 rollback.append(
2238 lambda: utils.RunCmd(["rm", "-f", pathutils.INTENT_TO_UPGRADE]))
2239
2240 ToStdoutAndLoginfo("Draining queue")
2241 client = GetClient()
2242 client.SetQueueDrainFlag(True)
2243
2244 rollback.append(lambda: GetClient().SetQueueDrainFlag(False))
2245
2246 if utils.SimpleRetry(0, _GetRunning,
2247 constants.UPGRADE_QUEUE_POLL_INTERVAL,
2248 constants.UPGRADE_QUEUE_DRAIN_TIMEOUT):
2249 ToStderr("Failed to completely empty the queue.")
2250 return (False, rollback)
2251
2252 ToStdoutAndLoginfo("Pausing the watcher for one hour.")
2253 rollback.append(lambda: GetClient().SetWatcherPause(None))
2254 GetClient().SetWatcherPause(time.time() + 60 * 60)
2255
2256 ToStdoutAndLoginfo("Stopping daemons on master node.")
2257 if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]):
2258 return (False, rollback)
2259
2260 ToStdoutAndLoginfo("Stopping daemons everywhere.")
2261 rollback.append(lambda: _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
2262 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
2263 if badnodes:
2264 ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),))
2265 return (False, rollback)
2266
2267 backuptar = os.path.join(pathutils.BACKUP_DIR, "ganeti%d.tar" % time.time())
2268 ToStdoutAndLoginfo("Backing up configuration as %s", backuptar)
2269 if not _RunCommandAndReport(["mkdir", "-p", pathutils.BACKUP_DIR]):
2270 return (False, rollback)
2271
2272 # Create the archive in a safe manner, as it contains sensitive
2273 # information.
2274 (_, tmp_name) = tempfile.mkstemp(prefix=backuptar, dir=pathutils.BACKUP_DIR)
2275 if not _RunCommandAndReport(["tar", "-cf", tmp_name,
2276 "--exclude=queue/archive",
2277 pathutils.DATA_DIR]):
2278 return (False, rollback)
2279
2280 os.rename(tmp_name, backuptar)
2281 return (True, rollback)
2282
2283
2284 def _VersionSpecificDowngrade():
2285 """
2286 Perform any additional downrade tasks that are version specific
2287 and need to be done just after the configuration downgrade. This
2288 function needs to be idempotent, so that it can be redone if the
2289 downgrade procedure gets interrupted after changing the
2290 configuration.
2291
2292 Note that this function has to be reset with every version bump.
2293
2294 @return: True upon success
2295 """
2296 ToStdoutAndLoginfo("Performing version-specific downgrade tasks.")
2297
2298 return True
2299
2300
2301 def _SwitchVersionAndConfig(versionstring, downgrade):
2302 """
2303 Switch to the new Ganeti version and change the configuration,
2304 in correct order.
2305
2306 @type versionstring: string
2307 @param versionstring: the version to change to
2308 @type downgrade: bool
2309 @param downgrade: True, if the configuration should be downgraded
2310 @rtype: (bool, list)
2311 @return: tupe of a bool indicating success, and a list of
2312 additional rollback tasks
2313
2314 """
2315 rollback = []
2316 if downgrade:
2317 ToStdoutAndLoginfo("Downgrading configuration")
2318 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]):
2319 return (False, rollback)
2320 # Note: version specific downgrades need to be done before switching
2321 # binaries, so that we still have the knowledgeable binary if the downgrade
2322 # process gets interrupted at this point.
2323 if not _VersionSpecificDowngrade():
2324 return (False, rollback)
2325
2326 # Configuration change is the point of no return. From then onwards, it is
2327 # safer to push through the up/dowgrade than to try to roll it back.
2328
2329 ToStdoutAndLoginfo("Switching to version %s on all nodes", versionstring)
2330 rollback.append(lambda: _SetGanetiVersionAndEnsure(constants.DIR_VERSION))
2331 badnodes = _SetGanetiVersionAndEnsure(versionstring)
2332 if badnodes:
2333 ToStderr("Failed to switch to Ganeti version %s on nodes %s"
2334 % (versionstring, ", ".join(badnodes)))
2335 if not downgrade:
2336 return (False, rollback)
2337
2338 # Now that we have changed to the new version of Ganeti we should
2339 # not communicate over luxi any more, as luxi might have changed in
2340 # incompatible ways. Therefore, manually call the corresponding ganeti
2341 # commands using their canonical (version independent) path.
2342
2343 if not downgrade:
2344 ToStdoutAndLoginfo("Upgrading configuration")
2345 if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]):
2346 return (False, rollback)
2347
2348 return (True, rollback)
2349
2350
2351 def _UpgradeAfterConfigurationChange(oldversion):
2352 """
2353 Carry out the upgrade actions necessary after switching to the new
2354 Ganeti version and updating the configuration.
2355
2356 As this part is run at a time where the new version of Ganeti is already
2357 running, no communication should happen via luxi, as this is not a stable
2358 interface. Also, as the configuration change is the point of no return,
2359 all actions are pushed through, even if some of them fail.
2360
2361 @param oldversion: the version the upgrade started from
2362 @type oldversion: string
2363 @rtype: int
2364 @return: the intended return value
2365
2366 """
2367 returnvalue = 0
2368
2369 ToStdoutAndLoginfo("Starting daemons everywhere.")
2370 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
2371 if badnodes:
2372 ToStderr("Warning: failed to start daemons on %s." % (", ".join(badnodes),))
2373 returnvalue = 1
2374
2375 ToStdoutAndLoginfo("Redistributing the configuration.")
2376 if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]):
2377 returnvalue = 1
2378
2379 ToStdoutAndLoginfo("Restarting daemons everywhere.")
2380 badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
2381 badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
2382 if badnodes:
2383 ToStderr("Warning: failed to start daemons on %s." %
2384 (", ".join(list(set(badnodes))),))
2385 returnvalue = 1
2386
2387 ToStdoutAndLoginfo("Undraining the queue.")
2388 if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]):
2389 returnvalue = 1
2390
2391 _RunCommandAndReport(["rm", "-f", pathutils.INTENT_TO_UPGRADE])
2392
2393 ToStdoutAndLoginfo("Running post-upgrade hooks")
2394 if not _RunCommandAndReport([pathutils.POST_UPGRADE, oldversion]):
2395 returnvalue = 1
2396
2397 ToStdoutAndLoginfo("Unpausing the watcher.")
2398 if not _RunCommandAndReport(["gnt-cluster", "watcher", "continue"]):
2399 returnvalue = 1
2400
2401 ToStdoutAndLoginfo("Verifying cluster.")
2402 if not _RunCommandAndReport(["gnt-cluster", "verify"]):
2403 returnvalue = 1
2404
2405 return returnvalue
2406
2407
2408 def UpgradeGanetiCommand(opts, args):
2409 """Upgrade a cluster to a new ganeti version.
2410
2411 @param opts: the command line options selected by the user
2412 @type args: list
2413 @param args: should be an empty list
2414 @rtype: int
2415 @return: the desired exit code
2416
2417 """
2418 if ((not opts.resume and opts.to is None)
2419 or (opts.resume and opts.to is not None)):
2420 ToStderr("Precisely one of the options --to and --resume"
2421 " has to be given")
2422 return 1
2423
2424 # If we're not told to resume, verify there is no upgrade
2425 # in progress.
2426 if not opts.resume:
2427 oldversion, versionstring = _ReadIntentToUpgrade()
2428 if versionstring is not None:
2429 # An upgrade is going on; verify whether the target matches
2430 if versionstring == opts.to:
2431 ToStderr("An upgrade is already in progress. Target version matches,"
2432 " resuming.")
2433 opts.resume = True
2434 opts.to = None
2435 else:
2436 ToStderr("An upgrade from %s to %s is in progress; use --resume to"
2437 " finish it first" % (oldversion, versionstring))
2438 return 1
2439
2440 utils.SetupLogging(pathutils.LOG_COMMANDS, 'gnt-cluster upgrade', debug=1)
2441
2442 oldversion = constants.RELEASE_VERSION
2443
2444 if opts.resume:
2445 ssconf.CheckMaster(False)
2446 oldversion, versionstring = _ReadIntentToUpgrade()
2447 if versionstring is None:
2448 return 0
2449 version = utils.version.ParseVersion(versionstring)
2450 if version is None:
2451 return 1
2452 configversion = _GetConfigVersion()
2453 if configversion is None:
2454 return 1
2455 # If the upgrade we resume was an upgrade between compatible
2456 # versions (like 2.10.0 to 2.10.1), the correct configversion
2457 # does not guarantee that the config has been updated.
2458 # However, in the case of a compatible update with the configuration
2459 # not touched, we are running a different dirversion with the same
2460 # config version.
2461 config_already_modified = \
2462 (utils.IsCorrectConfigVersion(version, configversion) and
2463 not (versionstring != constants.DIR_VERSION and
2464 configversion == (constants.CONFIG_MAJOR, constants.CONFIG_MINOR,
2465 constants.CONFIG_REVISION)))
2466 if not config_already_modified:
2467 # We have to start from the beginning; however, some daemons might have
2468 # already been stopped, so the only way to get into a well-defined state
2469 # is by starting all daemons again.
2470 _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
2471 else:
2472 versionstring = opts.to
2473 config_already_modified = False
2474 version = utils.version.ParseVersion(versionstring)
2475 if version is None:
2476 ToStderr("Could not parse version string %s" % versionstring)
2477 return 1
2478
2479 msg = utils.version.UpgradeRange(version)
2480 if msg is not None:
2481 ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg))
2482 return 1
2483
2484 if not config_already_modified:
2485 success, rollback = _UpgradeBeforeConfigurationChange(versionstring)
2486 if not success:
2487 _ExecuteCommands(rollback)
2488 return 1
2489 else:
2490 rollback = []
2491
2492 downgrade = utils.version.ShouldCfgdowngrade(version)
2493
2494 success, additionalrollback = \
2495 _SwitchVersionAndConfig(versionstring, downgrade)
2496 if not success:
2497 rollback.extend(additionalrollback)
2498 _ExecuteCommands(rollback)
2499 return 1
2500
2501 return _UpgradeAfterConfigurationChange(oldversion)
2502
2503
2504 commands = {
2505 "init": (
2506 InitCluster, [ArgHost(min=1, max=1)],
2507 [BACKEND_OPT, CP_SIZE_OPT, ENABLED_HV_OPT, GLOBAL_FILEDIR_OPT,
2508 HVLIST_OPT, MAC_PREFIX_OPT, MASTER_NETDEV_OPT, MASTER_NETMASK_OPT,
2509 NIC_PARAMS_OPT, NOMODIFY_ETCHOSTS_OPT, NOMODIFY_SSH_SETUP_OPT,
2510 SECONDARY_IP_OPT, VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT,
2511 DRBD_HELPER_OPT, DEFAULT_IALLOCATOR_OPT, DEFAULT_IALLOCATOR_PARAMS_OPT,
2512 PRIMARY_IP_VERSION_OPT, PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT,
2513 GLOBAL_SHARED_FILEDIR_OPT, USE_EXTERNAL_MIP_SCRIPT, DISK_PARAMS_OPT,
2514 HV_STATE_OPT, DISK_STATE_OPT, ENABLED_DISK_TEMPLATES_OPT,
2515 IPOLICY_STD_SPECS_OPT, GLOBAL_GLUSTER_FILEDIR_OPT, INSTALL_IMAGE_OPT,
2516 ZEROING_IMAGE_OPT, COMPRESSION_TOOLS_OPT,
2517 ENABLED_USER_SHUTDOWN_OPT, SSH_KEY_BITS_OPT, SSH_KEY_TYPE_OPT,
2518 ENABLED_PREDICTIVE_QUEUE_OPT,
2519 ]
2520 + INSTANCE_POLICY_OPTS + SPLIT_ISPECS_OPTS,
2521 "[opts...] <cluster_name>", "Initialises a new cluster configuration"),
2522 "destroy": (
2523 DestroyCluster, ARGS_NONE, [YES_DOIT_OPT],
2524 "", "Destroy cluster"),
2525 "rename": (
2526 RenameCluster, [ArgHost(min=1, max=1)],
2527 [FORCE_OPT, DRY_RUN_OPT],
2528 "<new_name>",
2529 "Renames the cluster"),
2530 "redist-conf": (
2531 RedistributeConfig, ARGS_NONE, SUBMIT_OPTS +
2532 [DRY_RUN_OPT, PRIORITY_OPT, FORCE_DISTRIBUTION],
2533 "", "Forces a push of the configuration file and ssconf files"
2534 " to the nodes in the cluster"),
2535 "verify": (
2536 VerifyCluster, ARGS_NONE,
2537 [VERBOSE_OPT, DEBUG_SIMERR_OPT, ERROR_CODES_OPT, NONPLUS1_OPT,
2538 PRIORITY_OPT, NODEGROUP_OPT, IGNORE_ERRORS_OPT, VERIFY_CLUTTER_OPT],
2539 "", "Does a check on the cluster configuration"),
2540 "verify-disks": (
2541 VerifyDisks, ARGS_NONE, [PRIORITY_OPT, NODEGROUP_OPT, STRICT_OPT],
2542 "", "Does a check on the cluster disk status"),
2543 "repair-disk-sizes": (
2544 RepairDiskSizes, ARGS_MANY_INSTANCES, [DRY_RUN_OPT, PRIORITY_OPT],
2545 "[instance...]", "Updates mismatches in recorded disk sizes"),
2546 "master-failover": (
2547 MasterFailover, ARGS_NONE,
2548 [NOVOTING_OPT, FORCE_FAILOVER, IGNORE_OFFLINE_NODES_FAILOVER],
2549 "", "Makes the current node the master"),
2550 "master-ping": (
2551 MasterPing, ARGS_NONE, [],
2552 "", "Checks if the master is alive"),
2553 "version": (
2554 ShowClusterVersion, ARGS_NONE, [],
2555 "", "Shows the cluster version"),
2556 "getmaster": (
2557 ShowClusterMaster, ARGS_NONE, [],
2558 "", "Shows the cluster master"),
2559 "copyfile": (
2560 ClusterCopyFile, [ArgFile(min=1, max=1)],
2561 [NODE_LIST_OPT, USE_REPL_NET_OPT, NODEGROUP_OPT],
2562 "[-n node...] <filename>", "Copies a file to all (or only some) nodes"),
2563 "command": (
2564 RunClusterCommand, [ArgCommand(min=1)],
2565 [NODE_LIST_OPT, NODEGROUP_OPT, SHOW_MACHINE_OPT, FAILURE_ONLY_OPT],
2566 "[-n node...] <command>", "Runs a command on all (or only some) nodes"),
2567 "info": (
2568 ShowClusterConfig, ARGS_NONE, [ROMAN_OPT],
2569 "[--roman]", "Show cluster configuration"),
2570 "list-tags": (
2571 ListTags, ARGS_NONE, [], "", "List the tags of the cluster"),
2572 "add-tags": (
2573 AddTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS,
2574 "tag...", "Add tags to the cluster"),
2575 "remove-tags": (
2576 RemoveTags, [ArgUnknown()], [TAG_SRC_OPT, PRIORITY_OPT] + SUBMIT_OPTS,
2577 "tag...", "Remove tags from the cluster"),
2578 "search-tags": (
2579 SearchTags, [ArgUnknown(min=1, max=1)], [PRIORITY_OPT], "",
2580 "Searches the tags on all objects on"
2581 " the cluster for a given pattern (regex)"),
2582 "queue": (
2583 QueueOps,
2584 [ArgChoice(min=1, max=1, choices=["drain", "undrain", "info"])],
2585 [], "drain|undrain|info", "Change queue properties"),
2586 "watcher": (
2587 WatcherOps,
2588 [ArgChoice(min=1, max=1, choices=["pause", "continue", "info"]),
2589 ArgSuggest(min=0, max=1, choices=["30m", "1h", "4h"])],
2590 [],
2591 "{pause <timespec>|continue|info}", "Change watcher properties"),
2592 "modify": (
2593 SetClusterParams, ARGS_NONE,
2594 [FORCE_OPT,
2595 BACKEND_OPT, CP_SIZE_OPT, RQL_OPT, MAX_TRACK_OPT, INSTALL_IMAGE_OPT,
2596 INSTANCE_COMMUNICATION_NETWORK_OPT, ENABLED_HV_OPT, HVLIST_OPT,
2597 MAC_PREFIX_OPT, MASTER_NETDEV_OPT, MASTER_NETMASK_OPT, NIC_PARAMS_OPT,
2598 VG_NAME_OPT, MAINTAIN_NODE_HEALTH_OPT, UIDPOOL_OPT, ADD_UIDS_OPT,
2599 REMOVE_UIDS_OPT, DRBD_HELPER_OPT, DEFAULT_IALLOCATOR_OPT,
2600 DEFAULT_IALLOCATOR_PARAMS_OPT, RESERVED_LVS_OPT, DRY_RUN_OPT, PRIORITY_OPT,
2601 PREALLOC_WIPE_DISKS_OPT, NODE_PARAMS_OPT, USE_EXTERNAL_MIP_SCRIPT,
2602 DISK_PARAMS_OPT, HV_STATE_OPT, DISK_STATE_OPT] + SUBMIT_OPTS +
2603 [ENABLED_DISK_TEMPLATES_OPT, IPOLICY_STD_SPECS_OPT, MODIFY_ETCHOSTS_OPT,
2604 MODIFY_SSH_SETUP_OPT, ENABLED_USER_SHUTDOWN_OPT,
2605 ENABLED_PREDICTIVE_QUEUE_OPT] +
2606 INSTANCE_POLICY_OPTS +
2607 [GLOBAL_FILEDIR_OPT, GLOBAL_SHARED_FILEDIR_OPT, ZEROING_IMAGE_OPT,
2608 COMPRESSION_TOOLS_OPT] +
2609 [ENABLED_DATA_COLLECTORS_OPT, DATA_COLLECTOR_INTERVAL_OPT,
2610 DIAGNOSE_DATA_COLLECTOR_FILENAME_OPT,
2611 MAINT_INTERVAL_OPT, MAINT_BALANCE_OPT, MAINT_BALANCE_THRESHOLD_OPT],
2612 "[opts...]",
2613 "Alters the parameters of the cluster"),
2614 "renew-crypto": (
2615 RenewCrypto, ARGS_NONE,
2616 [NEW_CLUSTER_CERT_OPT, NEW_RAPI_CERT_OPT, RAPI_CERT_OPT,
2617 NEW_CONFD_HMAC_KEY_OPT, FORCE_OPT,
2618 NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT,
2619 NEW_SPICE_CERT_OPT, SPICE_CERT_OPT, SPICE_CACERT_OPT,
2620 NEW_NODE_CERT_OPT, NEW_SSH_KEY_OPT, NOSSH_KEYCHECK_OPT,
2621 VERBOSE_OPT, SSH_KEY_BITS_OPT, SSH_KEY_TYPE_OPT],
2622 "[opts...]",
2623 "Renews cluster certificates, keys and secrets"),
2624 "epo": (
2625 Epo, [ArgUnknown()],
2626 [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT,
2627 SHUTDOWN_TIMEOUT_OPT, POWER_DELAY_OPT],
2628 "[opts...] [args]",
2629 "Performs an emergency power-off on given args"),
2630 "activate-master-ip": (
2631 ActivateMasterIp, ARGS_NONE, [], "", "Activates the master IP"),
2632 "deactivate-master-ip": (
2633 DeactivateMasterIp, ARGS_NONE, [CONFIRM_OPT], "",
2634 "Deactivates the master IP"),
2635 "show-ispecs-cmd": (
2636 ShowCreateCommand, ARGS_NONE, [], "",
2637 "Show the command line to re-create the cluster"),
2638 "upgrade": (
2639 UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
2640 "Upgrade (or downgrade) to a new Ganeti version"),
2641 "remove-repair": (
2642 RemoveRepair, [ArgUnknown()], [], "<uuid>",
2643 "Remove a repair event from the list of pending events"),
2644 }
2645
2646
2647 #: dictionary with aliases for commands
2648 aliases = {
2649 "masterfailover": "master-failover",
2650 "show": "info",
2651 }
2652
2653
2654 def Main():
2655 return GenericMain(commands, override={"tag_type": constants.TAG_CLUSTER},
2656 aliases=aliases)