fa6ae1982107d8e9934613b532821a917e70a0ae
[ganeti-github.git] / lib / bootstrap.py
1 #
2 #
3
4 # Copyright (C) 2006, 2007, 2008, 2010, 2011, 2012 Google Inc.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are
9 # met:
10 #
11 # 1. Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
22 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31 """Functions to bootstrap a new cluster.
32
33 """
34
35 import os
36 import os.path
37 import re
38 import logging
39 import time
40 import tempfile
41
42 from ganeti.cmdlib import cluster
43 import ganeti.rpc.node as rpc
44 from ganeti import ssh
45 from ganeti import utils
46 from ganeti import errors
47 from ganeti import config
48 from ganeti import constants
49 from ganeti import objects
50 from ganeti import ssconf
51 from ganeti import serializer
52 from ganeti import hypervisor
53 from ganeti.storage import drbd
54 from ganeti.storage import filestorage
55 from ganeti import netutils
56 from ganeti import luxi
57 from ganeti import jstore
58 from ganeti import pathutils
59 from ganeti import runtime
60 from ganeti import vcluster
61
62
63 # ec_id for InitConfig's temporary reservation manager
64 _INITCONF_ECID = "initconfig-ecid"
65
66 #: After how many seconds daemon must be responsive
67 _DAEMON_READY_TIMEOUT = 10.0
68
69
70 def _InitSSHSetup():
71 """Setup the SSH configuration for the cluster.
72
73 This generates a dsa keypair for root, adds the pub key to the
74 permitted hosts and adds the hostkey to its own known hosts.
75
76 """
77 priv_key, pub_key, auth_keys = ssh.GetUserFiles(constants.SSH_LOGIN_USER)
78
79 for name in priv_key, pub_key:
80 if os.path.exists(name):
81 utils.CreateBackup(name)
82 utils.RemoveFile(name)
83
84 result = utils.RunCmd(["ssh-keygen", "-t", "dsa",
85 "-f", priv_key,
86 "-q", "-N", ""])
87 if result.failed:
88 raise errors.OpExecError("Could not generate ssh keypair, error %s" %
89 result.output)
90
91 utils.AddAuthorizedKey(auth_keys, utils.ReadFile(pub_key))
92
93
94 def GenerateHmacKey(file_name):
95 """Writes a new HMAC key.
96
97 @type file_name: str
98 @param file_name: Path to output file
99
100 """
101 utils.WriteFile(file_name, data="%s\n" % utils.GenerateSecret(), mode=0400,
102 backup=True)
103
104
105 # pylint: disable=R0913
106 def GenerateClusterCrypto(new_cluster_cert, new_rapi_cert, new_spice_cert,
107 new_confd_hmac_key, new_cds, new_client_cert,
108 master_name,
109 rapi_cert_pem=None, spice_cert_pem=None,
110 spice_cacert_pem=None, cds=None,
111 nodecert_file=pathutils.NODED_CERT_FILE,
112 clientcert_file=pathutils.NODED_CLIENT_CERT_FILE,
113 rapicert_file=pathutils.RAPI_CERT_FILE,
114 spicecert_file=pathutils.SPICE_CERT_FILE,
115 spicecacert_file=pathutils.SPICE_CACERT_FILE,
116 hmackey_file=pathutils.CONFD_HMAC_KEY,
117 cds_file=pathutils.CLUSTER_DOMAIN_SECRET_FILE):
118 """Updates the cluster certificates, keys and secrets.
119
120 @type new_cluster_cert: bool
121 @param new_cluster_cert: Whether to generate a new cluster certificate
122 @type new_rapi_cert: bool
123 @param new_rapi_cert: Whether to generate a new RAPI certificate
124 @type new_spice_cert: bool
125 @param new_spice_cert: Whether to generate a new SPICE certificate
126 @type new_confd_hmac_key: bool
127 @param new_confd_hmac_key: Whether to generate a new HMAC key
128 @type new_cds: bool
129 @param new_cds: Whether to generate a new cluster domain secret
130 @type new_client_cert: bool
131 @param new_client_cert: Whether to generate a new client certificate
132 @type master_name: string
133 @param master_name: FQDN of the master node
134 @type rapi_cert_pem: string
135 @param rapi_cert_pem: New RAPI certificate in PEM format
136 @type spice_cert_pem: string
137 @param spice_cert_pem: New SPICE certificate in PEM format
138 @type spice_cacert_pem: string
139 @param spice_cacert_pem: Certificate of the CA that signed the SPICE
140 certificate, in PEM format
141 @type cds: string
142 @param cds: New cluster domain secret
143 @type nodecert_file: string
144 @param nodecert_file: optional override of the node cert file path
145 @type rapicert_file: string
146 @param rapicert_file: optional override of the rapi cert file path
147 @type spicecert_file: string
148 @param spicecert_file: optional override of the spice cert file path
149 @type spicecacert_file: string
150 @param spicecacert_file: optional override of the spice CA cert file path
151 @type hmackey_file: string
152 @param hmackey_file: optional override of the hmac key file path
153
154 """
155 # pylint: disable=R0913
156 # noded SSL certificate
157 utils.GenerateNewSslCert(
158 new_cluster_cert, nodecert_file, 1,
159 "Generating new cluster certificate at %s" % nodecert_file)
160
161 # If the cluster certificate was renewed, the client cert has to be
162 # renewed and resigned.
163 if new_cluster_cert or new_client_cert:
164 utils.GenerateNewClientSslCert(clientcert_file, nodecert_file,
165 master_name)
166
167 # confd HMAC key
168 if new_confd_hmac_key or not os.path.exists(hmackey_file):
169 logging.debug("Writing new confd HMAC key to %s", hmackey_file)
170 GenerateHmacKey(hmackey_file)
171
172 if rapi_cert_pem:
173 # Assume rapi_pem contains a valid PEM-formatted certificate and key
174 logging.debug("Writing RAPI certificate at %s", rapicert_file)
175 utils.WriteFile(rapicert_file, data=rapi_cert_pem, backup=True)
176
177 else:
178 utils.GenerateNewSslCert(
179 new_rapi_cert, rapicert_file, 1,
180 "Generating new RAPI certificate at %s" % rapicert_file)
181
182 # SPICE
183 spice_cert_exists = os.path.exists(spicecert_file)
184 spice_cacert_exists = os.path.exists(spicecacert_file)
185 if spice_cert_pem:
186 # spice_cert_pem implies also spice_cacert_pem
187 logging.debug("Writing SPICE certificate at %s", spicecert_file)
188 utils.WriteFile(spicecert_file, data=spice_cert_pem, backup=True)
189 logging.debug("Writing SPICE CA certificate at %s", spicecacert_file)
190 utils.WriteFile(spicecacert_file, data=spice_cacert_pem, backup=True)
191 elif new_spice_cert or not spice_cert_exists:
192 if spice_cert_exists:
193 utils.CreateBackup(spicecert_file)
194 if spice_cacert_exists:
195 utils.CreateBackup(spicecacert_file)
196
197 logging.debug("Generating new self-signed SPICE certificate at %s",
198 spicecert_file)
199 (_, cert_pem) = utils.GenerateSelfSignedSslCert(spicecert_file, 1)
200
201 # Self-signed certificate -> the public certificate is also the CA public
202 # certificate
203 logging.debug("Writing the public certificate to %s",
204 spicecert_file)
205 utils.io.WriteFile(spicecacert_file, mode=0400, data=cert_pem)
206
207 # Cluster domain secret
208 if cds:
209 logging.debug("Writing cluster domain secret to %s", cds_file)
210 utils.WriteFile(cds_file, data=cds, backup=True)
211
212 elif new_cds or not os.path.exists(cds_file):
213 logging.debug("Generating new cluster domain secret at %s", cds_file)
214 GenerateHmacKey(cds_file)
215
216
217 def _InitGanetiServerSetup(master_name):
218 """Setup the necessary configuration for the initial node daemon.
219
220 This creates the nodepass file containing the shared password for
221 the cluster, generates the SSL certificate and starts the node daemon.
222
223 @type master_name: str
224 @param master_name: Name of the master node
225
226 """
227 # Generate cluster secrets
228 GenerateClusterCrypto(True, False, False, False, False, False, master_name)
229
230 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED])
231 if result.failed:
232 raise errors.OpExecError("Could not start the node daemon, command %s"
233 " had exitcode %s and error %s" %
234 (result.cmd, result.exit_code, result.output))
235
236 _WaitForNodeDaemon(master_name)
237
238
239 def _WaitForNodeDaemon(node_name):
240 """Wait for node daemon to become responsive.
241
242 """
243 def _CheckNodeDaemon():
244 # Pylint bug <http://www.logilab.org/ticket/35642>
245 # pylint: disable=E1101
246 result = rpc.BootstrapRunner().call_version([node_name])[node_name]
247 if result.fail_msg:
248 raise utils.RetryAgain()
249
250 try:
251 utils.Retry(_CheckNodeDaemon, 1.0, _DAEMON_READY_TIMEOUT)
252 except utils.RetryTimeout:
253 raise errors.OpExecError("Node daemon on %s didn't answer queries within"
254 " %s seconds" % (node_name, _DAEMON_READY_TIMEOUT))
255
256
257 def _WaitForMasterDaemon():
258 """Wait for master daemon to become responsive.
259
260 """
261 def _CheckMasterDaemon():
262 try:
263 cl = luxi.Client()
264 (cluster_name, ) = cl.QueryConfigValues(["cluster_name"])
265 except Exception:
266 raise utils.RetryAgain()
267
268 logging.debug("Received cluster name %s from master", cluster_name)
269
270 try:
271 utils.Retry(_CheckMasterDaemon, 1.0, _DAEMON_READY_TIMEOUT)
272 except utils.RetryTimeout:
273 raise errors.OpExecError("Master daemon didn't answer queries within"
274 " %s seconds" % _DAEMON_READY_TIMEOUT)
275
276
277 def _WaitForSshDaemon(hostname, port):
278 """Wait for SSH daemon to become responsive.
279
280 """
281 family = ssconf.SimpleStore().GetPrimaryIPFamily()
282 hostip = netutils.GetHostname(name=hostname, family=family).ip
283
284 def _CheckSshDaemon():
285 if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
286 logging.debug("SSH daemon on %s:%s (IP address %s) has become"
287 " responsive", hostname, port, hostip)
288 else:
289 raise utils.RetryAgain()
290
291 try:
292 utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
293 except utils.RetryTimeout:
294 raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
295 " become responsive within %s seconds" %
296 (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
297
298
299 def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
300 use_cluster_key, ask_key, strict_host_check,
301 port, data):
302 """Runs a command to configure something on a remote machine.
303
304 @type cluster_name: string
305 @param cluster_name: Cluster name
306 @type node: string
307 @param node: Node name
308 @type basecmd: string
309 @param basecmd: Base command (path on the remote machine)
310 @type debug: bool
311 @param debug: Enable debug output
312 @type verbose: bool
313 @param verbose: Enable verbose output
314 @type use_cluster_key: bool
315 @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
316 @type ask_key: bool
317 @param ask_key: See L{ssh.SshRunner.BuildCmd}
318 @type strict_host_check: bool
319 @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
320 @type port: int
321 @param port: The SSH port of the remote machine or None for the default
322 @param data: JSON-serializable input data for script (passed to stdin)
323
324 """
325 cmd = [basecmd]
326
327 # Pass --debug/--verbose to the external script if set on our invocation
328 if debug:
329 cmd.append("--debug")
330
331 if verbose:
332 cmd.append("--verbose")
333
334 logging.debug("Node setup command: %s", cmd)
335
336 version = constants.DIR_VERSION
337 all_cmds = [["test", "-d", os.path.join(pathutils.PKGLIBDIR, version)]]
338 if constants.HAS_GNU_LN:
339 all_cmds.extend([["ln", "-s", "-f", "-T",
340 os.path.join(pathutils.PKGLIBDIR, version),
341 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
342 ["ln", "-s", "-f", "-T",
343 os.path.join(pathutils.SHAREDIR, version),
344 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
345 else:
346 all_cmds.extend([["rm", "-f",
347 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
348 ["ln", "-s", "-f",
349 os.path.join(pathutils.PKGLIBDIR, version),
350 os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
351 ["rm", "-f",
352 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")],
353 ["ln", "-s", "-f",
354 os.path.join(pathutils.SHAREDIR, version),
355 os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
356 all_cmds.append(cmd)
357
358 if port is None:
359 port = netutils.GetDaemonPort(constants.SSH)
360
361 srun = ssh.SshRunner(cluster_name)
362 scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
363 utils.ShellQuoteArgs(
364 utils.ShellCombineCommands(all_cmds)),
365 batch=False, ask_key=ask_key, quiet=False,
366 strict_host_check=strict_host_check,
367 use_cluster_key=use_cluster_key,
368 port=port)
369
370 tempfh = tempfile.TemporaryFile()
371 try:
372 tempfh.write(serializer.DumpJson(data))
373 tempfh.seek(0)
374
375 result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
376 finally:
377 tempfh.close()
378
379 if result.failed:
380 raise errors.OpExecError("Command '%s' failed: %s" %
381 (result.cmd, result.fail_reason))
382
383 _WaitForSshDaemon(node, port)
384
385
386 def _InitFileStorageDir(file_storage_dir):
387 """Initialize if needed the file storage.
388
389 @param file_storage_dir: the user-supplied value
390 @return: either empty string (if file storage was disabled at build
391 time) or the normalized path to the storage directory
392
393 """
394 file_storage_dir = os.path.normpath(file_storage_dir)
395
396 if not os.path.isabs(file_storage_dir):
397 raise errors.OpPrereqError("File storage directory '%s' is not an absolute"
398 " path" % file_storage_dir, errors.ECODE_INVAL)
399
400 if not os.path.exists(file_storage_dir):
401 try:
402 os.makedirs(file_storage_dir, 0750)
403 except OSError, err:
404 raise errors.OpPrereqError("Cannot create file storage directory"
405 " '%s': %s" % (file_storage_dir, err),
406 errors.ECODE_ENVIRON)
407
408 if not os.path.isdir(file_storage_dir):
409 raise errors.OpPrereqError("The file storage directory '%s' is not"
410 " a directory." % file_storage_dir,
411 errors.ECODE_ENVIRON)
412
413 return file_storage_dir
414
415
416 def _PrepareFileBasedStorage(
417 enabled_disk_templates, file_storage_dir,
418 default_dir, file_disk_template, _storage_path_acceptance_fn,
419 init_fn=_InitFileStorageDir, acceptance_fn=None):
420 """Checks if a file-base storage type is enabled and inits the dir.
421
422 @type enabled_disk_templates: list of string
423 @param enabled_disk_templates: list of enabled disk templates
424 @type file_storage_dir: string
425 @param file_storage_dir: the file storage directory
426 @type default_dir: string
427 @param default_dir: default file storage directory when C{file_storage_dir}
428 is 'None'
429 @type file_disk_template: string
430 @param file_disk_template: a disk template whose storage type is 'ST_FILE',
431 'ST_SHARED_FILE' or 'ST_GLUSTER'
432 @type _storage_path_acceptance_fn: function
433 @param _storage_path_acceptance_fn: checks whether the given file-based
434 storage directory is acceptable
435 @see: C{cluster.CheckFileBasedStoragePathVsEnabledDiskTemplates} for details
436
437 @rtype: string
438 @returns: the name of the actual file storage directory
439
440 """
441 assert (file_disk_template in utils.storage.GetDiskTemplatesOfStorageTypes(
442 constants.ST_FILE, constants.ST_SHARED_FILE, constants.ST_GLUSTER
443 ))
444
445 if file_storage_dir is None:
446 file_storage_dir = default_dir
447 if not acceptance_fn:
448 acceptance_fn = \
449 lambda path: filestorage.CheckFileStoragePathAcceptance(
450 path, exact_match_ok=True)
451
452 _storage_path_acceptance_fn(logging.warning, file_storage_dir,
453 enabled_disk_templates)
454
455 file_storage_enabled = file_disk_template in enabled_disk_templates
456 if file_storage_enabled:
457 try:
458 acceptance_fn(file_storage_dir)
459 except errors.FileStoragePathError as e:
460 raise errors.OpPrereqError(str(e))
461 result_file_storage_dir = init_fn(file_storage_dir)
462 else:
463 result_file_storage_dir = file_storage_dir
464 return result_file_storage_dir
465
466
467 def _PrepareFileStorage(
468 enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
469 acceptance_fn=None):
470 """Checks if file storage is enabled and inits the dir.
471
472 @see: C{_PrepareFileBasedStorage}
473
474 """
475 return _PrepareFileBasedStorage(
476 enabled_disk_templates, file_storage_dir,
477 pathutils.DEFAULT_FILE_STORAGE_DIR, constants.DT_FILE,
478 cluster.CheckFileStoragePathVsEnabledDiskTemplates,
479 init_fn=init_fn, acceptance_fn=acceptance_fn)
480
481
482 def _PrepareSharedFileStorage(
483 enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
484 acceptance_fn=None):
485 """Checks if shared file storage is enabled and inits the dir.
486
487 @see: C{_PrepareFileBasedStorage}
488
489 """
490 return _PrepareFileBasedStorage(
491 enabled_disk_templates, file_storage_dir,
492 pathutils.DEFAULT_SHARED_FILE_STORAGE_DIR, constants.DT_SHARED_FILE,
493 cluster.CheckSharedFileStoragePathVsEnabledDiskTemplates,
494 init_fn=init_fn, acceptance_fn=acceptance_fn)
495
496
497 def _PrepareGlusterStorage(
498 enabled_disk_templates, file_storage_dir, init_fn=_InitFileStorageDir,
499 acceptance_fn=None):
500 """Checks if gluster storage is enabled and inits the dir.
501
502 @see: C{_PrepareFileBasedStorage}
503
504 """
505 return _PrepareFileBasedStorage(
506 enabled_disk_templates, file_storage_dir,
507 pathutils.DEFAULT_GLUSTER_STORAGE_DIR, constants.DT_GLUSTER,
508 cluster.CheckGlusterStoragePathVsEnabledDiskTemplates,
509 init_fn=init_fn, acceptance_fn=acceptance_fn)
510
511
512 def _InitCheckEnabledDiskTemplates(enabled_disk_templates):
513 """Checks the sanity of the enabled disk templates.
514
515 """
516 if not enabled_disk_templates:
517 raise errors.OpPrereqError("Enabled disk templates list must contain at"
518 " least one member", errors.ECODE_INVAL)
519 invalid_disk_templates = \
520 set(enabled_disk_templates) - constants.DISK_TEMPLATES
521 if invalid_disk_templates:
522 raise errors.OpPrereqError("Enabled disk templates list contains invalid"
523 " entries: %s" % invalid_disk_templates,
524 errors.ECODE_INVAL)
525
526
527 def _RestrictIpolicyToEnabledDiskTemplates(ipolicy, enabled_disk_templates):
528 """Restricts the ipolicy's disk templates to the enabled ones.
529
530 This function clears the ipolicy's list of allowed disk templates from the
531 ones that are not enabled by the cluster.
532
533 @type ipolicy: dict
534 @param ipolicy: the instance policy
535 @type enabled_disk_templates: list of string
536 @param enabled_disk_templates: the list of cluster-wide enabled disk
537 templates
538
539 """
540 assert constants.IPOLICY_DTS in ipolicy
541 allowed_disk_templates = ipolicy[constants.IPOLICY_DTS]
542 restricted_disk_templates = list(set(allowed_disk_templates)
543 .intersection(set(enabled_disk_templates)))
544 ipolicy[constants.IPOLICY_DTS] = restricted_disk_templates
545
546
547 def _InitCheckDrbdHelper(drbd_helper, drbd_enabled):
548 """Checks the DRBD usermode helper.
549
550 @type drbd_helper: string
551 @param drbd_helper: name of the DRBD usermode helper that the system should
552 use
553
554 """
555 if not drbd_enabled:
556 return
557
558 if drbd_helper is not None:
559 try:
560 curr_helper = drbd.DRBD8.GetUsermodeHelper()
561 except errors.BlockDeviceError, err:
562 raise errors.OpPrereqError("Error while checking drbd helper"
563 " (disable drbd with --enabled-disk-templates"
564 " if you are not using drbd): %s" % str(err),
565 errors.ECODE_ENVIRON)
566 if drbd_helper != curr_helper:
567 raise errors.OpPrereqError("Error: requiring %s as drbd helper but %s"
568 " is the current helper" % (drbd_helper,
569 curr_helper),
570 errors.ECODE_INVAL)
571
572
573 def InitCluster(cluster_name, mac_prefix, # pylint: disable=R0913, R0914
574 master_netmask, master_netdev, file_storage_dir,
575 shared_file_storage_dir, gluster_storage_dir,
576 candidate_pool_size, secondary_ip=None,
577 vg_name=None, beparams=None, nicparams=None, ndparams=None,
578 hvparams=None, diskparams=None, enabled_hypervisors=None,
579 modify_etc_hosts=True, modify_ssh_setup=True,
580 maintain_node_health=False, drbd_helper=None, uid_pool=None,
581 default_iallocator=None, default_iallocator_params=None,
582 primary_ip_version=None, ipolicy=None,
583 prealloc_wipe_disks=False, use_external_mip_script=False,
584 hv_state=None, disk_state=None, enabled_disk_templates=None,
585 install_image=None, zeroing_image=None, compression_tools=None,
586 enabled_user_shutdown=False):
587 """Initialise the cluster.
588
589 @type candidate_pool_size: int
590 @param candidate_pool_size: master candidate pool size
591
592 @type enabled_disk_templates: list of string
593 @param enabled_disk_templates: list of disk_templates to be used in this
594 cluster
595
596 @type enabled_user_shutdown: bool
597 @param enabled_user_shutdown: whether user shutdown is enabled cluster
598 wide
599
600 """
601 # TODO: complete the docstring
602 if config.ConfigWriter.IsCluster():
603 raise errors.OpPrereqError("Cluster is already initialised",
604 errors.ECODE_STATE)
605
606 data_dir = vcluster.AddNodePrefix(pathutils.DATA_DIR)
607 queue_dir = vcluster.AddNodePrefix(pathutils.QUEUE_DIR)
608 archive_dir = vcluster.AddNodePrefix(pathutils.JOB_QUEUE_ARCHIVE_DIR)
609 for ddir in [queue_dir, data_dir, archive_dir]:
610 if os.path.isdir(ddir):
611 for entry in os.listdir(ddir):
612 if not os.path.isdir(os.path.join(ddir, entry)):
613 raise errors.OpPrereqError(
614 "%s contains non-directory enries like %s. Remove left-overs of an"
615 " old cluster before initialising a new one" % (ddir, entry),
616 errors.ECODE_STATE)
617
618 if not enabled_hypervisors:
619 raise errors.OpPrereqError("Enabled hypervisors list must contain at"
620 " least one member", errors.ECODE_INVAL)
621 invalid_hvs = set(enabled_hypervisors) - constants.HYPER_TYPES
622 if invalid_hvs:
623 raise errors.OpPrereqError("Enabled hypervisors contains invalid"
624 " entries: %s" % invalid_hvs,
625 errors.ECODE_INVAL)
626
627 _InitCheckEnabledDiskTemplates(enabled_disk_templates)
628
629 try:
630 ipcls = netutils.IPAddress.GetClassFromIpVersion(primary_ip_version)
631 except errors.ProgrammerError:
632 raise errors.OpPrereqError("Invalid primary ip version: %d." %
633 primary_ip_version, errors.ECODE_INVAL)
634
635 hostname = netutils.GetHostname(family=ipcls.family)
636 if not ipcls.IsValid(hostname.ip):
637 raise errors.OpPrereqError("This host's IP (%s) is not a valid IPv%d"
638 " address." % (hostname.ip, primary_ip_version),
639 errors.ECODE_INVAL)
640
641 if ipcls.IsLoopback(hostname.ip):
642 raise errors.OpPrereqError("This host's IP (%s) resolves to a loopback"
643 " address. Please fix DNS or %s." %
644 (hostname.ip, pathutils.ETC_HOSTS),
645 errors.ECODE_ENVIRON)
646
647 if not ipcls.Own(hostname.ip):
648 raise errors.OpPrereqError("Inconsistency: this host's name resolves"
649 " to %s,\nbut this ip address does not"
650 " belong to this host" %
651 hostname.ip, errors.ECODE_ENVIRON)
652
653 clustername = netutils.GetHostname(name=cluster_name, family=ipcls.family)
654
655 if netutils.TcpPing(clustername.ip, constants.DEFAULT_NODED_PORT, timeout=5):
656 raise errors.OpPrereqError("Cluster IP already active",
657 errors.ECODE_NOTUNIQUE)
658
659 if not secondary_ip:
660 if primary_ip_version == constants.IP6_VERSION:
661 raise errors.OpPrereqError("When using a IPv6 primary address, a valid"
662 " IPv4 address must be given as secondary",
663 errors.ECODE_INVAL)
664 secondary_ip = hostname.ip
665
666 if not netutils.IP4Address.IsValid(secondary_ip):
667 raise errors.OpPrereqError("Secondary IP address (%s) has to be a valid"
668 " IPv4 address." % secondary_ip,
669 errors.ECODE_INVAL)
670
671 if not netutils.IP4Address.Own(secondary_ip):
672 raise errors.OpPrereqError("You gave %s as secondary IP,"
673 " but it does not belong to this host." %
674 secondary_ip, errors.ECODE_ENVIRON)
675
676 if master_netmask is not None:
677 if not ipcls.ValidateNetmask(master_netmask):
678 raise errors.OpPrereqError("CIDR netmask (%s) not valid for IPv%s " %
679 (master_netmask, primary_ip_version),
680 errors.ECODE_INVAL)
681 else:
682 master_netmask = ipcls.iplen
683
684 if vg_name:
685 # Check if volume group is valid
686 vgstatus = utils.CheckVolumeGroupSize(utils.ListVolumeGroups(), vg_name,
687 constants.MIN_VG_SIZE)
688 if vgstatus:
689 raise errors.OpPrereqError("Error: %s" % vgstatus, errors.ECODE_INVAL)
690
691 drbd_enabled = constants.DT_DRBD8 in enabled_disk_templates
692 _InitCheckDrbdHelper(drbd_helper, drbd_enabled)
693
694 logging.debug("Stopping daemons (if any are running)")
695 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"])
696 if result.failed:
697 raise errors.OpExecError("Could not stop daemons, command %s"
698 " had exitcode %s and error '%s'" %
699 (result.cmd, result.exit_code, result.output))
700
701 file_storage_dir = _PrepareFileStorage(enabled_disk_templates,
702 file_storage_dir)
703 shared_file_storage_dir = _PrepareSharedFileStorage(enabled_disk_templates,
704 shared_file_storage_dir)
705 gluster_storage_dir = _PrepareGlusterStorage(enabled_disk_templates,
706 gluster_storage_dir)
707
708 if not re.match("^[0-9a-z]{2}:[0-9a-z]{2}:[0-9a-z]{2}$", mac_prefix):
709 raise errors.OpPrereqError("Invalid mac prefix given '%s'" % mac_prefix,
710 errors.ECODE_INVAL)
711
712 if not nicparams.get('mode', None) == constants.NIC_MODE_OVS:
713 # Do not do this check if mode=openvswitch, since the openvswitch is not
714 # created yet
715 result = utils.RunCmd(["ip", "link", "show", "dev", master_netdev])
716 if result.failed:
717 raise errors.OpPrereqError("Invalid master netdev given (%s): '%s'" %
718 (master_netdev,
719 result.output.strip()), errors.ECODE_INVAL)
720
721 dirs = [(pathutils.RUN_DIR, constants.RUN_DIRS_MODE)]
722 utils.EnsureDirs(dirs)
723
724 objects.UpgradeBeParams(beparams)
725 utils.ForceDictType(beparams, constants.BES_PARAMETER_TYPES)
726 utils.ForceDictType(nicparams, constants.NICS_PARAMETER_TYPES)
727
728 objects.NIC.CheckParameterSyntax(nicparams)
729
730 full_ipolicy = objects.FillIPolicy(constants.IPOLICY_DEFAULTS, ipolicy)
731 _RestrictIpolicyToEnabledDiskTemplates(full_ipolicy, enabled_disk_templates)
732
733 if ndparams is not None:
734 utils.ForceDictType(ndparams, constants.NDS_PARAMETER_TYPES)
735 else:
736 ndparams = dict(constants.NDC_DEFAULTS)
737
738 # This is ugly, as we modify the dict itself
739 # FIXME: Make utils.ForceDictType pure functional or write a wrapper
740 # around it
741 if hv_state:
742 for hvname, hvs_data in hv_state.items():
743 utils.ForceDictType(hvs_data, constants.HVSTS_PARAMETER_TYPES)
744 hv_state[hvname] = objects.Cluster.SimpleFillHvState(hvs_data)
745 else:
746 hv_state = dict((hvname, constants.HVST_DEFAULTS)
747 for hvname in enabled_hypervisors)
748
749 # FIXME: disk_state has no default values yet
750 if disk_state:
751 for storage, ds_data in disk_state.items():
752 if storage not in constants.DS_VALID_TYPES:
753 raise errors.OpPrereqError("Invalid storage type in disk state: %s" %
754 storage, errors.ECODE_INVAL)
755 for ds_name, state in ds_data.items():
756 utils.ForceDictType(state, constants.DSS_PARAMETER_TYPES)
757 ds_data[ds_name] = objects.Cluster.SimpleFillDiskState(state)
758
759 # hvparams is a mapping of hypervisor->hvparams dict
760 for hv_name, hv_params in hvparams.iteritems():
761 utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
762 hv_class = hypervisor.GetHypervisor(hv_name)
763 hv_class.CheckParameterSyntax(hv_params)
764
765 # diskparams is a mapping of disk-template->diskparams dict
766 for template, dt_params in diskparams.items():
767 param_keys = set(dt_params.keys())
768 default_param_keys = set(constants.DISK_DT_DEFAULTS[template].keys())
769 if not (param_keys <= default_param_keys):
770 unknown_params = param_keys - default_param_keys
771 raise errors.OpPrereqError("Invalid parameters for disk template %s:"
772 " %s" % (template,
773 utils.CommaJoin(unknown_params)),
774 errors.ECODE_INVAL)
775 utils.ForceDictType(dt_params, constants.DISK_DT_TYPES)
776 if template == constants.DT_DRBD8 and vg_name is not None:
777 # The default METAVG value is equal to the VG name set at init time,
778 # if provided
779 dt_params[constants.DRBD_DEFAULT_METAVG] = vg_name
780
781 try:
782 utils.VerifyDictOptions(diskparams, constants.DISK_DT_DEFAULTS)
783 except errors.OpPrereqError, err:
784 raise errors.OpPrereqError("While verify diskparam options: %s" % err,
785 errors.ECODE_INVAL)
786
787 # set up ssh config and /etc/hosts
788 rsa_sshkey = ""
789 dsa_sshkey = ""
790 if os.path.isfile(pathutils.SSH_HOST_RSA_PUB):
791 sshline = utils.ReadFile(pathutils.SSH_HOST_RSA_PUB)
792 rsa_sshkey = sshline.split(" ")[1]
793 if os.path.isfile(pathutils.SSH_HOST_DSA_PUB):
794 sshline = utils.ReadFile(pathutils.SSH_HOST_DSA_PUB)
795 dsa_sshkey = sshline.split(" ")[1]
796 if not rsa_sshkey and not dsa_sshkey:
797 raise errors.OpPrereqError("Failed to find SSH public keys",
798 errors.ECODE_ENVIRON)
799
800 if modify_etc_hosts:
801 utils.AddHostToEtcHosts(hostname.name, hostname.ip)
802
803 if modify_ssh_setup:
804 _InitSSHSetup()
805
806 if default_iallocator is not None:
807 alloc_script = utils.FindFile(default_iallocator,
808 constants.IALLOCATOR_SEARCH_PATH,
809 os.path.isfile)
810 if alloc_script is None:
811 raise errors.OpPrereqError("Invalid default iallocator script '%s'"
812 " specified" % default_iallocator,
813 errors.ECODE_INVAL)
814 else:
815 # default to htools
816 if utils.FindFile(constants.IALLOC_HAIL,
817 constants.IALLOCATOR_SEARCH_PATH,
818 os.path.isfile):
819 default_iallocator = constants.IALLOC_HAIL
820
821 # check if we have all the users we need
822 try:
823 runtime.GetEnts()
824 except errors.ConfigurationError, err:
825 raise errors.OpPrereqError("Required system user/group missing: %s" %
826 err, errors.ECODE_ENVIRON)
827
828 candidate_certs = {}
829
830 now = time.time()
831
832 if compression_tools is not None:
833 cluster.CheckCompressionTools(compression_tools)
834
835 # init of cluster config file
836 cluster_config = objects.Cluster(
837 serial_no=1,
838 rsahostkeypub=rsa_sshkey,
839 dsahostkeypub=dsa_sshkey,
840 highest_used_port=(constants.FIRST_DRBD_PORT - 1),
841 mac_prefix=mac_prefix,
842 volume_group_name=vg_name,
843 tcpudp_port_pool=set(),
844 master_ip=clustername.ip,
845 master_netmask=master_netmask,
846 master_netdev=master_netdev,
847 cluster_name=clustername.name,
848 file_storage_dir=file_storage_dir,
849 shared_file_storage_dir=shared_file_storage_dir,
850 gluster_storage_dir=gluster_storage_dir,
851 enabled_hypervisors=enabled_hypervisors,
852 beparams={constants.PP_DEFAULT: beparams},
853 nicparams={constants.PP_DEFAULT: nicparams},
854 ndparams=ndparams,
855 hvparams=hvparams,
856 diskparams=diskparams,
857 candidate_pool_size=candidate_pool_size,
858 modify_etc_hosts=modify_etc_hosts,
859 modify_ssh_setup=modify_ssh_setup,
860 uid_pool=uid_pool,
861 ctime=now,
862 mtime=now,
863 maintain_node_health=maintain_node_health,
864 drbd_usermode_helper=drbd_helper,
865 default_iallocator=default_iallocator,
866 default_iallocator_params=default_iallocator_params,
867 primary_ip_family=ipcls.family,
868 prealloc_wipe_disks=prealloc_wipe_disks,
869 use_external_mip_script=use_external_mip_script,
870 ipolicy=full_ipolicy,
871 hv_state_static=hv_state,
872 disk_state_static=disk_state,
873 enabled_disk_templates=enabled_disk_templates,
874 candidate_certs=candidate_certs,
875 osparams={},
876 osparams_private_cluster={},
877 install_image=install_image,
878 zeroing_image=zeroing_image,
879 compression_tools=compression_tools,
880 enabled_user_shutdown=enabled_user_shutdown,
881 )
882 master_node_config = objects.Node(name=hostname.name,
883 primary_ip=hostname.ip,
884 secondary_ip=secondary_ip,
885 serial_no=1,
886 master_candidate=True,
887 offline=False, drained=False,
888 ctime=now, mtime=now,
889 )
890 InitConfig(constants.CONFIG_VERSION, cluster_config, master_node_config)
891 cfg = config.ConfigWriter(offline=True)
892 ssh.WriteKnownHostsFile(cfg, pathutils.SSH_KNOWN_HOSTS_FILE)
893 cfg.Update(cfg.GetClusterInfo(), logging.error)
894 ssconf.WriteSsconfFiles(cfg.GetSsconfValues())
895
896 # set up the inter-node password and certificate
897 _InitGanetiServerSetup(hostname.name)
898
899 logging.debug("Starting daemons")
900 result = utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
901 if result.failed:
902 raise errors.OpExecError("Could not start daemons, command %s"
903 " had exitcode %s and error %s" %
904 (result.cmd, result.exit_code, result.output))
905
906 _WaitForMasterDaemon()
907
908
909 def InitConfig(version, cluster_config, master_node_config,
910 cfg_file=pathutils.CLUSTER_CONF_FILE):
911 """Create the initial cluster configuration.
912
913 It will contain the current node, which will also be the master
914 node, and no instances.
915
916 @type version: int
917 @param version: configuration version
918 @type cluster_config: L{objects.Cluster}
919 @param cluster_config: cluster configuration
920 @type master_node_config: L{objects.Node}
921 @param master_node_config: master node configuration
922 @type cfg_file: string
923 @param cfg_file: configuration file path
924
925 """
926 uuid_generator = config.TemporaryReservationManager()
927 cluster_config.uuid = uuid_generator.Generate([], utils.NewUUID,
928 _INITCONF_ECID)
929 master_node_config.uuid = uuid_generator.Generate([], utils.NewUUID,
930 _INITCONF_ECID)
931 cluster_config.master_node = master_node_config.uuid
932 nodes = {
933 master_node_config.uuid: master_node_config,
934 }
935 default_nodegroup = objects.NodeGroup(
936 uuid=uuid_generator.Generate([], utils.NewUUID, _INITCONF_ECID),
937 name=constants.INITIAL_NODE_GROUP_NAME,
938 members=[master_node_config.uuid],
939 diskparams={},
940 )
941 nodegroups = {
942 default_nodegroup.uuid: default_nodegroup,
943 }
944 now = time.time()
945 config_data = objects.ConfigData(version=version,
946 cluster=cluster_config,
947 nodegroups=nodegroups,
948 nodes=nodes,
949 instances={},
950 networks={},
951 disks={},
952 serial_no=1,
953 ctime=now, mtime=now)
954 utils.WriteFile(cfg_file,
955 data=serializer.Dump(config_data.ToDict()),
956 mode=0600)
957
958
959 def FinalizeClusterDestroy(master_uuid):
960 """Execute the last steps of cluster destroy
961
962 This function shuts down all the daemons, completing the destroy
963 begun in cmdlib.LUDestroyOpcode.
964
965 """
966 livelock = utils.livelock.LiveLock("bootstrap_destroy")
967 cfg = config.GetConfig(None, livelock)
968 modify_ssh_setup = cfg.GetClusterInfo().modify_ssh_setup
969 runner = rpc.BootstrapRunner()
970
971 master_name = cfg.GetNodeName(master_uuid)
972
973 master_params = cfg.GetMasterNetworkParameters()
974 master_params.uuid = master_uuid
975 ems = cfg.GetUseExternalMipScript()
976 result = runner.call_node_deactivate_master_ip(master_name, master_params,
977 ems)
978
979 msg = result.fail_msg
980 if msg:
981 logging.warning("Could not disable the master IP: %s", msg)
982
983 result = runner.call_node_stop_master(master_name)
984 msg = result.fail_msg
985 if msg:
986 logging.warning("Could not disable the master role: %s", msg)
987
988 result = runner.call_node_leave_cluster(master_name, modify_ssh_setup)
989 msg = result.fail_msg
990 if msg:
991 logging.warning("Could not shutdown the node daemon and cleanup"
992 " the node: %s", msg)
993
994
995 def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
996 """Add a node to the cluster.
997
998 This function must be called before the actual opcode, and will ssh
999 to the remote node, copy the needed files, and start ganeti-noded,
1000 allowing the master to do the rest via normal rpc calls.
1001
1002 @param cluster_name: the cluster name
1003 @param node: the name of the new node
1004 @param ssh_port: the SSH port of the new node
1005
1006 """
1007 data = {
1008 constants.NDS_CLUSTER_NAME: cluster_name,
1009 constants.NDS_NODE_DAEMON_CERTIFICATE:
1010 utils.ReadFile(pathutils.NODED_CERT_FILE),
1011 constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
1012 constants.NDS_START_NODE_DAEMON: True,
1013 constants.NDS_NODE_NAME: node,
1014 }
1015
1016 RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
1017 opts.debug, opts.verbose,
1018 True, opts.ssh_key_check, opts.ssh_key_check,
1019 ssh_port, data)
1020
1021 _WaitForNodeDaemon(node)
1022
1023
1024 def MasterFailover(no_voting=False):
1025 """Failover the master node.
1026
1027 This checks that we are not already the master, and will cause the
1028 current master to cease being master, and the non-master to become
1029 new master.
1030
1031 @type no_voting: boolean
1032 @param no_voting: force the operation without remote nodes agreement
1033 (dangerous)
1034
1035 @returns: the pair of an exit code and warnings to display
1036 """
1037 sstore = ssconf.SimpleStore()
1038
1039 old_master, new_master = ssconf.GetMasterAndMyself(sstore)
1040 node_names = sstore.GetNodeList()
1041 mc_list = sstore.GetMasterCandidates()
1042
1043 if old_master == new_master:
1044 raise errors.OpPrereqError("This commands must be run on the node"
1045 " where you want the new master to be."
1046 " %s is already the master" %
1047 old_master, errors.ECODE_INVAL)
1048
1049 if new_master not in mc_list:
1050 mc_no_master = [name for name in mc_list if name != old_master]
1051 raise errors.OpPrereqError("This node is not among the nodes marked"
1052 " as master candidates. Only these nodes"
1053 " can become masters. Current list of"
1054 " master candidates is:\n"
1055 "%s" % ("\n".join(mc_no_master)),
1056 errors.ECODE_STATE)
1057
1058 if not no_voting:
1059 vote_list = GatherMasterVotes(node_names)
1060
1061 if vote_list:
1062 voted_master = vote_list[0][0]
1063 if voted_master is None:
1064 raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
1065 " not respond.", errors.ECODE_ENVIRON)
1066 elif voted_master != old_master:
1067 raise errors.OpPrereqError("I have a wrong configuration, I believe"
1068 " the master is %s but the other nodes"
1069 " voted %s. Please resync the configuration"
1070 " of this node." %
1071 (old_master, voted_master),
1072 errors.ECODE_STATE)
1073 # end checks
1074
1075 rcode = 0
1076 warnings = []
1077
1078 logging.info("Setting master to %s, old master: %s", new_master, old_master)
1079
1080 try:
1081 # Forcefully start WConfd so that we can access the configuration
1082 result = utils.RunCmd([pathutils.DAEMON_UTIL,
1083 "start", constants.WCONFD, "--force-node",
1084 "--no-voting", "--yes-do-it"])
1085 if result.failed:
1086 raise errors.OpPrereqError("Could not start the configuration daemon,"
1087 " command %s had exitcode %s and error %s" %
1088 (result.cmd, result.exit_code, result.output),
1089 errors.ECODE_NOENT)
1090
1091 # instantiate a real config writer, as we now know we have the
1092 # configuration data
1093 livelock = utils.livelock.LiveLock("bootstrap_failover")
1094 cfg = config.GetConfig(None, livelock, accept_foreign=True)
1095
1096 old_master_node = cfg.GetNodeInfoByName(old_master)
1097 if old_master_node is None:
1098 raise errors.OpPrereqError("Could not find old master node '%s' in"
1099 " cluster configuration." % old_master,
1100 errors.ECODE_NOENT)
1101
1102 cluster_info = cfg.GetClusterInfo()
1103 new_master_node = cfg.GetNodeInfoByName(new_master)
1104 if new_master_node is None:
1105 raise errors.OpPrereqError("Could not find new master node '%s' in"
1106 " cluster configuration." % new_master,
1107 errors.ECODE_NOENT)
1108
1109 cluster_info.master_node = new_master_node.uuid
1110 # this will also regenerate the ssconf files, since we updated the
1111 # cluster info
1112 cfg.Update(cluster_info, logging.error)
1113
1114 # if cfg.Update worked, then it means the old master daemon won't be
1115 # able now to write its own config file (we rely on locking in both
1116 # backend.UploadFile() and ConfigWriter._Write(); hence the next
1117 # step is to kill the old master
1118
1119 logging.info("Stopping the master daemon on node %s", old_master)
1120
1121 runner = rpc.BootstrapRunner()
1122 master_params = cfg.GetMasterNetworkParameters()
1123 master_params.uuid = old_master_node.uuid
1124 ems = cfg.GetUseExternalMipScript()
1125 result = runner.call_node_deactivate_master_ip(old_master,
1126 master_params, ems)
1127
1128 msg = result.fail_msg
1129 if msg:
1130 warning = "Could not disable the master IP: %s" % (msg,)
1131 logging.warning("%s", warning)
1132 warnings.append(warning)
1133
1134 result = runner.call_node_stop_master(old_master)
1135 msg = result.fail_msg
1136 if msg:
1137 warning = ("Could not disable the master role on the old master"
1138 " %s, please disable manually: %s" % (old_master, msg))
1139 logging.error("%s", warning)
1140 warnings.append(warning)
1141 except errors.ConfigurationError, err:
1142 logging.error("Error while trying to set the new master: %s",
1143 str(err))
1144 return 1, warnings
1145 finally:
1146 # stop WConfd again:
1147 result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop", constants.WCONFD])
1148 if result.failed:
1149 warning = ("Could not stop the configuration daemon,"
1150 " command %s had exitcode %s and error %s"
1151 % (result.cmd, result.exit_code, result.output))
1152 logging.error("%s", warning)
1153 rcode = 1
1154
1155 logging.info("Checking master IP non-reachability...")
1156
1157 master_ip = sstore.GetMasterIP()
1158 total_timeout = 30
1159
1160 # Here we have a phase where no master should be running
1161 def _check_ip(expected):
1162 if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT) != expected:
1163 raise utils.RetryAgain()
1164
1165 try:
1166 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False])
1167 except utils.RetryTimeout:
1168 warning = ("The master IP is still reachable after %s seconds,"
1169 " continuing but activating the master IP on the current"
1170 " node will probably fail" % total_timeout)
1171 logging.warning("%s", warning)
1172 warnings.append(warning)
1173 rcode = 1
1174
1175 if jstore.CheckDrainFlag():
1176 logging.info("Undraining job queue")
1177 jstore.SetDrainFlag(False)
1178
1179 logging.info("Starting the master daemons on the new master")
1180
1181 result = rpc.BootstrapRunner().call_node_start_master_daemons(new_master,
1182 no_voting)
1183 msg = result.fail_msg
1184 if msg:
1185 logging.error("Could not start the master role on the new master"
1186 " %s, please check: %s", new_master, msg)
1187 rcode = 1
1188
1189 # Finally verify that the new master managed to set up the master IP
1190 # and warn if it didn't.
1191 try:
1192 utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True])
1193 except utils.RetryTimeout:
1194 warning = ("The master IP did not come up within %s seconds; the"
1195 " cluster should still be working and reachable via %s,"
1196 " but not via the master IP address"
1197 % (total_timeout, new_master))
1198 logging.warning("%s", warning)
1199 warnings.append(warning)
1200 rcode = 1
1201
1202 logging.info("Master failed over from %s to %s", old_master, new_master)
1203 return rcode, warnings
1204
1205
1206 def GetMaster():
1207 """Returns the current master node.
1208
1209 This is a separate function in bootstrap since it's needed by
1210 gnt-cluster, and instead of importing directly ssconf, it's better
1211 to abstract it in bootstrap, where we do use ssconf in other
1212 functions too.
1213
1214 """
1215 sstore = ssconf.SimpleStore()
1216
1217 old_master, _ = ssconf.GetMasterAndMyself(sstore)
1218
1219 return old_master
1220
1221
1222 def GatherMasterVotes(node_names):
1223 """Check the agreement on who is the master.
1224
1225 This function will return a list of (node, number of votes), ordered
1226 by the number of votes. Errors will be denoted by the key 'None'.
1227
1228 Note that the sum of votes is the number of nodes this machine
1229 knows, whereas the number of entries in the list could be different
1230 (if some nodes vote for another master).
1231
1232 @type node_names: list
1233 @param node_names: the list of nodes to query for master info; the current
1234 node will be removed if it is in the list
1235 @rtype: list
1236 @return: list of (node, votes)
1237
1238 """
1239 if not node_names:
1240 # no nodes
1241 return []
1242 results = rpc.BootstrapRunner().call_master_node_name(node_names)
1243 if not isinstance(results, dict):
1244 # this should not happen (unless internal error in rpc)
1245 logging.critical("Can't complete rpc call, aborting master startup")
1246 return [(None, len(node_names))]
1247 votes = {}
1248 for node_name in results:
1249 nres = results[node_name]
1250 msg = nres.fail_msg
1251
1252 if msg:
1253 logging.warning("Error contacting node %s: %s", node_name, msg)
1254 node = None
1255 else:
1256 node = nres.payload
1257
1258 if node not in votes:
1259 votes[node] = 1
1260 else:
1261 votes[node] += 1
1262
1263 vote_list = [v for v in votes.items()]
1264 # sort first on number of votes then on name, since we want None
1265 # sorted later if we have the half of the nodes not responding, and
1266 # half voting all for the same master
1267 vote_list.sort(key=lambda x: (x[1], x[0]), reverse=True)
1268
1269 return vote_list