f76a05b1e73cf05357c63ecad04a89abd2ebb826
[ganeti-github.git] / src / Ganeti / HTools / Cluster.hs
1 {-| Implementation of cluster-wide logic.
2
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
5
6 -}
7
8 {-
9
10 Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11 All rights reserved.
12
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are
15 met:
16
17 1. Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 2. Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
25 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
28 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
32 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
36 -}
37
38 module Ganeti.HTools.Cluster
39 (
40 -- * Types
41 AllocSolution(..)
42 , EvacSolution(..)
43 , Table(..)
44 , CStats(..)
45 , AllocNodes
46 , AllocResult
47 , AllocMethod
48 , AllocSolutionList
49 -- * Generic functions
50 , totalResources
51 , computeAllocationDelta
52 -- * First phase functions
53 , computeBadItems
54 -- * Second phase functions
55 , printSolutionLine
56 , formatCmds
57 , involvedNodes
58 , getMoves
59 , splitJobs
60 -- * Display functions
61 , printNodes
62 , printInsts
63 -- * Balacing functions
64 , checkMove
65 , doNextBalance
66 , tryBalance
67 , compCV
68 , compCVNodes
69 , compDetailedCV
70 , printStats
71 , iMoveToJob
72 -- * IAllocator functions
73 , genAllocNodes
74 , tryAlloc
75 , tryMGAlloc
76 , tryNodeEvac
77 , tryChangeGroup
78 , collapseFailures
79 , allocList
80 -- * Allocation functions
81 , iterateAlloc
82 , tieredAlloc
83 -- * Node group functions
84 , instanceGroup
85 , findSplitInstances
86 , splitCluster
87 ) where
88
89 import Control.Applicative (liftA2)
90 import Control.Arrow ((&&&))
91 import qualified Data.IntSet as IntSet
92 import Data.List
93 import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
94 import Data.Ord (comparing)
95 import Text.Printf (printf)
96
97 import Ganeti.BasicTypes
98 import qualified Ganeti.HTools.Container as Container
99 import qualified Ganeti.HTools.Instance as Instance
100 import qualified Ganeti.HTools.Nic as Nic
101 import qualified Ganeti.HTools.Node as Node
102 import qualified Ganeti.HTools.Group as Group
103 import Ganeti.HTools.Types
104 import Ganeti.Compat
105 import qualified Ganeti.OpCodes as OpCodes
106 import Ganeti.Utils
107 import Ganeti.Utils.Statistics
108 import Ganeti.Types (EvacMode(..), mkNonEmpty)
109
110 -- * Types
111
112 -- | Allocation\/relocation solution.
113 data AllocSolution = AllocSolution
114 { asFailures :: [FailMode] -- ^ Failure counts
115 , asAllocs :: Int -- ^ Good allocation count
116 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
117 , asLog :: [String] -- ^ Informational messages
118 }
119
120 -- | Node evacuation/group change iallocator result type. This result
121 -- type consists of actual opcodes (a restricted subset) that are
122 -- transmitted back to Ganeti.
123 data EvacSolution = EvacSolution
124 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
125 , esFailed :: [(Idx, String)] -- ^ Instances which were not
126 -- relocated
127 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
128 } deriving (Show)
129
130 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
131 type AllocResult = (FailStats, Node.List, Instance.List,
132 [Instance.Instance], [CStats])
133
134 -- | Type alias for easier handling.
135 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
136
137 -- | A type denoting the valid allocation mode/pairs.
138 --
139 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
140 -- for a two-node allocation, this will be a @Right [('Ndx',
141 -- ['Ndx'])]@. In the latter case, the list is basically an
142 -- association list, grouped by primary node and holding the potential
143 -- secondary nodes in the sub-list.
144 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
145
146 -- | The empty solution we start with when computing allocations.
147 emptyAllocSolution :: AllocSolution
148 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
149 , asSolution = Nothing, asLog = [] }
150
151 -- | The empty evac solution.
152 emptyEvacSolution :: EvacSolution
153 emptyEvacSolution = EvacSolution { esMoved = []
154 , esFailed = []
155 , esOpCodes = []
156 }
157
158 -- | The complete state for the balancing solution.
159 data Table = Table Node.List Instance.List Score [Placement]
160 deriving (Show)
161
162 -- | Cluster statistics data type.
163 data CStats = CStats
164 { csFmem :: Integer -- ^ Cluster free mem
165 , csFdsk :: Integer -- ^ Cluster free disk
166 , csFspn :: Integer -- ^ Cluster free spindles
167 , csAmem :: Integer -- ^ Cluster allocatable mem
168 , csAdsk :: Integer -- ^ Cluster allocatable disk
169 , csAcpu :: Integer -- ^ Cluster allocatable cpus
170 , csMmem :: Integer -- ^ Max node allocatable mem
171 , csMdsk :: Integer -- ^ Max node allocatable disk
172 , csMcpu :: Integer -- ^ Max node allocatable cpu
173 , csImem :: Integer -- ^ Instance used mem
174 , csIdsk :: Integer -- ^ Instance used disk
175 , csIspn :: Integer -- ^ Instance used spindles
176 , csIcpu :: Integer -- ^ Instance used cpu
177 , csTmem :: Double -- ^ Cluster total mem
178 , csTdsk :: Double -- ^ Cluster total disk
179 , csTspn :: Double -- ^ Cluster total spindles
180 , csTcpu :: Double -- ^ Cluster total cpus
181 , csVcpu :: Integer -- ^ Cluster total virtual cpus
182 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
183 -- physical CPUs, i.e. normalised used phys CPUs
184 , csXmem :: Integer -- ^ Unnacounted for mem
185 , csNmem :: Integer -- ^ Node own memory
186 , csScore :: Score -- ^ The cluster score
187 , csNinst :: Int -- ^ The total number of instances
188 } deriving (Show)
189
190 -- | A simple type for allocation functions.
191 type AllocMethod = Node.List -- ^ Node list
192 -> Instance.List -- ^ Instance list
193 -> Maybe Int -- ^ Optional allocation limit
194 -> Instance.Instance -- ^ Instance spec for allocation
195 -> AllocNodes -- ^ Which nodes we should allocate on
196 -> [Instance.Instance] -- ^ Allocated instances
197 -> [CStats] -- ^ Running cluster stats
198 -> Result AllocResult -- ^ Allocation result
199
200 -- | A simple type for the running solution of evacuations.
201 type EvacInnerState =
202 Either String (Node.List, Instance.Instance, Score, Ndx)
203
204 -- * Utility functions
205
206 -- | Verifies the N+1 status and return the affected nodes.
207 verifyN1 :: [Node.Node] -> [Node.Node]
208 verifyN1 = filter Node.failN1
209
210 {-| Computes the pair of bad nodes and instances.
211
212 The bad node list is computed via a simple 'verifyN1' check, and the
213 bad instance list is the list of primary and secondary instances of
214 those nodes.
215
216 -}
217 computeBadItems :: Node.List -> Instance.List ->
218 ([Node.Node], [Instance.Instance])
219 computeBadItems nl il =
220 let bad_nodes = verifyN1 $ getOnline nl
221 bad_instances = map (`Container.find` il) .
222 sort . nub $
223 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
224 in
225 (bad_nodes, bad_instances)
226
227 -- | Extracts the node pairs for an instance. This can fail if the
228 -- instance is single-homed. FIXME: this needs to be improved,
229 -- together with the general enhancement for handling non-DRBD moves.
230 instanceNodes :: Node.List -> Instance.Instance ->
231 (Ndx, Ndx, Node.Node, Node.Node)
232 instanceNodes nl inst =
233 let old_pdx = Instance.pNode inst
234 old_sdx = Instance.sNode inst
235 old_p = Container.find old_pdx nl
236 old_s = Container.find old_sdx nl
237 in (old_pdx, old_sdx, old_p, old_s)
238
239 -- | Zero-initializer for the CStats type.
240 emptyCStats :: CStats
241 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
242
243 -- | Update stats with data from a new node.
244 updateCStats :: CStats -> Node.Node -> CStats
245 updateCStats cs node =
246 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
247 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
248 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
249 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
250 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
251 csVcpu = x_vcpu, csNcpu = x_ncpu,
252 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
253 csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
254 }
255 = cs
256 inc_amem = Node.fMem node - Node.rMem node
257 inc_amem' = if inc_amem > 0 then inc_amem else 0
258 inc_adsk = Node.availDisk node
259 inc_imem = truncate (Node.tMem node) - Node.nMem node
260 - Node.xMem node - Node.fMem node
261 inc_icpu = Node.uCpu node
262 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
263 inc_ispn = Node.tSpindles node - Node.fSpindles node
264 inc_vcpu = Node.hiCpu node
265 inc_acpu = Node.availCpu node
266 inc_ncpu = fromIntegral (Node.uCpu node) /
267 iPolicyVcpuRatio (Node.iPolicy node)
268 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
269 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
270 , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
271 , csAmem = x_amem + fromIntegral inc_amem'
272 , csAdsk = x_adsk + fromIntegral inc_adsk
273 , csAcpu = x_acpu + fromIntegral inc_acpu
274 , csMmem = max x_mmem (fromIntegral inc_amem')
275 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
276 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
277 , csImem = x_imem + fromIntegral inc_imem
278 , csIdsk = x_idsk + fromIntegral inc_idsk
279 , csIspn = x_ispn + fromIntegral inc_ispn
280 , csIcpu = x_icpu + fromIntegral inc_icpu
281 , csTmem = x_tmem + Node.tMem node
282 , csTdsk = x_tdsk + Node.tDsk node
283 , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
284 , csTcpu = x_tcpu + Node.tCpu node
285 , csVcpu = x_vcpu + fromIntegral inc_vcpu
286 , csNcpu = x_ncpu + inc_ncpu
287 , csXmem = x_xmem + fromIntegral (Node.xMem node)
288 , csNmem = x_nmem + fromIntegral (Node.nMem node)
289 , csNinst = x_ninst + length (Node.pList node)
290 }
291
292 -- | Compute the total free disk and memory in the cluster.
293 totalResources :: Node.List -> CStats
294 totalResources nl =
295 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
296 in cs { csScore = compCV nl }
297
298 -- | Compute the delta between two cluster state.
299 --
300 -- This is used when doing allocations, to understand better the
301 -- available cluster resources. The return value is a triple of the
302 -- current used values, the delta that was still allocated, and what
303 -- was left unallocated.
304 computeAllocationDelta :: CStats -> CStats -> AllocStats
305 computeAllocationDelta cini cfin =
306 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
307 csNcpu = i_ncpu, csIspn = i_ispn } = cini
308 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
309 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
310 csNcpu = f_ncpu, csTcpu = f_tcpu,
311 csIspn = f_ispn, csTspn = t_spn } = cfin
312 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
313 , allocInfoNCpus = i_ncpu
314 , allocInfoMem = fromIntegral i_imem
315 , allocInfoDisk = fromIntegral i_idsk
316 , allocInfoSpn = fromIntegral i_ispn
317 }
318 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
319 , allocInfoNCpus = f_ncpu - i_ncpu
320 , allocInfoMem = fromIntegral (f_imem - i_imem)
321 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
322 , allocInfoSpn = fromIntegral (f_ispn - i_ispn)
323 }
324 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
325 , allocInfoNCpus = f_tcpu - f_ncpu
326 , allocInfoMem = truncate t_mem - fromIntegral f_imem
327 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
328 , allocInfoSpn = truncate t_spn - fromIntegral f_ispn
329 }
330 in (rini, rfin, runa)
331
332 -- | The names and weights of the individual elements in the CV list, together
333 -- with their statistical accumulation function and a bit to decide whether it
334 -- is a statistics for online nodes.
335 detailedCVInfoExt :: [((Double, String), ([Double] -> Statistics, Bool))]
336 detailedCVInfoExt = [ ((1, "free_mem_cv"), (getStdDevStatistics, True))
337 , ((1, "free_disk_cv"), (getStdDevStatistics, True))
338 , ((1, "n1_cnt"), (getSumStatistics, True))
339 , ((1, "reserved_mem_cv"), (getStdDevStatistics, True))
340 , ((4, "offline_all_cnt"), (getSumStatistics, False))
341 , ((16, "offline_pri_cnt"), (getSumStatistics, False))
342 , ((1, "vcpu_ratio_cv"), (getStdDevStatistics, True))
343 , ((1, "cpu_load_cv"), (getStdDevStatistics, True))
344 , ((1, "mem_load_cv"), (getStdDevStatistics, True))
345 , ((1, "disk_load_cv"), (getStdDevStatistics, True))
346 , ((1, "net_load_cv"), (getStdDevStatistics, True))
347 , ((2, "pri_tags_score"), (getSumStatistics, True))
348 , ((1, "spindles_cv"), (getStdDevStatistics, True))
349 ]
350
351 -- | The names and weights of the individual elements in the CV list.
352 detailedCVInfo :: [(Double, String)]
353 detailedCVInfo = map fst detailedCVInfoExt
354
355 -- | Holds the weights used by 'compCVNodes' for each metric.
356 detailedCVWeights :: [Double]
357 detailedCVWeights = map fst detailedCVInfo
358
359 -- | The aggregation functions for the weights
360 detailedCVAggregation :: [([Double] -> Statistics, Bool)]
361 detailedCVAggregation = map snd detailedCVInfoExt
362
363 -- | The bit vector describing which parts of the statistics are
364 -- for online nodes.
365 detailedCVOnlineStatus :: [Bool]
366 detailedCVOnlineStatus = map snd detailedCVAggregation
367
368 -- | Compute statistical measures of a single node.
369 compDetailedCVNode :: Node.Node -> [Double]
370 compDetailedCVNode node =
371 let mem = Node.pMem node
372 dsk = Node.pDsk node
373 n1 = fromIntegral
374 $ if Node.failN1 node
375 then length (Node.sList node) + length (Node.pList node)
376 else 0
377 res = Node.pRem node
378 ipri = fromIntegral . length $ Node.pList node
379 isec = fromIntegral . length $ Node.sList node
380 ioff = ipri + isec
381 cpu = Node.pCpuEff node
382 DynUtil c1 m1 d1 nn1 = Node.utilLoad node
383 DynUtil c2 m2 d2 nn2 = Node.utilPool node
384 (c_load, m_load, d_load, n_load) = (c1/c2, m1/m2, d1/d2, nn1/nn2)
385 pri_tags = fromIntegral $ Node.conflictingPrimaries node
386 spindles = Node.instSpindles node / Node.hiSpindles node
387 in [ mem, dsk, n1, res, ioff, ipri, cpu
388 , c_load, m_load, d_load, n_load
389 , pri_tags, spindles
390 ]
391
392 -- | Compute the statistics of a cluster.
393 compClusterStatistics :: [Node.Node] -> [Statistics]
394 compClusterStatistics all_nodes =
395 let (offline, nodes) = partition Node.offline all_nodes
396 offline_values = transpose (map compDetailedCVNode offline)
397 ++ repeat []
398 -- transpose of an empty list is empty and not k times the empty list, as
399 -- would be the transpose of a 0 x k matrix
400 online_values = transpose $ map compDetailedCVNode nodes
401 aggregate (f, True) (onNodes, _) = f onNodes
402 aggregate (f, False) (_, offNodes) = f offNodes
403 in zipWith aggregate detailedCVAggregation
404 $ zip online_values offline_values
405
406 -- | Update a cluster statistics by replacing the contribution of one
407 -- node by that of another.
408 updateClusterStatistics :: [Statistics]
409 -> (Node.Node, Node.Node) -> [Statistics]
410 updateClusterStatistics stats (old, new) =
411 let update = zip (compDetailedCVNode old) (compDetailedCVNode new)
412 online = not $ Node.offline old
413 updateStat forOnline stat upd = if forOnline == online
414 then updateStatistics stat upd
415 else stat
416 in zipWith3 updateStat detailedCVOnlineStatus stats update
417
418 -- | Update a cluster statistics twice.
419 updateClusterStatisticsTwice :: [Statistics]
420 -> (Node.Node, Node.Node)
421 -> (Node.Node, Node.Node)
422 -> [Statistics]
423 updateClusterStatisticsTwice s a =
424 updateClusterStatistics (updateClusterStatistics s a)
425
426 -- | Compute cluster statistics
427 compDetailedCV :: [Node.Node] -> [Double]
428 compDetailedCV = map getStatisticValue . compClusterStatistics
429
430 -- | Compute the cluster score from its statistics
431 compCVfromStats :: [Statistics] -> Double
432 compCVfromStats = sum . zipWith (*) detailedCVWeights . map getStatisticValue
433
434 -- | Compute the /total/ variance.
435 compCVNodes :: [Node.Node] -> Double
436 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
437
438 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
439 compCV :: Node.List -> Double
440 compCV = compCVNodes . Container.elems
441
442 -- | Compute online nodes from a 'Node.List'.
443 getOnline :: Node.List -> [Node.Node]
444 getOnline = filter (not . Node.offline) . Container.elems
445
446 -- * Balancing functions
447
448 -- | Compute best table. Note that the ordering of the arguments is important.
449 compareTables :: Table -> Table -> Table
450 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
451 if a_cv > b_cv then b else a
452
453 -- | Applies an instance move to a given node list and instance.
454 applyMove :: Node.List -> Instance.Instance
455 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
456 -- Failover (f)
457 applyMove nl inst Failover =
458 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
459 int_p = Node.removePri old_p inst
460 int_s = Node.removeSec old_s inst
461 new_nl = do -- Maybe monad
462 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
463 new_s <- Node.addSecExEx (Node.offline old_p) (Node.offline old_p)
464 int_p inst old_sdx
465 let new_inst = Instance.setBoth inst old_sdx old_pdx
466 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
467 new_inst, old_sdx, old_pdx)
468 in new_nl
469
470 -- Failover to any (fa)
471 applyMove nl inst (FailoverToAny new_pdx) = do
472 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
473 new_pnode = Container.find new_pdx nl
474 force_failover = Node.offline old_pnode
475 new_pnode' <- Node.addPriEx force_failover new_pnode inst
476 let old_pnode' = Node.removePri old_pnode inst
477 inst' = Instance.setPri inst new_pdx
478 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
479 return (nl', inst', new_pdx, old_sdx)
480
481 -- Replace the primary (f:, r:np, f)
482 applyMove nl inst (ReplacePrimary new_pdx) =
483 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
484 tgt_n = Container.find new_pdx nl
485 int_p = Node.removePri old_p inst
486 int_s = Node.removeSec old_s inst
487 force_p = Node.offline old_p
488 new_nl = do -- Maybe monad
489 -- check that the current secondary can host the instance
490 -- during the migration
491 tmp_s <- Node.addPriEx force_p int_s inst
492 let tmp_s' = Node.removePri tmp_s inst
493 new_p <- Node.addPriEx force_p tgt_n inst
494 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
495 let new_inst = Instance.setPri inst new_pdx
496 return (Container.add new_pdx new_p $
497 Container.addTwo old_pdx int_p old_sdx new_s nl,
498 new_inst, new_pdx, old_sdx)
499 in new_nl
500
501 -- Replace the secondary (r:ns)
502 applyMove nl inst (ReplaceSecondary new_sdx) =
503 let old_pdx = Instance.pNode inst
504 old_sdx = Instance.sNode inst
505 old_s = Container.find old_sdx nl
506 tgt_n = Container.find new_sdx nl
507 int_s = Node.removeSec old_s inst
508 force_s = Node.offline old_s
509 new_inst = Instance.setSec inst new_sdx
510 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
511 \new_s -> return (Container.addTwo new_sdx
512 new_s old_sdx int_s nl,
513 new_inst, old_pdx, new_sdx)
514 in new_nl
515
516 -- Replace the secondary and failover (r:np, f)
517 applyMove nl inst (ReplaceAndFailover new_pdx) =
518 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
519 tgt_n = Container.find new_pdx nl
520 int_p = Node.removePri old_p inst
521 int_s = Node.removeSec old_s inst
522 force_s = Node.offline old_s
523 new_nl = do -- Maybe monad
524 new_p <- Node.addPri tgt_n inst
525 new_s <- Node.addSecEx force_s int_p inst new_pdx
526 let new_inst = Instance.setBoth inst new_pdx old_pdx
527 return (Container.add new_pdx new_p $
528 Container.addTwo old_pdx new_s old_sdx int_s nl,
529 new_inst, new_pdx, old_pdx)
530 in new_nl
531
532 -- Failver and replace the secondary (f, r:ns)
533 applyMove nl inst (FailoverAndReplace new_sdx) =
534 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
535 tgt_n = Container.find new_sdx nl
536 int_p = Node.removePri old_p inst
537 int_s = Node.removeSec old_s inst
538 force_p = Node.offline old_p
539 new_nl = do -- Maybe monad
540 new_p <- Node.addPriEx force_p int_s inst
541 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
542 let new_inst = Instance.setBoth inst old_sdx new_sdx
543 return (Container.add new_sdx new_s $
544 Container.addTwo old_sdx new_p old_pdx int_p nl,
545 new_inst, old_sdx, new_sdx)
546 in new_nl
547
548 -- | Tries to allocate an instance on one given node.
549 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
550 -> OpResult Node.AllocElement
551 allocateOnSingle nl inst new_pdx =
552 let p = Container.find new_pdx nl
553 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
554 in do
555 Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
556 new_p <- Node.addPri p inst
557 let new_nl = Container.add new_pdx new_p nl
558 new_score = compCV new_nl
559 return (new_nl, new_inst, [new_p], new_score)
560
561 -- | Tries to allocate an instance on a given pair of nodes.
562 allocateOnPair :: [Statistics]
563 -> Node.List -> Instance.Instance -> Ndx -> Ndx
564 -> OpResult Node.AllocElement
565 allocateOnPair stats nl inst new_pdx new_sdx =
566 let tgt_p = Container.find new_pdx nl
567 tgt_s = Container.find new_sdx nl
568 in do
569 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
570 (Node.exclStorage tgt_p)
571 new_p <- Node.addPri tgt_p inst
572 new_s <- Node.addSec tgt_s inst new_pdx
573 let new_inst = Instance.setBoth inst new_pdx new_sdx
574 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
575 new_stats = updateClusterStatisticsTwice stats
576 (tgt_p, new_p) (tgt_s, new_s)
577 return (new_nl, new_inst, [new_p, new_s], compCVfromStats new_stats)
578
579 -- | Tries to perform an instance move and returns the best table
580 -- between the original one and the new one.
581 checkSingleStep :: Table -- ^ The original table
582 -> Instance.Instance -- ^ The instance to move
583 -> Table -- ^ The current best table
584 -> IMove -- ^ The move to apply
585 -> Table -- ^ The final best table
586 checkSingleStep ini_tbl target cur_tbl move =
587 let Table ini_nl ini_il _ ini_plc = ini_tbl
588 tmp_resu = applyMove ini_nl target move
589 in case tmp_resu of
590 Bad _ -> cur_tbl
591 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
592 let tgt_idx = Instance.idx target
593 upd_cvar = compCV upd_nl
594 upd_il = Container.add tgt_idx new_inst ini_il
595 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
596 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
597 in compareTables cur_tbl upd_tbl
598
599 -- | Given the status of the current secondary as a valid new node and
600 -- the current candidate target node, generate the possible moves for
601 -- a instance.
602 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
603 -> Bool -- ^ Whether the secondary node is a valid new node
604 -> Bool -- ^ Whether we can change the primary node
605 -> (Bool, Bool) -- ^ Whether migration is restricted and whether
606 -- the instance primary is offline
607 -> Ndx -- ^ Target node candidate
608 -> [IMove] -- ^ List of valid result moves
609
610 possibleMoves MirrorNone _ _ _ _ = []
611
612 possibleMoves MirrorExternal _ False _ _ = []
613
614 possibleMoves MirrorExternal _ True _ tdx =
615 [ FailoverToAny tdx ]
616
617 possibleMoves MirrorInternal _ False _ tdx =
618 [ ReplaceSecondary tdx ]
619
620 possibleMoves MirrorInternal _ _ (True, False) tdx =
621 [ ReplaceSecondary tdx
622 ]
623
624 possibleMoves MirrorInternal True True (False, _) tdx =
625 [ ReplaceSecondary tdx
626 , ReplaceAndFailover tdx
627 , ReplacePrimary tdx
628 , FailoverAndReplace tdx
629 ]
630
631 possibleMoves MirrorInternal True True (True, True) tdx =
632 [ ReplaceSecondary tdx
633 , ReplaceAndFailover tdx
634 , FailoverAndReplace tdx
635 ]
636
637 possibleMoves MirrorInternal False True _ tdx =
638 [ ReplaceSecondary tdx
639 , ReplaceAndFailover tdx
640 ]
641
642 -- | Compute the best move for a given instance.
643 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
644 -> Bool -- ^ Whether disk moves are allowed
645 -> Bool -- ^ Whether instance moves are allowed
646 -> Bool -- ^ Whether migration is restricted
647 -> Table -- ^ Original table
648 -> Instance.Instance -- ^ Instance to move
649 -> Table -- ^ Best new table for this instance
650 checkInstanceMove nodes_idx disk_moves inst_moves rest_mig
651 ini_tbl@(Table nl _ _ _) target =
652 let opdx = Instance.pNode target
653 osdx = Instance.sNode target
654 bad_nodes = [opdx, osdx]
655 nodes = filter (`notElem` bad_nodes) nodes_idx
656 mir_type = Instance.mirrorType target
657 use_secondary = elem osdx nodes_idx && inst_moves
658 aft_failover = if mir_type == MirrorInternal && use_secondary
659 -- if drbd and allowed to failover
660 then checkSingleStep ini_tbl target ini_tbl Failover
661 else ini_tbl
662 primary_drained = Node.offline
663 . flip Container.find nl
664 $ Instance.pNode target
665 all_moves =
666 if disk_moves
667 then concatMap (possibleMoves mir_type use_secondary inst_moves
668 (rest_mig, primary_drained))
669 nodes
670 else []
671 in
672 -- iterate over the possible nodes for this instance
673 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
674
675 -- | Compute the best next move.
676 checkMove :: [Ndx] -- ^ Allowed target node indices
677 -> Bool -- ^ Whether disk moves are allowed
678 -> Bool -- ^ Whether instance moves are allowed
679 -> Bool -- ^ Whether migration is restricted
680 -> Table -- ^ The current solution
681 -> [Instance.Instance] -- ^ List of instances still to move
682 -> Table -- ^ The new solution
683 checkMove nodes_idx disk_moves inst_moves rest_mig ini_tbl victims =
684 let Table _ _ _ ini_plc = ini_tbl
685 -- we're using rwhnf from the Control.Parallel.Strategies
686 -- package; we don't need to use rnf as that would force too
687 -- much evaluation in single-threaded cases, and in
688 -- multi-threaded case the weak head normal form is enough to
689 -- spark the evaluation
690 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
691 inst_moves rest_mig ini_tbl)
692 victims
693 -- iterate over all instances, computing the best move
694 best_tbl = foldl' compareTables ini_tbl tables
695 Table _ _ _ best_plc = best_tbl
696 in if length best_plc == length ini_plc
697 then ini_tbl -- no advancement
698 else best_tbl
699
700 -- | Check if we are allowed to go deeper in the balancing.
701 doNextBalance :: Table -- ^ The starting table
702 -> Int -- ^ Remaining length
703 -> Score -- ^ Score at which to stop
704 -> Bool -- ^ The resulting table and commands
705 doNextBalance ini_tbl max_rounds min_score =
706 let Table _ _ ini_cv ini_plc = ini_tbl
707 ini_plc_len = length ini_plc
708 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
709
710 -- | Run a balance move.
711 tryBalance :: Table -- ^ The starting table
712 -> Bool -- ^ Allow disk moves
713 -> Bool -- ^ Allow instance moves
714 -> Bool -- ^ Only evacuate moves
715 -> Bool -- ^ Restrict migration
716 -> Score -- ^ Min gain threshold
717 -> Score -- ^ Min gain
718 -> Maybe Table -- ^ The resulting table and commands
719 tryBalance ini_tbl disk_moves inst_moves evac_mode rest_mig mg_limit min_gain =
720 let Table ini_nl ini_il ini_cv _ = ini_tbl
721 all_inst = Container.elems ini_il
722 all_nodes = Container.elems ini_nl
723 (offline_nodes, online_nodes) = partition Node.offline all_nodes
724 all_inst' = if evac_mode
725 then let bad_nodes = map Node.idx offline_nodes
726 in filter (any (`elem` bad_nodes) .
727 Instance.allNodes) all_inst
728 else all_inst
729 reloc_inst = filter (\i -> Instance.movable i &&
730 Instance.autoBalance i) all_inst'
731 node_idx = map Node.idx online_nodes
732 fin_tbl = checkMove node_idx disk_moves inst_moves rest_mig
733 ini_tbl reloc_inst
734 (Table _ _ fin_cv _) = fin_tbl
735 in
736 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
737 then Just fin_tbl -- this round made success, return the new table
738 else Nothing
739
740 -- * Allocation functions
741
742 -- | Build failure stats out of a list of failures.
743 collapseFailures :: [FailMode] -> FailStats
744 collapseFailures flst =
745 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
746 [minBound..maxBound]
747
748 -- | Compares two Maybe AllocElement and chooses the best score.
749 bestAllocElement :: Maybe Node.AllocElement
750 -> Maybe Node.AllocElement
751 -> Maybe Node.AllocElement
752 bestAllocElement a Nothing = a
753 bestAllocElement Nothing b = b
754 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
755 if ascore < bscore then a else b
756
757 -- | Update current Allocation solution and failure stats with new
758 -- elements.
759 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
760 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
761
762 concatAllocs as (Ok ns) =
763 let -- Choose the old or new solution, based on the cluster score
764 cntok = asAllocs as
765 osols = asSolution as
766 nsols = bestAllocElement osols (Just ns)
767 nsuc = cntok + 1
768 -- Note: we force evaluation of nsols here in order to keep the
769 -- memory profile low - we know that we will need nsols for sure
770 -- in the next cycle, so we force evaluation of nsols, since the
771 -- foldl' in the caller will only evaluate the tuple, but not the
772 -- elements of the tuple
773 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
774
775 -- | Sums two 'AllocSolution' structures.
776 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
777 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
778 (AllocSolution bFails bAllocs bSols bLog) =
779 -- note: we add b first, since usually it will be smaller; when
780 -- fold'ing, a will grow and grow whereas b is the per-group
781 -- result, hence smaller
782 let nFails = bFails ++ aFails
783 nAllocs = aAllocs + bAllocs
784 nSols = bestAllocElement aSols bSols
785 nLog = bLog ++ aLog
786 in AllocSolution nFails nAllocs nSols nLog
787
788 -- | Given a solution, generates a reasonable description for it.
789 describeSolution :: AllocSolution -> String
790 describeSolution as =
791 let fcnt = asFailures as
792 sols = asSolution as
793 freasons =
794 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
795 filter ((> 0) . snd) . collapseFailures $ fcnt
796 in case sols of
797 Nothing -> "No valid allocation solutions, failure reasons: " ++
798 (if null fcnt then "unknown reasons" else freasons)
799 Just (_, _, nodes, cv) ->
800 printf ("score: %.8f, successes %d, failures %d (%s)" ++
801 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
802 (intercalate "/" . map Node.name $ nodes)
803
804 -- | Annotates a solution with the appropriate string.
805 annotateSolution :: AllocSolution -> AllocSolution
806 annotateSolution as = as { asLog = describeSolution as : asLog as }
807
808 -- | Reverses an evacuation solution.
809 --
810 -- Rationale: we always concat the results to the top of the lists, so
811 -- for proper jobset execution, we should reverse all lists.
812 reverseEvacSolution :: EvacSolution -> EvacSolution
813 reverseEvacSolution (EvacSolution f m o) =
814 EvacSolution (reverse f) (reverse m) (reverse o)
815
816 -- | Generate the valid node allocation singles or pairs for a new instance.
817 genAllocNodes :: Group.List -- ^ Group list
818 -> Node.List -- ^ The node map
819 -> Int -- ^ The number of nodes required
820 -> Bool -- ^ Whether to drop or not
821 -- unallocable nodes
822 -> Result AllocNodes -- ^ The (monadic) result
823 genAllocNodes gl nl count drop_unalloc =
824 let filter_fn = if drop_unalloc
825 then filter (Group.isAllocable .
826 flip Container.find gl . Node.group)
827 else id
828 all_nodes = filter_fn $ getOnline nl
829 all_pairs = [(Node.idx p,
830 [Node.idx s | s <- all_nodes,
831 Node.idx p /= Node.idx s,
832 Node.group p == Node.group s]) |
833 p <- all_nodes]
834 in case count of
835 1 -> Ok (Left (map Node.idx all_nodes))
836 2 -> Ok (Right (filter (not . null . snd) all_pairs))
837 _ -> Bad "Unsupported number of nodes, only one or two supported"
838
839 -- | Try to allocate an instance on the cluster.
840 tryAlloc :: (Monad m) =>
841 Node.List -- ^ The node list
842 -> Instance.List -- ^ The instance list
843 -> Instance.Instance -- ^ The instance to allocate
844 -> AllocNodes -- ^ The allocation targets
845 -> m AllocSolution -- ^ Possible solution list
846 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
847 tryAlloc nl _ inst (Right ok_pairs) =
848 let cstat = compClusterStatistics $ Container.elems nl
849 psols = parMap rwhnf (\(p, ss) ->
850 foldl' (\cstate ->
851 concatAllocs cstate .
852 allocateOnPair cstat nl inst p)
853 emptyAllocSolution ss) ok_pairs
854 sols = foldl' sumAllocs emptyAllocSolution psols
855 in return $ annotateSolution sols
856
857 tryAlloc _ _ _ (Left []) = fail "No online nodes"
858 tryAlloc nl _ inst (Left all_nodes) =
859 let sols = foldl' (\cstate ->
860 concatAllocs cstate . allocateOnSingle nl inst
861 ) emptyAllocSolution all_nodes
862 in return $ annotateSolution sols
863
864 -- | Given a group/result, describe it as a nice (list of) messages.
865 solutionDescription :: (Group.Group, Result AllocSolution)
866 -> [String]
867 solutionDescription (grp, result) =
868 case result of
869 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
870 Bad message -> [printf "Group %s: error %s" gname message]
871 where gname = Group.name grp
872 pol = allocPolicyToRaw (Group.allocPolicy grp)
873
874 -- | From a list of possibly bad and possibly empty solutions, filter
875 -- only the groups with a valid result. Note that the result will be
876 -- reversed compared to the original list.
877 filterMGResults :: [(Group.Group, Result AllocSolution)]
878 -> [(Group.Group, AllocSolution)]
879 filterMGResults = foldl' fn []
880 where unallocable = not . Group.isAllocable
881 fn accu (grp, rasol) =
882 case rasol of
883 Bad _ -> accu
884 Ok sol | isNothing (asSolution sol) -> accu
885 | unallocable grp -> accu
886 | otherwise -> (grp, sol):accu
887
888 -- | Sort multigroup results based on policy and score.
889 sortMGResults :: [(Group.Group, AllocSolution)]
890 -> [(Group.Group, AllocSolution)]
891 sortMGResults sols =
892 let extractScore (_, _, _, x) = x
893 solScore (grp, sol) = (Group.allocPolicy grp,
894 (extractScore . fromJust . asSolution) sol)
895 in sortBy (comparing solScore) sols
896
897 -- | Removes node groups which can't accommodate the instance
898 filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
899 -> Instance.Instance
900 -> ([(Group.Group, (Node.List, Instance.List))], [String])
901 filterValidGroups [] _ = ([], [])
902 filterValidGroups (ng:ngs) inst =
903 let (valid_ngs, msgs) = filterValidGroups ngs inst
904 hasNetwork nic = case Nic.network nic of
905 Just net -> net `elem` Group.networks (fst ng)
906 Nothing -> True
907 hasRequiredNetworks = all hasNetwork (Instance.nics inst)
908 in if hasRequiredNetworks
909 then (ng:valid_ngs, msgs)
910 else (valid_ngs,
911 ("group " ++ Group.name (fst ng) ++
912 " is not connected to a network required by instance " ++
913 Instance.name inst):msgs)
914
915 -- | Finds the best group for an instance on a multi-group cluster.
916 --
917 -- Only solutions in @preferred@ and @last_resort@ groups will be
918 -- accepted as valid, and additionally if the allowed groups parameter
919 -- is not null then allocation will only be run for those group
920 -- indices.
921 findBestAllocGroup :: Group.List -- ^ The group list
922 -> Node.List -- ^ The node list
923 -> Instance.List -- ^ The instance list
924 -> Maybe [Gdx] -- ^ The allowed groups
925 -> Instance.Instance -- ^ The instance to allocate
926 -> Int -- ^ Required number of nodes
927 -> Result (Group.Group, AllocSolution, [String])
928 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
929 let groups_by_idx = splitCluster mgnl mgil
930 groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
931 groups' = maybe groups
932 (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
933 allowed_gdxs
934 (groups'', filter_group_msgs) = filterValidGroups groups' inst
935 sols = map (\(gr, (nl, il)) ->
936 (gr, genAllocNodes mggl nl cnt False >>=
937 tryAlloc nl il inst))
938 groups''::[(Group.Group, Result AllocSolution)]
939 all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
940 goodSols = filterMGResults sols
941 sortedSols = sortMGResults goodSols
942 in case sortedSols of
943 [] -> Bad $ if null groups'
944 then "no groups for evacuation: allowed groups was " ++
945 show allowed_gdxs ++ ", all groups: " ++
946 show (map fst groups)
947 else intercalate ", " all_msgs
948 (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
949
950 -- | Try to allocate an instance on a multi-group cluster.
951 tryMGAlloc :: Group.List -- ^ The group list
952 -> Node.List -- ^ The node list
953 -> Instance.List -- ^ The instance list
954 -> Instance.Instance -- ^ The instance to allocate
955 -> Int -- ^ Required number of nodes
956 -> Result AllocSolution -- ^ Possible solution list
957 tryMGAlloc mggl mgnl mgil inst cnt = do
958 (best_group, solution, all_msgs) <-
959 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
960 let group_name = Group.name best_group
961 selmsg = "Selected group: " ++ group_name
962 return $ solution { asLog = selmsg:all_msgs }
963
964 -- | Calculate the new instance list after allocation solution.
965 updateIl :: Instance.List -- ^ The original instance list
966 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
967 -> Instance.List -- ^ The updated instance list
968 updateIl il Nothing = il
969 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
970
971 -- | Extract the the new node list from the allocation solution.
972 extractNl :: Node.List -- ^ The original node list
973 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
974 -> Node.List -- ^ The new node list
975 extractNl nl Nothing = nl
976 extractNl _ (Just (xnl, _, _, _)) = xnl
977
978 -- | Try to allocate a list of instances on a multi-group cluster.
979 allocList :: Group.List -- ^ The group list
980 -> Node.List -- ^ The node list
981 -> Instance.List -- ^ The instance list
982 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
983 -> AllocSolutionList -- ^ Possible solution list
984 -> Result (Node.List, Instance.List,
985 AllocSolutionList) -- ^ The final solution list
986 allocList _ nl il [] result = Ok (nl, il, result)
987 allocList gl nl il ((xi, xicnt):xies) result = do
988 ares <- tryMGAlloc gl nl il xi xicnt
989 let sol = asSolution ares
990 nl' = extractNl nl sol
991 il' = updateIl il sol
992 allocList gl nl' il' xies ((xi, ares):result)
993
994 -- | Function which fails if the requested mode is change secondary.
995 --
996 -- This is useful since except DRBD, no other disk template can
997 -- execute change secondary; thus, we can just call this function
998 -- instead of always checking for secondary mode. After the call to
999 -- this function, whatever mode we have is just a primary change.
1000 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
1001 failOnSecondaryChange ChangeSecondary dt =
1002 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
1003 "' can't execute change secondary"
1004 failOnSecondaryChange _ _ = return ()
1005
1006 -- | Run evacuation for a single instance.
1007 --
1008 -- /Note:/ this function should correctly execute both intra-group
1009 -- evacuations (in all modes) and inter-group evacuations (in the
1010 -- 'ChangeAll' mode). Of course, this requires that the correct list
1011 -- of target nodes is passed.
1012 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
1013 -> Instance.List -- ^ Instance list (cluster-wide)
1014 -> EvacMode -- ^ The evacuation mode
1015 -> Instance.Instance -- ^ The instance to be evacuated
1016 -> Gdx -- ^ The group we're targetting
1017 -> [Ndx] -- ^ The list of available nodes
1018 -- for allocation
1019 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1020 nodeEvacInstance nl il mode inst@(Instance.Instance
1021 {Instance.diskTemplate = dt@DTDiskless})
1022 gdx avail_nodes =
1023 failOnSecondaryChange mode dt >>
1024 evacOneNodeOnly nl il inst gdx avail_nodes
1025
1026 nodeEvacInstance _ _ _ (Instance.Instance
1027 {Instance.diskTemplate = DTPlain}) _ _ =
1028 fail "Instances of type plain cannot be relocated"
1029
1030 nodeEvacInstance _ _ _ (Instance.Instance
1031 {Instance.diskTemplate = DTFile}) _ _ =
1032 fail "Instances of type file cannot be relocated"
1033
1034 nodeEvacInstance nl il mode inst@(Instance.Instance
1035 {Instance.diskTemplate = dt@DTSharedFile})
1036 gdx avail_nodes =
1037 failOnSecondaryChange mode dt >>
1038 evacOneNodeOnly nl il inst gdx avail_nodes
1039
1040 nodeEvacInstance nl il mode inst@(Instance.Instance
1041 {Instance.diskTemplate = dt@DTBlock})
1042 gdx avail_nodes =
1043 failOnSecondaryChange mode dt >>
1044 evacOneNodeOnly nl il inst gdx avail_nodes
1045
1046 nodeEvacInstance nl il mode inst@(Instance.Instance
1047 {Instance.diskTemplate = dt@DTRbd})
1048 gdx avail_nodes =
1049 failOnSecondaryChange mode dt >>
1050 evacOneNodeOnly nl il inst gdx avail_nodes
1051
1052 nodeEvacInstance nl il mode inst@(Instance.Instance
1053 {Instance.diskTemplate = dt@DTExt})
1054 gdx avail_nodes =
1055 failOnSecondaryChange mode dt >>
1056 evacOneNodeOnly nl il inst gdx avail_nodes
1057
1058 nodeEvacInstance nl il mode inst@(Instance.Instance
1059 {Instance.diskTemplate = dt@DTGluster})
1060 gdx avail_nodes =
1061 failOnSecondaryChange mode dt >>
1062 evacOneNodeOnly nl il inst gdx avail_nodes
1063
1064 nodeEvacInstance nl il ChangePrimary
1065 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1066 _ _ =
1067 do
1068 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
1069 let idx = Instance.idx inst
1070 il' = Container.add idx inst' il
1071 ops = iMoveToJob nl' il' idx Failover
1072 return (nl', il', ops)
1073
1074 nodeEvacInstance nl il ChangeSecondary
1075 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1076 gdx avail_nodes =
1077 evacOneNodeOnly nl il inst gdx avail_nodes
1078
1079 -- The algorithm for ChangeAll is as follows:
1080 --
1081 -- * generate all (primary, secondary) node pairs for the target groups
1082 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1083 -- the final node list state and group score
1084 -- * select the best choice via a foldl that uses the same Either
1085 -- String solution as the ChangeSecondary mode
1086 nodeEvacInstance nl il ChangeAll
1087 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1088 gdx avail_nodes =
1089 do
1090 let no_nodes = Left "no nodes available"
1091 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1092 (nl', il', ops, _) <-
1093 annotateResult "Can't find any good nodes for relocation" .
1094 eitherToResult $
1095 foldl'
1096 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1097 Bad msg ->
1098 case accu of
1099 Right _ -> accu
1100 -- we don't need more details (which
1101 -- nodes, etc.) as we only selected
1102 -- this group if we can allocate on
1103 -- it, hence failures will not
1104 -- propagate out of this fold loop
1105 Left _ -> Left $ "Allocation failed: " ++ msg
1106 Ok result@(_, _, _, new_cv) ->
1107 let new_accu = Right result in
1108 case accu of
1109 Left _ -> new_accu
1110 Right (_, _, _, old_cv) ->
1111 if old_cv < new_cv
1112 then accu
1113 else new_accu
1114 ) no_nodes node_pairs
1115
1116 return (nl', il', ops)
1117
1118 -- | Generic function for changing one node of an instance.
1119 --
1120 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1121 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1122 -- over the list of available nodes, which results in the best choice
1123 -- for relocation.
1124 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1125 -> Instance.List -- ^ Instance list (cluster-wide)
1126 -> Instance.Instance -- ^ The instance to be evacuated
1127 -> Gdx -- ^ The group we're targetting
1128 -> [Ndx] -- ^ The list of available nodes
1129 -- for allocation
1130 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1131 evacOneNodeOnly nl il inst gdx avail_nodes = do
1132 op_fn <- case Instance.mirrorType inst of
1133 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1134 MirrorInternal -> Ok ReplaceSecondary
1135 MirrorExternal -> Ok FailoverToAny
1136 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1137 eitherToResult $
1138 foldl' (evacOneNodeInner nl inst gdx op_fn)
1139 (Left "no nodes available") avail_nodes
1140 let idx = Instance.idx inst
1141 il' = Container.add idx inst' il
1142 ops = iMoveToJob nl' il' idx (op_fn ndx)
1143 return (nl', il', ops)
1144
1145 -- | Inner fold function for changing one node of an instance.
1146 --
1147 -- Depending on the instance disk template, this will either change
1148 -- the secondary (for DRBD) or the primary node (for shared
1149 -- storage). However, the operation is generic otherwise.
1150 --
1151 -- The running solution is either a @Left String@, which means we
1152 -- don't have yet a working solution, or a @Right (...)@, which
1153 -- represents a valid solution; it holds the modified node list, the
1154 -- modified instance (after evacuation), the score of that solution,
1155 -- and the new secondary node index.
1156 evacOneNodeInner :: Node.List -- ^ Cluster node list
1157 -> Instance.Instance -- ^ Instance being evacuated
1158 -> Gdx -- ^ The group index of the instance
1159 -> (Ndx -> IMove) -- ^ Operation constructor
1160 -> EvacInnerState -- ^ Current best solution
1161 -> Ndx -- ^ Node we're evaluating as target
1162 -> EvacInnerState -- ^ New best solution
1163 evacOneNodeInner nl inst gdx op_fn accu ndx =
1164 case applyMove nl inst (op_fn ndx) of
1165 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1166 " failed: " ++ show fm
1167 in either (const $ Left fail_msg) (const accu) accu
1168 Ok (nl', inst', _, _) ->
1169 let nodes = Container.elems nl'
1170 -- The fromJust below is ugly (it can fail nastily), but
1171 -- at this point we should have any internal mismatches,
1172 -- and adding a monad here would be quite involved
1173 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1174 new_cv = compCVNodes grpnodes
1175 new_accu = Right (nl', inst', new_cv, ndx)
1176 in case accu of
1177 Left _ -> new_accu
1178 Right (_, _, old_cv, _) ->
1179 if old_cv < new_cv
1180 then accu
1181 else new_accu
1182
1183 -- | Compute result of changing all nodes of a DRBD instance.
1184 --
1185 -- Given the target primary and secondary node (which might be in a
1186 -- different group or not), this function will 'execute' all the
1187 -- required steps and assuming all operations succceed, will return
1188 -- the modified node and instance lists, the opcodes needed for this
1189 -- and the new group score.
1190 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1191 -> Instance.List -- ^ Cluster instance list
1192 -> Instance.Instance -- ^ The instance to be moved
1193 -> Gdx -- ^ The target group index
1194 -- (which can differ from the
1195 -- current group of the
1196 -- instance)
1197 -> (Ndx, Ndx) -- ^ Tuple of new
1198 -- primary\/secondary nodes
1199 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1200 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1201 let primary = Container.find (Instance.pNode inst) nl
1202 idx = Instance.idx inst
1203 -- if the primary is offline, then we first failover
1204 (nl1, inst1, ops1) <-
1205 if Node.offline primary
1206 then do
1207 (nl', inst', _, _) <-
1208 annotateResult "Failing over to the secondary" .
1209 opToResult $ applyMove nl inst Failover
1210 return (nl', inst', [Failover])
1211 else return (nl, inst, [])
1212 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1213 Failover,
1214 ReplaceSecondary t_sdx)
1215 -- we now need to execute a replace secondary to the future
1216 -- primary node
1217 (nl2, inst2, _, _) <-
1218 annotateResult "Changing secondary to new primary" .
1219 opToResult $
1220 applyMove nl1 inst1 o1
1221 let ops2 = o1:ops1
1222 -- we now execute another failover, the primary stays fixed now
1223 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1224 opToResult $ applyMove nl2 inst2 o2
1225 let ops3 = o2:ops2
1226 -- and finally another replace secondary, to the final secondary
1227 (nl4, inst4, _, _) <-
1228 annotateResult "Changing secondary to final secondary" .
1229 opToResult $
1230 applyMove nl3 inst3 o3
1231 let ops4 = o3:ops3
1232 il' = Container.add idx inst4 il
1233 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1234 let nodes = Container.elems nl4
1235 -- The fromJust below is ugly (it can fail nastily), but
1236 -- at this point we should have any internal mismatches,
1237 -- and adding a monad here would be quite involved
1238 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1239 new_cv = compCVNodes grpnodes
1240 return (nl4, il', ops, new_cv)
1241
1242 -- | Computes the nodes in a given group which are available for
1243 -- allocation.
1244 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1245 -> IntSet.IntSet -- ^ Nodes that are excluded
1246 -> Gdx -- ^ The group for which we
1247 -- query the nodes
1248 -> Result [Ndx] -- ^ List of available node indices
1249 availableGroupNodes group_nodes excl_ndx gdx = do
1250 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1251 Ok (lookup gdx group_nodes)
1252 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1253 return avail_nodes
1254
1255 -- | Updates the evac solution with the results of an instance
1256 -- evacuation.
1257 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1258 -> Idx
1259 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1260 -> (Node.List, Instance.List, EvacSolution)
1261 updateEvacSolution (nl, il, es) idx (Bad msg) =
1262 (nl, il, es { esFailed = (idx, msg):esFailed es})
1263 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1264 (nl, il, es { esMoved = new_elem:esMoved es
1265 , esOpCodes = opcodes:esOpCodes es })
1266 where inst = Container.find idx il
1267 new_elem = (idx,
1268 instancePriGroup nl inst,
1269 Instance.allNodes inst)
1270
1271 -- | Node-evacuation IAllocator mode main function.
1272 tryNodeEvac :: Group.List -- ^ The cluster groups
1273 -> Node.List -- ^ The node list (cluster-wide, not per group)
1274 -> Instance.List -- ^ Instance list (cluster-wide)
1275 -> EvacMode -- ^ The evacuation mode
1276 -> [Idx] -- ^ List of instance (indices) to be evacuated
1277 -> Result (Node.List, Instance.List, EvacSolution)
1278 tryNodeEvac _ ini_nl ini_il mode idxs =
1279 let evac_ndx = nodesToEvacuate ini_il mode idxs
1280 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1281 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1282 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1283 (Container.elems nl))) $
1284 splitCluster ini_nl ini_il
1285 (fin_nl, fin_il, esol) =
1286 foldl' (\state@(nl, il, _) inst ->
1287 let gdx = instancePriGroup nl inst
1288 pdx = Instance.pNode inst in
1289 updateEvacSolution state (Instance.idx inst) $
1290 availableGroupNodes group_ndx
1291 (IntSet.insert pdx excl_ndx) gdx >>=
1292 nodeEvacInstance nl il mode inst gdx
1293 )
1294 (ini_nl, ini_il, emptyEvacSolution)
1295 (map (`Container.find` ini_il) idxs)
1296 in return (fin_nl, fin_il, reverseEvacSolution esol)
1297
1298 -- | Change-group IAllocator mode main function.
1299 --
1300 -- This is very similar to 'tryNodeEvac', the only difference is that
1301 -- we don't choose as target group the current instance group, but
1302 -- instead:
1303 --
1304 -- 1. at the start of the function, we compute which are the target
1305 -- groups; either no groups were passed in, in which case we choose
1306 -- all groups out of which we don't evacuate instance, or there were
1307 -- some groups passed, in which case we use those
1308 --
1309 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1310 -- best group to hold the instance, and then we do what
1311 -- 'tryNodeEvac' does, except for this group instead of the current
1312 -- instance group.
1313 --
1314 -- Note that the correct behaviour of this function relies on the
1315 -- function 'nodeEvacInstance' to be able to do correctly both
1316 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1317 tryChangeGroup :: Group.List -- ^ The cluster groups
1318 -> Node.List -- ^ The node list (cluster-wide)
1319 -> Instance.List -- ^ Instance list (cluster-wide)
1320 -> [Gdx] -- ^ Target groups; if empty, any
1321 -- groups not being evacuated
1322 -> [Idx] -- ^ List of instance (indices) to be evacuated
1323 -> Result (Node.List, Instance.List, EvacSolution)
1324 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1325 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1326 flip Container.find ini_il) idxs
1327 target_gdxs = (if null gdxs
1328 then Container.keys gl
1329 else gdxs) \\ evac_gdxs
1330 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1331 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1332 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1333 (Container.elems nl))) $
1334 splitCluster ini_nl ini_il
1335 (fin_nl, fin_il, esol) =
1336 foldl' (\state@(nl, il, _) inst ->
1337 let solution = do
1338 let ncnt = Instance.requiredNodes $
1339 Instance.diskTemplate inst
1340 (grp, _, _) <- findBestAllocGroup gl nl il
1341 (Just target_gdxs) inst ncnt
1342 let gdx = Group.idx grp
1343 av_nodes <- availableGroupNodes group_ndx
1344 excl_ndx gdx
1345 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1346 in updateEvacSolution state (Instance.idx inst) solution
1347 )
1348 (ini_nl, ini_il, emptyEvacSolution)
1349 (map (`Container.find` ini_il) idxs)
1350 in return (fin_nl, fin_il, reverseEvacSolution esol)
1351
1352 -- | Standard-sized allocation method.
1353 --
1354 -- This places instances of the same size on the cluster until we're
1355 -- out of space. The result will be a list of identically-sized
1356 -- instances.
1357 iterateAlloc :: AllocMethod
1358 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1359 let depth = length ixes
1360 newname = printf "new-%d" depth::String
1361 newidx = Container.size il
1362 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1363 newlimit = fmap (flip (-) 1) limit
1364 in case tryAlloc nl il newi2 allocnodes of
1365 Bad s -> Bad s
1366 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1367 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1368 case sols3 of
1369 Nothing -> newsol
1370 Just (xnl, xi, _, _) ->
1371 if limit == Just 0
1372 then newsol
1373 else iterateAlloc xnl (Container.add newidx xi il)
1374 newlimit newinst allocnodes (xi:ixes)
1375 (totalResources xnl:cstats)
1376
1377 -- | Predicate whether shrinking a single resource can lead to a valid
1378 -- allocation.
1379 sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance
1380 -> FailMode -> Maybe Instance.Instance
1381 sufficesShrinking allocFn inst fm =
1382 case dropWhile (isNothing . asSolution . fst)
1383 . takeWhile (liftA2 (||) (elem fm . asFailures . fst)
1384 (isJust . asSolution . fst))
1385 . map (allocFn &&& id) $
1386 iterateOk (`Instance.shrinkByType` fm) inst
1387 of x:_ -> Just . snd $ x
1388 _ -> Nothing
1389
1390 -- | Tiered allocation method.
1391 --
1392 -- This places instances on the cluster, and decreases the spec until
1393 -- we can allocate again. The result will be a list of decreasing
1394 -- instance specs.
1395 tieredAlloc :: AllocMethod
1396 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1397 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1398 Bad s -> Bad s
1399 Ok (errs, nl', il', ixes', cstats') ->
1400 let newsol = Ok (errs, nl', il', ixes', cstats')
1401 ixes_cnt = length ixes'
1402 (stop, newlimit) = case limit of
1403 Nothing -> (False, Nothing)
1404 Just n -> (n <= ixes_cnt,
1405 Just (n - ixes_cnt))
1406 sortedErrs = map fst $ sortBy (comparing snd) errs
1407 suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution
1408 . flip (tryAlloc nl' il') allocnodes)
1409 newinst
1410 bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
1411 progress (Ok (_, _, _, newil', _)) (Ok (_, _, _, newil, _)) =
1412 length newil' > length newil
1413 progress _ _ = False
1414 in if stop then newsol else
1415 let newsol' = case Instance.shrinkByType newinst . last
1416 $ sortedErrs of
1417 Bad _ -> newsol
1418 Ok newinst' -> tieredAlloc nl' il' newlimit
1419 newinst' allocnodes ixes' cstats'
1420 in if progress newsol' newsol then newsol' else
1421 case bigSteps of
1422 Just newinst':_ -> tieredAlloc nl' il' newlimit
1423 newinst' allocnodes ixes' cstats'
1424 _ -> newsol
1425
1426 -- * Formatting functions
1427
1428 -- | Given the original and final nodes, computes the relocation description.
1429 computeMoves :: Instance.Instance -- ^ The instance to be moved
1430 -> String -- ^ The instance name
1431 -> IMove -- ^ The move being performed
1432 -> String -- ^ New primary
1433 -> String -- ^ New secondary
1434 -> (String, [String])
1435 -- ^ Tuple of moves and commands list; moves is containing
1436 -- either @/f/@ for failover or @/r:name/@ for replace
1437 -- secondary, while the command list holds gnt-instance
1438 -- commands (without that prefix), e.g \"@failover instance1@\"
1439 computeMoves i inam mv c d =
1440 case mv of
1441 Failover -> ("f", [mig])
1442 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1443 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1444 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1445 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1446 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1447 where morf = if Instance.isRunning i then "migrate" else "failover"
1448 mig = printf "%s -f %s" morf inam::String
1449 mig_any = printf "%s -f -n %s %s" morf c inam::String
1450 rep n = printf "replace-disks -n %s %s" n inam::String
1451
1452 -- | Converts a placement to string format.
1453 printSolutionLine :: Node.List -- ^ The node list
1454 -> Instance.List -- ^ The instance list
1455 -> Int -- ^ Maximum node name length
1456 -> Int -- ^ Maximum instance name length
1457 -> Placement -- ^ The current placement
1458 -> Int -- ^ The index of the placement in
1459 -- the solution
1460 -> (String, [String])
1461 printSolutionLine nl il nmlen imlen plc pos =
1462 let pmlen = (2*nmlen + 1)
1463 (i, p, s, mv, c) = plc
1464 old_sec = Instance.sNode inst
1465 inst = Container.find i il
1466 inam = Instance.alias inst
1467 npri = Node.alias $ Container.find p nl
1468 nsec = Node.alias $ Container.find s nl
1469 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1470 osec = Node.alias $ Container.find old_sec nl
1471 (moves, cmds) = computeMoves inst inam mv npri nsec
1472 -- FIXME: this should check instead/also the disk template
1473 ostr = if old_sec == Node.noSecondary
1474 then printf "%s" opri::String
1475 else printf "%s:%s" opri osec::String
1476 nstr = if s == Node.noSecondary
1477 then printf "%s" npri::String
1478 else printf "%s:%s" npri nsec::String
1479 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1480 pos imlen inam pmlen ostr pmlen nstr c moves,
1481 cmds)
1482
1483 -- | Return the instance and involved nodes in an instance move.
1484 --
1485 -- Note that the output list length can vary, and is not required nor
1486 -- guaranteed to be of any specific length.
1487 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1488 -- the instance from its index; note
1489 -- that this /must/ be the original
1490 -- instance list, so that we can
1491 -- retrieve the old nodes
1492 -> Placement -- ^ The placement we're investigating,
1493 -- containing the new nodes and
1494 -- instance index
1495 -> [Ndx] -- ^ Resulting list of node indices
1496 involvedNodes il plc =
1497 let (i, np, ns, _, _) = plc
1498 inst = Container.find i il
1499 in nub . filter (>= 0) $ [np, ns] ++ Instance.allNodes inst
1500
1501 -- | From two adjacent cluster tables get the list of moves that transitions
1502 -- from to the other
1503 getMoves :: (Table, Table) -> [MoveJob]
1504 getMoves (Table _ initial_il _ initial_plc, Table final_nl _ _ final_plc) =
1505 let
1506 plctoMoves (plc@(idx, p, s, mv, _)) =
1507 let inst = Container.find idx initial_il
1508 inst_name = Instance.name inst
1509 affected = involvedNodes initial_il plc
1510 np = Node.alias $ Container.find p final_nl
1511 ns = Node.alias $ Container.find s final_nl
1512 (_, cmds) = computeMoves inst inst_name mv np ns
1513 in (affected, idx, mv, cmds)
1514 in map plctoMoves . reverse . drop (length initial_plc) $ reverse final_plc
1515
1516 -- | Inner function for splitJobs, that either appends the next job to
1517 -- the current jobset, or starts a new jobset.
1518 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1519 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1520 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1521 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1522 | otherwise = ([n]:cjs, ndx)
1523
1524 -- | Break a list of moves into independent groups. Note that this
1525 -- will reverse the order of jobs.
1526 splitJobs :: [MoveJob] -> [JobSet]
1527 splitJobs = fst . foldl mergeJobs ([], [])
1528
1529 -- | Given a list of commands, prefix them with @gnt-instance@ and
1530 -- also beautify the display a little.
1531 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1532 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1533 let out =
1534 printf " echo job %d/%d" jsn sn:
1535 printf " check":
1536 map (" gnt-instance " ++) cmds
1537 in if sn == 1
1538 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1539 else out
1540
1541 -- | Given a list of commands, prefix them with @gnt-instance@ and
1542 -- also beautify the display a little.
1543 formatCmds :: [JobSet] -> String
1544 formatCmds =
1545 unlines .
1546 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1547 (zip [1..] js)) .
1548 zip [1..]
1549
1550 -- | Print the node list.
1551 printNodes :: Node.List -> [String] -> String
1552 printNodes nl fs =
1553 let fields = case fs of
1554 [] -> Node.defaultFields
1555 "+":rest -> Node.defaultFields ++ rest
1556 _ -> fs
1557 snl = sortBy (comparing Node.idx) (Container.elems nl)
1558 (header, isnum) = unzip $ map Node.showHeader fields
1559 in printTable "" header (map (Node.list fields) snl) isnum
1560
1561 -- | Print the instance list.
1562 printInsts :: Node.List -> Instance.List -> String
1563 printInsts nl il =
1564 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1565 helper inst = [ if Instance.isRunning inst then "R" else " "
1566 , Instance.name inst
1567 , Container.nameOf nl (Instance.pNode inst)
1568 , let sdx = Instance.sNode inst
1569 in if sdx == Node.noSecondary
1570 then ""
1571 else Container.nameOf nl sdx
1572 , if Instance.autoBalance inst then "Y" else "N"
1573 , printf "%3d" $ Instance.vcpus inst
1574 , printf "%5d" $ Instance.mem inst
1575 , printf "%5d" $ Instance.dsk inst `div` 1024
1576 , printf "%5.3f" lC
1577 , printf "%5.3f" lM
1578 , printf "%5.3f" lD
1579 , printf "%5.3f" lN
1580 ]
1581 where DynUtil lC lM lD lN = Instance.util inst
1582 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1583 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1584 isnum = False:False:False:False:False:repeat True
1585 in printTable "" header (map helper sil) isnum
1586
1587 -- | Shows statistics for a given node list.
1588 printStats :: String -> Node.List -> String
1589 printStats lp nl =
1590 let dcvs = compDetailedCV $ Container.elems nl
1591 (weights, names) = unzip detailedCVInfo
1592 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1593 header = [ "Field", "Value", "Weight" ]
1594 formatted = map (\(w, h, val) ->
1595 [ h
1596 , printf "%.8f" val
1597 , printf "x%.2f" w
1598 ]) hd
1599 in printTable lp header formatted $ False:repeat True
1600
1601 -- | Convert a placement into a list of OpCodes (basically a job).
1602 iMoveToJob :: Node.List -- ^ The node list; only used for node
1603 -- names, so any version is good
1604 -- (before or after the operation)
1605 -> Instance.List -- ^ The instance list; also used for
1606 -- names only
1607 -> Idx -- ^ The index of the instance being
1608 -- moved
1609 -> IMove -- ^ The actual move to be described
1610 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1611 -- the given move
1612 iMoveToJob nl il idx move =
1613 let inst = Container.find idx il
1614 iname = Instance.name inst
1615 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1616 -- FIXME: convert htools codebase to non-empty strings
1617 Bad msg -> error $ "Empty node name for idx " ++
1618 show n ++ ": " ++ msg ++ "??"
1619 Ok ne -> Just ne
1620 opF = OpCodes.OpInstanceMigrate
1621 { OpCodes.opInstanceName = iname
1622 , OpCodes.opInstanceUuid = Nothing
1623 , OpCodes.opMigrationMode = Nothing -- default
1624 , OpCodes.opOldLiveMode = Nothing -- default as well
1625 , OpCodes.opTargetNode = Nothing -- this is drbd
1626 , OpCodes.opTargetNodeUuid = Nothing
1627 , OpCodes.opAllowRuntimeChanges = False
1628 , OpCodes.opIgnoreIpolicy = False
1629 , OpCodes.opMigrationCleanup = False
1630 , OpCodes.opIallocator = Nothing
1631 , OpCodes.opAllowFailover = True }
1632 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1633 opR n = OpCodes.OpInstanceReplaceDisks
1634 { OpCodes.opInstanceName = iname
1635 , OpCodes.opInstanceUuid = Nothing
1636 , OpCodes.opEarlyRelease = False
1637 , OpCodes.opIgnoreIpolicy = False
1638 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1639 , OpCodes.opReplaceDisksList = []
1640 , OpCodes.opRemoteNode = lookNode n
1641 , OpCodes.opRemoteNodeUuid = Nothing
1642 , OpCodes.opIallocator = Nothing
1643 }
1644 in case move of
1645 Failover -> [ opF ]
1646 FailoverToAny np -> [ opFA np ]
1647 ReplacePrimary np -> [ opF, opR np, opF ]
1648 ReplaceSecondary ns -> [ opR ns ]
1649 ReplaceAndFailover np -> [ opR np, opF ]
1650 FailoverAndReplace ns -> [ opF, opR ns ]
1651
1652 -- * Node group functions
1653
1654 -- | Computes the group of an instance.
1655 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1656 instanceGroup nl i =
1657 let sidx = Instance.sNode i
1658 pnode = Container.find (Instance.pNode i) nl
1659 snode = if sidx == Node.noSecondary
1660 then pnode
1661 else Container.find sidx nl
1662 pgroup = Node.group pnode
1663 sgroup = Node.group snode
1664 in if pgroup /= sgroup
1665 then fail ("Instance placed accross two node groups, primary " ++
1666 show pgroup ++ ", secondary " ++ show sgroup)
1667 else return pgroup
1668
1669 -- | Computes the group of an instance per the primary node.
1670 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1671 instancePriGroup nl i =
1672 let pnode = Container.find (Instance.pNode i) nl
1673 in Node.group pnode
1674
1675 -- | Compute the list of badly allocated instances (split across node
1676 -- groups).
1677 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1678 findSplitInstances nl =
1679 filter (not . isOk . instanceGroup nl) . Container.elems
1680
1681 -- | Splits a cluster into the component node groups.
1682 splitCluster :: Node.List -> Instance.List ->
1683 [(Gdx, (Node.List, Instance.List))]
1684 splitCluster nl il =
1685 let ngroups = Node.computeGroups (Container.elems nl)
1686 in map (\(gdx, nodes) ->
1687 let nidxs = map Node.idx nodes
1688 nodes' = zip nidxs nodes
1689 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1690 in (gdx, (Container.fromList nodes', instances))) ngroups
1691
1692 -- | Compute the list of nodes that are to be evacuated, given a list
1693 -- of instances and an evacuation mode.
1694 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1695 -> EvacMode -- ^ The evacuation mode we're using
1696 -> [Idx] -- ^ List of instance indices being evacuated
1697 -> IntSet.IntSet -- ^ Set of node indices
1698 nodesToEvacuate il mode =
1699 IntSet.delete Node.noSecondary .
1700 foldl' (\ns idx ->
1701 let i = Container.find idx il
1702 pdx = Instance.pNode i
1703 sdx = Instance.sNode i
1704 dt = Instance.diskTemplate i
1705 withSecondary = case dt of
1706 DTDrbd8 -> IntSet.insert sdx ns
1707 _ -> ns
1708 in case mode of
1709 ChangePrimary -> IntSet.insert pdx ns
1710 ChangeSecondary -> withSecondary
1711 ChangeAll -> IntSet.insert pdx withSecondary
1712 ) IntSet.empty