Bugfix in checkInstanceMove function in Cluster.hs
[ganeti-github.git] / src / Ganeti / HTools / Cluster.hs
1 {-| Implementation of cluster-wide logic.
2
3 This module holds all pure cluster-logic; I\/O related functionality
4 goes into the /Main/ module for the individual binaries.
5
6 -}
7
8 {-
9
10 Copyright (C) 2009, 2010, 2011, 2012, 2013 Google Inc.
11 All rights reserved.
12
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are
15 met:
16
17 1. Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 2. Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
25 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
28 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
32 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
36 -}
37
38 module Ganeti.HTools.Cluster
39 (
40 -- * Types
41 AllocSolution(..)
42 , EvacSolution(..)
43 , Table(..)
44 , CStats(..)
45 , AllocNodes
46 , AllocResult
47 , AllocMethod
48 , AllocSolutionList
49 -- * Generic functions
50 , totalResources
51 , computeAllocationDelta
52 -- * First phase functions
53 , computeBadItems
54 -- * Second phase functions
55 , printSolutionLine
56 , formatCmds
57 , involvedNodes
58 , getMoves
59 , splitJobs
60 -- * Display functions
61 , printNodes
62 , printInsts
63 -- * Balacing functions
64 , checkMove
65 , doNextBalance
66 , tryBalance
67 , compCV
68 , compCVNodes
69 , compDetailedCV
70 , printStats
71 , iMoveToJob
72 -- * IAllocator functions
73 , genAllocNodes
74 , tryAlloc
75 , tryMGAlloc
76 , tryNodeEvac
77 , tryChangeGroup
78 , collapseFailures
79 , allocList
80 -- * Allocation functions
81 , iterateAlloc
82 , tieredAlloc
83 -- * Node group functions
84 , instanceGroup
85 , findSplitInstances
86 , splitCluster
87 ) where
88
89 import Control.Applicative (liftA2)
90 import Control.Arrow ((&&&))
91 import qualified Data.IntSet as IntSet
92 import Data.List
93 import Data.Maybe (fromJust, fromMaybe, isJust, isNothing)
94 import Data.Ord (comparing)
95 import Text.Printf (printf)
96
97 import Ganeti.BasicTypes
98 import qualified Ganeti.HTools.Container as Container
99 import qualified Ganeti.HTools.Instance as Instance
100 import qualified Ganeti.HTools.Nic as Nic
101 import qualified Ganeti.HTools.Node as Node
102 import qualified Ganeti.HTools.Group as Group
103 import Ganeti.HTools.Types
104 import Ganeti.Compat
105 import qualified Ganeti.OpCodes as OpCodes
106 import Ganeti.Utils
107 import Ganeti.Utils.Statistics
108 import Ganeti.Types (EvacMode(..), mkNonEmpty)
109
110 -- * Types
111
112 -- | Allocation\/relocation solution.
113 data AllocSolution = AllocSolution
114 { asFailures :: [FailMode] -- ^ Failure counts
115 , asAllocs :: Int -- ^ Good allocation count
116 , asSolution :: Maybe Node.AllocElement -- ^ The actual allocation result
117 , asLog :: [String] -- ^ Informational messages
118 }
119
120 -- | Node evacuation/group change iallocator result type. This result
121 -- type consists of actual opcodes (a restricted subset) that are
122 -- transmitted back to Ganeti.
123 data EvacSolution = EvacSolution
124 { esMoved :: [(Idx, Gdx, [Ndx])] -- ^ Instances moved successfully
125 , esFailed :: [(Idx, String)] -- ^ Instances which were not
126 -- relocated
127 , esOpCodes :: [[OpCodes.OpCode]] -- ^ List of jobs
128 } deriving (Show)
129
130 -- | Allocation results, as used in 'iterateAlloc' and 'tieredAlloc'.
131 type AllocResult = (FailStats, Node.List, Instance.List,
132 [Instance.Instance], [CStats])
133
134 -- | Type alias for easier handling.
135 type AllocSolutionList = [(Instance.Instance, AllocSolution)]
136
137 -- | A type denoting the valid allocation mode/pairs.
138 --
139 -- For a one-node allocation, this will be a @Left ['Ndx']@, whereas
140 -- for a two-node allocation, this will be a @Right [('Ndx',
141 -- ['Ndx'])]@. In the latter case, the list is basically an
142 -- association list, grouped by primary node and holding the potential
143 -- secondary nodes in the sub-list.
144 type AllocNodes = Either [Ndx] [(Ndx, [Ndx])]
145
146 -- | The empty solution we start with when computing allocations.
147 emptyAllocSolution :: AllocSolution
148 emptyAllocSolution = AllocSolution { asFailures = [], asAllocs = 0
149 , asSolution = Nothing, asLog = [] }
150
151 -- | The empty evac solution.
152 emptyEvacSolution :: EvacSolution
153 emptyEvacSolution = EvacSolution { esMoved = []
154 , esFailed = []
155 , esOpCodes = []
156 }
157
158 -- | The complete state for the balancing solution.
159 data Table = Table Node.List Instance.List Score [Placement]
160 deriving (Show)
161
162 -- | Cluster statistics data type.
163 data CStats = CStats
164 { csFmem :: Integer -- ^ Cluster free mem
165 , csFdsk :: Integer -- ^ Cluster free disk
166 , csFspn :: Integer -- ^ Cluster free spindles
167 , csAmem :: Integer -- ^ Cluster allocatable mem
168 , csAdsk :: Integer -- ^ Cluster allocatable disk
169 , csAcpu :: Integer -- ^ Cluster allocatable cpus
170 , csMmem :: Integer -- ^ Max node allocatable mem
171 , csMdsk :: Integer -- ^ Max node allocatable disk
172 , csMcpu :: Integer -- ^ Max node allocatable cpu
173 , csImem :: Integer -- ^ Instance used mem
174 , csIdsk :: Integer -- ^ Instance used disk
175 , csIspn :: Integer -- ^ Instance used spindles
176 , csIcpu :: Integer -- ^ Instance used cpu
177 , csTmem :: Double -- ^ Cluster total mem
178 , csTdsk :: Double -- ^ Cluster total disk
179 , csTspn :: Double -- ^ Cluster total spindles
180 , csTcpu :: Double -- ^ Cluster total cpus
181 , csVcpu :: Integer -- ^ Cluster total virtual cpus
182 , csNcpu :: Double -- ^ Equivalent to 'csIcpu' but in terms of
183 -- physical CPUs, i.e. normalised used phys CPUs
184 , csXmem :: Integer -- ^ Unnacounted for mem
185 , csNmem :: Integer -- ^ Node own memory
186 , csScore :: Score -- ^ The cluster score
187 , csNinst :: Int -- ^ The total number of instances
188 } deriving (Show)
189
190 -- | A simple type for allocation functions.
191 type AllocMethod = Node.List -- ^ Node list
192 -> Instance.List -- ^ Instance list
193 -> Maybe Int -- ^ Optional allocation limit
194 -> Instance.Instance -- ^ Instance spec for allocation
195 -> AllocNodes -- ^ Which nodes we should allocate on
196 -> [Instance.Instance] -- ^ Allocated instances
197 -> [CStats] -- ^ Running cluster stats
198 -> Result AllocResult -- ^ Allocation result
199
200 -- | A simple type for the running solution of evacuations.
201 type EvacInnerState =
202 Either String (Node.List, Instance.Instance, Score, Ndx)
203
204 -- * Utility functions
205
206 -- | Verifies the N+1 status and return the affected nodes.
207 verifyN1 :: [Node.Node] -> [Node.Node]
208 verifyN1 = filter Node.failN1
209
210 {-| Computes the pair of bad nodes and instances.
211
212 The bad node list is computed via a simple 'verifyN1' check, and the
213 bad instance list is the list of primary and secondary instances of
214 those nodes.
215
216 -}
217 computeBadItems :: Node.List -> Instance.List ->
218 ([Node.Node], [Instance.Instance])
219 computeBadItems nl il =
220 let bad_nodes = verifyN1 $ getOnline nl
221 bad_instances = map (`Container.find` il) .
222 sort . nub $
223 concatMap (\ n -> Node.sList n ++ Node.pList n) bad_nodes
224 in
225 (bad_nodes, bad_instances)
226
227 -- | Extracts the node pairs for an instance. This can fail if the
228 -- instance is single-homed. FIXME: this needs to be improved,
229 -- together with the general enhancement for handling non-DRBD moves.
230 instanceNodes :: Node.List -> Instance.Instance ->
231 (Ndx, Ndx, Node.Node, Node.Node)
232 instanceNodes nl inst =
233 let old_pdx = Instance.pNode inst
234 old_sdx = Instance.sNode inst
235 old_p = Container.find old_pdx nl
236 old_s = Container.find old_sdx nl
237 in (old_pdx, old_sdx, old_p, old_s)
238
239 -- | Zero-initializer for the CStats type.
240 emptyCStats :: CStats
241 emptyCStats = CStats 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
242
243 -- | Update stats with data from a new node.
244 updateCStats :: CStats -> Node.Node -> CStats
245 updateCStats cs node =
246 let CStats { csFmem = x_fmem, csFdsk = x_fdsk,
247 csAmem = x_amem, csAcpu = x_acpu, csAdsk = x_adsk,
248 csMmem = x_mmem, csMdsk = x_mdsk, csMcpu = x_mcpu,
249 csImem = x_imem, csIdsk = x_idsk, csIcpu = x_icpu,
250 csTmem = x_tmem, csTdsk = x_tdsk, csTcpu = x_tcpu,
251 csVcpu = x_vcpu, csNcpu = x_ncpu,
252 csXmem = x_xmem, csNmem = x_nmem, csNinst = x_ninst,
253 csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn
254 }
255 = cs
256 inc_amem = Node.fMem node - Node.rMem node
257 inc_amem' = if inc_amem > 0 then inc_amem else 0
258 inc_adsk = Node.availDisk node
259 inc_imem = truncate (Node.tMem node) - Node.nMem node
260 - Node.xMem node - Node.fMem node
261 inc_icpu = Node.uCpu node
262 inc_idsk = truncate (Node.tDsk node) - Node.fDsk node
263 inc_ispn = Node.tSpindles node - Node.fSpindles node
264 inc_vcpu = Node.hiCpu node
265 inc_acpu = Node.availCpu node
266 inc_ncpu = fromIntegral (Node.uCpu node) /
267 iPolicyVcpuRatio (Node.iPolicy node)
268 in cs { csFmem = x_fmem + fromIntegral (Node.fMem node)
269 , csFdsk = x_fdsk + fromIntegral (Node.fDsk node)
270 , csFspn = x_fspn + fromIntegral (Node.fSpindles node)
271 , csAmem = x_amem + fromIntegral inc_amem'
272 , csAdsk = x_adsk + fromIntegral inc_adsk
273 , csAcpu = x_acpu + fromIntegral inc_acpu
274 , csMmem = max x_mmem (fromIntegral inc_amem')
275 , csMdsk = max x_mdsk (fromIntegral inc_adsk)
276 , csMcpu = max x_mcpu (fromIntegral inc_acpu)
277 , csImem = x_imem + fromIntegral inc_imem
278 , csIdsk = x_idsk + fromIntegral inc_idsk
279 , csIspn = x_ispn + fromIntegral inc_ispn
280 , csIcpu = x_icpu + fromIntegral inc_icpu
281 , csTmem = x_tmem + Node.tMem node
282 , csTdsk = x_tdsk + Node.tDsk node
283 , csTspn = x_tspn + fromIntegral (Node.tSpindles node)
284 , csTcpu = x_tcpu + Node.tCpu node
285 , csVcpu = x_vcpu + fromIntegral inc_vcpu
286 , csNcpu = x_ncpu + inc_ncpu
287 , csXmem = x_xmem + fromIntegral (Node.xMem node)
288 , csNmem = x_nmem + fromIntegral (Node.nMem node)
289 , csNinst = x_ninst + length (Node.pList node)
290 }
291
292 -- | Compute the total free disk and memory in the cluster.
293 totalResources :: Node.List -> CStats
294 totalResources nl =
295 let cs = foldl' updateCStats emptyCStats . Container.elems $ nl
296 in cs { csScore = compCV nl }
297
298 -- | Compute the delta between two cluster state.
299 --
300 -- This is used when doing allocations, to understand better the
301 -- available cluster resources. The return value is a triple of the
302 -- current used values, the delta that was still allocated, and what
303 -- was left unallocated.
304 computeAllocationDelta :: CStats -> CStats -> AllocStats
305 computeAllocationDelta cini cfin =
306 let CStats {csImem = i_imem, csIdsk = i_idsk, csIcpu = i_icpu,
307 csNcpu = i_ncpu, csIspn = i_ispn } = cini
308 CStats {csImem = f_imem, csIdsk = f_idsk, csIcpu = f_icpu,
309 csTmem = t_mem, csTdsk = t_dsk, csVcpu = f_vcpu,
310 csNcpu = f_ncpu, csTcpu = f_tcpu,
311 csIspn = f_ispn, csTspn = t_spn } = cfin
312 rini = AllocInfo { allocInfoVCpus = fromIntegral i_icpu
313 , allocInfoNCpus = i_ncpu
314 , allocInfoMem = fromIntegral i_imem
315 , allocInfoDisk = fromIntegral i_idsk
316 , allocInfoSpn = fromIntegral i_ispn
317 }
318 rfin = AllocInfo { allocInfoVCpus = fromIntegral (f_icpu - i_icpu)
319 , allocInfoNCpus = f_ncpu - i_ncpu
320 , allocInfoMem = fromIntegral (f_imem - i_imem)
321 , allocInfoDisk = fromIntegral (f_idsk - i_idsk)
322 , allocInfoSpn = fromIntegral (f_ispn - i_ispn)
323 }
324 runa = AllocInfo { allocInfoVCpus = fromIntegral (f_vcpu - f_icpu)
325 , allocInfoNCpus = f_tcpu - f_ncpu
326 , allocInfoMem = truncate t_mem - fromIntegral f_imem
327 , allocInfoDisk = truncate t_dsk - fromIntegral f_idsk
328 , allocInfoSpn = truncate t_spn - fromIntegral f_ispn
329 }
330 in (rini, rfin, runa)
331
332 -- | The names and weights of the individual elements in the CV list, together
333 -- with their statistical accumulation function and a bit to decide whether it
334 -- is a statistics for online nodes.
335 detailedCVInfoExt :: [((Double, String), ([Double] -> Statistics, Bool))]
336 detailedCVInfoExt = [ ((1, "free_mem_cv"), (getStdDevStatistics, True))
337 , ((1, "free_disk_cv"), (getStdDevStatistics, True))
338 , ((1, "n1_cnt"), (getSumStatistics, True))
339 , ((1, "reserved_mem_cv"), (getStdDevStatistics, True))
340 , ((4, "offline_all_cnt"), (getSumStatistics, False))
341 , ((16, "offline_pri_cnt"), (getSumStatistics, False))
342 , ((1, "vcpu_ratio_cv"), (getStdDevStatistics, True))
343 , ((1, "cpu_load_cv"), (getStdDevStatistics, True))
344 , ((1, "mem_load_cv"), (getStdDevStatistics, True))
345 , ((1, "disk_load_cv"), (getStdDevStatistics, True))
346 , ((1, "net_load_cv"), (getStdDevStatistics, True))
347 , ((2, "pri_tags_score"), (getSumStatistics, True))
348 , ((1, "spindles_cv"), (getStdDevStatistics, True))
349 ]
350
351 -- | The names and weights of the individual elements in the CV list.
352 detailedCVInfo :: [(Double, String)]
353 detailedCVInfo = map fst detailedCVInfoExt
354
355 -- | Holds the weights used by 'compCVNodes' for each metric.
356 detailedCVWeights :: [Double]
357 detailedCVWeights = map fst detailedCVInfo
358
359 -- | The aggregation functions for the weights
360 detailedCVAggregation :: [([Double] -> Statistics, Bool)]
361 detailedCVAggregation = map snd detailedCVInfoExt
362
363 -- | The bit vector describing which parts of the statistics are
364 -- for online nodes.
365 detailedCVOnlineStatus :: [Bool]
366 detailedCVOnlineStatus = map snd detailedCVAggregation
367
368 -- | Compute statistical measures of a single node.
369 compDetailedCVNode :: Node.Node -> [Double]
370 compDetailedCVNode node =
371 let mem = Node.pMem node
372 dsk = Node.pDsk node
373 n1 = fromIntegral
374 $ if Node.failN1 node
375 then length (Node.sList node) + length (Node.pList node)
376 else 0
377 res = Node.pRem node
378 ipri = fromIntegral . length $ Node.pList node
379 isec = fromIntegral . length $ Node.sList node
380 ioff = ipri + isec
381 cpu = Node.pCpuEff node
382 DynUtil c1 m1 d1 nn1 = Node.utilLoad node
383 DynUtil c2 m2 d2 nn2 = Node.utilPool node
384 (c_load, m_load, d_load, n_load) = (c1/c2, m1/m2, d1/d2, nn1/nn2)
385 pri_tags = fromIntegral $ Node.conflictingPrimaries node
386 spindles = Node.instSpindles node / Node.hiSpindles node
387 in [ mem, dsk, n1, res, ioff, ipri, cpu
388 , c_load, m_load, d_load, n_load
389 , pri_tags, spindles
390 ]
391
392 -- | Compute the statistics of a cluster.
393 compClusterStatistics :: [Node.Node] -> [Statistics]
394 compClusterStatistics all_nodes =
395 let (offline, nodes) = partition Node.offline all_nodes
396 offline_values = transpose (map compDetailedCVNode offline)
397 ++ repeat []
398 -- transpose of an empty list is empty and not k times the empty list, as
399 -- would be the transpose of a 0 x k matrix
400 online_values = transpose $ map compDetailedCVNode nodes
401 aggregate (f, True) (onNodes, _) = f onNodes
402 aggregate (f, False) (_, offNodes) = f offNodes
403 in zipWith aggregate detailedCVAggregation
404 $ zip online_values offline_values
405
406 -- | Update a cluster statistics by replacing the contribution of one
407 -- node by that of another.
408 updateClusterStatistics :: [Statistics]
409 -> (Node.Node, Node.Node) -> [Statistics]
410 updateClusterStatistics stats (old, new) =
411 let update = zip (compDetailedCVNode old) (compDetailedCVNode new)
412 online = not $ Node.offline old
413 updateStat forOnline stat upd = if forOnline == online
414 then updateStatistics stat upd
415 else stat
416 in zipWith3 updateStat detailedCVOnlineStatus stats update
417
418 -- | Update a cluster statistics twice.
419 updateClusterStatisticsTwice :: [Statistics]
420 -> (Node.Node, Node.Node)
421 -> (Node.Node, Node.Node)
422 -> [Statistics]
423 updateClusterStatisticsTwice s a =
424 updateClusterStatistics (updateClusterStatistics s a)
425
426 -- | Compute cluster statistics
427 compDetailedCV :: [Node.Node] -> [Double]
428 compDetailedCV = map getStatisticValue . compClusterStatistics
429
430 -- | Compute the cluster score from its statistics
431 compCVfromStats :: [Statistics] -> Double
432 compCVfromStats = sum . zipWith (*) detailedCVWeights . map getStatisticValue
433
434 -- | Compute the /total/ variance.
435 compCVNodes :: [Node.Node] -> Double
436 compCVNodes = sum . zipWith (*) detailedCVWeights . compDetailedCV
437
438 -- | Wrapper over 'compCVNodes' for callers that have a 'Node.List'.
439 compCV :: Node.List -> Double
440 compCV = compCVNodes . Container.elems
441
442 -- | Compute online nodes from a 'Node.List'.
443 getOnline :: Node.List -> [Node.Node]
444 getOnline = filter (not . Node.offline) . Container.elems
445
446 -- * Balancing functions
447
448 -- | Compute best table. Note that the ordering of the arguments is important.
449 compareTables :: Table -> Table -> Table
450 compareTables a@(Table _ _ a_cv _) b@(Table _ _ b_cv _ ) =
451 if a_cv > b_cv then b else a
452
453 -- | Applies an instance move to a given node list and instance.
454 applyMove :: Node.List -> Instance.Instance
455 -> IMove -> OpResult (Node.List, Instance.Instance, Ndx, Ndx)
456 -- Failover (f)
457 applyMove nl inst Failover =
458 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
459 int_p = Node.removePri old_p inst
460 int_s = Node.removeSec old_s inst
461 new_nl = do -- Maybe monad
462 new_p <- Node.addPriEx (Node.offline old_p) int_s inst
463 new_s <- Node.addSecExEx (Node.offline old_p) (Node.offline old_p)
464 int_p inst old_sdx
465 let new_inst = Instance.setBoth inst old_sdx old_pdx
466 return (Container.addTwo old_pdx new_s old_sdx new_p nl,
467 new_inst, old_sdx, old_pdx)
468 in new_nl
469
470 -- Failover to any (fa)
471 applyMove nl inst (FailoverToAny new_pdx) = do
472 let (old_pdx, old_sdx, old_pnode, _) = instanceNodes nl inst
473 new_pnode = Container.find new_pdx nl
474 force_failover = Node.offline old_pnode
475 new_pnode' <- Node.addPriEx force_failover new_pnode inst
476 let old_pnode' = Node.removePri old_pnode inst
477 inst' = Instance.setPri inst new_pdx
478 nl' = Container.addTwo old_pdx old_pnode' new_pdx new_pnode' nl
479 return (nl', inst', new_pdx, old_sdx)
480
481 -- Replace the primary (f:, r:np, f)
482 applyMove nl inst (ReplacePrimary new_pdx) =
483 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
484 tgt_n = Container.find new_pdx nl
485 int_p = Node.removePri old_p inst
486 int_s = Node.removeSec old_s inst
487 force_p = Node.offline old_p
488 new_nl = do -- Maybe monad
489 -- check that the current secondary can host the instance
490 -- during the migration
491 tmp_s <- Node.addPriEx force_p int_s inst
492 let tmp_s' = Node.removePri tmp_s inst
493 new_p <- Node.addPriEx force_p tgt_n inst
494 new_s <- Node.addSecEx force_p tmp_s' inst new_pdx
495 let new_inst = Instance.setPri inst new_pdx
496 return (Container.add new_pdx new_p $
497 Container.addTwo old_pdx int_p old_sdx new_s nl,
498 new_inst, new_pdx, old_sdx)
499 in new_nl
500
501 -- Replace the secondary (r:ns)
502 applyMove nl inst (ReplaceSecondary new_sdx) =
503 let old_pdx = Instance.pNode inst
504 old_sdx = Instance.sNode inst
505 old_s = Container.find old_sdx nl
506 tgt_n = Container.find new_sdx nl
507 int_s = Node.removeSec old_s inst
508 force_s = Node.offline old_s
509 new_inst = Instance.setSec inst new_sdx
510 new_nl = Node.addSecEx force_s tgt_n inst old_pdx >>=
511 \new_s -> return (Container.addTwo new_sdx
512 new_s old_sdx int_s nl,
513 new_inst, old_pdx, new_sdx)
514 in new_nl
515
516 -- Replace the secondary and failover (r:np, f)
517 applyMove nl inst (ReplaceAndFailover new_pdx) =
518 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
519 tgt_n = Container.find new_pdx nl
520 int_p = Node.removePri old_p inst
521 int_s = Node.removeSec old_s inst
522 force_s = Node.offline old_s
523 new_nl = do -- Maybe monad
524 new_p <- Node.addPri tgt_n inst
525 new_s <- Node.addSecEx force_s int_p inst new_pdx
526 let new_inst = Instance.setBoth inst new_pdx old_pdx
527 return (Container.add new_pdx new_p $
528 Container.addTwo old_pdx new_s old_sdx int_s nl,
529 new_inst, new_pdx, old_pdx)
530 in new_nl
531
532 -- Failver and replace the secondary (f, r:ns)
533 applyMove nl inst (FailoverAndReplace new_sdx) =
534 let (old_pdx, old_sdx, old_p, old_s) = instanceNodes nl inst
535 tgt_n = Container.find new_sdx nl
536 int_p = Node.removePri old_p inst
537 int_s = Node.removeSec old_s inst
538 force_p = Node.offline old_p
539 new_nl = do -- Maybe monad
540 new_p <- Node.addPriEx force_p int_s inst
541 new_s <- Node.addSecEx force_p tgt_n inst old_sdx
542 let new_inst = Instance.setBoth inst old_sdx new_sdx
543 return (Container.add new_sdx new_s $
544 Container.addTwo old_sdx new_p old_pdx int_p nl,
545 new_inst, old_sdx, new_sdx)
546 in new_nl
547
548 -- | Tries to allocate an instance on one given node.
549 allocateOnSingle :: Node.List -> Instance.Instance -> Ndx
550 -> OpResult Node.AllocElement
551 allocateOnSingle nl inst new_pdx =
552 let p = Container.find new_pdx nl
553 new_inst = Instance.setBoth inst new_pdx Node.noSecondary
554 in do
555 Instance.instMatchesPolicy inst (Node.iPolicy p) (Node.exclStorage p)
556 new_p <- Node.addPri p inst
557 let new_nl = Container.add new_pdx new_p nl
558 new_score = compCV new_nl
559 return (new_nl, new_inst, [new_p], new_score)
560
561 -- | Tries to allocate an instance on a given pair of nodes.
562 allocateOnPair :: [Statistics]
563 -> Node.List -> Instance.Instance -> Ndx -> Ndx
564 -> OpResult Node.AllocElement
565 allocateOnPair stats nl inst new_pdx new_sdx =
566 let tgt_p = Container.find new_pdx nl
567 tgt_s = Container.find new_sdx nl
568 in do
569 Instance.instMatchesPolicy inst (Node.iPolicy tgt_p)
570 (Node.exclStorage tgt_p)
571 new_p <- Node.addPri tgt_p inst
572 new_s <- Node.addSec tgt_s inst new_pdx
573 let new_inst = Instance.setBoth inst new_pdx new_sdx
574 new_nl = Container.addTwo new_pdx new_p new_sdx new_s nl
575 new_stats = updateClusterStatisticsTwice stats
576 (tgt_p, new_p) (tgt_s, new_s)
577 return (new_nl, new_inst, [new_p, new_s], compCVfromStats new_stats)
578
579 -- | Tries to perform an instance move and returns the best table
580 -- between the original one and the new one.
581 checkSingleStep :: Table -- ^ The original table
582 -> Instance.Instance -- ^ The instance to move
583 -> Table -- ^ The current best table
584 -> IMove -- ^ The move to apply
585 -> Table -- ^ The final best table
586 checkSingleStep ini_tbl target cur_tbl move =
587 let Table ini_nl ini_il _ ini_plc = ini_tbl
588 tmp_resu = applyMove ini_nl target move
589 in case tmp_resu of
590 Bad _ -> cur_tbl
591 Ok (upd_nl, new_inst, pri_idx, sec_idx) ->
592 let tgt_idx = Instance.idx target
593 upd_cvar = compCV upd_nl
594 upd_il = Container.add tgt_idx new_inst ini_il
595 upd_plc = (tgt_idx, pri_idx, sec_idx, move, upd_cvar):ini_plc
596 upd_tbl = Table upd_nl upd_il upd_cvar upd_plc
597 in compareTables cur_tbl upd_tbl
598
599 -- | Given the status of the current secondary as a valid new node and
600 -- the current candidate target node, generate the possible moves for
601 -- a instance.
602 possibleMoves :: MirrorType -- ^ The mirroring type of the instance
603 -> Bool -- ^ Whether the secondary node is a valid new node
604 -> Bool -- ^ Whether we can change the primary node
605 -> Bool -- ^ Whether we alowed to move disks
606 -> (Bool, Bool) -- ^ Whether migration is restricted and whether
607 -- the instance primary is offline
608 -> Ndx -- ^ Target node candidate
609 -> [IMove] -- ^ List of valid result moves
610
611 possibleMoves MirrorNone _ _ _ _ _ = []
612
613 possibleMoves MirrorExternal _ False _ _ _ = []
614
615 possibleMoves MirrorExternal _ True _ _ tdx =
616 [ FailoverToAny tdx ]
617
618 possibleMoves MirrorInternal _ _ False _ _ = []
619
620 possibleMoves MirrorInternal _ False True _ tdx =
621 [ ReplaceSecondary tdx ]
622
623 possibleMoves MirrorInternal _ _ True (True, False) tdx =
624 [ ReplaceSecondary tdx
625 ]
626
627 possibleMoves MirrorInternal True True True (False, _) tdx =
628 [ ReplaceSecondary tdx
629 , ReplaceAndFailover tdx
630 , ReplacePrimary tdx
631 , FailoverAndReplace tdx
632 ]
633
634 possibleMoves MirrorInternal True True True (True, True) tdx =
635 [ ReplaceSecondary tdx
636 , ReplaceAndFailover tdx
637 , FailoverAndReplace tdx
638 ]
639
640 possibleMoves MirrorInternal False True True _ tdx =
641 [ ReplaceSecondary tdx
642 , ReplaceAndFailover tdx
643 ]
644
645 -- | Compute the best move for a given instance.
646 checkInstanceMove :: [Ndx] -- ^ Allowed target node indices
647 -> Bool -- ^ Whether disk moves are allowed
648 -> Bool -- ^ Whether instance moves are allowed
649 -> Bool -- ^ Whether migration is restricted
650 -> Table -- ^ Original table
651 -> Instance.Instance -- ^ Instance to move
652 -> Table -- ^ Best new table for this instance
653 checkInstanceMove nodes_idx disk_moves inst_moves rest_mig
654 ini_tbl@(Table nl _ _ _) target =
655 let opdx = Instance.pNode target
656 osdx = Instance.sNode target
657 bad_nodes = [opdx, osdx]
658 nodes = filter (`notElem` bad_nodes) nodes_idx
659 mir_type = Instance.mirrorType target
660 use_secondary = elem osdx nodes_idx && inst_moves
661 aft_failover = if mir_type == MirrorInternal && use_secondary
662 -- if drbd and allowed to failover
663 then checkSingleStep ini_tbl target ini_tbl Failover
664 else ini_tbl
665 primary_drained = Node.offline
666 . flip Container.find nl
667 $ Instance.pNode target
668 all_moves = concatMap (possibleMoves mir_type use_secondary inst_moves
669 disk_moves (rest_mig, primary_drained)) nodes
670 in
671 -- iterate over the possible nodes for this instance
672 foldl' (checkSingleStep ini_tbl target) aft_failover all_moves
673
674 -- | Compute the best next move.
675 checkMove :: [Ndx] -- ^ Allowed target node indices
676 -> Bool -- ^ Whether disk moves are allowed
677 -> Bool -- ^ Whether instance moves are allowed
678 -> Bool -- ^ Whether migration is restricted
679 -> Table -- ^ The current solution
680 -> [Instance.Instance] -- ^ List of instances still to move
681 -> Table -- ^ The new solution
682 checkMove nodes_idx disk_moves inst_moves rest_mig ini_tbl victims =
683 let Table _ _ _ ini_plc = ini_tbl
684 -- we're using rwhnf from the Control.Parallel.Strategies
685 -- package; we don't need to use rnf as that would force too
686 -- much evaluation in single-threaded cases, and in
687 -- multi-threaded case the weak head normal form is enough to
688 -- spark the evaluation
689 tables = parMap rwhnf (checkInstanceMove nodes_idx disk_moves
690 inst_moves rest_mig ini_tbl)
691 victims
692 -- iterate over all instances, computing the best move
693 best_tbl = foldl' compareTables ini_tbl tables
694 Table _ _ _ best_plc = best_tbl
695 in if length best_plc == length ini_plc
696 then ini_tbl -- no advancement
697 else best_tbl
698
699 -- | Check if we are allowed to go deeper in the balancing.
700 doNextBalance :: Table -- ^ The starting table
701 -> Int -- ^ Remaining length
702 -> Score -- ^ Score at which to stop
703 -> Bool -- ^ The resulting table and commands
704 doNextBalance ini_tbl max_rounds min_score =
705 let Table _ _ ini_cv ini_plc = ini_tbl
706 ini_plc_len = length ini_plc
707 in (max_rounds < 0 || ini_plc_len < max_rounds) && ini_cv > min_score
708
709 -- | Run a balance move.
710 tryBalance :: Table -- ^ The starting table
711 -> Bool -- ^ Allow disk moves
712 -> Bool -- ^ Allow instance moves
713 -> Bool -- ^ Only evacuate moves
714 -> Bool -- ^ Restrict migration
715 -> Score -- ^ Min gain threshold
716 -> Score -- ^ Min gain
717 -> Maybe Table -- ^ The resulting table and commands
718 tryBalance ini_tbl disk_moves inst_moves evac_mode rest_mig mg_limit min_gain =
719 let Table ini_nl ini_il ini_cv _ = ini_tbl
720 all_inst = Container.elems ini_il
721 all_nodes = Container.elems ini_nl
722 (offline_nodes, online_nodes) = partition Node.offline all_nodes
723 all_inst' = if evac_mode
724 then let bad_nodes = map Node.idx offline_nodes
725 in filter (any (`elem` bad_nodes) .
726 Instance.allNodes) all_inst
727 else all_inst
728 reloc_inst = filter (\i -> Instance.movable i &&
729 Instance.autoBalance i) all_inst'
730 node_idx = map Node.idx online_nodes
731 fin_tbl = checkMove node_idx disk_moves inst_moves rest_mig
732 ini_tbl reloc_inst
733 (Table _ _ fin_cv _) = fin_tbl
734 in
735 if fin_cv < ini_cv && (ini_cv > mg_limit || ini_cv - fin_cv >= min_gain)
736 then Just fin_tbl -- this round made success, return the new table
737 else Nothing
738
739 -- * Allocation functions
740
741 -- | Build failure stats out of a list of failures.
742 collapseFailures :: [FailMode] -> FailStats
743 collapseFailures flst =
744 map (\k -> (k, foldl' (\a e -> if e == k then a + 1 else a) 0 flst))
745 [minBound..maxBound]
746
747 -- | Compares two Maybe AllocElement and chooses the best score.
748 bestAllocElement :: Maybe Node.AllocElement
749 -> Maybe Node.AllocElement
750 -> Maybe Node.AllocElement
751 bestAllocElement a Nothing = a
752 bestAllocElement Nothing b = b
753 bestAllocElement a@(Just (_, _, _, ascore)) b@(Just (_, _, _, bscore)) =
754 if ascore < bscore then a else b
755
756 -- | Update current Allocation solution and failure stats with new
757 -- elements.
758 concatAllocs :: AllocSolution -> OpResult Node.AllocElement -> AllocSolution
759 concatAllocs as (Bad reason) = as { asFailures = reason : asFailures as }
760
761 concatAllocs as (Ok ns) =
762 let -- Choose the old or new solution, based on the cluster score
763 cntok = asAllocs as
764 osols = asSolution as
765 nsols = bestAllocElement osols (Just ns)
766 nsuc = cntok + 1
767 -- Note: we force evaluation of nsols here in order to keep the
768 -- memory profile low - we know that we will need nsols for sure
769 -- in the next cycle, so we force evaluation of nsols, since the
770 -- foldl' in the caller will only evaluate the tuple, but not the
771 -- elements of the tuple
772 in nsols `seq` nsuc `seq` as { asAllocs = nsuc, asSolution = nsols }
773
774 -- | Sums two 'AllocSolution' structures.
775 sumAllocs :: AllocSolution -> AllocSolution -> AllocSolution
776 sumAllocs (AllocSolution aFails aAllocs aSols aLog)
777 (AllocSolution bFails bAllocs bSols bLog) =
778 -- note: we add b first, since usually it will be smaller; when
779 -- fold'ing, a will grow and grow whereas b is the per-group
780 -- result, hence smaller
781 let nFails = bFails ++ aFails
782 nAllocs = aAllocs + bAllocs
783 nSols = bestAllocElement aSols bSols
784 nLog = bLog ++ aLog
785 in AllocSolution nFails nAllocs nSols nLog
786
787 -- | Given a solution, generates a reasonable description for it.
788 describeSolution :: AllocSolution -> String
789 describeSolution as =
790 let fcnt = asFailures as
791 sols = asSolution as
792 freasons =
793 intercalate ", " . map (\(a, b) -> printf "%s: %d" (show a) b) .
794 filter ((> 0) . snd) . collapseFailures $ fcnt
795 in case sols of
796 Nothing -> "No valid allocation solutions, failure reasons: " ++
797 (if null fcnt then "unknown reasons" else freasons)
798 Just (_, _, nodes, cv) ->
799 printf ("score: %.8f, successes %d, failures %d (%s)" ++
800 " for node(s) %s") cv (asAllocs as) (length fcnt) freasons
801 (intercalate "/" . map Node.name $ nodes)
802
803 -- | Annotates a solution with the appropriate string.
804 annotateSolution :: AllocSolution -> AllocSolution
805 annotateSolution as = as { asLog = describeSolution as : asLog as }
806
807 -- | Reverses an evacuation solution.
808 --
809 -- Rationale: we always concat the results to the top of the lists, so
810 -- for proper jobset execution, we should reverse all lists.
811 reverseEvacSolution :: EvacSolution -> EvacSolution
812 reverseEvacSolution (EvacSolution f m o) =
813 EvacSolution (reverse f) (reverse m) (reverse o)
814
815 -- | Generate the valid node allocation singles or pairs for a new instance.
816 genAllocNodes :: Group.List -- ^ Group list
817 -> Node.List -- ^ The node map
818 -> Int -- ^ The number of nodes required
819 -> Bool -- ^ Whether to drop or not
820 -- unallocable nodes
821 -> Result AllocNodes -- ^ The (monadic) result
822 genAllocNodes gl nl count drop_unalloc =
823 let filter_fn = if drop_unalloc
824 then filter (Group.isAllocable .
825 flip Container.find gl . Node.group)
826 else id
827 all_nodes = filter_fn $ getOnline nl
828 all_pairs = [(Node.idx p,
829 [Node.idx s | s <- all_nodes,
830 Node.idx p /= Node.idx s,
831 Node.group p == Node.group s]) |
832 p <- all_nodes]
833 in case count of
834 1 -> Ok (Left (map Node.idx all_nodes))
835 2 -> Ok (Right (filter (not . null . snd) all_pairs))
836 _ -> Bad "Unsupported number of nodes, only one or two supported"
837
838 -- | Try to allocate an instance on the cluster.
839 tryAlloc :: (Monad m) =>
840 Node.List -- ^ The node list
841 -> Instance.List -- ^ The instance list
842 -> Instance.Instance -- ^ The instance to allocate
843 -> AllocNodes -- ^ The allocation targets
844 -> m AllocSolution -- ^ Possible solution list
845 tryAlloc _ _ _ (Right []) = fail "Not enough online nodes"
846 tryAlloc nl _ inst (Right ok_pairs) =
847 let cstat = compClusterStatistics $ Container.elems nl
848 psols = parMap rwhnf (\(p, ss) ->
849 foldl' (\cstate ->
850 concatAllocs cstate .
851 allocateOnPair cstat nl inst p)
852 emptyAllocSolution ss) ok_pairs
853 sols = foldl' sumAllocs emptyAllocSolution psols
854 in return $ annotateSolution sols
855
856 tryAlloc _ _ _ (Left []) = fail "No online nodes"
857 tryAlloc nl _ inst (Left all_nodes) =
858 let sols = foldl' (\cstate ->
859 concatAllocs cstate . allocateOnSingle nl inst
860 ) emptyAllocSolution all_nodes
861 in return $ annotateSolution sols
862
863 -- | Given a group/result, describe it as a nice (list of) messages.
864 solutionDescription :: (Group.Group, Result AllocSolution)
865 -> [String]
866 solutionDescription (grp, result) =
867 case result of
868 Ok solution -> map (printf "Group %s (%s): %s" gname pol) (asLog solution)
869 Bad message -> [printf "Group %s: error %s" gname message]
870 where gname = Group.name grp
871 pol = allocPolicyToRaw (Group.allocPolicy grp)
872
873 -- | From a list of possibly bad and possibly empty solutions, filter
874 -- only the groups with a valid result. Note that the result will be
875 -- reversed compared to the original list.
876 filterMGResults :: [(Group.Group, Result AllocSolution)]
877 -> [(Group.Group, AllocSolution)]
878 filterMGResults = foldl' fn []
879 where unallocable = not . Group.isAllocable
880 fn accu (grp, rasol) =
881 case rasol of
882 Bad _ -> accu
883 Ok sol | isNothing (asSolution sol) -> accu
884 | unallocable grp -> accu
885 | otherwise -> (grp, sol):accu
886
887 -- | Sort multigroup results based on policy and score.
888 sortMGResults :: [(Group.Group, AllocSolution)]
889 -> [(Group.Group, AllocSolution)]
890 sortMGResults sols =
891 let extractScore (_, _, _, x) = x
892 solScore (grp, sol) = (Group.allocPolicy grp,
893 (extractScore . fromJust . asSolution) sol)
894 in sortBy (comparing solScore) sols
895
896 -- | Removes node groups which can't accommodate the instance
897 filterValidGroups :: [(Group.Group, (Node.List, Instance.List))]
898 -> Instance.Instance
899 -> ([(Group.Group, (Node.List, Instance.List))], [String])
900 filterValidGroups [] _ = ([], [])
901 filterValidGroups (ng:ngs) inst =
902 let (valid_ngs, msgs) = filterValidGroups ngs inst
903 hasNetwork nic = case Nic.network nic of
904 Just net -> net `elem` Group.networks (fst ng)
905 Nothing -> True
906 hasRequiredNetworks = all hasNetwork (Instance.nics inst)
907 in if hasRequiredNetworks
908 then (ng:valid_ngs, msgs)
909 else (valid_ngs,
910 ("group " ++ Group.name (fst ng) ++
911 " is not connected to a network required by instance " ++
912 Instance.name inst):msgs)
913
914 -- | Finds the best group for an instance on a multi-group cluster.
915 --
916 -- Only solutions in @preferred@ and @last_resort@ groups will be
917 -- accepted as valid, and additionally if the allowed groups parameter
918 -- is not null then allocation will only be run for those group
919 -- indices.
920 findBestAllocGroup :: Group.List -- ^ The group list
921 -> Node.List -- ^ The node list
922 -> Instance.List -- ^ The instance list
923 -> Maybe [Gdx] -- ^ The allowed groups
924 -> Instance.Instance -- ^ The instance to allocate
925 -> Int -- ^ Required number of nodes
926 -> Result (Group.Group, AllocSolution, [String])
927 findBestAllocGroup mggl mgnl mgil allowed_gdxs inst cnt =
928 let groups_by_idx = splitCluster mgnl mgil
929 groups = map (\(gid, d) -> (Container.find gid mggl, d)) groups_by_idx
930 groups' = maybe groups
931 (\gs -> filter ((`elem` gs) . Group.idx . fst) groups)
932 allowed_gdxs
933 (groups'', filter_group_msgs) = filterValidGroups groups' inst
934 sols = map (\(gr, (nl, il)) ->
935 (gr, genAllocNodes mggl nl cnt False >>=
936 tryAlloc nl il inst))
937 groups''::[(Group.Group, Result AllocSolution)]
938 all_msgs = filter_group_msgs ++ concatMap solutionDescription sols
939 goodSols = filterMGResults sols
940 sortedSols = sortMGResults goodSols
941 in case sortedSols of
942 [] -> Bad $ if null groups'
943 then "no groups for evacuation: allowed groups was " ++
944 show allowed_gdxs ++ ", all groups: " ++
945 show (map fst groups)
946 else intercalate ", " all_msgs
947 (final_group, final_sol):_ -> return (final_group, final_sol, all_msgs)
948
949 -- | Try to allocate an instance on a multi-group cluster.
950 tryMGAlloc :: Group.List -- ^ The group list
951 -> Node.List -- ^ The node list
952 -> Instance.List -- ^ The instance list
953 -> Instance.Instance -- ^ The instance to allocate
954 -> Int -- ^ Required number of nodes
955 -> Result AllocSolution -- ^ Possible solution list
956 tryMGAlloc mggl mgnl mgil inst cnt = do
957 (best_group, solution, all_msgs) <-
958 findBestAllocGroup mggl mgnl mgil Nothing inst cnt
959 let group_name = Group.name best_group
960 selmsg = "Selected group: " ++ group_name
961 return $ solution { asLog = selmsg:all_msgs }
962
963 -- | Calculate the new instance list after allocation solution.
964 updateIl :: Instance.List -- ^ The original instance list
965 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
966 -> Instance.List -- ^ The updated instance list
967 updateIl il Nothing = il
968 updateIl il (Just (_, xi, _, _)) = Container.add (Container.size il) xi il
969
970 -- | Extract the the new node list from the allocation solution.
971 extractNl :: Node.List -- ^ The original node list
972 -> Maybe Node.AllocElement -- ^ The result of the allocation attempt
973 -> Node.List -- ^ The new node list
974 extractNl nl Nothing = nl
975 extractNl _ (Just (xnl, _, _, _)) = xnl
976
977 -- | Try to allocate a list of instances on a multi-group cluster.
978 allocList :: Group.List -- ^ The group list
979 -> Node.List -- ^ The node list
980 -> Instance.List -- ^ The instance list
981 -> [(Instance.Instance, Int)] -- ^ The instance to allocate
982 -> AllocSolutionList -- ^ Possible solution list
983 -> Result (Node.List, Instance.List,
984 AllocSolutionList) -- ^ The final solution list
985 allocList _ nl il [] result = Ok (nl, il, result)
986 allocList gl nl il ((xi, xicnt):xies) result = do
987 ares <- tryMGAlloc gl nl il xi xicnt
988 let sol = asSolution ares
989 nl' = extractNl nl sol
990 il' = updateIl il sol
991 allocList gl nl' il' xies ((xi, ares):result)
992
993 -- | Function which fails if the requested mode is change secondary.
994 --
995 -- This is useful since except DRBD, no other disk template can
996 -- execute change secondary; thus, we can just call this function
997 -- instead of always checking for secondary mode. After the call to
998 -- this function, whatever mode we have is just a primary change.
999 failOnSecondaryChange :: (Monad m) => EvacMode -> DiskTemplate -> m ()
1000 failOnSecondaryChange ChangeSecondary dt =
1001 fail $ "Instances with disk template '" ++ diskTemplateToRaw dt ++
1002 "' can't execute change secondary"
1003 failOnSecondaryChange _ _ = return ()
1004
1005 -- | Run evacuation for a single instance.
1006 --
1007 -- /Note:/ this function should correctly execute both intra-group
1008 -- evacuations (in all modes) and inter-group evacuations (in the
1009 -- 'ChangeAll' mode). Of course, this requires that the correct list
1010 -- of target nodes is passed.
1011 nodeEvacInstance :: Node.List -- ^ The node list (cluster-wide)
1012 -> Instance.List -- ^ Instance list (cluster-wide)
1013 -> EvacMode -- ^ The evacuation mode
1014 -> Instance.Instance -- ^ The instance to be evacuated
1015 -> Gdx -- ^ The group we're targetting
1016 -> [Ndx] -- ^ The list of available nodes
1017 -- for allocation
1018 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1019 nodeEvacInstance nl il mode inst@(Instance.Instance
1020 {Instance.diskTemplate = dt@DTDiskless})
1021 gdx avail_nodes =
1022 failOnSecondaryChange mode dt >>
1023 evacOneNodeOnly nl il inst gdx avail_nodes
1024
1025 nodeEvacInstance _ _ _ (Instance.Instance
1026 {Instance.diskTemplate = DTPlain}) _ _ =
1027 fail "Instances of type plain cannot be relocated"
1028
1029 nodeEvacInstance _ _ _ (Instance.Instance
1030 {Instance.diskTemplate = DTFile}) _ _ =
1031 fail "Instances of type file cannot be relocated"
1032
1033 nodeEvacInstance nl il mode inst@(Instance.Instance
1034 {Instance.diskTemplate = dt@DTSharedFile})
1035 gdx avail_nodes =
1036 failOnSecondaryChange mode dt >>
1037 evacOneNodeOnly nl il inst gdx avail_nodes
1038
1039 nodeEvacInstance nl il mode inst@(Instance.Instance
1040 {Instance.diskTemplate = dt@DTBlock})
1041 gdx avail_nodes =
1042 failOnSecondaryChange mode dt >>
1043 evacOneNodeOnly nl il inst gdx avail_nodes
1044
1045 nodeEvacInstance nl il mode inst@(Instance.Instance
1046 {Instance.diskTemplate = dt@DTRbd})
1047 gdx avail_nodes =
1048 failOnSecondaryChange mode dt >>
1049 evacOneNodeOnly nl il inst gdx avail_nodes
1050
1051 nodeEvacInstance nl il mode inst@(Instance.Instance
1052 {Instance.diskTemplate = dt@DTExt})
1053 gdx avail_nodes =
1054 failOnSecondaryChange mode dt >>
1055 evacOneNodeOnly nl il inst gdx avail_nodes
1056
1057 nodeEvacInstance nl il mode inst@(Instance.Instance
1058 {Instance.diskTemplate = dt@DTGluster})
1059 gdx avail_nodes =
1060 failOnSecondaryChange mode dt >>
1061 evacOneNodeOnly nl il inst gdx avail_nodes
1062
1063 nodeEvacInstance nl il ChangePrimary
1064 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1065 _ _ =
1066 do
1067 (nl', inst', _, _) <- opToResult $ applyMove nl inst Failover
1068 let idx = Instance.idx inst
1069 il' = Container.add idx inst' il
1070 ops = iMoveToJob nl' il' idx Failover
1071 return (nl', il', ops)
1072
1073 nodeEvacInstance nl il ChangeSecondary
1074 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1075 gdx avail_nodes =
1076 evacOneNodeOnly nl il inst gdx avail_nodes
1077
1078 -- The algorithm for ChangeAll is as follows:
1079 --
1080 -- * generate all (primary, secondary) node pairs for the target groups
1081 -- * for each pair, execute the needed moves (r:s, f, r:s) and compute
1082 -- the final node list state and group score
1083 -- * select the best choice via a foldl that uses the same Either
1084 -- String solution as the ChangeSecondary mode
1085 nodeEvacInstance nl il ChangeAll
1086 inst@(Instance.Instance {Instance.diskTemplate = DTDrbd8})
1087 gdx avail_nodes =
1088 do
1089 let no_nodes = Left "no nodes available"
1090 node_pairs = [(p,s) | p <- avail_nodes, s <- avail_nodes, p /= s]
1091 (nl', il', ops, _) <-
1092 annotateResult "Can't find any good nodes for relocation" .
1093 eitherToResult $
1094 foldl'
1095 (\accu nodes -> case evacDrbdAllInner nl il inst gdx nodes of
1096 Bad msg ->
1097 case accu of
1098 Right _ -> accu
1099 -- we don't need more details (which
1100 -- nodes, etc.) as we only selected
1101 -- this group if we can allocate on
1102 -- it, hence failures will not
1103 -- propagate out of this fold loop
1104 Left _ -> Left $ "Allocation failed: " ++ msg
1105 Ok result@(_, _, _, new_cv) ->
1106 let new_accu = Right result in
1107 case accu of
1108 Left _ -> new_accu
1109 Right (_, _, _, old_cv) ->
1110 if old_cv < new_cv
1111 then accu
1112 else new_accu
1113 ) no_nodes node_pairs
1114
1115 return (nl', il', ops)
1116
1117 -- | Generic function for changing one node of an instance.
1118 --
1119 -- This is similar to 'nodeEvacInstance' but will be used in a few of
1120 -- its sub-patterns. It folds the inner function 'evacOneNodeInner'
1121 -- over the list of available nodes, which results in the best choice
1122 -- for relocation.
1123 evacOneNodeOnly :: Node.List -- ^ The node list (cluster-wide)
1124 -> Instance.List -- ^ Instance list (cluster-wide)
1125 -> Instance.Instance -- ^ The instance to be evacuated
1126 -> Gdx -- ^ The group we're targetting
1127 -> [Ndx] -- ^ The list of available nodes
1128 -- for allocation
1129 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1130 evacOneNodeOnly nl il inst gdx avail_nodes = do
1131 op_fn <- case Instance.mirrorType inst of
1132 MirrorNone -> Bad "Can't relocate/evacuate non-mirrored instances"
1133 MirrorInternal -> Ok ReplaceSecondary
1134 MirrorExternal -> Ok FailoverToAny
1135 (nl', inst', _, ndx) <- annotateResult "Can't find any good node" .
1136 eitherToResult $
1137 foldl' (evacOneNodeInner nl inst gdx op_fn)
1138 (Left "no nodes available") avail_nodes
1139 let idx = Instance.idx inst
1140 il' = Container.add idx inst' il
1141 ops = iMoveToJob nl' il' idx (op_fn ndx)
1142 return (nl', il', ops)
1143
1144 -- | Inner fold function for changing one node of an instance.
1145 --
1146 -- Depending on the instance disk template, this will either change
1147 -- the secondary (for DRBD) or the primary node (for shared
1148 -- storage). However, the operation is generic otherwise.
1149 --
1150 -- The running solution is either a @Left String@, which means we
1151 -- don't have yet a working solution, or a @Right (...)@, which
1152 -- represents a valid solution; it holds the modified node list, the
1153 -- modified instance (after evacuation), the score of that solution,
1154 -- and the new secondary node index.
1155 evacOneNodeInner :: Node.List -- ^ Cluster node list
1156 -> Instance.Instance -- ^ Instance being evacuated
1157 -> Gdx -- ^ The group index of the instance
1158 -> (Ndx -> IMove) -- ^ Operation constructor
1159 -> EvacInnerState -- ^ Current best solution
1160 -> Ndx -- ^ Node we're evaluating as target
1161 -> EvacInnerState -- ^ New best solution
1162 evacOneNodeInner nl inst gdx op_fn accu ndx =
1163 case applyMove nl inst (op_fn ndx) of
1164 Bad fm -> let fail_msg = "Node " ++ Container.nameOf nl ndx ++
1165 " failed: " ++ show fm
1166 in either (const $ Left fail_msg) (const accu) accu
1167 Ok (nl', inst', _, _) ->
1168 let nodes = Container.elems nl'
1169 -- The fromJust below is ugly (it can fail nastily), but
1170 -- at this point we should have any internal mismatches,
1171 -- and adding a monad here would be quite involved
1172 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1173 new_cv = compCVNodes grpnodes
1174 new_accu = Right (nl', inst', new_cv, ndx)
1175 in case accu of
1176 Left _ -> new_accu
1177 Right (_, _, old_cv, _) ->
1178 if old_cv < new_cv
1179 then accu
1180 else new_accu
1181
1182 -- | Compute result of changing all nodes of a DRBD instance.
1183 --
1184 -- Given the target primary and secondary node (which might be in a
1185 -- different group or not), this function will 'execute' all the
1186 -- required steps and assuming all operations succceed, will return
1187 -- the modified node and instance lists, the opcodes needed for this
1188 -- and the new group score.
1189 evacDrbdAllInner :: Node.List -- ^ Cluster node list
1190 -> Instance.List -- ^ Cluster instance list
1191 -> Instance.Instance -- ^ The instance to be moved
1192 -> Gdx -- ^ The target group index
1193 -- (which can differ from the
1194 -- current group of the
1195 -- instance)
1196 -> (Ndx, Ndx) -- ^ Tuple of new
1197 -- primary\/secondary nodes
1198 -> Result (Node.List, Instance.List, [OpCodes.OpCode], Score)
1199 evacDrbdAllInner nl il inst gdx (t_pdx, t_sdx) = do
1200 let primary = Container.find (Instance.pNode inst) nl
1201 idx = Instance.idx inst
1202 -- if the primary is offline, then we first failover
1203 (nl1, inst1, ops1) <-
1204 if Node.offline primary
1205 then do
1206 (nl', inst', _, _) <-
1207 annotateResult "Failing over to the secondary" .
1208 opToResult $ applyMove nl inst Failover
1209 return (nl', inst', [Failover])
1210 else return (nl, inst, [])
1211 let (o1, o2, o3) = (ReplaceSecondary t_pdx,
1212 Failover,
1213 ReplaceSecondary t_sdx)
1214 -- we now need to execute a replace secondary to the future
1215 -- primary node
1216 (nl2, inst2, _, _) <-
1217 annotateResult "Changing secondary to new primary" .
1218 opToResult $
1219 applyMove nl1 inst1 o1
1220 let ops2 = o1:ops1
1221 -- we now execute another failover, the primary stays fixed now
1222 (nl3, inst3, _, _) <- annotateResult "Failing over to new primary" .
1223 opToResult $ applyMove nl2 inst2 o2
1224 let ops3 = o2:ops2
1225 -- and finally another replace secondary, to the final secondary
1226 (nl4, inst4, _, _) <-
1227 annotateResult "Changing secondary to final secondary" .
1228 opToResult $
1229 applyMove nl3 inst3 o3
1230 let ops4 = o3:ops3
1231 il' = Container.add idx inst4 il
1232 ops = concatMap (iMoveToJob nl4 il' idx) $ reverse ops4
1233 let nodes = Container.elems nl4
1234 -- The fromJust below is ugly (it can fail nastily), but
1235 -- at this point we should have any internal mismatches,
1236 -- and adding a monad here would be quite involved
1237 grpnodes = fromJust (gdx `lookup` Node.computeGroups nodes)
1238 new_cv = compCVNodes grpnodes
1239 return (nl4, il', ops, new_cv)
1240
1241 -- | Computes the nodes in a given group which are available for
1242 -- allocation.
1243 availableGroupNodes :: [(Gdx, [Ndx])] -- ^ Group index/node index assoc list
1244 -> IntSet.IntSet -- ^ Nodes that are excluded
1245 -> Gdx -- ^ The group for which we
1246 -- query the nodes
1247 -> Result [Ndx] -- ^ List of available node indices
1248 availableGroupNodes group_nodes excl_ndx gdx = do
1249 local_nodes <- maybe (Bad $ "Can't find group with index " ++ show gdx)
1250 Ok (lookup gdx group_nodes)
1251 let avail_nodes = filter (not . flip IntSet.member excl_ndx) local_nodes
1252 return avail_nodes
1253
1254 -- | Updates the evac solution with the results of an instance
1255 -- evacuation.
1256 updateEvacSolution :: (Node.List, Instance.List, EvacSolution)
1257 -> Idx
1258 -> Result (Node.List, Instance.List, [OpCodes.OpCode])
1259 -> (Node.List, Instance.List, EvacSolution)
1260 updateEvacSolution (nl, il, es) idx (Bad msg) =
1261 (nl, il, es { esFailed = (idx, msg):esFailed es})
1262 updateEvacSolution (_, _, es) idx (Ok (nl, il, opcodes)) =
1263 (nl, il, es { esMoved = new_elem:esMoved es
1264 , esOpCodes = opcodes:esOpCodes es })
1265 where inst = Container.find idx il
1266 new_elem = (idx,
1267 instancePriGroup nl inst,
1268 Instance.allNodes inst)
1269
1270 -- | Node-evacuation IAllocator mode main function.
1271 tryNodeEvac :: Group.List -- ^ The cluster groups
1272 -> Node.List -- ^ The node list (cluster-wide, not per group)
1273 -> Instance.List -- ^ Instance list (cluster-wide)
1274 -> EvacMode -- ^ The evacuation mode
1275 -> [Idx] -- ^ List of instance (indices) to be evacuated
1276 -> Result (Node.List, Instance.List, EvacSolution)
1277 tryNodeEvac _ ini_nl ini_il mode idxs =
1278 let evac_ndx = nodesToEvacuate ini_il mode idxs
1279 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1280 excl_ndx = foldl' (flip IntSet.insert) evac_ndx offline
1281 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1282 (Container.elems nl))) $
1283 splitCluster ini_nl ini_il
1284 (fin_nl, fin_il, esol) =
1285 foldl' (\state@(nl, il, _) inst ->
1286 let gdx = instancePriGroup nl inst
1287 pdx = Instance.pNode inst in
1288 updateEvacSolution state (Instance.idx inst) $
1289 availableGroupNodes group_ndx
1290 (IntSet.insert pdx excl_ndx) gdx >>=
1291 nodeEvacInstance nl il mode inst gdx
1292 )
1293 (ini_nl, ini_il, emptyEvacSolution)
1294 (map (`Container.find` ini_il) idxs)
1295 in return (fin_nl, fin_il, reverseEvacSolution esol)
1296
1297 -- | Change-group IAllocator mode main function.
1298 --
1299 -- This is very similar to 'tryNodeEvac', the only difference is that
1300 -- we don't choose as target group the current instance group, but
1301 -- instead:
1302 --
1303 -- 1. at the start of the function, we compute which are the target
1304 -- groups; either no groups were passed in, in which case we choose
1305 -- all groups out of which we don't evacuate instance, or there were
1306 -- some groups passed, in which case we use those
1307 --
1308 -- 2. for each instance, we use 'findBestAllocGroup' to choose the
1309 -- best group to hold the instance, and then we do what
1310 -- 'tryNodeEvac' does, except for this group instead of the current
1311 -- instance group.
1312 --
1313 -- Note that the correct behaviour of this function relies on the
1314 -- function 'nodeEvacInstance' to be able to do correctly both
1315 -- intra-group and inter-group moves when passed the 'ChangeAll' mode.
1316 tryChangeGroup :: Group.List -- ^ The cluster groups
1317 -> Node.List -- ^ The node list (cluster-wide)
1318 -> Instance.List -- ^ Instance list (cluster-wide)
1319 -> [Gdx] -- ^ Target groups; if empty, any
1320 -- groups not being evacuated
1321 -> [Idx] -- ^ List of instance (indices) to be evacuated
1322 -> Result (Node.List, Instance.List, EvacSolution)
1323 tryChangeGroup gl ini_nl ini_il gdxs idxs =
1324 let evac_gdxs = nub $ map (instancePriGroup ini_nl .
1325 flip Container.find ini_il) idxs
1326 target_gdxs = (if null gdxs
1327 then Container.keys gl
1328 else gdxs) \\ evac_gdxs
1329 offline = map Node.idx . filter Node.offline $ Container.elems ini_nl
1330 excl_ndx = foldl' (flip IntSet.insert) IntSet.empty offline
1331 group_ndx = map (\(gdx, (nl, _)) -> (gdx, map Node.idx
1332 (Container.elems nl))) $
1333 splitCluster ini_nl ini_il
1334 (fin_nl, fin_il, esol) =
1335 foldl' (\state@(nl, il, _) inst ->
1336 let solution = do
1337 let ncnt = Instance.requiredNodes $
1338 Instance.diskTemplate inst
1339 (grp, _, _) <- findBestAllocGroup gl nl il
1340 (Just target_gdxs) inst ncnt
1341 let gdx = Group.idx grp
1342 av_nodes <- availableGroupNodes group_ndx
1343 excl_ndx gdx
1344 nodeEvacInstance nl il ChangeAll inst gdx av_nodes
1345 in updateEvacSolution state (Instance.idx inst) solution
1346 )
1347 (ini_nl, ini_il, emptyEvacSolution)
1348 (map (`Container.find` ini_il) idxs)
1349 in return (fin_nl, fin_il, reverseEvacSolution esol)
1350
1351 -- | Standard-sized allocation method.
1352 --
1353 -- This places instances of the same size on the cluster until we're
1354 -- out of space. The result will be a list of identically-sized
1355 -- instances.
1356 iterateAlloc :: AllocMethod
1357 iterateAlloc nl il limit newinst allocnodes ixes cstats =
1358 let depth = length ixes
1359 newname = printf "new-%d" depth::String
1360 newidx = Container.size il
1361 newi2 = Instance.setIdx (Instance.setName newinst newname) newidx
1362 newlimit = fmap (flip (-) 1) limit
1363 in case tryAlloc nl il newi2 allocnodes of
1364 Bad s -> Bad s
1365 Ok (AllocSolution { asFailures = errs, asSolution = sols3 }) ->
1366 let newsol = Ok (collapseFailures errs, nl, il, ixes, cstats) in
1367 case sols3 of
1368 Nothing -> newsol
1369 Just (xnl, xi, _, _) ->
1370 if limit == Just 0
1371 then newsol
1372 else iterateAlloc xnl (Container.add newidx xi il)
1373 newlimit newinst allocnodes (xi:ixes)
1374 (totalResources xnl:cstats)
1375
1376 -- | Predicate whether shrinking a single resource can lead to a valid
1377 -- allocation.
1378 sufficesShrinking :: (Instance.Instance -> AllocSolution) -> Instance.Instance
1379 -> FailMode -> Maybe Instance.Instance
1380 sufficesShrinking allocFn inst fm =
1381 case dropWhile (isNothing . asSolution . fst)
1382 . takeWhile (liftA2 (||) (elem fm . asFailures . fst)
1383 (isJust . asSolution . fst))
1384 . map (allocFn &&& id) $
1385 iterateOk (`Instance.shrinkByType` fm) inst
1386 of x:_ -> Just . snd $ x
1387 _ -> Nothing
1388
1389 -- | Tiered allocation method.
1390 --
1391 -- This places instances on the cluster, and decreases the spec until
1392 -- we can allocate again. The result will be a list of decreasing
1393 -- instance specs.
1394 tieredAlloc :: AllocMethod
1395 tieredAlloc nl il limit newinst allocnodes ixes cstats =
1396 case iterateAlloc nl il limit newinst allocnodes ixes cstats of
1397 Bad s -> Bad s
1398 Ok (errs, nl', il', ixes', cstats') ->
1399 let newsol = Ok (errs, nl', il', ixes', cstats')
1400 ixes_cnt = length ixes'
1401 (stop, newlimit) = case limit of
1402 Nothing -> (False, Nothing)
1403 Just n -> (n <= ixes_cnt,
1404 Just (n - ixes_cnt))
1405 sortedErrs = map fst $ sortBy (comparing snd) errs
1406 suffShrink = sufficesShrinking (fromMaybe emptyAllocSolution
1407 . flip (tryAlloc nl' il') allocnodes)
1408 newinst
1409 bigSteps = filter isJust . map suffShrink . reverse $ sortedErrs
1410 progress (Ok (_, _, _, newil', _)) (Ok (_, _, _, newil, _)) =
1411 length newil' > length newil
1412 progress _ _ = False
1413 in if stop then newsol else
1414 let newsol' = case Instance.shrinkByType newinst . last
1415 $ sortedErrs of
1416 Bad _ -> newsol
1417 Ok newinst' -> tieredAlloc nl' il' newlimit
1418 newinst' allocnodes ixes' cstats'
1419 in if progress newsol' newsol then newsol' else
1420 case bigSteps of
1421 Just newinst':_ -> tieredAlloc nl' il' newlimit
1422 newinst' allocnodes ixes' cstats'
1423 _ -> newsol
1424
1425 -- * Formatting functions
1426
1427 -- | Given the original and final nodes, computes the relocation description.
1428 computeMoves :: Instance.Instance -- ^ The instance to be moved
1429 -> String -- ^ The instance name
1430 -> IMove -- ^ The move being performed
1431 -> String -- ^ New primary
1432 -> String -- ^ New secondary
1433 -> (String, [String])
1434 -- ^ Tuple of moves and commands list; moves is containing
1435 -- either @/f/@ for failover or @/r:name/@ for replace
1436 -- secondary, while the command list holds gnt-instance
1437 -- commands (without that prefix), e.g \"@failover instance1@\"
1438 computeMoves i inam mv c d =
1439 case mv of
1440 Failover -> ("f", [mig])
1441 FailoverToAny _ -> (printf "fa:%s" c, [mig_any])
1442 FailoverAndReplace _ -> (printf "f r:%s" d, [mig, rep d])
1443 ReplaceSecondary _ -> (printf "r:%s" d, [rep d])
1444 ReplaceAndFailover _ -> (printf "r:%s f" c, [rep c, mig])
1445 ReplacePrimary _ -> (printf "f r:%s f" c, [mig, rep c, mig])
1446 where morf = if Instance.isRunning i then "migrate" else "failover"
1447 mig = printf "%s -f %s" morf inam::String
1448 mig_any = printf "%s -f -n %s %s" morf c inam::String
1449 rep n = printf "replace-disks -n %s %s" n inam::String
1450
1451 -- | Converts a placement to string format.
1452 printSolutionLine :: Node.List -- ^ The node list
1453 -> Instance.List -- ^ The instance list
1454 -> Int -- ^ Maximum node name length
1455 -> Int -- ^ Maximum instance name length
1456 -> Placement -- ^ The current placement
1457 -> Int -- ^ The index of the placement in
1458 -- the solution
1459 -> (String, [String])
1460 printSolutionLine nl il nmlen imlen plc pos =
1461 let pmlen = (2*nmlen + 1)
1462 (i, p, s, mv, c) = plc
1463 old_sec = Instance.sNode inst
1464 inst = Container.find i il
1465 inam = Instance.alias inst
1466 npri = Node.alias $ Container.find p nl
1467 nsec = Node.alias $ Container.find s nl
1468 opri = Node.alias $ Container.find (Instance.pNode inst) nl
1469 osec = Node.alias $ Container.find old_sec nl
1470 (moves, cmds) = computeMoves inst inam mv npri nsec
1471 -- FIXME: this should check instead/also the disk template
1472 ostr = if old_sec == Node.noSecondary
1473 then printf "%s" opri::String
1474 else printf "%s:%s" opri osec::String
1475 nstr = if s == Node.noSecondary
1476 then printf "%s" npri::String
1477 else printf "%s:%s" npri nsec::String
1478 in (printf " %3d. %-*s %-*s => %-*s %12.8f a=%s"
1479 pos imlen inam pmlen ostr pmlen nstr c moves,
1480 cmds)
1481
1482 -- | Return the instance and involved nodes in an instance move.
1483 --
1484 -- Note that the output list length can vary, and is not required nor
1485 -- guaranteed to be of any specific length.
1486 involvedNodes :: Instance.List -- ^ Instance list, used for retrieving
1487 -- the instance from its index; note
1488 -- that this /must/ be the original
1489 -- instance list, so that we can
1490 -- retrieve the old nodes
1491 -> Placement -- ^ The placement we're investigating,
1492 -- containing the new nodes and
1493 -- instance index
1494 -> [Ndx] -- ^ Resulting list of node indices
1495 involvedNodes il plc =
1496 let (i, np, ns, _, _) = plc
1497 inst = Container.find i il
1498 in nub . filter (>= 0) $ [np, ns] ++ Instance.allNodes inst
1499
1500 -- | From two adjacent cluster tables get the list of moves that transitions
1501 -- from to the other
1502 getMoves :: (Table, Table) -> [MoveJob]
1503 getMoves (Table _ initial_il _ initial_plc, Table final_nl _ _ final_plc) =
1504 let
1505 plctoMoves (plc@(idx, p, s, mv, _)) =
1506 let inst = Container.find idx initial_il
1507 inst_name = Instance.name inst
1508 affected = involvedNodes initial_il plc
1509 np = Node.alias $ Container.find p final_nl
1510 ns = Node.alias $ Container.find s final_nl
1511 (_, cmds) = computeMoves inst inst_name mv np ns
1512 in (affected, idx, mv, cmds)
1513 in map plctoMoves . reverse . drop (length initial_plc) $ reverse final_plc
1514
1515 -- | Inner function for splitJobs, that either appends the next job to
1516 -- the current jobset, or starts a new jobset.
1517 mergeJobs :: ([JobSet], [Ndx]) -> MoveJob -> ([JobSet], [Ndx])
1518 mergeJobs ([], _) n@(ndx, _, _, _) = ([[n]], ndx)
1519 mergeJobs (cjs@(j:js), nbuf) n@(ndx, _, _, _)
1520 | null (ndx `intersect` nbuf) = ((n:j):js, ndx ++ nbuf)
1521 | otherwise = ([n]:cjs, ndx)
1522
1523 -- | Break a list of moves into independent groups. Note that this
1524 -- will reverse the order of jobs.
1525 splitJobs :: [MoveJob] -> [JobSet]
1526 splitJobs = fst . foldl mergeJobs ([], [])
1527
1528 -- | Given a list of commands, prefix them with @gnt-instance@ and
1529 -- also beautify the display a little.
1530 formatJob :: Int -> Int -> (Int, MoveJob) -> [String]
1531 formatJob jsn jsl (sn, (_, _, _, cmds)) =
1532 let out =
1533 printf " echo job %d/%d" jsn sn:
1534 printf " check":
1535 map (" gnt-instance " ++) cmds
1536 in if sn == 1
1537 then ["", printf "echo jobset %d, %d jobs" jsn jsl] ++ out
1538 else out
1539
1540 -- | Given a list of commands, prefix them with @gnt-instance@ and
1541 -- also beautify the display a little.
1542 formatCmds :: [JobSet] -> String
1543 formatCmds =
1544 unlines .
1545 concatMap (\(jsn, js) -> concatMap (formatJob jsn (length js))
1546 (zip [1..] js)) .
1547 zip [1..]
1548
1549 -- | Print the node list.
1550 printNodes :: Node.List -> [String] -> String
1551 printNodes nl fs =
1552 let fields = case fs of
1553 [] -> Node.defaultFields
1554 "+":rest -> Node.defaultFields ++ rest
1555 _ -> fs
1556 snl = sortBy (comparing Node.idx) (Container.elems nl)
1557 (header, isnum) = unzip $ map Node.showHeader fields
1558 in printTable "" header (map (Node.list fields) snl) isnum
1559
1560 -- | Print the instance list.
1561 printInsts :: Node.List -> Instance.List -> String
1562 printInsts nl il =
1563 let sil = sortBy (comparing Instance.idx) (Container.elems il)
1564 helper inst = [ if Instance.isRunning inst then "R" else " "
1565 , Instance.name inst
1566 , Container.nameOf nl (Instance.pNode inst)
1567 , let sdx = Instance.sNode inst
1568 in if sdx == Node.noSecondary
1569 then ""
1570 else Container.nameOf nl sdx
1571 , if Instance.autoBalance inst then "Y" else "N"
1572 , printf "%3d" $ Instance.vcpus inst
1573 , printf "%5d" $ Instance.mem inst
1574 , printf "%5d" $ Instance.dsk inst `div` 1024
1575 , printf "%5.3f" lC
1576 , printf "%5.3f" lM
1577 , printf "%5.3f" lD
1578 , printf "%5.3f" lN
1579 ]
1580 where DynUtil lC lM lD lN = Instance.util inst
1581 header = [ "F", "Name", "Pri_node", "Sec_node", "Auto_bal"
1582 , "vcpu", "mem" , "dsk", "lCpu", "lMem", "lDsk", "lNet" ]
1583 isnum = False:False:False:False:False:repeat True
1584 in printTable "" header (map helper sil) isnum
1585
1586 -- | Shows statistics for a given node list.
1587 printStats :: String -> Node.List -> String
1588 printStats lp nl =
1589 let dcvs = compDetailedCV $ Container.elems nl
1590 (weights, names) = unzip detailedCVInfo
1591 hd = zip3 (weights ++ repeat 1) (names ++ repeat "unknown") dcvs
1592 header = [ "Field", "Value", "Weight" ]
1593 formatted = map (\(w, h, val) ->
1594 [ h
1595 , printf "%.8f" val
1596 , printf "x%.2f" w
1597 ]) hd
1598 in printTable lp header formatted $ False:repeat True
1599
1600 -- | Convert a placement into a list of OpCodes (basically a job).
1601 iMoveToJob :: Node.List -- ^ The node list; only used for node
1602 -- names, so any version is good
1603 -- (before or after the operation)
1604 -> Instance.List -- ^ The instance list; also used for
1605 -- names only
1606 -> Idx -- ^ The index of the instance being
1607 -- moved
1608 -> IMove -- ^ The actual move to be described
1609 -> [OpCodes.OpCode] -- ^ The list of opcodes equivalent to
1610 -- the given move
1611 iMoveToJob nl il idx move =
1612 let inst = Container.find idx il
1613 iname = Instance.name inst
1614 lookNode n = case mkNonEmpty (Container.nameOf nl n) of
1615 -- FIXME: convert htools codebase to non-empty strings
1616 Bad msg -> error $ "Empty node name for idx " ++
1617 show n ++ ": " ++ msg ++ "??"
1618 Ok ne -> Just ne
1619 opF = OpCodes.OpInstanceMigrate
1620 { OpCodes.opInstanceName = iname
1621 , OpCodes.opInstanceUuid = Nothing
1622 , OpCodes.opMigrationMode = Nothing -- default
1623 , OpCodes.opOldLiveMode = Nothing -- default as well
1624 , OpCodes.opTargetNode = Nothing -- this is drbd
1625 , OpCodes.opTargetNodeUuid = Nothing
1626 , OpCodes.opAllowRuntimeChanges = False
1627 , OpCodes.opIgnoreIpolicy = False
1628 , OpCodes.opMigrationCleanup = False
1629 , OpCodes.opIallocator = Nothing
1630 , OpCodes.opAllowFailover = True }
1631 opFA n = opF { OpCodes.opTargetNode = lookNode n } -- not drbd
1632 opR n = OpCodes.OpInstanceReplaceDisks
1633 { OpCodes.opInstanceName = iname
1634 , OpCodes.opInstanceUuid = Nothing
1635 , OpCodes.opEarlyRelease = False
1636 , OpCodes.opIgnoreIpolicy = False
1637 , OpCodes.opReplaceDisksMode = OpCodes.ReplaceNewSecondary
1638 , OpCodes.opReplaceDisksList = []
1639 , OpCodes.opRemoteNode = lookNode n
1640 , OpCodes.opRemoteNodeUuid = Nothing
1641 , OpCodes.opIallocator = Nothing
1642 }
1643 in case move of
1644 Failover -> [ opF ]
1645 FailoverToAny np -> [ opFA np ]
1646 ReplacePrimary np -> [ opF, opR np, opF ]
1647 ReplaceSecondary ns -> [ opR ns ]
1648 ReplaceAndFailover np -> [ opR np, opF ]
1649 FailoverAndReplace ns -> [ opF, opR ns ]
1650
1651 -- * Node group functions
1652
1653 -- | Computes the group of an instance.
1654 instanceGroup :: Node.List -> Instance.Instance -> Result Gdx
1655 instanceGroup nl i =
1656 let sidx = Instance.sNode i
1657 pnode = Container.find (Instance.pNode i) nl
1658 snode = if sidx == Node.noSecondary
1659 then pnode
1660 else Container.find sidx nl
1661 pgroup = Node.group pnode
1662 sgroup = Node.group snode
1663 in if pgroup /= sgroup
1664 then fail ("Instance placed accross two node groups, primary " ++
1665 show pgroup ++ ", secondary " ++ show sgroup)
1666 else return pgroup
1667
1668 -- | Computes the group of an instance per the primary node.
1669 instancePriGroup :: Node.List -> Instance.Instance -> Gdx
1670 instancePriGroup nl i =
1671 let pnode = Container.find (Instance.pNode i) nl
1672 in Node.group pnode
1673
1674 -- | Compute the list of badly allocated instances (split across node
1675 -- groups).
1676 findSplitInstances :: Node.List -> Instance.List -> [Instance.Instance]
1677 findSplitInstances nl =
1678 filter (not . isOk . instanceGroup nl) . Container.elems
1679
1680 -- | Splits a cluster into the component node groups.
1681 splitCluster :: Node.List -> Instance.List ->
1682 [(Gdx, (Node.List, Instance.List))]
1683 splitCluster nl il =
1684 let ngroups = Node.computeGroups (Container.elems nl)
1685 in map (\(gdx, nodes) ->
1686 let nidxs = map Node.idx nodes
1687 nodes' = zip nidxs nodes
1688 instances = Container.filter ((`elem` nidxs) . Instance.pNode) il
1689 in (gdx, (Container.fromList nodes', instances))) ngroups
1690
1691 -- | Compute the list of nodes that are to be evacuated, given a list
1692 -- of instances and an evacuation mode.
1693 nodesToEvacuate :: Instance.List -- ^ The cluster-wide instance list
1694 -> EvacMode -- ^ The evacuation mode we're using
1695 -> [Idx] -- ^ List of instance indices being evacuated
1696 -> IntSet.IntSet -- ^ Set of node indices
1697 nodesToEvacuate il mode =
1698 IntSet.delete Node.noSecondary .
1699 foldl' (\ns idx ->
1700 let i = Container.find idx il
1701 pdx = Instance.pNode i
1702 sdx = Instance.sNode i
1703 dt = Instance.diskTemplate i
1704 withSecondary = case dt of
1705 DTDrbd8 -> IntSet.insert sdx ns
1706 _ -> ns
1707 in case mode of
1708 ChangePrimary -> IntSet.insert pdx ns
1709 ChangeSecondary -> withSecondary
1710 ChangeAll -> IntSet.insert pdx withSecondary
1711 ) IntSet.empty