Merge branch 'stable-2.16' into stable-2.17
[ganeti-github.git] / src / Ganeti / HTools / Program / Harep.hs
1 {-# LANGUAGE TupleSections #-}
2
3 {-| Auto-repair tool for Ganeti.
4
5 -}
6
7 {-
8
9 Copyright (C) 2013 Google Inc.
10 All rights reserved.
11
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are
14 met:
15
16 1. Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 2. Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
24 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
27 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
35 -}
36
37 module Ganeti.HTools.Program.Harep
38 ( main
39 , arguments
40 , options) where
41
42 import Control.Exception (bracket)
43 import Control.Lens (over)
44 import Control.Monad
45 import Data.Maybe
46 import System.Time
47 import qualified Data.Map as Map
48
49 import Ganeti.BasicTypes
50 import Ganeti.Common
51 import Ganeti.Errors
52 import Ganeti.JQueue (currentTimestamp, reasonTrailTimestamp)
53 import Ganeti.JQueue.Objects (Timestamp)
54 import Ganeti.Jobs
55 import Ganeti.OpCodes
56 import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
57 import Ganeti.Types
58 import Ganeti.Utils
59 import qualified Ganeti.Luxi as L
60 import qualified Ganeti.Path as Path
61
62 import Ganeti.HTools.CLI
63 import qualified Ganeti.HTools.Container as Container
64 import Ganeti.HTools.Loader
65 import Ganeti.HTools.ExtLoader
66 import Ganeti.HTools.Repair
67 import Ganeti.HTools.Types
68 import qualified Ganeti.HTools.Instance as Instance
69
70 import Ganeti.Version (version)
71
72
73 -- | Options list and functions.
74 options :: IO [OptType]
75 options = do
76 luxi <- oLuxiSocket
77 return
78 [ luxi
79 , oJobDelay
80 , oReason
81 ]
82
83 arguments :: [ArgCompletion]
84 arguments = []
85
86 -- | Wraps an 'OpCode' in a 'MetaOpCode' while also adding a comment
87 -- about what generated the opcode.
88 annotateOpCode :: Maybe String -> Timestamp -> OpCode -> MetaOpCode
89 annotateOpCode reason ts =
90 over (metaParamsL . opReasonL)
91 (++ [( "harep", fromMaybe ("harep " ++ version ++ " called") reason
92 , reasonTrailTimestamp ts)])
93 . setOpComment ("automated repairs by harep " ++ version)
94 . wrapOpCode
95
96 -- | Query jobs of a pending repair, returning the new instance data.
97 processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
98 processPending opts client instData =
99 case arState instData of
100 (ArPendingRepair arData) -> do
101 sts <- L.queryJobsStatus client $ arJobs arData
102 time <- getClockTime
103 case sts of
104 Bad e -> exitErr $ "could not check job status: " ++ formatError e
105 Ok sts' ->
106 if any (<= JOB_STATUS_RUNNING) sts' then
107 return instData -- (no change)
108 else do
109 let iname = Instance.name $ arInstance instData
110 srcSt = arStateName $ arState instData
111 destSt = arStateName arState'
112 putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
113 show destSt)
114 commitChange opts client instData'
115 where
116 instData' =
117 instData { arState = arState'
118 , tagsToRemove = delCurTag instData
119 }
120 arState' =
121 if all (== JOB_STATUS_SUCCESS) sts' then
122 ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
123 , arTime = time })
124 else
125 ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
126 , arTime = time })
127
128 _ -> return instData
129
130 -- | Apply and remove tags from an instance as indicated by 'InstanceData'.
131 --
132 -- If the /arState/ of the /InstanceData/ record has an associated
133 -- 'AutoRepairData', add its tag to the instance object. Additionally, if
134 -- /tagsToRemove/ is not empty, remove those tags from the instance object. The
135 -- returned /InstanceData/ object always has an empty /tagsToRemove/.
136 commitChange :: Options -> L.Client -> InstanceData -> IO InstanceData
137 commitChange opts client instData = do
138 now <- currentTimestamp
139 let iname = Instance.name $ arInstance instData
140 arData = getArData $ arState instData
141 rmTags = tagsToRemove instData
142 execJobsWaitOk' opcodes = do
143 res <- execJobsWaitOk
144 [map (annotateOpCode (optReason opts) now) opcodes] client
145 case res of
146 Ok _ -> return ()
147 Bad e -> exitErr e
148
149 when (isJust arData) $ do
150 let tag = arTag $ fromJust arData
151 putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
152 execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)]
153
154 unless (null rmTags) $ do
155 putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
156 unlines (map show rmTags))
157 execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)]
158
159 return instData { tagsToRemove = [] }
160
161 -- | Perform the suggested repair on an instance if its policy allows it.
162 doRepair :: Options
163 -> L.Client -- ^ The Luxi client
164 -> Double -- ^ Delay to insert before the first repair opcode
165 -> InstanceData -- ^ The instance data
166 -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
167 -> IO InstanceData -- ^ The updated instance data
168 doRepair opts client delay instData (rtype, opcodes) =
169 let inst = arInstance instData
170 ipol = Instance.arPolicy inst
171 iname = Instance.name inst
172 in
173 case ipol of
174 ArEnabled maxtype ->
175 if rtype > maxtype then do
176 uuid <- newUUID
177 time <- getClockTime
178
179 let arState' = ArNeedsRepair (
180 updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
181 instData' = instData { arState = arState'
182 , tagsToRemove = delCurTag instData
183 }
184
185 putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
186 iname ++ " because only repairs up to " ++ show maxtype ++
187 " are allowed")
188 commitChange opts client instData' -- Adds "enoperm" result label.
189 else do
190 now <- currentTimestamp
191 putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
192
193 -- After submitting the job, we must write an autorepair:pending tag,
194 -- that includes the repair job IDs so that they can be checked later.
195 -- One problem we run into is that the repair job immediately grabs
196 -- locks for the affected instance, and the subsequent TAGS_SET job is
197 -- blocked, introducing an unnecessary delay for the end-user. One
198 -- alternative would be not to wait for the completion of the TAGS_SET
199 -- job, contrary to what commitChange normally does; but we insist on
200 -- waiting for the tag to be set so as to abort in case of failure,
201 -- because the cluster is left in an invalid state in that case.
202 --
203 -- The proper solution (in 2.9+) would be not to use tags for storing
204 -- autorepair data, or make the TAGS_SET opcode not grab an instance's
205 -- locks (if that's deemed safe). In the meantime, we introduce an
206 -- artificial delay in the repair job (via a TestDelay opcode) so that
207 -- once we have the job ID, the TAGS_SET job can complete before the
208 -- repair job actually grabs the locks. (Please note that this is not
209 -- about synchronization, but merely about speeding up the execution of
210 -- the harep tool. If this TestDelay opcode is removed, the program is
211 -- still correct.)
212 let opcodes' =
213 if delay > 0 then
214 OpTestDelay { opDelayDuration = delay
215 , opDelayOnMaster = True
216 , opDelayOnNodes = []
217 , opDelayOnNodeUuids = Nothing
218 , opDelayRepeat = fromJust $ mkNonNegative 0
219 , opDelayInterruptible = False
220 , opDelayNoLocks = False
221 } : opcodes
222 else
223 opcodes
224
225 uuid <- newUUID
226 time <- getClockTime
227 jids <- submitJobs [map (annotateOpCode (optReason opts) now) opcodes']
228 client
229
230 case jids of
231 Bad e -> exitErr e
232 Ok jids' ->
233 let arState' = ArPendingRepair (
234 updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
235 instData' = instData { arState = arState'
236 , tagsToRemove = delCurTag instData
237 }
238 in
239 commitChange opts client instData' -- Adds "pending" label.
240
241 otherSt -> do
242 putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
243 show otherSt)
244 return instData
245
246 -- | Main function.
247 main :: Options -> [String] -> IO ()
248 main opts args = do
249 unless (null args) $
250 exitErr "this program doesn't take any arguments."
251
252 luxiDef <- Path.defaultQuerySocket
253 let master = fromMaybe luxiDef $ optLuxi opts
254 opts' = opts { optLuxi = Just master }
255
256 (ClusterData _ nl il _ _) <- loadExternalData opts'
257
258 let iniDataRes = mapM setInitialState $ Container.elems il
259 iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
260
261 -- First step: check all pending repairs, see if they are completed.
262 iniData' <- bracket (L.getLuxiClient master) L.closeClient $
263 forM iniData . processPending opts
264
265 -- Second step: detect any problems.
266 let repairs = map (detectBroken nl . arInstance) iniData'
267
268 -- Third step: create repair jobs for broken instances that are in ArHealthy.
269 let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
270 jobDelay = optJobDelay opts
271 repairHealthy c i = case arState i of
272 ArHealthy _ -> doRepair opts c jobDelay i
273 _ -> const (return i)
274
275 repairDone <- bracket (L.getLuxiClient master) L.closeClient $
276 forM (zip iniData' repairs) . maybeRepair
277
278 -- Print some stats and exit.
279 let states = map ((, 1 :: Int) . arStateName . arState) repairDone
280 counts = Map.fromListWith (+) states
281
282 putStrLn "---------------------"
283 putStrLn "Instance status count"
284 putStrLn "---------------------"
285 putStr . unlines . Map.elems $
286 Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts