Merge branch 'stable-2.16' into stable-2.17
[ganeti-github.git] / src / Ganeti / HTools / Program / Harep.hs
1 {-# LANGUAGE TupleSections #-}
2
3 {-| Auto-repair tool for Ganeti.
4
5 -}
6
7 {-
8
9 Copyright (C) 2013 Google Inc.
10 All rights reserved.
11
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are
14 met:
15
16 1. Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 2. Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
24 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
27 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
35 -}
36
37 module Ganeti.HTools.Program.Harep
38 ( main
39 , arguments
40 , options) where
41
42 import Control.Exception (bracket)
43 import Control.Lens (over)
44 import Control.Monad
45 import Data.Maybe
46 import System.Time
47 import qualified Data.Map as Map
48 import qualified Text.JSON as J
49
50 import Ganeti.BasicTypes
51 import Ganeti.Common
52 import Ganeti.Errors
53 import Ganeti.JQueue (currentTimestamp, reasonTrailTimestamp)
54 import Ganeti.JQueue.Objects (Timestamp)
55 import Ganeti.Jobs
56 import Ganeti.OpCodes
57 import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
58 import Ganeti.Types
59 import Ganeti.Utils
60 import qualified Ganeti.Luxi as L
61 import qualified Ganeti.Path as Path
62
63 import Ganeti.HTools.CLI
64 import qualified Ganeti.HTools.Container as Container
65 import Ganeti.HTools.Loader
66 import Ganeti.HTools.ExtLoader
67 import Ganeti.HTools.Repair
68 import Ganeti.HTools.Types
69 import qualified Ganeti.HTools.Instance as Instance
70
71 import Ganeti.Version (version)
72
73
74 -- | Options list and functions.
75 options :: IO [OptType]
76 options = do
77 luxi <- oLuxiSocket
78 return
79 [ luxi
80 , oJobDelay
81 , oReason
82 , oDryRun
83 ]
84
85 arguments :: [ArgCompletion]
86 arguments = []
87
88 -- | Wraps an 'OpCode' in a 'MetaOpCode' while also adding a comment
89 -- about what generated the opcode.
90 annotateOpCode :: Maybe String -> Timestamp -> OpCode -> MetaOpCode
91 annotateOpCode reason ts =
92 over (metaParamsL . opReasonL)
93 (++ [( "harep", fromMaybe ("harep " ++ version ++ " called") reason
94 , reasonTrailTimestamp ts)])
95 . setOpComment ("automated repairs by harep " ++ version)
96 . wrapOpCode
97
98 -- | Query jobs of a pending repair, returning the new instance data.
99 processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
100 processPending opts client instData =
101 case arState instData of
102 (ArPendingRepair arData) -> do
103 sts <- L.queryJobsStatus client $ arJobs arData
104 time <- getClockTime
105 case sts of
106 Bad e -> exitErr $ "could not check job status: " ++ formatError e
107 Ok sts' ->
108 if any (<= JOB_STATUS_RUNNING) sts' then
109 return instData -- (no change)
110 else do
111 let iname = Instance.name $ arInstance instData
112 srcSt = arStateName $ arState instData
113 destSt = arStateName arState'
114 putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
115 show destSt)
116 commitChange opts client instData'
117 where
118 instData' =
119 instData { arState = arState'
120 , tagsToRemove = delCurTag instData
121 }
122 arState' =
123 if all (== JOB_STATUS_SUCCESS) sts' then
124 ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
125 , arTime = time })
126 else
127 ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
128 , arTime = time })
129
130 _ -> return instData
131
132 -- | Apply and remove tags from an instance as indicated by 'InstanceData'.
133 --
134 -- If the /arState/ of the /InstanceData/ record has an associated
135 -- 'AutoRepairData', add its tag to the instance object. Additionally, if
136 -- /tagsToRemove/ is not empty, remove those tags from the instance object. The
137 -- returned /InstanceData/ object always has an empty /tagsToRemove/.
138 commitChange :: Options -> L.Client -> InstanceData -> IO InstanceData
139 commitChange opts client instData = do
140 now <- currentTimestamp
141 let iname = Instance.name $ arInstance instData
142 arData = getArData $ arState instData
143 rmTags = tagsToRemove instData
144 execJobsWaitOk' opcodes = unless (optDryRun opts) $ do
145 res <- execJobsWaitOk
146 [map (annotateOpCode (optReason opts) now) opcodes] client
147 case res of
148 Ok _ -> return ()
149 Bad e -> exitErr e
150
151 when (isJust arData) $ do
152 let tag = arTag $ fromJust arData
153 putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
154 execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)]
155
156 unless (null rmTags) $ do
157 putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
158 unlines (map show rmTags))
159 execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)]
160
161 return instData { tagsToRemove = [] }
162
163 -- | Submit jobs, unless a dry-run is requested; in this case, just report
164 -- the job that would be submitted.
165 submitJobs' :: Options -> [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
166 submitJobs' opts jobs client =
167 if optDryRun opts
168 then do
169 putStrLn . (++) "jobs: " . J.encode $ map (map metaOpCode) jobs
170 return $ Ok []
171 else
172 submitJobs jobs client
173
174 -- | Perform the suggested repair on an instance if its policy allows it.
175 doRepair :: Options
176 -> L.Client -- ^ The Luxi client
177 -> Double -- ^ Delay to insert before the first repair opcode
178 -> InstanceData -- ^ The instance data
179 -> (AutoRepairType, [OpCode]) -- ^ The repair job to perform
180 -> IO InstanceData -- ^ The updated instance data
181 doRepair opts client delay instData (rtype, opcodes) =
182 let inst = arInstance instData
183 ipol = Instance.arPolicy inst
184 iname = Instance.name inst
185 in
186 case ipol of
187 ArEnabled maxtype ->
188 if rtype > maxtype then do
189 uuid <- newUUID
190 time <- getClockTime
191
192 let arState' = ArNeedsRepair (
193 updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
194 instData' = instData { arState = arState'
195 , tagsToRemove = delCurTag instData
196 }
197
198 putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
199 iname ++ " because only repairs up to " ++ show maxtype ++
200 " are allowed")
201 commitChange opts client instData' -- Adds "enoperm" result label.
202 else do
203 now <- currentTimestamp
204 putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
205
206 -- After submitting the job, we must write an autorepair:pending tag,
207 -- that includes the repair job IDs so that they can be checked later.
208 -- One problem we run into is that the repair job immediately grabs
209 -- locks for the affected instance, and the subsequent TAGS_SET job is
210 -- blocked, introducing an unnecessary delay for the end-user. One
211 -- alternative would be not to wait for the completion of the TAGS_SET
212 -- job, contrary to what commitChange normally does; but we insist on
213 -- waiting for the tag to be set so as to abort in case of failure,
214 -- because the cluster is left in an invalid state in that case.
215 --
216 -- The proper solution (in 2.9+) would be not to use tags for storing
217 -- autorepair data, or make the TAGS_SET opcode not grab an instance's
218 -- locks (if that's deemed safe). In the meantime, we introduce an
219 -- artificial delay in the repair job (via a TestDelay opcode) so that
220 -- once we have the job ID, the TAGS_SET job can complete before the
221 -- repair job actually grabs the locks. (Please note that this is not
222 -- about synchronization, but merely about speeding up the execution of
223 -- the harep tool. If this TestDelay opcode is removed, the program is
224 -- still correct.)
225 let opcodes' =
226 if delay > 0 then
227 OpTestDelay { opDelayDuration = delay
228 , opDelayOnMaster = True
229 , opDelayOnNodes = []
230 , opDelayOnNodeUuids = Nothing
231 , opDelayRepeat = fromJust $ mkNonNegative 0
232 , opDelayInterruptible = False
233 , opDelayNoLocks = False
234 } : opcodes
235 else
236 opcodes
237
238 uuid <- newUUID
239 time <- getClockTime
240 jids <- submitJobs'
241 opts
242 [map (annotateOpCode (optReason opts) now) opcodes']
243 client
244
245 case jids of
246 Bad e -> exitErr e
247 Ok jids' ->
248 let arState' = ArPendingRepair (
249 updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
250 instData' = instData { arState = arState'
251 , tagsToRemove = delCurTag instData
252 }
253 in
254 commitChange opts client instData' -- Adds "pending" label.
255
256 otherSt -> do
257 putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
258 show otherSt)
259 return instData
260
261 -- | Main function.
262 main :: Options -> [String] -> IO ()
263 main opts args = do
264 unless (null args) $
265 exitErr "this program doesn't take any arguments."
266
267 luxiDef <- Path.defaultQuerySocket
268 let master = fromMaybe luxiDef $ optLuxi opts
269 opts' = opts { optLuxi = Just master }
270
271 (ClusterData _ nl il _ _) <- loadExternalData opts'
272
273 let iniDataRes = mapM setInitialState $ Container.elems il
274 iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
275
276 -- First step: check all pending repairs, see if they are completed.
277 iniData' <- bracket (L.getLuxiClient master) L.closeClient $
278 forM iniData . processPending opts
279
280 -- Second step: detect any problems.
281 let repairs = map (detectBroken nl . arInstance) iniData'
282
283 -- Third step: create repair jobs for broken instances that are in ArHealthy.
284 let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
285 jobDelay = optJobDelay opts
286 repairHealthy c i = case arState i of
287 ArHealthy _ -> doRepair opts c jobDelay i
288 _ -> const (return i)
289
290 repairDone <- bracket (L.getLuxiClient master) L.closeClient $
291 forM (zip iniData' repairs) . maybeRepair
292
293 -- Print some stats and exit.
294 let states = map ((, 1 :: Int) . arStateName . arState) repairDone
295 counts = Map.fromListWith (+) states
296
297 putStrLn "---------------------"
298 putStrLn "Instance status count"
299 putStrLn "---------------------"
300 putStr . unlines . Map.elems $
301 Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts