Coverage for curator/helpers/waiters.py: 100%

146 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-20 21:00 -0600

1"""The function that waits 

2 

3...and its helpers 

4""" 

5import logging 

6from time import localtime, sleep, strftime 

7from datetime import datetime 

8from curator.exceptions import ( 

9 ActionTimeout, ConfigurationError, CuratorException, FailedReindex, MissingArgument) 

10from curator.helpers.utils import chunk_index_list 

11 

12def health_check(client, **kwargs): 

13 """ 

14 This function calls `client.cluster.` :py:meth:`~.elasticsearch.client.ClusterClient.health` 

15 and, based on the params provided, will return ``True`` or ``False`` depending on whether that 

16 particular keyword appears in the output, and has the expected value. 

17 

18 If multiple keys are provided, all must match for a ``True`` response. 

19 

20 :param client: A client connection object 

21 

22 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

23 

24 :rtype: bool 

25 """ 

26 logger = logging.getLogger(__name__) 

27 logger.debug('KWARGS= "%s"', kwargs) 

28 klist = list(kwargs.keys()) 

29 if not klist: 

30 raise MissingArgument('Must provide at least one keyword argument') 

31 hc_data = client.cluster.health() 

32 response = True 

33 

34 for k in klist: 

35 # First, verify that all kwargs are in the list 

36 if not k in list(hc_data.keys()): 

37 raise ConfigurationError('Key "{0}" not in cluster health output') 

38 if not hc_data[k] == kwargs[k]: 

39 msg = f'NO MATCH: Value for key "{kwargs[k]}", health check data: {hc_data[k]}' 

40 logger.debug(msg) 

41 response = False 

42 else: 

43 msg = f'MATCH: Value for key "{kwargs[k]}", health check data: {hc_data[k]}' 

44 logger.debug(msg) 

45 if response: 

46 logger.info('Health Check for all provided keys passed.') 

47 return response 

48 

49def relocate_check(client, index): 

50 """ 

51 This function calls `client.cluster.` :py:meth:`~.elasticsearch.client.ClusterClient.state` 

52 with a given index to check if all of the shards for that index are in the ``STARTED`` state. 

53 It will return ``True`` if all primary and replica shards are in the ``STARTED`` state, and it 

54 will return ``False`` if any shard is in a different state. 

55 

56 :param client: A client connection object 

57 :param index: The index name 

58 

59 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

60 :type index: str 

61 

62 :rtype: bool 

63 """ 

64 logger = logging.getLogger(__name__) 

65 shard_state_data = ( 

66 client.cluster.state(index=index)['routing_table']['indices'][index]['shards'] 

67 ) 

68 finished_state = ( 

69 all( 

70 all( 

71 shard['state'] == "STARTED" for shard in shards 

72 ) 

73 for shards in shard_state_data.values() 

74 ) 

75 ) 

76 if finished_state: 

77 logger.info('Relocate Check for index: "%s" has passed.', index) 

78 return finished_state 

79 

80def restore_check(client, index_list): 

81 """ 

82 This function calls `client.indices.` :py:meth:`~.elasticsearch.client.IndicesClient.recovery` 

83 with the list of indices to check for complete recovery. It will return ``True`` if recovery 

84 of those indices is complete, and ``False`` otherwise. It is designed to fail fast: if a 

85 single shard is encountered that is still recovering (not in ``DONE`` stage), it will 

86 immediately return ``False``, rather than complete iterating over the rest of the response. 

87 

88 :param client: A client connection object 

89 :param index_list: The list of indices to verify having been restored. 

90 

91 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

92 :type index_list: list 

93 

94 :rtype: bool 

95 """ 

96 logger = logging.getLogger(__name__) 

97 response = {} 

98 for chunk in chunk_index_list(index_list): 

99 try: 

100 chunk_response = client.indices.recovery(index=chunk, human=True) 

101 except Exception as err: 

102 msg = f'Unable to obtain recovery information for specified indices. Error: {err}' 

103 raise CuratorException(msg) from err 

104 if chunk_response == {}: 

105 logger.info('_recovery returned an empty response. Trying again.') 

106 return False 

107 response.update(chunk_response) 

108 logger.info('Provided indices: %s', index_list) 

109 logger.info('Found indices: %s', list(response.keys())) 

110 # pylint: disable=consider-using-dict-items 

111 for index in response: 

112 for shard in range(0, len(response[index]['shards'])): 

113 stage = response[index]['shards'][shard]['stage'] 

114 if stage != 'DONE': 

115 logger.info('Index "%s" is still in stage "%s"', index, stage) 

116 return False 

117 

118 # If we've gotten here, all of the indices have recovered 

119 return True 

120 

121def snapshot_check(client, snapshot=None, repository=None): 

122 """ 

123 This function calls `client.snapshot.` :py:meth:`~.elasticsearch.client.SnapshotClient.get` and 

124 tests to see whether the snapshot is complete, and if so, with what status. It will log errors 

125 according to the result. If the snapshot is still ``IN_PROGRESS``, it will return ``False``. 

126 ``SUCCESS`` will be an ``INFO`` level message, ``PARTIAL`` nets a ``WARNING`` message, 

127 ``FAILED`` is an ``ERROR``, message, and all others will be a ``WARNING`` level message. 

128 

129 :param client: A client connection object 

130 :param snapshot: The snapshot name 

131 :param repository: The repository name 

132 

133 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

134 :type snapshot: str 

135 :type repository: str 

136 

137 :rtype: bool 

138 """ 

139 logger = logging.getLogger(__name__) 

140 logger.debug('SNAPSHOT: %s', snapshot) 

141 logger.debug('REPOSITORY: %s', repository) 

142 try: 

143 result = client.snapshot.get(repository=repository, snapshot=snapshot) 

144 logger.debug('RESULT: %s', result) 

145 except Exception as err: 

146 raise CuratorException( 

147 f'Unable to obtain information for snapshot "{snapshot}" in repository ' 

148 f'"{repository}". Error: {err}' 

149 ) from err 

150 state = result['snapshots'][0]['state'] 

151 logger.debug('Snapshot state = %s', state) 

152 retval = True 

153 if state == 'IN_PROGRESS': 

154 logger.info('Snapshot %s still in progress.', snapshot) 

155 retval = False 

156 elif state == 'SUCCESS': 

157 logger.info('Snapshot %s successfully completed.', snapshot) 

158 elif state == 'PARTIAL': 

159 logger.warning('Snapshot %s completed with state PARTIAL.', snapshot) 

160 elif state == 'FAILED': 

161 logger.error('Snapshot %s completed with state FAILED.', snapshot) 

162 else: 

163 logger.warning('Snapshot %s completed with state: %s', snapshot, state) 

164 return retval 

165 

166def task_check(client, task_id=None): 

167 """ 

168 This function calls `client.tasks.` :py:meth:`~.elasticsearch.client.TasksClient.get` with the 

169 provided ``task_id``. If the task data contains ``'completed': True``, then it will return 

170 ``True``. If the task is not completed, it will log some information about the task and return 

171 ``False`` 

172 

173 :param client: A client connection object 

174 :param task_id: The task id 

175 

176 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

177 :type task_id: str 

178 

179 :rtype: bool 

180 """ 

181 logger = logging.getLogger(__name__) 

182 try: 

183 task_data = client.tasks.get(task_id=task_id) 

184 except Exception as err: 

185 msg = f'Unable to obtain task information for task_id "{task_id}". Exception {err}' 

186 raise CuratorException(msg) from err 

187 task = task_data['task'] 

188 completed = task_data['completed'] 

189 if task['action'] == 'indices:data/write/reindex': 

190 logger.debug('It\'s a REINDEX TASK') 

191 logger.debug('TASK_DATA: %s', task_data) 

192 logger.debug('TASK_DATA keys: %s', list(task_data.keys())) 

193 if 'response' in task_data: 

194 response = task_data['response'] 

195 if response['failures']: 

196 msg = f'Failures found in reindex response: {response["failures"]}' 

197 raise FailedReindex(msg) 

198 running_time = 0.000000001 * task['running_time_in_nanos'] 

199 logger.debug('Running time: %s seconds', running_time) 

200 descr = task['description'] 

201 

202 if completed: 

203 completion_time = (running_time * 1000) + task['start_time_in_millis'] 

204 time_string = strftime('%Y-%m-%dT%H:%M:%SZ', localtime(completion_time/1000)) 

205 logger.info('Task "%s" completed at %s.', descr, time_string) 

206 retval = True 

207 else: 

208 # Log the task status here. 

209 logger.debug('Full Task Data: %s', task_data) 

210 msg = ( 

211 f'Task "{descr}" with task_id "{task_id}" has been running for {running_time} seconds' 

212 ) 

213 logger.info(msg) 

214 retval = False 

215 return retval 

216 

217# pylint: disable=too-many-locals, too-many-arguments 

218def wait_for_it( 

219 client, action, task_id=None, snapshot=None, repository=None, index=None, index_list=None, 

220 wait_interval=9, max_wait=-1 

221 ): 

222 """ 

223 This function becomes one place to do all ``wait_for_completion`` type behaviors 

224 

225 :param client: A client connection object 

226 :param action: The action name that will identify how to wait 

227 :param task_id: If the action provided a task_id, this is where it must be declared. 

228 :param snapshot: The name of the snapshot. 

229 :param repository: The Elasticsearch snapshot repository to use 

230 :param wait_interval: Seconds to wait between completion checks. 

231 :param max_wait: Maximum number of seconds to ``wait_for_completion`` 

232 

233 :type client: :py:class:`~.elasticsearch.Elasticsearch` 

234 :type action: str 

235 :type task_id: str 

236 :type snapshot: str 

237 :type repository: str 

238 :type wait_interval: int 

239 :type max_wait: int 

240 :rtype: None 

241 """ 

242 logger = logging.getLogger(__name__) 

243 action_map = { 

244 'allocation':{'function': health_check, 'args': {'relocating_shards':0}}, 

245 'replicas':{'function': health_check, 'args': {'status':'green'}}, 

246 'cluster_routing':{'function': health_check, 'args': {'relocating_shards':0}}, 

247 'snapshot':{ 

248 'function':snapshot_check, 'args':{'snapshot':snapshot, 'repository':repository}}, 

249 'restore':{'function':restore_check, 'args':{'index_list':index_list}}, 

250 'reindex':{'function':task_check, 'args':{'task_id':task_id}}, 

251 'shrink':{'function': health_check, 'args': {'status':'green'}}, 

252 'relocate':{'function': relocate_check, 'args': {'index':index}}, 

253 } 

254 wait_actions = list(action_map.keys()) 

255 

256 if action not in wait_actions: 

257 raise ConfigurationError(f'"action" must be one of {wait_actions}') 

258 if action == 'reindex' and task_id is None: 

259 raise MissingArgument(f'A task_id must accompany "action" {action}') 

260 if action == 'snapshot' and ((snapshot is None) or (repository is None)): 

261 raise MissingArgument( 

262 f'A snapshot and repository must accompany "action" {action}. snapshot: ' 

263 f'{snapshot}, repository: {repository}' 

264 ) 

265 if action == 'restore' and index_list is None: 

266 raise MissingArgument(f'An index_list must accompany "action" {action}') 

267 if action == 'reindex': 

268 try: 

269 _ = client.tasks.get(task_id=task_id) 

270 except Exception as err: 

271 # This exception should only exist in API usage. It should never 

272 # occur in regular Curator usage. 

273 raise CuratorException(f'Unable to find task_id {task_id}. Exception: {err}') from err 

274 

275 # Now with this mapped, we can perform the wait as indicated. 

276 start_time = datetime.now() 

277 result = False 

278 while True: 

279 elapsed = int((datetime.now() - start_time).total_seconds()) 

280 logger.debug('Elapsed time: %s seconds', elapsed) 

281 response = action_map[action]['function'](client, **action_map[action]['args']) 

282 logger.debug('Response: %s', response) 

283 # Success 

284 if response: 

285 logger.debug( 

286 'Action "%s" finished executing (may or may not have been successful)', action) 

287 result = True 

288 break 

289 # Not success, and reached maximum wait (if defined) 

290 if (max_wait != -1) and (elapsed >= max_wait): 

291 msg = f'Unable to complete action "{action}" within max_wait ({max_wait}) seconds.' 

292 logger.error(msg) 

293 break 

294 # Not success, so we wait. 

295 msg = ( 

296 f'Action "{action}" not yet complete, {elapsed} total seconds elapsed. ' 

297 f'Waiting {wait_interval} seconds before checking again.' 

298 ) 

299 logger.debug(msg) 

300 sleep(wait_interval) 

301 

302 logger.debug('Result: %s', result) 

303 if not result: 

304 raise ActionTimeout( 

305 f'Action "{action}" failed to complete in the max_wait period of {max_wait} seconds' 

306 )