Coverage for curator/helpers/waiters.py: 100%
146 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
1"""The function that waits
3...and its helpers
4"""
5import logging
6from time import localtime, sleep, strftime
7from datetime import datetime
8from curator.exceptions import (
9 ActionTimeout, ConfigurationError, CuratorException, FailedReindex, MissingArgument)
10from curator.helpers.utils import chunk_index_list
12def health_check(client, **kwargs):
13 """
14 This function calls `client.cluster.` :py:meth:`~.elasticsearch.client.ClusterClient.health`
15 and, based on the params provided, will return ``True`` or ``False`` depending on whether that
16 particular keyword appears in the output, and has the expected value.
18 If multiple keys are provided, all must match for a ``True`` response.
20 :param client: A client connection object
22 :type client: :py:class:`~.elasticsearch.Elasticsearch`
24 :rtype: bool
25 """
26 logger = logging.getLogger(__name__)
27 logger.debug('KWARGS= "%s"', kwargs)
28 klist = list(kwargs.keys())
29 if not klist:
30 raise MissingArgument('Must provide at least one keyword argument')
31 hc_data = client.cluster.health()
32 response = True
34 for k in klist:
35 # First, verify that all kwargs are in the list
36 if not k in list(hc_data.keys()):
37 raise ConfigurationError('Key "{0}" not in cluster health output')
38 if not hc_data[k] == kwargs[k]:
39 msg = f'NO MATCH: Value for key "{kwargs[k]}", health check data: {hc_data[k]}'
40 logger.debug(msg)
41 response = False
42 else:
43 msg = f'MATCH: Value for key "{kwargs[k]}", health check data: {hc_data[k]}'
44 logger.debug(msg)
45 if response:
46 logger.info('Health Check for all provided keys passed.')
47 return response
49def relocate_check(client, index):
50 """
51 This function calls `client.cluster.` :py:meth:`~.elasticsearch.client.ClusterClient.state`
52 with a given index to check if all of the shards for that index are in the ``STARTED`` state.
53 It will return ``True`` if all primary and replica shards are in the ``STARTED`` state, and it
54 will return ``False`` if any shard is in a different state.
56 :param client: A client connection object
57 :param index: The index name
59 :type client: :py:class:`~.elasticsearch.Elasticsearch`
60 :type index: str
62 :rtype: bool
63 """
64 logger = logging.getLogger(__name__)
65 shard_state_data = (
66 client.cluster.state(index=index)['routing_table']['indices'][index]['shards']
67 )
68 finished_state = (
69 all(
70 all(
71 shard['state'] == "STARTED" for shard in shards
72 )
73 for shards in shard_state_data.values()
74 )
75 )
76 if finished_state:
77 logger.info('Relocate Check for index: "%s" has passed.', index)
78 return finished_state
80def restore_check(client, index_list):
81 """
82 This function calls `client.indices.` :py:meth:`~.elasticsearch.client.IndicesClient.recovery`
83 with the list of indices to check for complete recovery. It will return ``True`` if recovery
84 of those indices is complete, and ``False`` otherwise. It is designed to fail fast: if a
85 single shard is encountered that is still recovering (not in ``DONE`` stage), it will
86 immediately return ``False``, rather than complete iterating over the rest of the response.
88 :param client: A client connection object
89 :param index_list: The list of indices to verify having been restored.
91 :type client: :py:class:`~.elasticsearch.Elasticsearch`
92 :type index_list: list
94 :rtype: bool
95 """
96 logger = logging.getLogger(__name__)
97 response = {}
98 for chunk in chunk_index_list(index_list):
99 try:
100 chunk_response = client.indices.recovery(index=chunk, human=True)
101 except Exception as err:
102 msg = f'Unable to obtain recovery information for specified indices. Error: {err}'
103 raise CuratorException(msg) from err
104 if chunk_response == {}:
105 logger.info('_recovery returned an empty response. Trying again.')
106 return False
107 response.update(chunk_response)
108 logger.info('Provided indices: %s', index_list)
109 logger.info('Found indices: %s', list(response.keys()))
110 # pylint: disable=consider-using-dict-items
111 for index in response:
112 for shard in range(0, len(response[index]['shards'])):
113 stage = response[index]['shards'][shard]['stage']
114 if stage != 'DONE':
115 logger.info('Index "%s" is still in stage "%s"', index, stage)
116 return False
118 # If we've gotten here, all of the indices have recovered
119 return True
121def snapshot_check(client, snapshot=None, repository=None):
122 """
123 This function calls `client.snapshot.` :py:meth:`~.elasticsearch.client.SnapshotClient.get` and
124 tests to see whether the snapshot is complete, and if so, with what status. It will log errors
125 according to the result. If the snapshot is still ``IN_PROGRESS``, it will return ``False``.
126 ``SUCCESS`` will be an ``INFO`` level message, ``PARTIAL`` nets a ``WARNING`` message,
127 ``FAILED`` is an ``ERROR``, message, and all others will be a ``WARNING`` level message.
129 :param client: A client connection object
130 :param snapshot: The snapshot name
131 :param repository: The repository name
133 :type client: :py:class:`~.elasticsearch.Elasticsearch`
134 :type snapshot: str
135 :type repository: str
137 :rtype: bool
138 """
139 logger = logging.getLogger(__name__)
140 logger.debug('SNAPSHOT: %s', snapshot)
141 logger.debug('REPOSITORY: %s', repository)
142 try:
143 result = client.snapshot.get(repository=repository, snapshot=snapshot)
144 logger.debug('RESULT: %s', result)
145 except Exception as err:
146 raise CuratorException(
147 f'Unable to obtain information for snapshot "{snapshot}" in repository '
148 f'"{repository}". Error: {err}'
149 ) from err
150 state = result['snapshots'][0]['state']
151 logger.debug('Snapshot state = %s', state)
152 retval = True
153 if state == 'IN_PROGRESS':
154 logger.info('Snapshot %s still in progress.', snapshot)
155 retval = False
156 elif state == 'SUCCESS':
157 logger.info('Snapshot %s successfully completed.', snapshot)
158 elif state == 'PARTIAL':
159 logger.warning('Snapshot %s completed with state PARTIAL.', snapshot)
160 elif state == 'FAILED':
161 logger.error('Snapshot %s completed with state FAILED.', snapshot)
162 else:
163 logger.warning('Snapshot %s completed with state: %s', snapshot, state)
164 return retval
166def task_check(client, task_id=None):
167 """
168 This function calls `client.tasks.` :py:meth:`~.elasticsearch.client.TasksClient.get` with the
169 provided ``task_id``. If the task data contains ``'completed': True``, then it will return
170 ``True``. If the task is not completed, it will log some information about the task and return
171 ``False``
173 :param client: A client connection object
174 :param task_id: The task id
176 :type client: :py:class:`~.elasticsearch.Elasticsearch`
177 :type task_id: str
179 :rtype: bool
180 """
181 logger = logging.getLogger(__name__)
182 try:
183 task_data = client.tasks.get(task_id=task_id)
184 except Exception as err:
185 msg = f'Unable to obtain task information for task_id "{task_id}". Exception {err}'
186 raise CuratorException(msg) from err
187 task = task_data['task']
188 completed = task_data['completed']
189 if task['action'] == 'indices:data/write/reindex':
190 logger.debug('It\'s a REINDEX TASK')
191 logger.debug('TASK_DATA: %s', task_data)
192 logger.debug('TASK_DATA keys: %s', list(task_data.keys()))
193 if 'response' in task_data:
194 response = task_data['response']
195 if response['failures']:
196 msg = f'Failures found in reindex response: {response["failures"]}'
197 raise FailedReindex(msg)
198 running_time = 0.000000001 * task['running_time_in_nanos']
199 logger.debug('Running time: %s seconds', running_time)
200 descr = task['description']
202 if completed:
203 completion_time = (running_time * 1000) + task['start_time_in_millis']
204 time_string = strftime('%Y-%m-%dT%H:%M:%SZ', localtime(completion_time/1000))
205 logger.info('Task "%s" completed at %s.', descr, time_string)
206 retval = True
207 else:
208 # Log the task status here.
209 logger.debug('Full Task Data: %s', task_data)
210 msg = (
211 f'Task "{descr}" with task_id "{task_id}" has been running for {running_time} seconds'
212 )
213 logger.info(msg)
214 retval = False
215 return retval
217# pylint: disable=too-many-locals, too-many-arguments
218def wait_for_it(
219 client, action, task_id=None, snapshot=None, repository=None, index=None, index_list=None,
220 wait_interval=9, max_wait=-1
221 ):
222 """
223 This function becomes one place to do all ``wait_for_completion`` type behaviors
225 :param client: A client connection object
226 :param action: The action name that will identify how to wait
227 :param task_id: If the action provided a task_id, this is where it must be declared.
228 :param snapshot: The name of the snapshot.
229 :param repository: The Elasticsearch snapshot repository to use
230 :param wait_interval: Seconds to wait between completion checks.
231 :param max_wait: Maximum number of seconds to ``wait_for_completion``
233 :type client: :py:class:`~.elasticsearch.Elasticsearch`
234 :type action: str
235 :type task_id: str
236 :type snapshot: str
237 :type repository: str
238 :type wait_interval: int
239 :type max_wait: int
240 :rtype: None
241 """
242 logger = logging.getLogger(__name__)
243 action_map = {
244 'allocation':{'function': health_check, 'args': {'relocating_shards':0}},
245 'replicas':{'function': health_check, 'args': {'status':'green'}},
246 'cluster_routing':{'function': health_check, 'args': {'relocating_shards':0}},
247 'snapshot':{
248 'function':snapshot_check, 'args':{'snapshot':snapshot, 'repository':repository}},
249 'restore':{'function':restore_check, 'args':{'index_list':index_list}},
250 'reindex':{'function':task_check, 'args':{'task_id':task_id}},
251 'shrink':{'function': health_check, 'args': {'status':'green'}},
252 'relocate':{'function': relocate_check, 'args': {'index':index}},
253 }
254 wait_actions = list(action_map.keys())
256 if action not in wait_actions:
257 raise ConfigurationError(f'"action" must be one of {wait_actions}')
258 if action == 'reindex' and task_id is None:
259 raise MissingArgument(f'A task_id must accompany "action" {action}')
260 if action == 'snapshot' and ((snapshot is None) or (repository is None)):
261 raise MissingArgument(
262 f'A snapshot and repository must accompany "action" {action}. snapshot: '
263 f'{snapshot}, repository: {repository}'
264 )
265 if action == 'restore' and index_list is None:
266 raise MissingArgument(f'An index_list must accompany "action" {action}')
267 if action == 'reindex':
268 try:
269 _ = client.tasks.get(task_id=task_id)
270 except Exception as err:
271 # This exception should only exist in API usage. It should never
272 # occur in regular Curator usage.
273 raise CuratorException(f'Unable to find task_id {task_id}. Exception: {err}') from err
275 # Now with this mapped, we can perform the wait as indicated.
276 start_time = datetime.now()
277 result = False
278 while True:
279 elapsed = int((datetime.now() - start_time).total_seconds())
280 logger.debug('Elapsed time: %s seconds', elapsed)
281 response = action_map[action]['function'](client, **action_map[action]['args'])
282 logger.debug('Response: %s', response)
283 # Success
284 if response:
285 logger.debug(
286 'Action "%s" finished executing (may or may not have been successful)', action)
287 result = True
288 break
289 # Not success, and reached maximum wait (if defined)
290 if (max_wait != -1) and (elapsed >= max_wait):
291 msg = f'Unable to complete action "{action}" within max_wait ({max_wait}) seconds.'
292 logger.error(msg)
293 break
294 # Not success, so we wait.
295 msg = (
296 f'Action "{action}" not yet complete, {elapsed} total seconds elapsed. '
297 f'Waiting {wait_interval} seconds before checking again.'
298 )
299 logger.debug(msg)
300 sleep(wait_interval)
302 logger.debug('Result: %s', result)
303 if not result:
304 raise ActionTimeout(
305 f'Action "{action}" failed to complete in the max_wait period of {max_wait} seconds'
306 )