Coverage for curator/actions/shrink.py: 97%
273 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
1"""Reindex action class"""
2import logging
3# pylint: disable=broad-except
4from curator.exceptions import ActionError, ConfigurationError
5from curator.helpers.getters import index_size, name_to_node_id, node_id_to_name, node_roles
6from curator.helpers.testers import verify_index_list
7from curator.helpers.utils import chunk_index_list, report_failure
8from curator.helpers.waiters import health_check, wait_for_it
11class Shrink:
12 """Shrink Action Class"""
13 def __init__(
14 self, ilo, shrink_node='DETERMINISTIC', node_filters=None, number_of_shards=1,
15 number_of_replicas=1, shrink_prefix='', shrink_suffix='-shrink', copy_aliases=False,
16 delete_after=True, post_allocation=None, wait_for_active_shards=1,
17 wait_for_rebalance=True, extra_settings=None, wait_for_completion=True, wait_interval=9,
18 max_wait=-1
19 ):
20 """
21 :param ilo: An IndexList Object
22 :param shrink_node: The node name to use as the shrink target, or ``DETERMINISTIC``, which
23 will use the values in ``node_filters`` to determine which node will be the shrink
24 node.
25 :param node_filters: If the value of ``shrink_node`` is ``DETERMINISTIC``, the values in
26 ``node_filters`` will be used while determining which node to allocate the shards on
27 before performing the shrink.
28 :param number_of_shards: The number of shards the shrunk index should have
29 :param number_of_replicas: The number of replicas for the shrunk index
30 :param shrink_prefix: Prepend the shrunk index with this value
31 :param shrink_suffix: Append the value to the shrunk index (Default: ``-shrink``)
32 :param copy_aliases: Whether to copy each source index aliases to target index after
33 shrinking. The aliases will be added to target index and deleted from source index at
34 the same time. (Default: ``False``)
35 :param delete_after: Whether to delete each index after shrinking. (Default: ``True``)
36 :param post_allocation: If populated, the ``allocation_type``, ``key``, and ``value`` will
37 be applied to the shrunk index to re-route it.
38 :param extra_settings: Permitted root keys are ``settings`` and ``aliases``.
39 :param wait_for_active_shards: Wait for this many active shards before returning.
40 :param wait_for_rebalance: Wait for rebalance. (Default: ``True``)
41 :param wait_for_completion: Wait for completion before returning.
42 :param wait_interval: Seconds to wait between completion checks.
43 :param max_wait: Maximum number of seconds to ``wait_for_completion``
45 :type ilo: :py:class:`~.curator.indexlist.IndexList`
46 :type shrink_node: str
47 :type node_filters: dict
48 :type number_of_shards: int
49 :type number_of_replicas: int
50 :type shrink_prefix: str
51 :type shrink_suffix: str
52 :type copy_aliases: bool
53 :type delete_after: bool
54 :type post_allocation: dict
55 :type extra_settings: dict
56 :type wait_for_active_shards: int
57 :type wait_for_rebalance: bool
58 :type wait_for_completion: bool
59 :type wait_interval: int
60 :type max_wait: int
61 """
62 if node_filters is None:
63 node_filters = {}
64 if post_allocation is None:
65 post_allocation = {}
66 if extra_settings is None:
67 extra_settings = {}
68 self.loggit = logging.getLogger('curator.actions.shrink')
69 verify_index_list(ilo)
70 if 'permit_masters' not in node_filters:
71 node_filters['permit_masters'] = False
72 #: The :py:class:`~.curator.indexlist.IndexList` object passed from param ``ilo``
73 self.index_list = ilo
74 #: The :py:class:`~.elasticsearch.Elasticsearch` client object derived from
75 #: :py:attr:`index_list`
76 self.client = ilo.client
77 #: Object attribute that gets the value of param ``shrink_node``.
78 self.shrink_node = shrink_node
79 #: Object attribute that gets the value of param ``node_filters``.
80 self.node_filters = node_filters
81 #: Object attribute that gets the value of param ``shrink_prefix``.
82 self.shrink_prefix = shrink_prefix
83 #: Object attribute that gets the value of param ``shrink_suffix``.
84 self.shrink_suffix = shrink_suffix
85 #: Object attribute that gets the value of param ``copy_aliases``.
86 self.copy_aliases = copy_aliases
87 #: Object attribute that gets the value of param ``delete_after``.
88 self.delete_after = delete_after
89 #: Object attribute that gets the value of param ``post_allocation``.
90 self.post_allocation = post_allocation
91 #: Object attribute that gets the value of param ``wait_for_rebalance``.
92 self.wait_for_rebalance = wait_for_rebalance
93 #: Object attribute that gets the value of param ``wait_for_completion``.
94 self.wfc = wait_for_completion
95 #: Object attribute that gets the value of param ``wait_interval``.
96 self.wait_interval = wait_interval
97 #: Object attribute that gets the value of param ``max_wait``.
98 self.max_wait = max_wait
99 #: Object attribute that gets the value of param ``number_of_shards``.
100 self.number_of_shards = number_of_shards
101 #: Object attribute that gets the value of param ``wait_for_active_shards``.
102 self.wait_for_active_shards = wait_for_active_shards
104 #: Object attribute that represents the target node for shrinking.
105 self.shrink_node_name = None
106 #: Object attribute that represents whether :py:attr:`shrink_node_name` is available
107 self.shrink_node_avail = None
108 #: Object attribute that represents the node_id of :py:attr:`shrink_node_name`
109 self.shrink_node_id = None
111 #: Object attribute that gets values from params ``number_of_shards`` and
112 #: ``number_of_replicas``.
113 self.settings = {
114 'index.number_of_shards' : number_of_shards,
115 'index.number_of_replicas' : number_of_replicas,
116 }
118 if extra_settings:
119 self._merge_extra_settings(extra_settings)
121 self._merge_extra_settings({
122 'settings': {
123 'index.routing.allocation.require._name': None,
124 'index.blocks.write': None
125 }})
127 def _merge_extra_settings(self, extra_settings):
128 self.loggit.debug('Adding extra_settings to shrink body: %s', extra_settings)
129 # Pop these here, otherwise we could overwrite our default number of
130 # shards and replicas
131 if 'settings' in extra_settings:
132 settings = extra_settings.pop('settings')
133 try:
134 self.settings.update(settings)
135 except Exception as exc:
136 raise ConfigurationError(
137 f"Unable to apply extra settings \"{{'settings':settings}}\" "
138 f"to shrink body. Exception: {exc}"
139 ) from exc
140 if extra_settings:
141 try: # Apply any remaining keys, should there be any.
142 self.settings.update(extra_settings)
143 except Exception as exc:
144 raise ConfigurationError(
145 f'Unable to apply extra settings "{extra_settings}" '
146 f'to shrink body. Exception: {exc}'
147 ) from exc
149 def _data_node(self, node_id):
150 roles = node_roles(self.client, node_id)
151 name = node_id_to_name(self.client, node_id)
152 if not 'data' in roles:
153 self.loggit.info('Skipping node "%s": non-data node', name)
154 return False
155 if 'master' in roles and not self.node_filters['permit_masters']:
156 self.loggit.info('Skipping node "%s": master node', name)
157 return False
158 elif 'master' in roles and self.node_filters['permit_masters']:
159 msg = (
160 f'Not skipping node "{name}" which is a master node (not recommended), but '
161 f'permit_masters is True'
162 )
163 self.loggit.warning(msg)
164 return True
165 else: # It does have `data` as a role.
166 return True
168 def _exclude_node(self, name):
169 if 'exclude_nodes' in self.node_filters:
170 if name in self.node_filters['exclude_nodes']:
171 self.loggit.info('Excluding node "%s" due to node_filters', name)
172 return True
173 return False
175 def _shrink_target(self, name):
176 return f'{self.shrink_prefix}{name}{self.shrink_suffix}'
178 def qualify_single_node(self):
179 """Qualify a single node as a shrink target"""
180 node_id = name_to_node_id(self.client, self.shrink_node)
181 if node_id:
182 self.shrink_node_id = node_id
183 self.shrink_node_name = self.shrink_node
184 else:
185 raise ConfigurationError(
186 f'Unable to find node named: "{self.shrink_node}"')
187 if self._exclude_node(self.shrink_node):
188 raise ConfigurationError(
189 f'Node "{self.shrink_node}" listed for exclusion')
190 if not self._data_node(node_id):
191 raise ActionError(
192 f'Node "{self.shrink_node}" is not usable as a shrink node')
193 self.shrink_node_avail = (
194 self.client.nodes.stats()['nodes'][node_id]['fs']['total']['available_in_bytes']
195 )
197 def most_available_node(self):
198 """
199 Determine which data node name has the most available free space, and meets the other node
200 filters settings.
201 """
202 mvn_avail = 0
203 # mvn_total = 0
204 mvn_name = None
205 mvn_id = None
206 nodes = self.client.nodes.stats()['nodes']
207 for node_id in nodes:
208 name = nodes[node_id]['name']
209 if self._exclude_node(name):
210 self.loggit.debug('Node "%s" excluded by node filters', name)
211 continue
212 if not self._data_node(node_id):
213 self.loggit.debug('Node "%s" is not a data node', name)
214 continue
215 value = nodes[node_id]['fs']['total']['available_in_bytes']
216 if value > mvn_avail:
217 mvn_name = name
218 mvn_id = node_id
219 mvn_avail = value
220 self.shrink_node_name = mvn_name
221 self.shrink_node_id = mvn_id
222 self.shrink_node_avail = mvn_avail
224 def route_index(self, idx, allocation_type, key, value):
225 """Apply the indicated shard routing allocation"""
226 bkey = f'index.routing.allocation.{allocation_type}.{key}'
227 routing = {bkey : value}
228 try:
229 self.client.indices.put_settings(index=idx, settings=routing)
230 if self.wait_for_rebalance:
231 wait_for_it(
232 self.client, 'allocation', wait_interval=self.wait_interval,
233 max_wait=self.max_wait
234 )
235 else:
236 wait_for_it(
237 self.client, 'relocate', index=idx, wait_interval=self.wait_interval,
238 max_wait=self.max_wait
239 )
240 except Exception as err:
241 report_failure(err)
243 def __log_action(self, error_msg, dry_run=False):
244 if not dry_run:
245 raise ActionError(error_msg)
246 else:
247 self.loggit.warning('DRY-RUN: %s', error_msg)
249 def _block_writes(self, idx):
250 block = {'index.blocks.write': True}
251 self.client.indices.put_settings(index=idx, settings=block)
253 def _unblock_writes(self, idx):
254 unblock = {'index.blocks.write': False}
255 self.client.indices.put_settings(index=idx, settings=unblock)
257 def _check_space(self, idx, dry_run=False):
258 # Disk watermark calculation is already baked into `available_in_bytes`
259 size = index_size(self.client, idx, value='primaries')
260 padded = (size * 2) + (32 * 1024)
261 if padded < self.shrink_node_avail:
262 msg = (
263 f'Sufficient space available for 2x the size of index "{idx}". '
264 f'Required: {padded}, available: {self.shrink_node_avail}'
265 )
266 self.loggit.debug(msg)
267 else:
268 error_msg = (
269 f'Insufficient space available for 2x the size of index "{idx}", shrinking will '
270 f'exceed space available. Required: {padded}, available: {self.shrink_node_avail}'
271 )
272 self.__log_action(error_msg, dry_run)
274 def _check_node(self):
275 if self.shrink_node != 'DETERMINISTIC':
276 if not self.shrink_node_name:
277 self.qualify_single_node()
278 else:
279 self.most_available_node()
280 # At this point, we should have the three shrink-node identifying
281 # instance variables:
282 # - self.shrink_node_name
283 # - self.shrink_node_id
284 # - self.shrink_node_avail
285 # # - self.shrink_node_total - only if needed in the future
287 def _check_target_exists(self, idx, dry_run=False):
288 target = self._shrink_target(idx)
289 if self.client.indices.exists(index=target):
290 error_msg = f'Target index "{target}" already exists'
291 self.__log_action(error_msg, dry_run)
293 def _check_doc_count(self, idx, dry_run=False):
294 max_docs = 2147483519
295 doc_count = self.client.indices.stats(index=idx)['indices'][idx]['primaries']['docs']['count']
296 if doc_count > (max_docs * self.number_of_shards):
297 error_msg = (
298 f'Too many documents ({doc_count}) to fit in {self.number_of_shards} shard(s). '
299 f'Maximum number of docs per shard is {max_docs}'
300 )
301 self.__log_action(error_msg, dry_run)
303 def _check_shard_count(self, idx, src_shards, dry_run=False):
304 if self.number_of_shards >= src_shards:
305 error_msg = (
306 f'Target number of shards ({self.number_of_shards}) must be less than current '
307 f'number of shards ({src_shards}) in index "{idx}"'
308 )
309 self.__log_action(error_msg, dry_run)
311 def _check_shard_factor(self, idx, src_shards, dry_run=False):
312 # Find the list of factors of src_shards
313 factors = [x for x in range(1, src_shards+1) if src_shards % x == 0]
314 # Pop the last one, because it will be the value of src_shards
315 factors.pop()
316 if not self.number_of_shards in factors:
317 error_msg = (
318 f'"{self.number_of_shards}" is not a valid factor of {src_shards} shards of '
319 f'index {idx}. Valid values are {factors}'
320 )
321 self.__log_action(error_msg, dry_run)
323 def _check_all_shards(self, idx):
324 shards = self.client.cluster.state(index=idx)['routing_table']['indices'][idx]['shards']
325 found = []
326 for shardnum in shards:
327 for shard_idx in range(0, len(shards[shardnum])):
328 if shards[shardnum][shard_idx]['node'] == self.shrink_node_id:
329 found.append(
330 {'shard': shardnum, 'primary': shards[shardnum][shard_idx]['primary']})
331 if len(shards) != len(found):
332 self.loggit.debug(
333 'Found these shards on node "%s": %s', self.shrink_node_name, found)
334 raise ActionError(
335 f'Unable to shrink index "{idx}" as not all shards were found on the designated '
336 f'shrink node ({self.shrink_node_name}): {found}'
337 )
339 def pre_shrink_check(self, idx, dry_run=False):
340 """Do a shrink preflight check"""
341 self.loggit.debug('BEGIN PRE_SHRINK_CHECK')
342 self.loggit.debug('Check that target exists')
343 self._check_target_exists(idx, dry_run)
344 self.loggit.debug('Check doc count constraints')
345 self._check_doc_count(idx, dry_run)
346 self.loggit.debug('Check shard count')
347 src_shards = int(self.client.indices.get(index=idx)[idx]['settings']['index']['number_of_shards'])
348 self._check_shard_count(idx, src_shards, dry_run)
349 self.loggit.debug('Check shard factor')
350 self._check_shard_factor(idx, src_shards, dry_run)
351 self.loggit.debug('Check node availability')
352 self._check_node()
353 self.loggit.debug('Check available disk space')
354 self._check_space(idx, dry_run)
355 self.loggit.debug('FINISH PRE_SHRINK_CHECK')
357 def do_copy_aliases(self, source_idx, target_idx):
358 """Copy the aliases to the shrunk index"""
359 alias_actions = []
360 aliases = self.client.indices.get_alias(index=source_idx)
361 for alias in aliases[source_idx]['aliases']:
362 self.loggit.debug('alias: %s', alias)
363 alias_actions.append({'remove': {'index': source_idx, 'alias': alias}})
364 alias_actions.append({'add': {'index': target_idx, 'alias': alias}})
365 if alias_actions:
366 self.loggit.info('Copy alias actions: %s', alias_actions)
367 self.client.indices.update_aliases(actions=alias_actions)
369 def do_dry_run(self):
370 """Show what a regular run would do, but don't actually do it."""
371 self.index_list.filter_closed()
372 self.index_list.filter_by_shards(number_of_shards=self.number_of_shards)
373 self.index_list.empty_list_check()
374 try:
375 index_lists = chunk_index_list(self.index_list.indices)
376 for lst in index_lists:
377 for idx in lst: # Shrink can only be done one at a time...
378 target = self._shrink_target(idx)
379 self.pre_shrink_check(idx, dry_run=True)
380 self.loggit.info(
381 'DRY-RUN: Moving shards to shrink node: "%s"', self.shrink_node_name)
382 msg = (
383 f'DRY-RUN: Shrinking index "{idx}" to "{target}" with settings: '
384 f'{self.settings}, wait_for_active_shards={self.wait_for_active_shards}'
385 )
386 self.loggit.info(msg)
387 if self.post_allocation:
388 submsg = (
389 f"index.routing.allocation.{self.post_allocation['allocation_type']}."
390 f"{self.post_allocation['key']}:{self.post_allocation['value']}"
391 )
392 msg = (
393 f'DRY-RUN: Applying post-shrink allocation rule "{submsg}" to index '
394 f'"{target}"'
395 )
396 self.loggit.info(msg)
397 if self.copy_aliases:
398 msg = (
399 f'DRY-RUN: Copy source index aliases '
400 f'"{self.client.indices.get_alias(index=idx)}"'
401 )
402 self.loggit.info(msg)
403 if self.delete_after:
404 self.loggit.info('DRY-RUN: Deleting source index "%s"', idx)
405 except Exception as err:
406 report_failure(err)
408 def do_action(self):
409 """
410 :py:meth:`~.elasticsearch.client.IndicesClient.shrink` the indices in :py:attr:`index_list`
411 """
412 self.index_list.filter_closed()
413 self.index_list.filter_by_shards(number_of_shards=self.number_of_shards)
414 self.index_list.empty_list_check()
415 msg = (
416 f'Shrinking {len(self.index_list.indices)} selected indices: {self.index_list.indices}'
417 )
418 self.loggit.info(msg)
419 try:
420 index_lists = chunk_index_list(self.index_list.indices)
421 for lst in index_lists:
422 for idx in lst: # Shrink can only be done one at a time...
423 target = self._shrink_target(idx)
424 self.loggit.info('Source index: %s -- Target index: %s', idx, target)
425 # Pre-check ensures disk space available for each pass of the loop
426 self.pre_shrink_check(idx)
427 # Route the index to the shrink node
428 self.loggit.info('Moving shards to shrink node: "%s"', self.shrink_node_name)
429 self.route_index(idx, 'require', '_name', self.shrink_node_name)
430 # Ensure a copy of each shard is present
431 self._check_all_shards(idx)
432 # Block writes on index
433 self._block_writes(idx)
434 # Do final health check
435 if not health_check(self.client, status='green'):
436 raise ActionError(
437 'Unable to proceed with shrink action. Cluster health is not "green"')
438 # Do the shrink
439 msg = (
440 f'Shrinking index "{idx}" to "{target}" with settings: {self.settings}, '
441 f'wait_for_active_shards={self.wait_for_active_shards}'
442 )
443 self.loggit.info(msg)
444 try:
445 self.client.indices.shrink(
446 index=idx, target=target, settings=self.settings,
447 wait_for_active_shards=self.wait_for_active_shards
448 )
449 # Wait for it to complete
450 if self.wfc:
451 self.loggit.debug(
452 'Wait for shards to complete allocation for index: %s', target)
453 if self.wait_for_rebalance:
454 wait_for_it(
455 self.client, 'shrink', wait_interval=self.wait_interval,
456 max_wait=self.max_wait
457 )
458 else:
459 wait_for_it(
460 self.client, 'relocate', index=target,
461 wait_interval=self.wait_interval, max_wait=self.max_wait
462 )
463 except Exception as exc:
464 if self.client.indices.exists(index=target):
465 msg = (
466 f'Deleting target index "{target}" due to failure to complete '
467 f'shrink'
468 )
469 self.loggit.error(msg)
470 self.client.indices.delete(index=target)
471 raise ActionError(
472 f'Unable to shrink index "{idx}" -- Error: {exc}') from exc
473 self.loggit.info('Index "%s" successfully shrunk to "%s"', idx, target)
474 # Do post-shrink steps
475 # Unblock writes on index (just in case)
476 self._unblock_writes(idx)
477 ## Post-allocation, if enabled
478 if self.post_allocation:
479 submsg = (
480 f"index.routing.allocation.{self.post_allocation['allocation_type']}."
481 f"{self.post_allocation['key']}:{self.post_allocation['value']}"
482 )
483 msg = (
484 f'Applying post-shrink allocation rule "{submsg}" to index "{target}"'
485 )
486 self.loggit.info(msg)
487 self.route_index(
488 target, self.post_allocation['allocation_type'],
489 self.post_allocation['key'], self.post_allocation['value']
490 )
491 ## Copy aliases, if flagged
492 if self.copy_aliases:
493 self.loggit.info('Copy source index aliases "%s"', idx)
494 self.do_copy_aliases(idx, target)
495 ## Delete, if flagged
496 if self.delete_after:
497 self.loggit.info('Deleting source index "%s"', idx)
498 self.client.indices.delete(index=idx)
499 else: # Let's unset the routing we applied here.
500 self.loggit.info('Unassigning routing for source index: "%s"', idx)
501 self.route_index(idx, 'require', '_name', '')
503 except Exception as err:
504 # Just in case it fails after attempting to meet this condition
505 self._unblock_writes(idx)
506 report_failure(err)