Coverage for curator/actions/reindex.py: 95%
167 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-20 21:00 -0600
1"""Reindex action class"""
2import logging
3from copy import deepcopy
4# pylint: disable=broad-except
5from es_client.builder import ClientArgs, OtherArgs, Builder
6from es_client.helpers.utils import ensure_list, verify_url_schema
7from es_client.exceptions import ConfigurationError
8from curator.exceptions import CuratorException, FailedExecution, NoIndices
9from curator.exceptions import ConfigurationError as CuratorConfigError # Separate from es_client
10from curator.helpers.testers import verify_index_list
11from curator.helpers.utils import report_failure
12from curator.helpers.waiters import wait_for_it
13from curator import IndexList
15class Reindex:
16 """Reindex Action Class"""
17 def __init__(
18 self, ilo, request_body, refresh=True, requests_per_second=-1, slices=1, timeout=60,
19 wait_for_active_shards=1, wait_for_completion=True, max_wait=-1, wait_interval=9,
20 remote_certificate=None, remote_client_cert=None, remote_client_key=None,
21 remote_filters=None, migration_prefix='', migration_suffix=''
22 ):
23 """
24 :param ilo: An IndexList Object
25 :param request_body: The body to send to :py:meth:`~.elasticsearch.Elasticsearch.reindex`,
26 which must be complete and usable, as Curator will do no vetting of the request_body.
27 If it fails to function, Curator will return an exception.
28 :param refresh: Whether to refresh the entire target index after the operation is complete.
29 :param requests_per_second: The throttle to set on this request in sub-requests per second.
30 ``-1`` means set no throttle as does ``unlimited`` which is the only non-float this
31 accepts.
32 :param slices: The number of slices this task should be divided into. ``1`` means the task
33 will not be sliced into subtasks. (Default: ``1``)
34 :param timeout: The length in seconds each individual bulk request should wait for shards
35 that are unavailable. (default: ``60``)
36 :param wait_for_active_shards: Sets the number of shard copies that must be active before
37 proceeding with the reindex operation. (Default: ``1``) means the primary shard only.
38 Set to ``all`` for all shard copies, otherwise set to any non-negative value less than
39 or equal to the total number of copies for the shard (number of replicas + 1)
40 :param wait_for_completion: Wait for completion before returning.
41 :param wait_interval: Seconds to wait between completion checks.
42 :param max_wait: Maximum number of seconds to ``wait_for_completion``
43 :param remote_certificate: Path to SSL/TLS certificate
44 :param remote_client_cert: Path to SSL/TLS client certificate (public key)
45 :param remote_client_key: Path to SSL/TLS private key
46 :param migration_prefix: When migrating, prepend this value to the index name.
47 :param migration_suffix: When migrating, append this value to the index name.
49 :type ilo: :py:class:`~.curator.indexlist.IndexList`
50 :type request_body: dict
51 :type refresh: bool
52 :type requests_per_second: int
53 :type slices: int
54 :type timeout: int
55 :type wait_for_active_shards: int
56 :type wait_for_completion: bool
57 :type wait_interval: int
58 :type max_wait: int
59 :type remote_certificate: str
60 :type remote_cclient_cert: str
61 :type remote_cclient_key: str
62 :type migration_prefix: str
63 :type migration_suffix: str
64 """
65 if remote_filters is None:
66 remote_filters = {}
67 self.loggit = logging.getLogger('curator.actions.reindex')
68 verify_index_list(ilo)
69 if not isinstance(request_body, dict):
70 raise CuratorConfigError('"request_body" is not of type dictionary')
71 #: Object attribute that gets the value of param ``request_body``.
72 self.body = request_body
73 self.loggit.debug('REQUEST_BODY = %s', request_body)
74 #: The :py:class:`~.curator.indexlist.IndexList` object passed from param ``ilo``
75 self.index_list = ilo
76 #: The :py:class:`~.elasticsearch.Elasticsearch` client object derived from
77 #: :py:attr:`index_list`
78 self.client = ilo.client
79 #: Object attribute that gets the value of param ``refresh``.
80 self.refresh = refresh
81 #: Object attribute that gets the value of param ``requests_per_second``.
82 self.requests_per_second = requests_per_second
83 #: Object attribute that gets the value of param ``slices``.
84 self.slices = slices
85 #: Object attribute that gets the value of param ``timeout``, convert to :py:class:`str`
86 #: and add ``s`` for seconds.
87 self.timeout = f'{timeout}s'
88 #: Object attribute that gets the value of param ``wait_for_active_shards``.
89 self.wait_for_active_shards = wait_for_active_shards
90 #: Object attribute that gets the value of param ``wait_for_completion``.
91 self.wfc = wait_for_completion
92 #: Object attribute that gets the value of param ``wait_interval``.
93 self.wait_interval = wait_interval
94 #: Object attribute that gets the value of param ``max_wait``.
95 self.max_wait = max_wait
96 #: Object attribute that gets the value of param ``migration_prefix``.
97 self.mpfx = migration_prefix
98 #: Object attribute that gets the value of param ``migration_suffix``.
99 self.msfx = migration_suffix
101 #: Object attribute that is set ``False`` unless :py:attr:`body` has
102 #: ``{'source': {'remote': {}}}``, then it is set ``True``
103 self.remote = False
104 if 'remote' in self.body['source']:
105 self.remote = True
107 #: Object attribute that is set ``False`` unless :py:attr:`body` has
108 #: ``{'dest': {'index': 'MIGRATION'}}``, then it is set ``True``
109 self.migration = False
110 if self.body['dest']['index'] == 'MIGRATION':
111 self.migration = True
113 if self.migration:
114 if not self.remote and not self.mpfx and not self.msfx:
115 raise CuratorConfigError(
116 'MIGRATION can only be used locally with one or both of '
117 'migration_prefix or migration_suffix.'
118 )
120 # REINDEX_SELECTION is the designated token. If you use this for the
121 # source "index," it will be replaced with the list of indices from the
122 # provided 'ilo' (index list object).
123 if self.body['source']['index'] == 'REINDEX_SELECTION' \
124 and not self.remote:
125 self.body['source']['index'] = self.index_list.indices
127 # Remote section
128 elif self.remote:
129 rclient_args = ClientArgs()
130 rother_args = OtherArgs()
131 self.loggit.debug('Remote reindex request detected')
132 if 'host' not in self.body['source']['remote']:
133 raise CuratorConfigError('Missing remote "host"')
134 try:
135 rclient_args.hosts = verify_url_schema(self.body['source']['remote']['host'])
136 except ConfigurationError as exc:
137 raise CuratorConfigError(exc) from exc
139 # Now that the URL schema is verified, these will pass.
140 self.remote_host = rclient_args.hosts.split(':')[-2]
141 self.remote_host = self.remote_host.split('/')[2]
142 self.remote_port = rclient_args.hosts.split(':')[-1]
144 if 'username' in self.body['source']['remote']:
145 rother_args.username = self.body['source']['remote']['username']
146 if 'password' in self.body['source']['remote']:
147 rother_args.password = self.body['source']['remote']['password']
148 if remote_certificate:
149 rclient_args.ca_certs = remote_certificate
150 if remote_client_cert:
151 rclient_args.client_cert = remote_client_cert
152 if remote_client_key:
153 rclient_args.client_key = remote_client_key
155 # Let's set a decent remote timeout for initially reading
156 # the indices on the other side, and collecting their metadata
157 rclient_args.remote_timeout = 180
159 # The rest only applies if using filters for remote indices
160 if self.body['source']['index'] == 'REINDEX_SELECTION':
161 self.loggit.debug('Filtering indices from remote')
162 msg = (
163 f'Remote client args: '
164 f'hosts={rclient_args.hosts} '
165 f'username=REDACTED '
166 f'password=REDACTED '
167 f'certificate={remote_certificate} '
168 f'client_cert={remote_client_cert} '
169 f'client_key={remote_client_key} '
170 f'request_timeout={rclient_args.remote_timeout} '
171 f'skip_version_test=True'
172 )
173 self.loggit.debug(msg)
174 remote_config = {
175 'elasticsearch': {
176 'client': rclient_args.asdict(),
177 'other_settings': rother_args.asdict()
178 }
179 }
180 try: # let's try to build a remote connection with these!
181 builder = Builder(configdict=remote_config, version_min=(1,0,0))
182 builder.connect()
183 rclient = builder.client
184 except Exception as err:
185 self.loggit.error(
186 'Unable to establish connection to remote Elasticsearch'
187 ' with provided credentials/certificates/settings.'
188 )
189 report_failure(err)
190 try:
191 rio = IndexList(rclient)
192 rio.iterate_filters({'filters': remote_filters})
193 try:
194 rio.empty_list_check()
195 except NoIndices as exc:
196 raise FailedExecution(
197 'No actionable remote indices selected after applying filters.'
198 ) from exc
199 self.body['source']['index'] = rio.indices
200 except Exception as err:
201 self.loggit.error('Unable to get/filter list of remote indices.')
202 report_failure(err)
204 self.loggit.debug('Reindexing indices: %s', self.body['source']['index'])
206 def _get_request_body(self, source, dest):
207 body = deepcopy(self.body)
208 body['source']['index'] = source
209 body['dest']['index'] = dest
210 return body
212 def _get_reindex_args(self, source, dest):
213 # Always set wait_for_completion to False. Let 'wait_for_it' do its
214 # thing if wait_for_completion is set to True. Report the task_id
215 # either way.
216 reindex_args = {
217 'refresh':self.refresh,
218 'requests_per_second': self.requests_per_second,
219 'slices': self.slices,
220 'timeout': self.timeout,
221 'wait_for_active_shards': self.wait_for_active_shards,
222 'wait_for_completion': False,
223 }
224 for keyname in ['dest', 'source', 'conflicts', 'max_docs', 'size', '_source', 'script']:
225 if keyname in self.body:
226 reindex_args[keyname] = self.body[keyname]
227 # Mimic the _get_request_body(source, dest) behavior by casting these values here instead
228 reindex_args['dest']['index'] = dest
229 reindex_args['source']['index'] = source
230 return reindex_args
232 def get_processed_items(self, task_id):
233 """
234 This function calls :py:func:`~.elasticsearch.client.TasksClient.get` with the provided
235 ``task_id``. It will get the value from ``'response.total'`` as the total number of
236 elements processed during reindexing. If the value is not found, it will return ``-1``
238 :param task_id: A task_id which ostensibly matches a task searchable in the tasks API.
239 """
240 try:
241 task_data = self.client.tasks.get(task_id=task_id)
242 except Exception as exc:
243 raise CuratorException(
244 f'Unable to obtain task information for task_id "{task_id}". Exception {exc}'
245 ) from exc
246 total_processed_items = -1
247 task = task_data['task']
248 if task['action'] == 'indices:data/write/reindex':
249 self.loggit.debug("It's a REINDEX TASK'")
250 self.loggit.debug('TASK_DATA: %s', task_data)
251 self.loggit.debug('TASK_DATA keys: %s', list(task_data.keys()))
252 if 'response' in task_data:
253 response = task_data['response']
254 total_processed_items = response['total']
255 self.loggit.debug('total_processed_items = %s', total_processed_items)
256 return total_processed_items
258 def _post_run_quick_check(self, index_name, task_id):
259 # Check whether any documents were processed
260 # if no documents processed, the target index "dest" won't exist
261 processed_items = self.get_processed_items(task_id)
262 if processed_items == 0:
263 msg = f'No items were processed. Will not check if target index "{index_name}" exists'
264 self.loggit.info(msg)
265 else:
266 # Verify the destination index is there after the fact
267 index_exists = self.client.indices.exists(index=index_name)
268 alias_instead = self.client.indices.exists_alias(name=index_name)
269 if not index_exists and not alias_instead:
270 # pylint: disable=logging-fstring-interpolation
271 self.loggit.error(
272 f'The index described as "{index_name}" was not found after the reindex '
273 f'operation. Check Elasticsearch logs for more '
274 f'information.'
275 )
276 if self.remote:
277 # pylint: disable=logging-fstring-interpolation
278 self.loggit.error(
279 f'Did you forget to add "reindex.remote.whitelist: '
280 f'{self.remote_host}:{self.remote_port}" to the elasticsearch.yml file on '
281 f'the "dest" node?'
282 )
283 raise FailedExecution(
284 f'Reindex failed. The index or alias identified by "{index_name}" was '
285 f'not found.'
286 )
288 def sources(self):
289 """Generator for Reindexing ``sources`` & ``dests``"""
290 dest = self.body['dest']['index']
291 source_list = ensure_list(self.body['source']['index'])
292 self.loggit.debug('source_list: %s', source_list)
293 if not source_list or source_list == ['REINDEX_SELECTED']: # Empty list
294 raise NoIndices
295 if not self.migration:
296 yield self.body['source']['index'], dest
298 # Loop over all sources (default will only be one)
299 else:
300 for source in source_list:
301 if self.migration:
302 dest = self.mpfx + source + self.msfx
303 yield source, dest
305 def show_run_args(self, source, dest):
306 """Show what will run"""
307 return (
308 f'request body: {self._get_request_body(source, dest)} with arguments: '
309 f'refresh={self.refresh} '
310 f'requests_per_second={self.requests_per_second} '
311 f'slices={self.slices} '
312 f'timeout={self.timeout} '
313 f'wait_for_active_shards={self.wait_for_active_shards} '
314 f'wait_for_completion={self.wfc}'
315 )
317 def do_dry_run(self):
318 """Log what the output would be, but take no action."""
319 self.loggit.info('DRY-RUN MODE. No changes will be made.')
320 for source, dest in self.sources():
321 self.loggit.info('DRY-RUN: REINDEX: %s', self.show_run_args(source, dest))
323 def do_action(self):
324 """
325 Execute :py:meth:`~.elasticsearch.Elasticsearch.reindex` operation with the
326 ``request_body`` from :py:meth:`_get_request_body` and arguments :py:attr:`refresh`,
327 :py:attr:`requests_per_second`, :py:attr:`slices`, :py:attr:`timeout`,
328 :py:attr:`wait_for_active_shards`, and :py:attr:`wfc`.
329 """
330 try:
331 # Loop over all sources (default will only be one)
332 for source, dest in self.sources():
333 self.loggit.info('Commencing reindex operation')
334 self.loggit.debug('REINDEX: %s', self.show_run_args(source, dest))
335 response = self.client.reindex(**self._get_reindex_args(source, dest))
337 self.loggit.debug('TASK ID = %s', response['task'])
338 if self.wfc:
339 wait_for_it(
340 self.client, 'reindex', task_id=response['task'],
341 wait_interval=self.wait_interval, max_wait=self.max_wait
342 )
343 self._post_run_quick_check(dest, response['task'])
345 else:
346 msg = (
347 f'"wait_for_completion" set to {self.wfc}. Remember to check task_id '
348 f"\"{response['task']}\" for successful completion manually."
349 )
350 self.loggit.warning(msg)
351 except NoIndices as exc:
352 raise NoIndices(
353 'Source index must be list of actual indices. It must not be an empty list.'
354 ) from exc
355 except Exception as exc:
356 report_failure(exc)