Coverage for curator/actions/reindex.py: 95%

167 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-20 21:00 -0600

1"""Reindex action class""" 

2import logging 

3from copy import deepcopy 

4# pylint: disable=broad-except 

5from es_client.builder import ClientArgs, OtherArgs, Builder 

6from es_client.helpers.utils import ensure_list, verify_url_schema 

7from es_client.exceptions import ConfigurationError 

8from curator.exceptions import CuratorException, FailedExecution, NoIndices 

9from curator.exceptions import ConfigurationError as CuratorConfigError # Separate from es_client 

10from curator.helpers.testers import verify_index_list 

11from curator.helpers.utils import report_failure 

12from curator.helpers.waiters import wait_for_it 

13from curator import IndexList 

14 

15class Reindex: 

16 """Reindex Action Class""" 

17 def __init__( 

18 self, ilo, request_body, refresh=True, requests_per_second=-1, slices=1, timeout=60, 

19 wait_for_active_shards=1, wait_for_completion=True, max_wait=-1, wait_interval=9, 

20 remote_certificate=None, remote_client_cert=None, remote_client_key=None, 

21 remote_filters=None, migration_prefix='', migration_suffix='' 

22 ): 

23 """ 

24 :param ilo: An IndexList Object 

25 :param request_body: The body to send to :py:meth:`~.elasticsearch.Elasticsearch.reindex`, 

26 which must be complete and usable, as Curator will do no vetting of the request_body. 

27 If it fails to function, Curator will return an exception. 

28 :param refresh: Whether to refresh the entire target index after the operation is complete. 

29 :param requests_per_second: The throttle to set on this request in sub-requests per second. 

30 ``-1`` means set no throttle as does ``unlimited`` which is the only non-float this 

31 accepts. 

32 :param slices: The number of slices this task should be divided into. ``1`` means the task 

33 will not be sliced into subtasks. (Default: ``1``) 

34 :param timeout: The length in seconds each individual bulk request should wait for shards 

35 that are unavailable. (default: ``60``) 

36 :param wait_for_active_shards: Sets the number of shard copies that must be active before 

37 proceeding with the reindex operation. (Default: ``1``) means the primary shard only. 

38 Set to ``all`` for all shard copies, otherwise set to any non-negative value less than 

39 or equal to the total number of copies for the shard (number of replicas + 1) 

40 :param wait_for_completion: Wait for completion before returning. 

41 :param wait_interval: Seconds to wait between completion checks. 

42 :param max_wait: Maximum number of seconds to ``wait_for_completion`` 

43 :param remote_certificate: Path to SSL/TLS certificate 

44 :param remote_client_cert: Path to SSL/TLS client certificate (public key) 

45 :param remote_client_key: Path to SSL/TLS private key 

46 :param migration_prefix: When migrating, prepend this value to the index name. 

47 :param migration_suffix: When migrating, append this value to the index name. 

48 

49 :type ilo: :py:class:`~.curator.indexlist.IndexList` 

50 :type request_body: dict 

51 :type refresh: bool 

52 :type requests_per_second: int 

53 :type slices: int 

54 :type timeout: int 

55 :type wait_for_active_shards: int 

56 :type wait_for_completion: bool 

57 :type wait_interval: int 

58 :type max_wait: int 

59 :type remote_certificate: str 

60 :type remote_cclient_cert: str 

61 :type remote_cclient_key: str 

62 :type migration_prefix: str 

63 :type migration_suffix: str 

64 """ 

65 if remote_filters is None: 

66 remote_filters = {} 

67 self.loggit = logging.getLogger('curator.actions.reindex') 

68 verify_index_list(ilo) 

69 if not isinstance(request_body, dict): 

70 raise CuratorConfigError('"request_body" is not of type dictionary') 

71 #: Object attribute that gets the value of param ``request_body``. 

72 self.body = request_body 

73 self.loggit.debug('REQUEST_BODY = %s', request_body) 

74 #: The :py:class:`~.curator.indexlist.IndexList` object passed from param ``ilo`` 

75 self.index_list = ilo 

76 #: The :py:class:`~.elasticsearch.Elasticsearch` client object derived from 

77 #: :py:attr:`index_list` 

78 self.client = ilo.client 

79 #: Object attribute that gets the value of param ``refresh``. 

80 self.refresh = refresh 

81 #: Object attribute that gets the value of param ``requests_per_second``. 

82 self.requests_per_second = requests_per_second 

83 #: Object attribute that gets the value of param ``slices``. 

84 self.slices = slices 

85 #: Object attribute that gets the value of param ``timeout``, convert to :py:class:`str` 

86 #: and add ``s`` for seconds. 

87 self.timeout = f'{timeout}s' 

88 #: Object attribute that gets the value of param ``wait_for_active_shards``. 

89 self.wait_for_active_shards = wait_for_active_shards 

90 #: Object attribute that gets the value of param ``wait_for_completion``. 

91 self.wfc = wait_for_completion 

92 #: Object attribute that gets the value of param ``wait_interval``. 

93 self.wait_interval = wait_interval 

94 #: Object attribute that gets the value of param ``max_wait``. 

95 self.max_wait = max_wait 

96 #: Object attribute that gets the value of param ``migration_prefix``. 

97 self.mpfx = migration_prefix 

98 #: Object attribute that gets the value of param ``migration_suffix``. 

99 self.msfx = migration_suffix 

100 

101 #: Object attribute that is set ``False`` unless :py:attr:`body` has 

102 #: ``{'source': {'remote': {}}}``, then it is set ``True`` 

103 self.remote = False 

104 if 'remote' in self.body['source']: 

105 self.remote = True 

106 

107 #: Object attribute that is set ``False`` unless :py:attr:`body` has 

108 #: ``{'dest': {'index': 'MIGRATION'}}``, then it is set ``True`` 

109 self.migration = False 

110 if self.body['dest']['index'] == 'MIGRATION': 

111 self.migration = True 

112 

113 if self.migration: 

114 if not self.remote and not self.mpfx and not self.msfx: 

115 raise CuratorConfigError( 

116 'MIGRATION can only be used locally with one or both of ' 

117 'migration_prefix or migration_suffix.' 

118 ) 

119 

120 # REINDEX_SELECTION is the designated token. If you use this for the 

121 # source "index," it will be replaced with the list of indices from the 

122 # provided 'ilo' (index list object). 

123 if self.body['source']['index'] == 'REINDEX_SELECTION' \ 

124 and not self.remote: 

125 self.body['source']['index'] = self.index_list.indices 

126 

127 # Remote section 

128 elif self.remote: 

129 rclient_args = ClientArgs() 

130 rother_args = OtherArgs() 

131 self.loggit.debug('Remote reindex request detected') 

132 if 'host' not in self.body['source']['remote']: 

133 raise CuratorConfigError('Missing remote "host"') 

134 try: 

135 rclient_args.hosts = verify_url_schema(self.body['source']['remote']['host']) 

136 except ConfigurationError as exc: 

137 raise CuratorConfigError(exc) from exc 

138 

139 # Now that the URL schema is verified, these will pass. 

140 self.remote_host = rclient_args.hosts.split(':')[-2] 

141 self.remote_host = self.remote_host.split('/')[2] 

142 self.remote_port = rclient_args.hosts.split(':')[-1] 

143 

144 if 'username' in self.body['source']['remote']: 

145 rother_args.username = self.body['source']['remote']['username'] 

146 if 'password' in self.body['source']['remote']: 

147 rother_args.password = self.body['source']['remote']['password'] 

148 if remote_certificate: 

149 rclient_args.ca_certs = remote_certificate 

150 if remote_client_cert: 

151 rclient_args.client_cert = remote_client_cert 

152 if remote_client_key: 

153 rclient_args.client_key = remote_client_key 

154 

155 # Let's set a decent remote timeout for initially reading 

156 # the indices on the other side, and collecting their metadata 

157 rclient_args.remote_timeout = 180 

158 

159 # The rest only applies if using filters for remote indices 

160 if self.body['source']['index'] == 'REINDEX_SELECTION': 

161 self.loggit.debug('Filtering indices from remote') 

162 msg = ( 

163 f'Remote client args: ' 

164 f'hosts={rclient_args.hosts} ' 

165 f'username=REDACTED ' 

166 f'password=REDACTED ' 

167 f'certificate={remote_certificate} ' 

168 f'client_cert={remote_client_cert} ' 

169 f'client_key={remote_client_key} ' 

170 f'request_timeout={rclient_args.remote_timeout} ' 

171 f'skip_version_test=True' 

172 ) 

173 self.loggit.debug(msg) 

174 remote_config = { 

175 'elasticsearch': { 

176 'client': rclient_args.asdict(), 

177 'other_settings': rother_args.asdict() 

178 } 

179 } 

180 try: # let's try to build a remote connection with these! 

181 builder = Builder(configdict=remote_config, version_min=(1,0,0)) 

182 builder.connect() 

183 rclient = builder.client 

184 except Exception as err: 

185 self.loggit.error( 

186 'Unable to establish connection to remote Elasticsearch' 

187 ' with provided credentials/certificates/settings.' 

188 ) 

189 report_failure(err) 

190 try: 

191 rio = IndexList(rclient) 

192 rio.iterate_filters({'filters': remote_filters}) 

193 try: 

194 rio.empty_list_check() 

195 except NoIndices as exc: 

196 raise FailedExecution( 

197 'No actionable remote indices selected after applying filters.' 

198 ) from exc 

199 self.body['source']['index'] = rio.indices 

200 except Exception as err: 

201 self.loggit.error('Unable to get/filter list of remote indices.') 

202 report_failure(err) 

203 

204 self.loggit.debug('Reindexing indices: %s', self.body['source']['index']) 

205 

206 def _get_request_body(self, source, dest): 

207 body = deepcopy(self.body) 

208 body['source']['index'] = source 

209 body['dest']['index'] = dest 

210 return body 

211 

212 def _get_reindex_args(self, source, dest): 

213 # Always set wait_for_completion to False. Let 'wait_for_it' do its 

214 # thing if wait_for_completion is set to True. Report the task_id 

215 # either way. 

216 reindex_args = { 

217 'refresh':self.refresh, 

218 'requests_per_second': self.requests_per_second, 

219 'slices': self.slices, 

220 'timeout': self.timeout, 

221 'wait_for_active_shards': self.wait_for_active_shards, 

222 'wait_for_completion': False, 

223 } 

224 for keyname in ['dest', 'source', 'conflicts', 'max_docs', 'size', '_source', 'script']: 

225 if keyname in self.body: 

226 reindex_args[keyname] = self.body[keyname] 

227 # Mimic the _get_request_body(source, dest) behavior by casting these values here instead 

228 reindex_args['dest']['index'] = dest 

229 reindex_args['source']['index'] = source 

230 return reindex_args 

231 

232 def get_processed_items(self, task_id): 

233 """ 

234 This function calls :py:func:`~.elasticsearch.client.TasksClient.get` with the provided 

235 ``task_id``. It will get the value from ``'response.total'`` as the total number of 

236 elements processed during reindexing. If the value is not found, it will return ``-1`` 

237 

238 :param task_id: A task_id which ostensibly matches a task searchable in the tasks API. 

239 """ 

240 try: 

241 task_data = self.client.tasks.get(task_id=task_id) 

242 except Exception as exc: 

243 raise CuratorException( 

244 f'Unable to obtain task information for task_id "{task_id}". Exception {exc}' 

245 ) from exc 

246 total_processed_items = -1 

247 task = task_data['task'] 

248 if task['action'] == 'indices:data/write/reindex': 

249 self.loggit.debug("It's a REINDEX TASK'") 

250 self.loggit.debug('TASK_DATA: %s', task_data) 

251 self.loggit.debug('TASK_DATA keys: %s', list(task_data.keys())) 

252 if 'response' in task_data: 

253 response = task_data['response'] 

254 total_processed_items = response['total'] 

255 self.loggit.debug('total_processed_items = %s', total_processed_items) 

256 return total_processed_items 

257 

258 def _post_run_quick_check(self, index_name, task_id): 

259 # Check whether any documents were processed 

260 # if no documents processed, the target index "dest" won't exist 

261 processed_items = self.get_processed_items(task_id) 

262 if processed_items == 0: 

263 msg = f'No items were processed. Will not check if target index "{index_name}" exists' 

264 self.loggit.info(msg) 

265 else: 

266 # Verify the destination index is there after the fact 

267 index_exists = self.client.indices.exists(index=index_name) 

268 alias_instead = self.client.indices.exists_alias(name=index_name) 

269 if not index_exists and not alias_instead: 

270 # pylint: disable=logging-fstring-interpolation 

271 self.loggit.error( 

272 f'The index described as "{index_name}" was not found after the reindex ' 

273 f'operation. Check Elasticsearch logs for more ' 

274 f'information.' 

275 ) 

276 if self.remote: 

277 # pylint: disable=logging-fstring-interpolation 

278 self.loggit.error( 

279 f'Did you forget to add "reindex.remote.whitelist: ' 

280 f'{self.remote_host}:{self.remote_port}" to the elasticsearch.yml file on ' 

281 f'the "dest" node?' 

282 ) 

283 raise FailedExecution( 

284 f'Reindex failed. The index or alias identified by "{index_name}" was ' 

285 f'not found.' 

286 ) 

287 

288 def sources(self): 

289 """Generator for Reindexing ``sources`` & ``dests``""" 

290 dest = self.body['dest']['index'] 

291 source_list = ensure_list(self.body['source']['index']) 

292 self.loggit.debug('source_list: %s', source_list) 

293 if not source_list or source_list == ['REINDEX_SELECTED']: # Empty list 

294 raise NoIndices 

295 if not self.migration: 

296 yield self.body['source']['index'], dest 

297 

298 # Loop over all sources (default will only be one) 

299 else: 

300 for source in source_list: 

301 if self.migration: 

302 dest = self.mpfx + source + self.msfx 

303 yield source, dest 

304 

305 def show_run_args(self, source, dest): 

306 """Show what will run""" 

307 return ( 

308 f'request body: {self._get_request_body(source, dest)} with arguments: ' 

309 f'refresh={self.refresh} ' 

310 f'requests_per_second={self.requests_per_second} ' 

311 f'slices={self.slices} ' 

312 f'timeout={self.timeout} ' 

313 f'wait_for_active_shards={self.wait_for_active_shards} ' 

314 f'wait_for_completion={self.wfc}' 

315 ) 

316 

317 def do_dry_run(self): 

318 """Log what the output would be, but take no action.""" 

319 self.loggit.info('DRY-RUN MODE. No changes will be made.') 

320 for source, dest in self.sources(): 

321 self.loggit.info('DRY-RUN: REINDEX: %s', self.show_run_args(source, dest)) 

322 

323 def do_action(self): 

324 """ 

325 Execute :py:meth:`~.elasticsearch.Elasticsearch.reindex` operation with the 

326 ``request_body`` from :py:meth:`_get_request_body` and arguments :py:attr:`refresh`, 

327 :py:attr:`requests_per_second`, :py:attr:`slices`, :py:attr:`timeout`, 

328 :py:attr:`wait_for_active_shards`, and :py:attr:`wfc`. 

329 """ 

330 try: 

331 # Loop over all sources (default will only be one) 

332 for source, dest in self.sources(): 

333 self.loggit.info('Commencing reindex operation') 

334 self.loggit.debug('REINDEX: %s', self.show_run_args(source, dest)) 

335 response = self.client.reindex(**self._get_reindex_args(source, dest)) 

336 

337 self.loggit.debug('TASK ID = %s', response['task']) 

338 if self.wfc: 

339 wait_for_it( 

340 self.client, 'reindex', task_id=response['task'], 

341 wait_interval=self.wait_interval, max_wait=self.max_wait 

342 ) 

343 self._post_run_quick_check(dest, response['task']) 

344 

345 else: 

346 msg = ( 

347 f'"wait_for_completion" set to {self.wfc}. Remember to check task_id ' 

348 f"\"{response['task']}\" for successful completion manually." 

349 ) 

350 self.loggit.warning(msg) 

351 except NoIndices as exc: 

352 raise NoIndices( 

353 'Source index must be list of actual indices. It must not be an empty list.' 

354 ) from exc 

355 except Exception as exc: 

356 report_failure(exc)