Coverage for curator/actions/shrink.py: 97%

273 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-20 21:00 -0600

1"""Reindex action class""" 

2import logging 

3# pylint: disable=broad-except 

4from curator.exceptions import ActionError, ConfigurationError 

5from curator.helpers.getters import index_size, name_to_node_id, node_id_to_name, node_roles 

6from curator.helpers.testers import verify_index_list 

7from curator.helpers.utils import chunk_index_list, report_failure 

8from curator.helpers.waiters import health_check, wait_for_it 

9 

10 

11class Shrink: 

12 """Shrink Action Class""" 

13 def __init__( 

14 self, ilo, shrink_node='DETERMINISTIC', node_filters=None, number_of_shards=1, 

15 number_of_replicas=1, shrink_prefix='', shrink_suffix='-shrink', copy_aliases=False, 

16 delete_after=True, post_allocation=None, wait_for_active_shards=1, 

17 wait_for_rebalance=True, extra_settings=None, wait_for_completion=True, wait_interval=9, 

18 max_wait=-1 

19 ): 

20 """ 

21 :param ilo: An IndexList Object 

22 :param shrink_node: The node name to use as the shrink target, or ``DETERMINISTIC``, which 

23 will use the values in ``node_filters`` to determine which node will be the shrink 

24 node. 

25 :param node_filters: If the value of ``shrink_node`` is ``DETERMINISTIC``, the values in 

26 ``node_filters`` will be used while determining which node to allocate the shards on 

27 before performing the shrink. 

28 :param number_of_shards: The number of shards the shrunk index should have 

29 :param number_of_replicas: The number of replicas for the shrunk index 

30 :param shrink_prefix: Prepend the shrunk index with this value 

31 :param shrink_suffix: Append the value to the shrunk index (Default: ``-shrink``) 

32 :param copy_aliases: Whether to copy each source index aliases to target index after 

33 shrinking. The aliases will be added to target index and deleted from source index at 

34 the same time. (Default: ``False``) 

35 :param delete_after: Whether to delete each index after shrinking. (Default: ``True``) 

36 :param post_allocation: If populated, the ``allocation_type``, ``key``, and ``value`` will 

37 be applied to the shrunk index to re-route it. 

38 :param extra_settings: Permitted root keys are ``settings`` and ``aliases``. 

39 :param wait_for_active_shards: Wait for this many active shards before returning. 

40 :param wait_for_rebalance: Wait for rebalance. (Default: ``True``) 

41 :param wait_for_completion: Wait for completion before returning. 

42 :param wait_interval: Seconds to wait between completion checks. 

43 :param max_wait: Maximum number of seconds to ``wait_for_completion`` 

44 

45 :type ilo: :py:class:`~.curator.indexlist.IndexList` 

46 :type shrink_node: str 

47 :type node_filters: dict 

48 :type number_of_shards: int 

49 :type number_of_replicas: int 

50 :type shrink_prefix: str 

51 :type shrink_suffix: str 

52 :type copy_aliases: bool 

53 :type delete_after: bool 

54 :type post_allocation: dict 

55 :type extra_settings: dict 

56 :type wait_for_active_shards: int 

57 :type wait_for_rebalance: bool 

58 :type wait_for_completion: bool 

59 :type wait_interval: int 

60 :type max_wait: int 

61 """ 

62 if node_filters is None: 

63 node_filters = {} 

64 if post_allocation is None: 

65 post_allocation = {} 

66 if extra_settings is None: 

67 extra_settings = {} 

68 self.loggit = logging.getLogger('curator.actions.shrink') 

69 verify_index_list(ilo) 

70 if 'permit_masters' not in node_filters: 

71 node_filters['permit_masters'] = False 

72 #: The :py:class:`~.curator.indexlist.IndexList` object passed from param ``ilo`` 

73 self.index_list = ilo 

74 #: The :py:class:`~.elasticsearch.Elasticsearch` client object derived from 

75 #: :py:attr:`index_list` 

76 self.client = ilo.client 

77 #: Object attribute that gets the value of param ``shrink_node``. 

78 self.shrink_node = shrink_node 

79 #: Object attribute that gets the value of param ``node_filters``. 

80 self.node_filters = node_filters 

81 #: Object attribute that gets the value of param ``shrink_prefix``. 

82 self.shrink_prefix = shrink_prefix 

83 #: Object attribute that gets the value of param ``shrink_suffix``. 

84 self.shrink_suffix = shrink_suffix 

85 #: Object attribute that gets the value of param ``copy_aliases``. 

86 self.copy_aliases = copy_aliases 

87 #: Object attribute that gets the value of param ``delete_after``. 

88 self.delete_after = delete_after 

89 #: Object attribute that gets the value of param ``post_allocation``. 

90 self.post_allocation = post_allocation 

91 #: Object attribute that gets the value of param ``wait_for_rebalance``. 

92 self.wait_for_rebalance = wait_for_rebalance 

93 #: Object attribute that gets the value of param ``wait_for_completion``. 

94 self.wfc = wait_for_completion 

95 #: Object attribute that gets the value of param ``wait_interval``. 

96 self.wait_interval = wait_interval 

97 #: Object attribute that gets the value of param ``max_wait``. 

98 self.max_wait = max_wait 

99 #: Object attribute that gets the value of param ``number_of_shards``. 

100 self.number_of_shards = number_of_shards 

101 #: Object attribute that gets the value of param ``wait_for_active_shards``. 

102 self.wait_for_active_shards = wait_for_active_shards 

103 

104 #: Object attribute that represents the target node for shrinking. 

105 self.shrink_node_name = None 

106 #: Object attribute that represents whether :py:attr:`shrink_node_name` is available 

107 self.shrink_node_avail = None 

108 #: Object attribute that represents the node_id of :py:attr:`shrink_node_name` 

109 self.shrink_node_id = None 

110 

111 #: Object attribute that gets values from params ``number_of_shards`` and 

112 #: ``number_of_replicas``. 

113 self.settings = { 

114 'index.number_of_shards' : number_of_shards, 

115 'index.number_of_replicas' : number_of_replicas, 

116 } 

117 

118 if extra_settings: 

119 self._merge_extra_settings(extra_settings) 

120 

121 self._merge_extra_settings({ 

122 'settings': { 

123 'index.routing.allocation.require._name': None, 

124 'index.blocks.write': None 

125 }}) 

126 

127 def _merge_extra_settings(self, extra_settings): 

128 self.loggit.debug('Adding extra_settings to shrink body: %s', extra_settings) 

129 # Pop these here, otherwise we could overwrite our default number of 

130 # shards and replicas 

131 if 'settings' in extra_settings: 

132 settings = extra_settings.pop('settings') 

133 try: 

134 self.settings.update(settings) 

135 except Exception as exc: 

136 raise ConfigurationError( 

137 f"Unable to apply extra settings \"{{'settings':settings}}\" " 

138 f"to shrink body. Exception: {exc}" 

139 ) from exc 

140 if extra_settings: 

141 try: # Apply any remaining keys, should there be any. 

142 self.settings.update(extra_settings) 

143 except Exception as exc: 

144 raise ConfigurationError( 

145 f'Unable to apply extra settings "{extra_settings}" ' 

146 f'to shrink body. Exception: {exc}' 

147 ) from exc 

148 

149 def _data_node(self, node_id): 

150 roles = node_roles(self.client, node_id) 

151 name = node_id_to_name(self.client, node_id) 

152 if not 'data' in roles: 

153 self.loggit.info('Skipping node "%s": non-data node', name) 

154 return False 

155 if 'master' in roles and not self.node_filters['permit_masters']: 

156 self.loggit.info('Skipping node "%s": master node', name) 

157 return False 

158 elif 'master' in roles and self.node_filters['permit_masters']: 

159 msg = ( 

160 f'Not skipping node "{name}" which is a master node (not recommended), but ' 

161 f'permit_masters is True' 

162 ) 

163 self.loggit.warning(msg) 

164 return True 

165 else: # It does have `data` as a role. 

166 return True 

167 

168 def _exclude_node(self, name): 

169 if 'exclude_nodes' in self.node_filters: 

170 if name in self.node_filters['exclude_nodes']: 

171 self.loggit.info('Excluding node "%s" due to node_filters', name) 

172 return True 

173 return False 

174 

175 def _shrink_target(self, name): 

176 return f'{self.shrink_prefix}{name}{self.shrink_suffix}' 

177 

178 def qualify_single_node(self): 

179 """Qualify a single node as a shrink target""" 

180 node_id = name_to_node_id(self.client, self.shrink_node) 

181 if node_id: 

182 self.shrink_node_id = node_id 

183 self.shrink_node_name = self.shrink_node 

184 else: 

185 raise ConfigurationError( 

186 f'Unable to find node named: "{self.shrink_node}"') 

187 if self._exclude_node(self.shrink_node): 

188 raise ConfigurationError( 

189 f'Node "{self.shrink_node}" listed for exclusion') 

190 if not self._data_node(node_id): 

191 raise ActionError( 

192 f'Node "{self.shrink_node}" is not usable as a shrink node') 

193 self.shrink_node_avail = ( 

194 self.client.nodes.stats()['nodes'][node_id]['fs']['total']['available_in_bytes'] 

195 ) 

196 

197 def most_available_node(self): 

198 """ 

199 Determine which data node name has the most available free space, and meets the other node 

200 filters settings. 

201 """ 

202 mvn_avail = 0 

203 # mvn_total = 0 

204 mvn_name = None 

205 mvn_id = None 

206 nodes = self.client.nodes.stats()['nodes'] 

207 for node_id in nodes: 

208 name = nodes[node_id]['name'] 

209 if self._exclude_node(name): 

210 self.loggit.debug('Node "%s" excluded by node filters', name) 

211 continue 

212 if not self._data_node(node_id): 

213 self.loggit.debug('Node "%s" is not a data node', name) 

214 continue 

215 value = nodes[node_id]['fs']['total']['available_in_bytes'] 

216 if value > mvn_avail: 

217 mvn_name = name 

218 mvn_id = node_id 

219 mvn_avail = value 

220 self.shrink_node_name = mvn_name 

221 self.shrink_node_id = mvn_id 

222 self.shrink_node_avail = mvn_avail 

223 

224 def route_index(self, idx, allocation_type, key, value): 

225 """Apply the indicated shard routing allocation""" 

226 bkey = f'index.routing.allocation.{allocation_type}.{key}' 

227 routing = {bkey : value} 

228 try: 

229 self.client.indices.put_settings(index=idx, settings=routing) 

230 if self.wait_for_rebalance: 

231 wait_for_it( 

232 self.client, 'allocation', wait_interval=self.wait_interval, 

233 max_wait=self.max_wait 

234 ) 

235 else: 

236 wait_for_it( 

237 self.client, 'relocate', index=idx, wait_interval=self.wait_interval, 

238 max_wait=self.max_wait 

239 ) 

240 except Exception as err: 

241 report_failure(err) 

242 

243 def __log_action(self, error_msg, dry_run=False): 

244 if not dry_run: 

245 raise ActionError(error_msg) 

246 else: 

247 self.loggit.warning('DRY-RUN: %s', error_msg) 

248 

249 def _block_writes(self, idx): 

250 block = {'index.blocks.write': True} 

251 self.client.indices.put_settings(index=idx, settings=block) 

252 

253 def _unblock_writes(self, idx): 

254 unblock = {'index.blocks.write': False} 

255 self.client.indices.put_settings(index=idx, settings=unblock) 

256 

257 def _check_space(self, idx, dry_run=False): 

258 # Disk watermark calculation is already baked into `available_in_bytes` 

259 size = index_size(self.client, idx, value='primaries') 

260 padded = (size * 2) + (32 * 1024) 

261 if padded < self.shrink_node_avail: 

262 msg = ( 

263 f'Sufficient space available for 2x the size of index "{idx}". ' 

264 f'Required: {padded}, available: {self.shrink_node_avail}' 

265 ) 

266 self.loggit.debug(msg) 

267 else: 

268 error_msg = ( 

269 f'Insufficient space available for 2x the size of index "{idx}", shrinking will ' 

270 f'exceed space available. Required: {padded}, available: {self.shrink_node_avail}' 

271 ) 

272 self.__log_action(error_msg, dry_run) 

273 

274 def _check_node(self): 

275 if self.shrink_node != 'DETERMINISTIC': 

276 if not self.shrink_node_name: 

277 self.qualify_single_node() 

278 else: 

279 self.most_available_node() 

280 # At this point, we should have the three shrink-node identifying 

281 # instance variables: 

282 # - self.shrink_node_name 

283 # - self.shrink_node_id 

284 # - self.shrink_node_avail 

285 # # - self.shrink_node_total - only if needed in the future 

286 

287 def _check_target_exists(self, idx, dry_run=False): 

288 target = self._shrink_target(idx) 

289 if self.client.indices.exists(index=target): 

290 error_msg = f'Target index "{target}" already exists' 

291 self.__log_action(error_msg, dry_run) 

292 

293 def _check_doc_count(self, idx, dry_run=False): 

294 max_docs = 2147483519 

295 doc_count = self.client.indices.stats(index=idx)['indices'][idx]['primaries']['docs']['count'] 

296 if doc_count > (max_docs * self.number_of_shards): 

297 error_msg = ( 

298 f'Too many documents ({doc_count}) to fit in {self.number_of_shards} shard(s). ' 

299 f'Maximum number of docs per shard is {max_docs}' 

300 ) 

301 self.__log_action(error_msg, dry_run) 

302 

303 def _check_shard_count(self, idx, src_shards, dry_run=False): 

304 if self.number_of_shards >= src_shards: 

305 error_msg = ( 

306 f'Target number of shards ({self.number_of_shards}) must be less than current ' 

307 f'number of shards ({src_shards}) in index "{idx}"' 

308 ) 

309 self.__log_action(error_msg, dry_run) 

310 

311 def _check_shard_factor(self, idx, src_shards, dry_run=False): 

312 # Find the list of factors of src_shards 

313 factors = [x for x in range(1, src_shards+1) if src_shards % x == 0] 

314 # Pop the last one, because it will be the value of src_shards 

315 factors.pop() 

316 if not self.number_of_shards in factors: 

317 error_msg = ( 

318 f'"{self.number_of_shards}" is not a valid factor of {src_shards} shards of ' 

319 f'index {idx}. Valid values are {factors}' 

320 ) 

321 self.__log_action(error_msg, dry_run) 

322 

323 def _check_all_shards(self, idx): 

324 shards = self.client.cluster.state(index=idx)['routing_table']['indices'][idx]['shards'] 

325 found = [] 

326 for shardnum in shards: 

327 for shard_idx in range(0, len(shards[shardnum])): 

328 if shards[shardnum][shard_idx]['node'] == self.shrink_node_id: 

329 found.append( 

330 {'shard': shardnum, 'primary': shards[shardnum][shard_idx]['primary']}) 

331 if len(shards) != len(found): 

332 self.loggit.debug( 

333 'Found these shards on node "%s": %s', self.shrink_node_name, found) 

334 raise ActionError( 

335 f'Unable to shrink index "{idx}" as not all shards were found on the designated ' 

336 f'shrink node ({self.shrink_node_name}): {found}' 

337 ) 

338 

339 def pre_shrink_check(self, idx, dry_run=False): 

340 """Do a shrink preflight check""" 

341 self.loggit.debug('BEGIN PRE_SHRINK_CHECK') 

342 self.loggit.debug('Check that target exists') 

343 self._check_target_exists(idx, dry_run) 

344 self.loggit.debug('Check doc count constraints') 

345 self._check_doc_count(idx, dry_run) 

346 self.loggit.debug('Check shard count') 

347 src_shards = int(self.client.indices.get(index=idx)[idx]['settings']['index']['number_of_shards']) 

348 self._check_shard_count(idx, src_shards, dry_run) 

349 self.loggit.debug('Check shard factor') 

350 self._check_shard_factor(idx, src_shards, dry_run) 

351 self.loggit.debug('Check node availability') 

352 self._check_node() 

353 self.loggit.debug('Check available disk space') 

354 self._check_space(idx, dry_run) 

355 self.loggit.debug('FINISH PRE_SHRINK_CHECK') 

356 

357 def do_copy_aliases(self, source_idx, target_idx): 

358 """Copy the aliases to the shrunk index""" 

359 alias_actions = [] 

360 aliases = self.client.indices.get_alias(index=source_idx) 

361 for alias in aliases[source_idx]['aliases']: 

362 self.loggit.debug('alias: %s', alias) 

363 alias_actions.append({'remove': {'index': source_idx, 'alias': alias}}) 

364 alias_actions.append({'add': {'index': target_idx, 'alias': alias}}) 

365 if alias_actions: 

366 self.loggit.info('Copy alias actions: %s', alias_actions) 

367 self.client.indices.update_aliases(actions=alias_actions) 

368 

369 def do_dry_run(self): 

370 """Show what a regular run would do, but don't actually do it.""" 

371 self.index_list.filter_closed() 

372 self.index_list.filter_by_shards(number_of_shards=self.number_of_shards) 

373 self.index_list.empty_list_check() 

374 try: 

375 index_lists = chunk_index_list(self.index_list.indices) 

376 for lst in index_lists: 

377 for idx in lst: # Shrink can only be done one at a time... 

378 target = self._shrink_target(idx) 

379 self.pre_shrink_check(idx, dry_run=True) 

380 self.loggit.info( 

381 'DRY-RUN: Moving shards to shrink node: "%s"', self.shrink_node_name) 

382 msg = ( 

383 f'DRY-RUN: Shrinking index "{idx}" to "{target}" with settings: ' 

384 f'{self.settings}, wait_for_active_shards={self.wait_for_active_shards}' 

385 ) 

386 self.loggit.info(msg) 

387 if self.post_allocation: 

388 submsg = ( 

389 f"index.routing.allocation.{self.post_allocation['allocation_type']}." 

390 f"{self.post_allocation['key']}:{self.post_allocation['value']}" 

391 ) 

392 msg = ( 

393 f'DRY-RUN: Applying post-shrink allocation rule "{submsg}" to index ' 

394 f'"{target}"' 

395 ) 

396 self.loggit.info(msg) 

397 if self.copy_aliases: 

398 msg = ( 

399 f'DRY-RUN: Copy source index aliases ' 

400 f'"{self.client.indices.get_alias(index=idx)}"' 

401 ) 

402 self.loggit.info(msg) 

403 if self.delete_after: 

404 self.loggit.info('DRY-RUN: Deleting source index "%s"', idx) 

405 except Exception as err: 

406 report_failure(err) 

407 

408 def do_action(self): 

409 """ 

410 :py:meth:`~.elasticsearch.client.IndicesClient.shrink` the indices in :py:attr:`index_list` 

411 """ 

412 self.index_list.filter_closed() 

413 self.index_list.filter_by_shards(number_of_shards=self.number_of_shards) 

414 self.index_list.empty_list_check() 

415 msg = ( 

416 f'Shrinking {len(self.index_list.indices)} selected indices: {self.index_list.indices}' 

417 ) 

418 self.loggit.info(msg) 

419 try: 

420 index_lists = chunk_index_list(self.index_list.indices) 

421 for lst in index_lists: 

422 for idx in lst: # Shrink can only be done one at a time... 

423 target = self._shrink_target(idx) 

424 self.loggit.info('Source index: %s -- Target index: %s', idx, target) 

425 # Pre-check ensures disk space available for each pass of the loop 

426 self.pre_shrink_check(idx) 

427 # Route the index to the shrink node 

428 self.loggit.info('Moving shards to shrink node: "%s"', self.shrink_node_name) 

429 self.route_index(idx, 'require', '_name', self.shrink_node_name) 

430 # Ensure a copy of each shard is present 

431 self._check_all_shards(idx) 

432 # Block writes on index 

433 self._block_writes(idx) 

434 # Do final health check 

435 if not health_check(self.client, status='green'): 

436 raise ActionError( 

437 'Unable to proceed with shrink action. Cluster health is not "green"') 

438 # Do the shrink 

439 msg = ( 

440 f'Shrinking index "{idx}" to "{target}" with settings: {self.settings}, ' 

441 f'wait_for_active_shards={self.wait_for_active_shards}' 

442 ) 

443 self.loggit.info(msg) 

444 try: 

445 self.client.indices.shrink( 

446 index=idx, target=target, settings=self.settings, 

447 wait_for_active_shards=self.wait_for_active_shards 

448 ) 

449 # Wait for it to complete 

450 if self.wfc: 

451 self.loggit.debug( 

452 'Wait for shards to complete allocation for index: %s', target) 

453 if self.wait_for_rebalance: 

454 wait_for_it( 

455 self.client, 'shrink', wait_interval=self.wait_interval, 

456 max_wait=self.max_wait 

457 ) 

458 else: 

459 wait_for_it( 

460 self.client, 'relocate', index=target, 

461 wait_interval=self.wait_interval, max_wait=self.max_wait 

462 ) 

463 except Exception as exc: 

464 if self.client.indices.exists(index=target): 

465 msg = ( 

466 f'Deleting target index "{target}" due to failure to complete ' 

467 f'shrink' 

468 ) 

469 self.loggit.error(msg) 

470 self.client.indices.delete(index=target) 

471 raise ActionError( 

472 f'Unable to shrink index "{idx}" -- Error: {exc}') from exc 

473 self.loggit.info('Index "%s" successfully shrunk to "%s"', idx, target) 

474 # Do post-shrink steps 

475 # Unblock writes on index (just in case) 

476 self._unblock_writes(idx) 

477 ## Post-allocation, if enabled 

478 if self.post_allocation: 

479 submsg = ( 

480 f"index.routing.allocation.{self.post_allocation['allocation_type']}." 

481 f"{self.post_allocation['key']}:{self.post_allocation['value']}" 

482 ) 

483 msg = ( 

484 f'Applying post-shrink allocation rule "{submsg}" to index "{target}"' 

485 ) 

486 self.loggit.info(msg) 

487 self.route_index( 

488 target, self.post_allocation['allocation_type'], 

489 self.post_allocation['key'], self.post_allocation['value'] 

490 ) 

491 ## Copy aliases, if flagged 

492 if self.copy_aliases: 

493 self.loggit.info('Copy source index aliases "%s"', idx) 

494 self.do_copy_aliases(idx, target) 

495 ## Delete, if flagged 

496 if self.delete_after: 

497 self.loggit.info('Deleting source index "%s"', idx) 

498 self.client.indices.delete(index=idx) 

499 else: # Let's unset the routing we applied here. 

500 self.loggit.info('Unassigning routing for source index: "%s"', idx) 

501 self.route_index(idx, 'require', '_name', '') 

502 

503 except Exception as err: 

504 # Just in case it fails after attempting to meet this condition 

505 self._unblock_writes(idx) 

506 report_failure(err)