Coverage for /Users/buh/.pyenv/versions/3.12.2/envs/pii/lib/python3.12/site-packages/es_pii_tool/redacters/index.py: 68%

122 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-10-01 16:39 -0600

1"""Redact data from an Elasticsearch index""" 

2 

3import typing as t 

4import logging 

5from dotmap import DotMap # type: ignore 

6from es_pii_tool.exceptions import ( 

7 BadClientResult, 

8 FatalError, 

9 MissingIndex, 

10) 

11from es_pii_tool.task import Task 

12from es_pii_tool.helpers.utils import ( 

13 exception_msgmaker, 

14 get_field_matches, 

15) 

16from es_pii_tool.helpers import elastic_api as api 

17from es_pii_tool.redacters.snapshot import RedactSnapshot 

18 

19if t.TYPE_CHECKING: 

20 from es_pii_tool.job import Job 

21 

22logger = logging.getLogger(__name__) 

23 

24 

25class RedactIndex: 

26 """Redact index per settings""" 

27 

28 def __init__(self, index: str, job: 'Job', counter: int): 

29 self.task = Task(job, index=index, id_suffix='REDACT-INDEX') 

30 self.index = index 

31 self.counter = counter 

32 self.data = DotMap() 

33 self.verify_index() 

34 

35 @property 

36 def success(self) -> bool: 

37 """Was the redaction a success?""" 

38 return self._success 

39 

40 @success.setter 

41 def success(self, value: bool) -> None: 

42 self._success = value 

43 

44 def end_in_failure( 

45 self, 

46 exception: t.Union[BadClientResult, MissingIndex], 

47 reraise: bool = False, 

48 func: t.Union[t.Callable, None] = None, 

49 kwargs: t.Union[t.Dict[str, t.Union[bool, str]], None] = None, 

50 ) -> None: 

51 """For steps and checks that end in failure, we lump you into this method""" 

52 msg = exception_msgmaker(exception) 

53 logger.critical(msg) 

54 if func: 

55 if kwargs is None: 

56 kwargs = {} 

57 logger.error('Empty kwargs passed') 

58 if 'logmsg' in kwargs: # For the task ender 

59 kwargs['logmsg'] = msg 

60 func(**kwargs) 

61 if reraise: 

62 raise FatalError(msg, exception) 

63 

64 def verify_index(self): 

65 """Verify the index exists""" 

66 # If the index name changed because of an ILM phase shift from hot to cold 

67 # or cold to frozen, then we should verify the name change here. We should raise 

68 # an exception if the name of the index changed or it disappeared. 

69 if not api.verify_index(self.task.job.client, self.index): 

70 msg = f'Halting execution: Index {self.index} changed or is missing.' 

71 logger.critical(msg) 

72 self.success = False 

73 raise ValueError(msg, 'index not found as expected', self.index) 

74 

75 def run_query(self): 

76 """Run the query""" 

77 self.data.result = DotMap( 

78 dict( 

79 api.do_search( 

80 self.task.job.client, 

81 self.index, 

82 self.task.job.config['query'], 

83 size=10000, 

84 ) 

85 ) 

86 ) 

87 self.data.hits = self.data.result.hits.total.value 

88 logger.debug('Checking document fields on index: %s...', self.index) 

89 if self.data.hits == 0: 

90 self.counter += 1 

91 msg = f'Documents matching redaction query not found on index: {self.index}' 

92 logger.debug(msg) 

93 msg = f'Index {self.counter} of {self.task.job.total} processed...' 

94 logger.info(msg) 

95 # Record success for this task but send msg to the log field 

96 # An index could be in the pattern but have no matches. 

97 self.task.end(True, logmsg=msg) 

98 self.task.add_log(f"Hits: {self.data.hits}") 

99 

100 def verify_fields(self): 

101 """Verify the fields in the query results match what we expect""" 

102 if not get_field_matches(self.task.job.config, self.data.result.toDict()) > 0: 

103 msg = f'Fields required for redaction not found on index: {self.index}' 

104 logger.warning(msg) 

105 self.task.end(completed=True, logmsg=msg) 

106 logger.warning( 

107 'Not a fatal error. Index in pattern does not have the specified fields' 

108 ) 

109 

110 def get_phase(self): 

111 """Get the ILM phase (if any) for the index""" 

112 nope = 'Not assigned an ILM Phase' 

113 try: 

114 self.data.phase = api.get_phase(self.task.job.client, self.index) or nope 

115 except MissingIndex as exc: 

116 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'} 

117 self.end_in_failure(exc, reraise=True, func=self.task.end, kwargs=kwargs) 

118 logger.debug('Index in phase: %s', self.data.phase.upper()) 

119 self.task.add_log(f'ILM Phase: {self.data.phase}') 

120 

121 def normal_redact(self): 

122 """Redact data from a normal (not searchable-snapshot) index""" 

123 msg = 'Initiating redaction of data from writeable index...' 

124 logger.info(msg) 

125 self.task.add_log(msg) 

126 # As the redact_from_index function doesn't track dry-run, we have to do it 

127 if not self.task.job.dry_run: 

128 msg = f'Redacting data from {self.index}' 

129 logger.info(msg) 

130 self.task.add_log(msg) 

131 try: 

132 api.redact_from_index( 

133 self.task.job.client, self.index, self.task.job.config 

134 ) 

135 except (MissingIndex, BadClientResult) as exc: 

136 kwargs = {'completed': False, 'errors': True, 'logmsg': 'replaceme'} 

137 self.end_in_failure( 

138 exc, reraise=False, func=self.task.end, kwargs=kwargs 

139 ) 

140 else: 

141 msg = f'DRY-RUN: Will not redact data from {self.index}' 

142 logger.info(msg) 

143 self.task.add_log(msg) 

144 

145 def snapshot_redact(self): 

146 """Redact data from searchable snapshot-backed index""" 

147 msg = 'Initiating redaction of data from mounted searchable snapshot...' 

148 logger.info(msg) 

149 self.task.add_log(msg) 

150 try: 

151 snp = RedactSnapshot(self.index, self.task.job, self.data.phase) 

152 except Exception as exc: 

153 logger.critical('Unable to build RedactSnapshot object. Exception: %s', exc) 

154 raise 

155 try: 

156 snp.run() 

157 except Exception as exc: 

158 logger.critical('Unable to run RedactSnapshot object. Exception: %s', exc) 

159 raise 

160 

161 def run(self): 

162 """Do the actual run""" 

163 if self.task.finished(): 

164 self.success = True 

165 return 

166 # Log task start time 

167 self.task.begin() 

168 self.run_query() 

169 if self.task.completed: 

170 self.success = True 

171 return 

172 self.verify_fields() 

173 if self.task.completed: 

174 self.success = True 

175 return 

176 self.get_phase() 

177 if self.data.phase in ('cold', 'frozen'): 

178 self.snapshot_redact() 

179 else: 

180 self.normal_redact() 

181 # If we have reached this point, we've succeeded. 

182 self.counter += 1 

183 msg = f'Index {self.counter} of {self.task.job.total} processed...' 

184 logger.info(msg) 

185 self.task.add_log(msg) 

186 self.task.end(completed=True, logmsg='DONE') 

187 self.success = True