Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_api.py: 100%

175 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import pytest 

5import pandas as pd 

6import math 

7from unittest.mock import MagicMock, patch, Mock 

8from intelligence_toolkit.anonymize_case_data.api import AnonymizeCaseData 

9from intelligence_toolkit.anonymize_case_data.synthesizability_statistics import ( 

10 SynthesizabilityStatistics, 

11) 

12 

13 

14def test_anonymize_case_data_initialization(): 

15 acd = AnonymizeCaseData() 

16 

17 assert acd.protected_number_of_records == 0 

18 assert acd.delta == 0 

19 assert isinstance(acd.sensitive_df, pd.DataFrame) 

20 assert isinstance(acd.aggregate_df, pd.DataFrame) 

21 assert isinstance(acd.synthetic_aggregate_df, pd.DataFrame) 

22 assert isinstance(acd.synthetic_df, pd.DataFrame) 

23 assert isinstance(acd.aggregate_error_report, pd.DataFrame) 

24 assert isinstance(acd.synthetic_error_report, pd.DataFrame) 

25 

26 

27def test_fabrication_strategy_enum(): 

28 # Test that enum values exist 

29 assert hasattr(AnonymizeCaseData.FabricationStrategy, "BALANCED") 

30 assert hasattr(AnonymizeCaseData.FabricationStrategy, "PROGRESSIVE") 

31 assert hasattr(AnonymizeCaseData.FabricationStrategy, "MINIMIZED") 

32 assert hasattr(AnonymizeCaseData.FabricationStrategy, "UNCONTROLLED") 

33 

34 

35def test_analyze_synthesizability_basic(): 

36 acd = AnonymizeCaseData() 

37 df = pd.DataFrame( 

38 { 

39 "Color": ["Red", "Blue", "Red"], 

40 "Size": ["Large", "Small", "Large"], 

41 } 

42 ) 

43 

44 stats = acd.analyze_synthesizability(df) 

45 

46 assert isinstance(stats, SynthesizabilityStatistics) 

47 assert stats.num_cols == 2 

48 assert stats.overall_att_count > 0 

49 assert stats.possible_combinations > 0 

50 

51 

52def test_analyze_synthesizability_with_empty_values(): 

53 acd = AnonymizeCaseData() 

54 df = pd.DataFrame( 

55 { 

56 "Color": ["Red", "", "Blue"], 

57 "Size": ["Large", "Small", ""], 

58 } 

59 ) 

60 

61 stats = acd.analyze_synthesizability(df) 

62 

63 # Empty values should be filtered out 

64 assert isinstance(stats, SynthesizabilityStatistics) 

65 assert stats.num_cols == 2 

66 

67 

68def test_analyze_synthesizability_with_nan(): 

69 acd = AnonymizeCaseData() 

70 df = pd.DataFrame( 

71 { 

72 "Color": ["Red", None, "Blue"], 

73 "Size": ["Large", "Small", None], 

74 } 

75 ) 

76 

77 stats = acd.analyze_synthesizability(df) 

78 

79 # NaN values should be filtered out 

80 assert isinstance(stats, SynthesizabilityStatistics) 

81 

82 

83def test_analyze_synthesizability_distinct_counts(): 

84 acd = AnonymizeCaseData() 

85 df = pd.DataFrame( 

86 { 

87 "Color": ["Red", "Red", "Red"], 

88 "Size": ["Large", "Large", "Large"], 

89 } 

90 ) 

91 

92 stats = acd.analyze_synthesizability(df) 

93 

94 # Only 1 distinct value per column 

95 assert stats.possible_combinations == 1 # 1 * 1 

96 

97 

98def test_analyze_synthesizability_calculates_combinations(): 

99 acd = AnonymizeCaseData() 

100 df = pd.DataFrame( 

101 { 

102 "Color": ["Red", "Blue"], 

103 "Size": ["Large", "Small"], 

104 } 

105 ) 

106 

107 stats = acd.analyze_synthesizability(df) 

108 

109 # 2 colors * 2 sizes = 4 possible combinations 

110 assert stats.possible_combinations == 4 

111 assert stats.possible_combinations_per_row == 2.0 # 4 / 2 rows 

112 

113 

114def test_analyze_synthesizability_mean_vals_per_record(): 

115 acd = AnonymizeCaseData() 

116 df = pd.DataFrame( 

117 { 

118 "A": ["X", "Y"], 

119 "B": ["1", ""], 

120 "C": ["", "2"], 

121 } 

122 ) 

123 

124 stats = acd.analyze_synthesizability(df) 

125 

126 # Row 1: 2 vals (X, 1), Row 2: 2 vals (Y, 2), mean = 2.0 

127 assert stats.mean_vals_per_record == 2.0 

128 

129 

130def test_analyze_synthesizability_excess_combinations_ratio(): 

131 acd = AnonymizeCaseData() 

132 df = pd.DataFrame( 

133 { 

134 "A": ["X", "Y"], 

135 "B": ["1", "2"], 

136 } 

137 ) 

138 

139 stats = acd.analyze_synthesizability(df) 

140 

141 # Should calculate excess_combinations_ratio 

142 assert stats.excess_combinations_ratio > 0 

143 

144 

145@patch("intelligence_toolkit.anonymize_case_data.api.DpAggregateSeededSynthesizer") 

146@patch("intelligence_toolkit.anonymize_case_data.api.Dataset") 

147@patch("intelligence_toolkit.anonymize_case_data.api.df_functions.fix_null_ints") 

148def test_anonymize_case_data_method(mock_fix_null_ints, mock_dataset, mock_synth_class): 

149 # Setup mocks 

150 mock_fix_null_ints.return_value = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) 

151 

152 mock_dataset_instance = MagicMock() 

153 mock_dataset.from_data_frame.return_value = mock_dataset_instance 

154 mock_dataset_instance.get_aggregates.return_value = {"A:1": 10} 

155 mock_dataset.return_value.get_aggregates.return_value = {"A:1": 9} 

156 mock_dataset.raw_data_to_data_frame.return_value = pd.DataFrame({"A": [1], "B": [3]}) 

157 

158 mock_synth_instance = MagicMock() 

159 mock_synth_class.return_value = mock_synth_instance 

160 mock_synth_instance.get_dp_number_of_records.return_value = 100 

161 mock_synth_instance.get_dp_aggregates.return_value = {"A:1": 10} 

162 mock_synth_instance.sample.return_value = MagicMock() 

163 

164 acd = AnonymizeCaseData() 

165 df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) 

166 

167 acd.anonymize_case_data(df, epsilon=1.0) 

168 

169 # Verify synthesizer was created and fit was called 

170 assert mock_synth_instance.fit.called 

171 assert mock_synth_instance.sample.called 

172 assert acd.protected_number_of_records == 100 

173 

174 

175def test_get_data_schema(): 

176 acd = AnonymizeCaseData() 

177 acd.synthetic_df = pd.DataFrame({"Color": ["Red", "Blue"], "Size": ["Large", "Small"]}) 

178 

179 schema = acd.get_data_schema() 

180 

181 assert "Color" in schema 

182 assert "Size" in schema 

183 

184 

185@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_aggregate_graph") 

186def test_compute_aggregate_graph_df(mock_compute): 

187 mock_compute.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]}) 

188 

189 acd = AnonymizeCaseData() 

190 acd.aggregate_df = pd.DataFrame({"selections": ["A;B"], "protected_count": [10]}) 

191 

192 result = acd.compute_aggregate_graph_df([], "source", "target", "") 

193 

194 assert mock_compute.called 

195 assert isinstance(result, pd.DataFrame) 

196 

197 

198@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_synthetic_graph") 

199def test_compute_synthetic_graph_df(mock_compute): 

200 mock_compute.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]}) 

201 

202 acd = AnonymizeCaseData() 

203 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]}) 

204 

205 result = acd.compute_synthetic_graph_df([], "source", "target", "") 

206 

207 assert mock_compute.called 

208 assert isinstance(result, pd.DataFrame) 

209 

210 

211@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_time_series_query") 

212def test_compute_time_series_query_df(mock_compute): 

213 mock_compute.return_value = pd.DataFrame({"Year": ["2020"], "Count": [10]}) 

214 

215 acd = AnonymizeCaseData() 

216 acd.synthetic_df = pd.DataFrame({"Year": ["2020"], "Value": [10]}) 

217 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

218 

219 result = acd.compute_time_series_query_df([], "Year", ["Value"]) 

220 

221 assert mock_compute.called 

222 assert isinstance(result, pd.DataFrame) 

223 

224 

225@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_top_attributes_query") 

226def test_compute_top_attributes_query_df(mock_compute): 

227 mock_compute.return_value = pd.DataFrame({"Attribute": ["Color"], "Count": [10]}) 

228 

229 acd = AnonymizeCaseData() 

230 acd.synthetic_df = pd.DataFrame({"Color": ["Red"]}) 

231 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

232 

233 result = acd.compute_top_attributes_query_df([], ["Color"], 10) 

234 

235 assert mock_compute.called 

236 assert isinstance(result, pd.DataFrame) 

237 

238 

239@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_bar_chart") 

240def test_get_bar_chart_fig(mock_get_chart): 

241 mock_fig = MagicMock() 

242 mock_get_chart.return_value = mock_fig 

243 

244 acd = AnonymizeCaseData() 

245 acd.synthetic_df = pd.DataFrame({"Color": ["Red", "Blue"]}) 

246 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

247 

248 fig, chart_df = acd.get_bar_chart_fig([], ["Color"], "record", 800, 600, ["#ff0000"], 10) 

249 

250 assert mock_get_chart.called 

251 assert fig == mock_fig 

252 

253 

254@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_line_chart") 

255@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_time_series_query") 

256def test_get_line_chart_fig(mock_compute, mock_get_chart): 

257 mock_fig = MagicMock() 

258 mock_get_chart.return_value = mock_fig 

259 mock_compute.return_value = pd.DataFrame({"Year": ["2020"], "Count": [10]}) 

260 

261 acd = AnonymizeCaseData() 

262 acd.synthetic_df = pd.DataFrame({"Year": ["2020"], "Value": [10]}) 

263 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

264 

265 fig, chart_df = acd.get_line_chart_fig([], ["Value"], "record", "Year", 800, 600, ["#ff0000"]) 

266 

267 assert mock_get_chart.called 

268 assert fig == mock_fig 

269 

270 

271@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_flow_chart") 

272@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_aggregate_graph") 

273def test_get_flow_chart_fig_with_aggregate(mock_compute_agg, mock_get_chart): 

274 mock_fig = MagicMock() 

275 mock_get_chart.return_value = mock_fig 

276 mock_compute_agg.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]}) 

277 

278 acd = AnonymizeCaseData() 

279 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

280 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]}) 

281 

282 # With 2 attributes (source + target), should use aggregate 

283 fig, chart_df = acd.get_flow_chart_fig([], "source", "target", "", 800, 600, "record", ["#ff0000"]) 

284 

285 assert mock_compute_agg.called 

286 assert fig == mock_fig 

287 

288 

289@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_flow_chart") 

290@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_synthetic_graph") 

291def test_get_flow_chart_fig_with_synthetic(mock_compute_syn, mock_get_chart): 

292 mock_fig = MagicMock() 

293 mock_get_chart.return_value = mock_fig 

294 mock_compute_syn.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]}) 

295 

296 acd = AnonymizeCaseData() 

297 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []}) 

298 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]}) 

299 

300 # With many selections (> 4 attributes), should use synthetic 

301 selection = [ 

302 {"attribute": "A", "value": "1"}, 

303 {"attribute": "B", "value": "2"}, 

304 {"attribute": "C", "value": "3"}, 

305 ] 

306 fig, chart_df = acd.get_flow_chart_fig( 

307 selection, "source", "target", "", 800, 600, "record", ["#ff0000"] 

308 ) 

309 

310 assert mock_compute_syn.called 

311 assert fig == mock_fig 

312 

313 

314def test_analyze_synthesizability_single_row(): 

315 acd = AnonymizeCaseData() 

316 # Test with single row 

317 df = pd.DataFrame({"A": [1], "B": [2]}) 

318 

319 stats = acd.analyze_synthesizability(df) 

320 

321 assert stats.num_cols == 2 

322 assert stats.possible_combinations_per_row == 1.0 # 1 combination / 1 row