Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_queries.py: 100%

201 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import pytest 

5import pandas as pd 

6from collections import defaultdict 

7from intelligence_toolkit.anonymize_case_data.queries import ( 

8 get_data_schema, 

9 compute_aggregate_graph, 

10 compute_synthetic_graph, 

11 compute_top_attributes_query, 

12 compute_time_series_query, 

13) 

14 

15 

16def test_get_data_schema_with_dataframe(): 

17 df = pd.DataFrame({"Color": ["Red", "Blue", "Red"], "Size": ["Large", "Small", ""]}) 

18 

19 schema = get_data_schema(df) 

20 

21 assert "Color" in schema 

22 assert "Size" in schema 

23 assert "Red" in schema["Color"] 

24 assert "Blue" in schema["Color"] 

25 assert "Large" in schema["Size"] 

26 assert "Small" in schema["Size"] 

27 assert len(schema["Color"]) == 2 

28 assert len(schema["Size"]) == 2 # Empty string filtered out 

29 

30 

31def test_get_data_schema_with_none(): 

32 schema = get_data_schema(None) 

33 

34 assert isinstance(schema, defaultdict) 

35 assert len(schema) == 0 

36 

37 

38def test_get_data_schema_empty_dataframe(): 

39 df = pd.DataFrame() 

40 

41 schema = get_data_schema(df) 

42 

43 assert len(schema) == 0 

44 

45 

46def test_get_data_schema_sorts_values(): 

47 df = pd.DataFrame({"Letter": ["Z", "A", "M", "B"]}) 

48 

49 schema = get_data_schema(df) 

50 

51 assert schema["Letter"] == ["A", "B", "M", "Z"] 

52 

53 

54def test_compute_aggregate_graph_basic(): 

55 adf = pd.DataFrame( 

56 { 

57 "selections": [ 

58 "source:A;target:B", 

59 "source:A;target:C", 

60 "source:B;target:C", 

61 ], 

62 "protected_count": [10, 5, 8], 

63 } 

64 ) 

65 filters = [] 

66 source_attribute = "source" 

67 target_attribute = "target" 

68 highlight_attribute = None 

69 

70 result = compute_aggregate_graph( 

71 adf, filters, source_attribute, target_attribute, highlight_attribute 

72 ) 

73 

74 assert isinstance(result, pd.DataFrame) 

75 assert list(result.columns) == [ 

76 "Source", 

77 "Target", 

78 "Count", 

79 "Highlight", 

80 "Proportion", 

81 "Dataset", 

82 ] 

83 

84 

85def test_compute_aggregate_graph_with_filters(): 

86 adf = pd.DataFrame( 

87 { 

88 "selections": [ 

89 "color:Red;source:A;target:B", 

90 "color:Blue;source:A;target:C", 

91 "color:Red;source:B;target:C", 

92 ], 

93 "protected_count": [10, 5, 8], 

94 } 

95 ) 

96 filters = ["color:Red"] 

97 source_attribute = "source" 

98 target_attribute = "target" 

99 highlight_attribute = None 

100 

101 result = compute_aggregate_graph( 

102 adf, filters, source_attribute, target_attribute, highlight_attribute 

103 ) 

104 

105 # Should only include rows matching filter 

106 assert len(result) == 2 

107 assert all(result["Dataset"] == "Aggregate") 

108 

109 

110def test_compute_aggregate_graph_with_highlight(): 

111 adf = pd.DataFrame( 

112 { 

113 "selections": [ 

114 "source:A;target:B", 

115 "highlight:Yes;source:A;target:B", 

116 "source:A;target:C", 

117 ], 

118 "protected_count": [10, 3, 5], 

119 } 

120 ) 

121 filters = [] 

122 source_attribute = "source" 

123 target_attribute = "target" 

124 highlight_attribute = "highlight:Yes" 

125 

126 result = compute_aggregate_graph( 

127 adf, filters, source_attribute, target_attribute, highlight_attribute 

128 ) 

129 

130 # Should have highlight values 

131 assert "Highlight" in result.columns 

132 assert "Proportion" in result.columns 

133 

134 

135def test_compute_aggregate_graph_zero_counts_filtered(): 

136 adf = pd.DataFrame( 

137 { 

138 "selections": ["source:A;target:B", "source:C;target:D"], 

139 "protected_count": [10, 0], 

140 } 

141 ) 

142 filters = [] 

143 source_attribute = "source" 

144 target_attribute = "target" 

145 highlight_attribute = None 

146 

147 result = compute_aggregate_graph( 

148 adf, filters, source_attribute, target_attribute, highlight_attribute 

149 ) 

150 

151 # Zero counts should be filtered out 

152 assert len(result) == 1 

153 assert result.iloc[0]["Count"] == 10 

154 

155 

156def test_compute_synthetic_graph_basic(): 

157 sdf = pd.DataFrame( 

158 { 

159 "source": ["A", "A", "B"], 

160 "target": ["X", "Y", "X"], 

161 } 

162 ) 

163 filters = [] 

164 source_attribute = "source" 

165 target_attribute = "target" 

166 highlight_attribute = "" 

167 

168 result = compute_synthetic_graph( 

169 sdf, filters, source_attribute, target_attribute, highlight_attribute 

170 ) 

171 

172 assert isinstance(result, pd.DataFrame) 

173 assert list(result.columns) == [ 

174 "Source", 

175 "Target", 

176 "Count", 

177 "Highlight", 

178 "Proportion", 

179 "Dataset", 

180 ] 

181 assert all(result["Dataset"] == "Synthetic") 

182 

183 

184def test_compute_synthetic_graph_with_filters(): 

185 sdf = pd.DataFrame( 

186 { 

187 "color": ["Red", "Red", "Blue"], 

188 "source": ["A", "A", "B"], 

189 "target": ["X", "Y", "X"], 

190 } 

191 ) 

192 filters = ["color:Red"] 

193 source_attribute = "source" 

194 target_attribute = "target" 

195 highlight_attribute = "" 

196 

197 result = compute_synthetic_graph( 

198 sdf, filters, source_attribute, target_attribute, highlight_attribute 

199 ) 

200 

201 # Should only count rows matching filter 

202 assert len(result) == 2 # A->X and A->Y 

203 

204 

205def test_compute_synthetic_graph_filters_empty_values(): 

206 sdf = pd.DataFrame( 

207 { 

208 "source": ["A", "", "B"], 

209 "target": ["X", "Y", ""], 

210 } 

211 ) 

212 filters = [] 

213 source_attribute = "source" 

214 target_attribute = "target" 

215 highlight_attribute = "" 

216 

217 result = compute_synthetic_graph( 

218 sdf, filters, source_attribute, target_attribute, highlight_attribute 

219 ) 

220 

221 # Should filter out rows with empty source or target 

222 assert len(result) == 1 

223 assert result.iloc[0]["Source"] == "A" 

224 assert result.iloc[0]["Target"] == "X" 

225 

226 

227def test_compute_synthetic_graph_with_highlight(): 

228 sdf = pd.DataFrame( 

229 { 

230 "source": ["A", "A", "B"], 

231 "target": ["X", "X", "X"], 

232 "status": ["Active", "Inactive", "Active"], 

233 } 

234 ) 

235 filters = [] 

236 source_attribute = "source" 

237 target_attribute = "target" 

238 highlight_attribute = "status:Active" 

239 

240 result = compute_synthetic_graph( 

241 sdf, filters, source_attribute, target_attribute, highlight_attribute 

242 ) 

243 

244 # Should have highlight counts 

245 ax_row = result[(result["Source"] == "A") & (result["Target"] == "X")] 

246 assert len(ax_row) == 1 

247 assert ax_row.iloc[0]["Count"] == 2 

248 assert ax_row.iloc[0]["Highlight"] == 1 # Only one with Active status 

249 

250 

251def test_compute_top_attributes_query_basic(): 

252 sdf = pd.DataFrame( 

253 { 

254 "Color": ["Red", "Blue", "Red"], 

255 "Size": ["Large", "Small", "Large"], 

256 } 

257 ) 

258 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

259 query = [] 

260 show_attributes = [] 

261 num_values = 0 

262 

263 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

264 

265 assert isinstance(result, pd.DataFrame) 

266 assert list(result.columns) == ["Attribute", "Attribute Value", "Count", "Dataset"] 

267 assert len(result) > 0 

268 assert all(result["Dataset"] == "Synthetic") 

269 

270 

271def test_compute_top_attributes_query_with_filter(): 

272 sdf = pd.DataFrame( 

273 { 

274 "Color": ["Red", "Blue", "Red", "Green"], 

275 "Size": ["Large", "Small", "Large", "Medium"], 

276 } 

277 ) 

278 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

279 query = [{"attribute": "Color", "value": "Red"}] 

280 show_attributes = [] 

281 num_values = 0 

282 

283 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

284 

285 # Should only count rows where Color=Red 

286 assert len(result) > 0 

287 

288 

289def test_compute_top_attributes_query_show_specific_attributes(): 

290 sdf = pd.DataFrame( 

291 { 

292 "Color": ["Red", "Blue", "Red"], 

293 "Size": ["Large", "Small", "Large"], 

294 "Weight": ["Heavy", "Light", "Heavy"], 

295 } 

296 ) 

297 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

298 query = [] 

299 show_attributes = ["Color", "Size"] 

300 num_values = 0 

301 

302 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

303 

304 # Should only show Color and Size attributes 

305 assert all(result["Attribute"].isin(["Color", "Size"])) 

306 assert "Weight" not in result["Attribute"].values 

307 

308 

309def test_compute_top_attributes_query_limit_num_values(): 

310 sdf = pd.DataFrame( 

311 { 

312 "Color": ["Red", "Blue", "Green", "Yellow", "Purple"], 

313 } 

314 ) 

315 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

316 query = [] 

317 show_attributes = [] 

318 num_values = 2 

319 

320 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

321 

322 # Should limit to top 2 values 

323 assert len(result) <= 2 

324 

325 

326def test_compute_top_attributes_query_filters_empty_values(): 

327 sdf = pd.DataFrame( 

328 { 

329 "Color": ["Red", "", "Blue"], 

330 "Size": ["Large", "Small", ""], 

331 } 

332 ) 

333 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

334 query = [] 

335 show_attributes = [] 

336 num_values = 0 

337 

338 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

339 

340 # Should not include empty values 

341 assert "" not in result["Attribute Value"].values 

342 

343 

344def test_compute_time_series_query_basic(): 

345 sdf = pd.DataFrame( 

346 { 

347 "Year": ["2020", "2020", "2021", "2021"], 

348 "Color": ["Red", "Blue", "Red", "Green"], 

349 } 

350 ) 

351 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

352 query = [] 

353 time_attribute = "Year" 

354 time_series = ["Color"] 

355 

356 result = compute_time_series_query( 

357 query, sdf, adf, time_attribute, time_series 

358 ) 

359 

360 assert isinstance(result, pd.DataFrame) 

361 assert time_attribute in result.columns 

362 assert "Attribute" in result.columns 

363 assert "Attribute Value" in result.columns 

364 assert "Count" in result.columns 

365 assert "Dataset" in result.columns 

366 

367 

368def test_compute_time_series_query_fills_missing_times(): 

369 sdf = pd.DataFrame( 

370 { 

371 "Year": ["2020", "2020", "2021"], 

372 "Color": ["Red", "Blue", "Red"], 

373 } 

374 ) 

375 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

376 query = [] 

377 time_attribute = "Year" 

378 time_series = ["Color"] 

379 

380 result = compute_time_series_query( 

381 query, sdf, adf, time_attribute, time_series 

382 ) 

383 

384 # Blue should appear in 2021 with count 0 

385 blue_2021 = result[ 

386 (result["Year"] == "2021") & (result["Attribute Value"].str.contains("Blue")) 

387 ] 

388 assert len(blue_2021) > 0 

389 

390 

391def test_compute_time_series_query_filters_empty_times(): 

392 sdf = pd.DataFrame( 

393 { 

394 "Year": ["2020", "", "2021"], 

395 "Color": ["Red", "Blue", "Green"], 

396 } 

397 ) 

398 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

399 query = [] 

400 time_attribute = "Year" 

401 time_series = ["Color"] 

402 

403 result = compute_time_series_query( 

404 query, sdf, adf, time_attribute, time_series 

405 ) 

406 

407 # Should not include rows with empty Year 

408 assert "" not in result["Year"].values 

409 

410 

411def test_compute_top_attributes_query_with_selection(): 

412 sdf = pd.DataFrame( 

413 { 

414 "Year": ["2020", "2020", "2021", "2021"], 

415 "Color": ["Red", "Blue", "Red", "Green"], 

416 } 

417 ) 

418 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

419 query = [{"attribute": "Size", "value": "Large"}] 

420 time_attribute = "Year" 

421 time_series = ["Color"] 

422 

423 result = compute_time_series_query( 

424 query, sdf, adf, time_attribute, time_series 

425 ) 

426 

427 # Should only include Large items 

428 assert len(result) > 0 

429 

430 

431def test_compute_top_attributes_query_with_unions(): 

432 # Test has_unions path (multiple values for same attribute) 

433 sdf = pd.DataFrame( 

434 { 

435 "Color": ["Red", "Blue", "Green", "Red", "Blue"], 

436 "Size": ["Large", "Small", "Large", "Medium", "Small"], 

437 } 

438 ) 

439 adf = pd.DataFrame({"selections": [], "protected_count": []}) 

440 # Multiple values for Color attribute triggers union path 

441 query = [ 

442 {"attribute": "Color", "value": "Red"}, 

443 {"attribute": "Color", "value": "Blue"}, 

444 ] 

445 show_attributes = [] 

446 num_values = 0 

447 

448 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

449 

450 # Should only include Red or Blue colors 

451 assert len(result) > 0 

452 assert all(result["Dataset"] == "Synthetic") # Unions use synthetic only 

453 

454 

455def test_compute_top_attributes_query_with_aggregate_counts(): 

456 # Test path where aggregate counts are used (no unions) 

457 sdf = pd.DataFrame( 

458 { 

459 "Color": ["Red", "Blue", "Red"], 

460 "Size": ["Large", "Small", "Large"], 

461 } 

462 ) 

463 # Setup aggregate data that matches selections 

464 adf = pd.DataFrame( 

465 { 

466 "selections": ["Color:Red", "Color:Blue", "Size:Large"], 

467 "protected_count": [15, 8, 12], 

468 } 

469 ) 

470 query = [] 

471 show_attributes = [] 

472 num_values = 0 

473 

474 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values) 

475 

476 # Should include both Synthetic and Aggregate datasets 

477 assert "Aggregate" in result["Dataset"].values or "Synthetic" in result["Dataset"].values