Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_queries.py: 100%
201 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import pytest
5import pandas as pd
6from collections import defaultdict
7from intelligence_toolkit.anonymize_case_data.queries import (
8 get_data_schema,
9 compute_aggregate_graph,
10 compute_synthetic_graph,
11 compute_top_attributes_query,
12 compute_time_series_query,
13)
16def test_get_data_schema_with_dataframe():
17 df = pd.DataFrame({"Color": ["Red", "Blue", "Red"], "Size": ["Large", "Small", ""]})
19 schema = get_data_schema(df)
21 assert "Color" in schema
22 assert "Size" in schema
23 assert "Red" in schema["Color"]
24 assert "Blue" in schema["Color"]
25 assert "Large" in schema["Size"]
26 assert "Small" in schema["Size"]
27 assert len(schema["Color"]) == 2
28 assert len(schema["Size"]) == 2 # Empty string filtered out
31def test_get_data_schema_with_none():
32 schema = get_data_schema(None)
34 assert isinstance(schema, defaultdict)
35 assert len(schema) == 0
38def test_get_data_schema_empty_dataframe():
39 df = pd.DataFrame()
41 schema = get_data_schema(df)
43 assert len(schema) == 0
46def test_get_data_schema_sorts_values():
47 df = pd.DataFrame({"Letter": ["Z", "A", "M", "B"]})
49 schema = get_data_schema(df)
51 assert schema["Letter"] == ["A", "B", "M", "Z"]
54def test_compute_aggregate_graph_basic():
55 adf = pd.DataFrame(
56 {
57 "selections": [
58 "source:A;target:B",
59 "source:A;target:C",
60 "source:B;target:C",
61 ],
62 "protected_count": [10, 5, 8],
63 }
64 )
65 filters = []
66 source_attribute = "source"
67 target_attribute = "target"
68 highlight_attribute = None
70 result = compute_aggregate_graph(
71 adf, filters, source_attribute, target_attribute, highlight_attribute
72 )
74 assert isinstance(result, pd.DataFrame)
75 assert list(result.columns) == [
76 "Source",
77 "Target",
78 "Count",
79 "Highlight",
80 "Proportion",
81 "Dataset",
82 ]
85def test_compute_aggregate_graph_with_filters():
86 adf = pd.DataFrame(
87 {
88 "selections": [
89 "color:Red;source:A;target:B",
90 "color:Blue;source:A;target:C",
91 "color:Red;source:B;target:C",
92 ],
93 "protected_count": [10, 5, 8],
94 }
95 )
96 filters = ["color:Red"]
97 source_attribute = "source"
98 target_attribute = "target"
99 highlight_attribute = None
101 result = compute_aggregate_graph(
102 adf, filters, source_attribute, target_attribute, highlight_attribute
103 )
105 # Should only include rows matching filter
106 assert len(result) == 2
107 assert all(result["Dataset"] == "Aggregate")
110def test_compute_aggregate_graph_with_highlight():
111 adf = pd.DataFrame(
112 {
113 "selections": [
114 "source:A;target:B",
115 "highlight:Yes;source:A;target:B",
116 "source:A;target:C",
117 ],
118 "protected_count": [10, 3, 5],
119 }
120 )
121 filters = []
122 source_attribute = "source"
123 target_attribute = "target"
124 highlight_attribute = "highlight:Yes"
126 result = compute_aggregate_graph(
127 adf, filters, source_attribute, target_attribute, highlight_attribute
128 )
130 # Should have highlight values
131 assert "Highlight" in result.columns
132 assert "Proportion" in result.columns
135def test_compute_aggregate_graph_zero_counts_filtered():
136 adf = pd.DataFrame(
137 {
138 "selections": ["source:A;target:B", "source:C;target:D"],
139 "protected_count": [10, 0],
140 }
141 )
142 filters = []
143 source_attribute = "source"
144 target_attribute = "target"
145 highlight_attribute = None
147 result = compute_aggregate_graph(
148 adf, filters, source_attribute, target_attribute, highlight_attribute
149 )
151 # Zero counts should be filtered out
152 assert len(result) == 1
153 assert result.iloc[0]["Count"] == 10
156def test_compute_synthetic_graph_basic():
157 sdf = pd.DataFrame(
158 {
159 "source": ["A", "A", "B"],
160 "target": ["X", "Y", "X"],
161 }
162 )
163 filters = []
164 source_attribute = "source"
165 target_attribute = "target"
166 highlight_attribute = ""
168 result = compute_synthetic_graph(
169 sdf, filters, source_attribute, target_attribute, highlight_attribute
170 )
172 assert isinstance(result, pd.DataFrame)
173 assert list(result.columns) == [
174 "Source",
175 "Target",
176 "Count",
177 "Highlight",
178 "Proportion",
179 "Dataset",
180 ]
181 assert all(result["Dataset"] == "Synthetic")
184def test_compute_synthetic_graph_with_filters():
185 sdf = pd.DataFrame(
186 {
187 "color": ["Red", "Red", "Blue"],
188 "source": ["A", "A", "B"],
189 "target": ["X", "Y", "X"],
190 }
191 )
192 filters = ["color:Red"]
193 source_attribute = "source"
194 target_attribute = "target"
195 highlight_attribute = ""
197 result = compute_synthetic_graph(
198 sdf, filters, source_attribute, target_attribute, highlight_attribute
199 )
201 # Should only count rows matching filter
202 assert len(result) == 2 # A->X and A->Y
205def test_compute_synthetic_graph_filters_empty_values():
206 sdf = pd.DataFrame(
207 {
208 "source": ["A", "", "B"],
209 "target": ["X", "Y", ""],
210 }
211 )
212 filters = []
213 source_attribute = "source"
214 target_attribute = "target"
215 highlight_attribute = ""
217 result = compute_synthetic_graph(
218 sdf, filters, source_attribute, target_attribute, highlight_attribute
219 )
221 # Should filter out rows with empty source or target
222 assert len(result) == 1
223 assert result.iloc[0]["Source"] == "A"
224 assert result.iloc[0]["Target"] == "X"
227def test_compute_synthetic_graph_with_highlight():
228 sdf = pd.DataFrame(
229 {
230 "source": ["A", "A", "B"],
231 "target": ["X", "X", "X"],
232 "status": ["Active", "Inactive", "Active"],
233 }
234 )
235 filters = []
236 source_attribute = "source"
237 target_attribute = "target"
238 highlight_attribute = "status:Active"
240 result = compute_synthetic_graph(
241 sdf, filters, source_attribute, target_attribute, highlight_attribute
242 )
244 # Should have highlight counts
245 ax_row = result[(result["Source"] == "A") & (result["Target"] == "X")]
246 assert len(ax_row) == 1
247 assert ax_row.iloc[0]["Count"] == 2
248 assert ax_row.iloc[0]["Highlight"] == 1 # Only one with Active status
251def test_compute_top_attributes_query_basic():
252 sdf = pd.DataFrame(
253 {
254 "Color": ["Red", "Blue", "Red"],
255 "Size": ["Large", "Small", "Large"],
256 }
257 )
258 adf = pd.DataFrame({"selections": [], "protected_count": []})
259 query = []
260 show_attributes = []
261 num_values = 0
263 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
265 assert isinstance(result, pd.DataFrame)
266 assert list(result.columns) == ["Attribute", "Attribute Value", "Count", "Dataset"]
267 assert len(result) > 0
268 assert all(result["Dataset"] == "Synthetic")
271def test_compute_top_attributes_query_with_filter():
272 sdf = pd.DataFrame(
273 {
274 "Color": ["Red", "Blue", "Red", "Green"],
275 "Size": ["Large", "Small", "Large", "Medium"],
276 }
277 )
278 adf = pd.DataFrame({"selections": [], "protected_count": []})
279 query = [{"attribute": "Color", "value": "Red"}]
280 show_attributes = []
281 num_values = 0
283 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
285 # Should only count rows where Color=Red
286 assert len(result) > 0
289def test_compute_top_attributes_query_show_specific_attributes():
290 sdf = pd.DataFrame(
291 {
292 "Color": ["Red", "Blue", "Red"],
293 "Size": ["Large", "Small", "Large"],
294 "Weight": ["Heavy", "Light", "Heavy"],
295 }
296 )
297 adf = pd.DataFrame({"selections": [], "protected_count": []})
298 query = []
299 show_attributes = ["Color", "Size"]
300 num_values = 0
302 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
304 # Should only show Color and Size attributes
305 assert all(result["Attribute"].isin(["Color", "Size"]))
306 assert "Weight" not in result["Attribute"].values
309def test_compute_top_attributes_query_limit_num_values():
310 sdf = pd.DataFrame(
311 {
312 "Color": ["Red", "Blue", "Green", "Yellow", "Purple"],
313 }
314 )
315 adf = pd.DataFrame({"selections": [], "protected_count": []})
316 query = []
317 show_attributes = []
318 num_values = 2
320 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
322 # Should limit to top 2 values
323 assert len(result) <= 2
326def test_compute_top_attributes_query_filters_empty_values():
327 sdf = pd.DataFrame(
328 {
329 "Color": ["Red", "", "Blue"],
330 "Size": ["Large", "Small", ""],
331 }
332 )
333 adf = pd.DataFrame({"selections": [], "protected_count": []})
334 query = []
335 show_attributes = []
336 num_values = 0
338 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
340 # Should not include empty values
341 assert "" not in result["Attribute Value"].values
344def test_compute_time_series_query_basic():
345 sdf = pd.DataFrame(
346 {
347 "Year": ["2020", "2020", "2021", "2021"],
348 "Color": ["Red", "Blue", "Red", "Green"],
349 }
350 )
351 adf = pd.DataFrame({"selections": [], "protected_count": []})
352 query = []
353 time_attribute = "Year"
354 time_series = ["Color"]
356 result = compute_time_series_query(
357 query, sdf, adf, time_attribute, time_series
358 )
360 assert isinstance(result, pd.DataFrame)
361 assert time_attribute in result.columns
362 assert "Attribute" in result.columns
363 assert "Attribute Value" in result.columns
364 assert "Count" in result.columns
365 assert "Dataset" in result.columns
368def test_compute_time_series_query_fills_missing_times():
369 sdf = pd.DataFrame(
370 {
371 "Year": ["2020", "2020", "2021"],
372 "Color": ["Red", "Blue", "Red"],
373 }
374 )
375 adf = pd.DataFrame({"selections": [], "protected_count": []})
376 query = []
377 time_attribute = "Year"
378 time_series = ["Color"]
380 result = compute_time_series_query(
381 query, sdf, adf, time_attribute, time_series
382 )
384 # Blue should appear in 2021 with count 0
385 blue_2021 = result[
386 (result["Year"] == "2021") & (result["Attribute Value"].str.contains("Blue"))
387 ]
388 assert len(blue_2021) > 0
391def test_compute_time_series_query_filters_empty_times():
392 sdf = pd.DataFrame(
393 {
394 "Year": ["2020", "", "2021"],
395 "Color": ["Red", "Blue", "Green"],
396 }
397 )
398 adf = pd.DataFrame({"selections": [], "protected_count": []})
399 query = []
400 time_attribute = "Year"
401 time_series = ["Color"]
403 result = compute_time_series_query(
404 query, sdf, adf, time_attribute, time_series
405 )
407 # Should not include rows with empty Year
408 assert "" not in result["Year"].values
411def test_compute_top_attributes_query_with_selection():
412 sdf = pd.DataFrame(
413 {
414 "Year": ["2020", "2020", "2021", "2021"],
415 "Color": ["Red", "Blue", "Red", "Green"],
416 }
417 )
418 adf = pd.DataFrame({"selections": [], "protected_count": []})
419 query = [{"attribute": "Size", "value": "Large"}]
420 time_attribute = "Year"
421 time_series = ["Color"]
423 result = compute_time_series_query(
424 query, sdf, adf, time_attribute, time_series
425 )
427 # Should only include Large items
428 assert len(result) > 0
431def test_compute_top_attributes_query_with_unions():
432 # Test has_unions path (multiple values for same attribute)
433 sdf = pd.DataFrame(
434 {
435 "Color": ["Red", "Blue", "Green", "Red", "Blue"],
436 "Size": ["Large", "Small", "Large", "Medium", "Small"],
437 }
438 )
439 adf = pd.DataFrame({"selections": [], "protected_count": []})
440 # Multiple values for Color attribute triggers union path
441 query = [
442 {"attribute": "Color", "value": "Red"},
443 {"attribute": "Color", "value": "Blue"},
444 ]
445 show_attributes = []
446 num_values = 0
448 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
450 # Should only include Red or Blue colors
451 assert len(result) > 0
452 assert all(result["Dataset"] == "Synthetic") # Unions use synthetic only
455def test_compute_top_attributes_query_with_aggregate_counts():
456 # Test path where aggregate counts are used (no unions)
457 sdf = pd.DataFrame(
458 {
459 "Color": ["Red", "Blue", "Red"],
460 "Size": ["Large", "Small", "Large"],
461 }
462 )
463 # Setup aggregate data that matches selections
464 adf = pd.DataFrame(
465 {
466 "selections": ["Color:Red", "Color:Blue", "Size:Large"],
467 "protected_count": [15, 8, 12],
468 }
469 )
470 query = []
471 show_attributes = []
472 num_values = 0
474 result = compute_top_attributes_query(query, sdf, adf, show_attributes, num_values)
476 # Should include both Synthetic and Aggregate datasets
477 assert "Aggregate" in result["Dataset"].values or "Synthetic" in result["Dataset"].values