Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_api.py: 100%
175 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import pytest
5import pandas as pd
6import math
7from unittest.mock import MagicMock, patch, Mock
8from intelligence_toolkit.anonymize_case_data.api import AnonymizeCaseData
9from intelligence_toolkit.anonymize_case_data.synthesizability_statistics import (
10 SynthesizabilityStatistics,
11)
14def test_anonymize_case_data_initialization():
15 acd = AnonymizeCaseData()
17 assert acd.protected_number_of_records == 0
18 assert acd.delta == 0
19 assert isinstance(acd.sensitive_df, pd.DataFrame)
20 assert isinstance(acd.aggregate_df, pd.DataFrame)
21 assert isinstance(acd.synthetic_aggregate_df, pd.DataFrame)
22 assert isinstance(acd.synthetic_df, pd.DataFrame)
23 assert isinstance(acd.aggregate_error_report, pd.DataFrame)
24 assert isinstance(acd.synthetic_error_report, pd.DataFrame)
27def test_fabrication_strategy_enum():
28 # Test that enum values exist
29 assert hasattr(AnonymizeCaseData.FabricationStrategy, "BALANCED")
30 assert hasattr(AnonymizeCaseData.FabricationStrategy, "PROGRESSIVE")
31 assert hasattr(AnonymizeCaseData.FabricationStrategy, "MINIMIZED")
32 assert hasattr(AnonymizeCaseData.FabricationStrategy, "UNCONTROLLED")
35def test_analyze_synthesizability_basic():
36 acd = AnonymizeCaseData()
37 df = pd.DataFrame(
38 {
39 "Color": ["Red", "Blue", "Red"],
40 "Size": ["Large", "Small", "Large"],
41 }
42 )
44 stats = acd.analyze_synthesizability(df)
46 assert isinstance(stats, SynthesizabilityStatistics)
47 assert stats.num_cols == 2
48 assert stats.overall_att_count > 0
49 assert stats.possible_combinations > 0
52def test_analyze_synthesizability_with_empty_values():
53 acd = AnonymizeCaseData()
54 df = pd.DataFrame(
55 {
56 "Color": ["Red", "", "Blue"],
57 "Size": ["Large", "Small", ""],
58 }
59 )
61 stats = acd.analyze_synthesizability(df)
63 # Empty values should be filtered out
64 assert isinstance(stats, SynthesizabilityStatistics)
65 assert stats.num_cols == 2
68def test_analyze_synthesizability_with_nan():
69 acd = AnonymizeCaseData()
70 df = pd.DataFrame(
71 {
72 "Color": ["Red", None, "Blue"],
73 "Size": ["Large", "Small", None],
74 }
75 )
77 stats = acd.analyze_synthesizability(df)
79 # NaN values should be filtered out
80 assert isinstance(stats, SynthesizabilityStatistics)
83def test_analyze_synthesizability_distinct_counts():
84 acd = AnonymizeCaseData()
85 df = pd.DataFrame(
86 {
87 "Color": ["Red", "Red", "Red"],
88 "Size": ["Large", "Large", "Large"],
89 }
90 )
92 stats = acd.analyze_synthesizability(df)
94 # Only 1 distinct value per column
95 assert stats.possible_combinations == 1 # 1 * 1
98def test_analyze_synthesizability_calculates_combinations():
99 acd = AnonymizeCaseData()
100 df = pd.DataFrame(
101 {
102 "Color": ["Red", "Blue"],
103 "Size": ["Large", "Small"],
104 }
105 )
107 stats = acd.analyze_synthesizability(df)
109 # 2 colors * 2 sizes = 4 possible combinations
110 assert stats.possible_combinations == 4
111 assert stats.possible_combinations_per_row == 2.0 # 4 / 2 rows
114def test_analyze_synthesizability_mean_vals_per_record():
115 acd = AnonymizeCaseData()
116 df = pd.DataFrame(
117 {
118 "A": ["X", "Y"],
119 "B": ["1", ""],
120 "C": ["", "2"],
121 }
122 )
124 stats = acd.analyze_synthesizability(df)
126 # Row 1: 2 vals (X, 1), Row 2: 2 vals (Y, 2), mean = 2.0
127 assert stats.mean_vals_per_record == 2.0
130def test_analyze_synthesizability_excess_combinations_ratio():
131 acd = AnonymizeCaseData()
132 df = pd.DataFrame(
133 {
134 "A": ["X", "Y"],
135 "B": ["1", "2"],
136 }
137 )
139 stats = acd.analyze_synthesizability(df)
141 # Should calculate excess_combinations_ratio
142 assert stats.excess_combinations_ratio > 0
145@patch("intelligence_toolkit.anonymize_case_data.api.DpAggregateSeededSynthesizer")
146@patch("intelligence_toolkit.anonymize_case_data.api.Dataset")
147@patch("intelligence_toolkit.anonymize_case_data.api.df_functions.fix_null_ints")
148def test_anonymize_case_data_method(mock_fix_null_ints, mock_dataset, mock_synth_class):
149 # Setup mocks
150 mock_fix_null_ints.return_value = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
152 mock_dataset_instance = MagicMock()
153 mock_dataset.from_data_frame.return_value = mock_dataset_instance
154 mock_dataset_instance.get_aggregates.return_value = {"A:1": 10}
155 mock_dataset.return_value.get_aggregates.return_value = {"A:1": 9}
156 mock_dataset.raw_data_to_data_frame.return_value = pd.DataFrame({"A": [1], "B": [3]})
158 mock_synth_instance = MagicMock()
159 mock_synth_class.return_value = mock_synth_instance
160 mock_synth_instance.get_dp_number_of_records.return_value = 100
161 mock_synth_instance.get_dp_aggregates.return_value = {"A:1": 10}
162 mock_synth_instance.sample.return_value = MagicMock()
164 acd = AnonymizeCaseData()
165 df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
167 acd.anonymize_case_data(df, epsilon=1.0)
169 # Verify synthesizer was created and fit was called
170 assert mock_synth_instance.fit.called
171 assert mock_synth_instance.sample.called
172 assert acd.protected_number_of_records == 100
175def test_get_data_schema():
176 acd = AnonymizeCaseData()
177 acd.synthetic_df = pd.DataFrame({"Color": ["Red", "Blue"], "Size": ["Large", "Small"]})
179 schema = acd.get_data_schema()
181 assert "Color" in schema
182 assert "Size" in schema
185@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_aggregate_graph")
186def test_compute_aggregate_graph_df(mock_compute):
187 mock_compute.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]})
189 acd = AnonymizeCaseData()
190 acd.aggregate_df = pd.DataFrame({"selections": ["A;B"], "protected_count": [10]})
192 result = acd.compute_aggregate_graph_df([], "source", "target", "")
194 assert mock_compute.called
195 assert isinstance(result, pd.DataFrame)
198@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_synthetic_graph")
199def test_compute_synthetic_graph_df(mock_compute):
200 mock_compute.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]})
202 acd = AnonymizeCaseData()
203 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]})
205 result = acd.compute_synthetic_graph_df([], "source", "target", "")
207 assert mock_compute.called
208 assert isinstance(result, pd.DataFrame)
211@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_time_series_query")
212def test_compute_time_series_query_df(mock_compute):
213 mock_compute.return_value = pd.DataFrame({"Year": ["2020"], "Count": [10]})
215 acd = AnonymizeCaseData()
216 acd.synthetic_df = pd.DataFrame({"Year": ["2020"], "Value": [10]})
217 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
219 result = acd.compute_time_series_query_df([], "Year", ["Value"])
221 assert mock_compute.called
222 assert isinstance(result, pd.DataFrame)
225@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_top_attributes_query")
226def test_compute_top_attributes_query_df(mock_compute):
227 mock_compute.return_value = pd.DataFrame({"Attribute": ["Color"], "Count": [10]})
229 acd = AnonymizeCaseData()
230 acd.synthetic_df = pd.DataFrame({"Color": ["Red"]})
231 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
233 result = acd.compute_top_attributes_query_df([], ["Color"], 10)
235 assert mock_compute.called
236 assert isinstance(result, pd.DataFrame)
239@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_bar_chart")
240def test_get_bar_chart_fig(mock_get_chart):
241 mock_fig = MagicMock()
242 mock_get_chart.return_value = mock_fig
244 acd = AnonymizeCaseData()
245 acd.synthetic_df = pd.DataFrame({"Color": ["Red", "Blue"]})
246 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
248 fig, chart_df = acd.get_bar_chart_fig([], ["Color"], "record", 800, 600, ["#ff0000"], 10)
250 assert mock_get_chart.called
251 assert fig == mock_fig
254@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_line_chart")
255@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_time_series_query")
256def test_get_line_chart_fig(mock_compute, mock_get_chart):
257 mock_fig = MagicMock()
258 mock_get_chart.return_value = mock_fig
259 mock_compute.return_value = pd.DataFrame({"Year": ["2020"], "Count": [10]})
261 acd = AnonymizeCaseData()
262 acd.synthetic_df = pd.DataFrame({"Year": ["2020"], "Value": [10]})
263 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
265 fig, chart_df = acd.get_line_chart_fig([], ["Value"], "record", "Year", 800, 600, ["#ff0000"])
267 assert mock_get_chart.called
268 assert fig == mock_fig
271@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_flow_chart")
272@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_aggregate_graph")
273def test_get_flow_chart_fig_with_aggregate(mock_compute_agg, mock_get_chart):
274 mock_fig = MagicMock()
275 mock_get_chart.return_value = mock_fig
276 mock_compute_agg.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]})
278 acd = AnonymizeCaseData()
279 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
280 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]})
282 # With 2 attributes (source + target), should use aggregate
283 fig, chart_df = acd.get_flow_chart_fig([], "source", "target", "", 800, 600, "record", ["#ff0000"])
285 assert mock_compute_agg.called
286 assert fig == mock_fig
289@patch("intelligence_toolkit.anonymize_case_data.api.visuals.get_flow_chart")
290@patch("intelligence_toolkit.anonymize_case_data.api.queries.compute_synthetic_graph")
291def test_get_flow_chart_fig_with_synthetic(mock_compute_syn, mock_get_chart):
292 mock_fig = MagicMock()
293 mock_get_chart.return_value = mock_fig
294 mock_compute_syn.return_value = pd.DataFrame({"Source": ["A"], "Target": ["B"]})
296 acd = AnonymizeCaseData()
297 acd.aggregate_df = pd.DataFrame({"selections": [], "protected_count": []})
298 acd.synthetic_df = pd.DataFrame({"source": ["A"], "target": ["B"]})
300 # With many selections (> 4 attributes), should use synthetic
301 selection = [
302 {"attribute": "A", "value": "1"},
303 {"attribute": "B", "value": "2"},
304 {"attribute": "C", "value": "3"},
305 ]
306 fig, chart_df = acd.get_flow_chart_fig(
307 selection, "source", "target", "", 800, 600, "record", ["#ff0000"]
308 )
310 assert mock_compute_syn.called
311 assert fig == mock_fig
314def test_analyze_synthesizability_single_row():
315 acd = AnonymizeCaseData()
316 # Test with single row
317 df = pd.DataFrame({"A": [1], "B": [2]})
319 stats = acd.analyze_synthesizability(df)
321 assert stats.num_cols == 2
322 assert stats.possible_combinations_per_row == 1.0 # 1 combination / 1 row