Coverage for intelligence_toolkit/tests/smoke/test_anonymize_case_data.py: 0%
62 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
4import shutil
5from functools import wraps
6from pathlib import Path
7from typing import ClassVar
9import pandas as pd
10import pytest
12from intelligence_toolkit.anonymize_case_data.api import (
13 AnonymizeCaseData,
14 SynthesizabilityStatistics,
15)
16from intelligence_toolkit.anonymize_case_data.visuals import color_schemes
17from intelligence_toolkit.helpers import df_functions
19example_outputs_folder = "./example_outputs/anonymize_case_data"
22def cleanup(skip: bool = False):
23 """Decorator to cleanup the output and cache folders after each test."""
25 def decorator(func):
26 @wraps(func)
27 def wrapper(*args, **kwargs):
28 try:
29 return func(*args, **kwargs)
30 except AssertionError:
31 raise
32 finally:
33 if not skip:
34 root = Path(kwargs["input_path"])
35 shutil.rmtree(root / "anonymize_case_data", ignore_errors=True)
37 return wrapper
39 return decorator
42class TestACD:
43 @pytest.fixture()
44 def dataset(self):
45 data_path = f"{example_outputs_folder}/customer_complaints/customer_complaints_prepared.csv"
46 return pd.read_csv(data_path)
48 def test_anonymize_case_data(self, dataset):
49 acd = AnonymizeCaseData()
51 sensitive_data = df_functions.suppress_boolean_binary(dataset)
53 assert not sensitive_data.isin([0.0]).any().any()
55 synthesizability_stats = acd.analyze_synthesizability(sensitive_data)
56 assert synthesizability_stats.num_cols == 9
57 assert synthesizability_stats.overall_att_count == 101
58 assert synthesizability_stats.possible_combinations == 27648
59 assert synthesizability_stats.possible_combinations_per_row == 9.2
60 assert synthesizability_stats.mean_vals_per_record == 5.409
61 assert round(synthesizability_stats.max_combinations_per_record, 2) == 42.49
62 assert round(synthesizability_stats.excess_combinations_ratio, 2) == 0.22
64 # Anonymize the data
65 acd.anonymize_case_data(
66 df=sensitive_data,
67 epsilon=12.0,
68 )
70 assert len(acd.aggregate_df) > 0
72 selections = acd.aggregate_df["selections"].to_list()
73 assert "age_range:(30-40]" in selections
74 assert "record_count" in selections
75 assert "quality_issue:True" in selections
76 assert "age_range:(40-50];city:Mountainview;period:2023-H1" in selections
78 assert "0.00 %" not in acd.aggregate_error_report["Suppressed %"].to_list()
80 count_error = acd.aggregate_error_report["Count +/- Error"].to_list()
82 assert "160.66" in count_error[0]
83 assert "23.85" in count_error[1]
84 assert "6.85" in count_error[2]
85 assert "2.85" in count_error[3]
86 assert "6.88" in count_error[4]
88 bar_chart, bar_chart_df = acd.get_bar_chart_fig(
89 selection=[],
90 show_attributes=[],
91 unit="Customer",
92 width=700,
93 height=400,
94 scheme=color_schemes["Alphabet"],
95 num_values=10,
96 )
98 assert isinstance(bar_chart_df, pd.DataFrame), "Expected a pandas DataFrame"
99 assert len(bar_chart_df) == 10, "Expected 10 rows in the DataFrame"
100 expected_columns = [
101 "Attribute",
102 "Count",
103 "Attribute Value",
104 ]
105 assert all(
106 col in bar_chart_df.columns for col in expected_columns
107 ), f"DataFrame should contain columns: {expected_columns}"
109 assert bar_chart.layout.width == 700, "Expected bar chart width of 700"
110 assert bar_chart.layout.height == 400, "Expected bar chart height of 400"
111 assert (
112 len(bar_chart.data) > 0
113 ), "Expected the chart to contain at least one data trace"