Coverage for intelligence_toolkit/tests/smoke/test_anonymize_case_data.py: 0%

62 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4import shutil 

5from functools import wraps 

6from pathlib import Path 

7from typing import ClassVar 

8 

9import pandas as pd 

10import pytest 

11 

12from intelligence_toolkit.anonymize_case_data.api import ( 

13 AnonymizeCaseData, 

14 SynthesizabilityStatistics, 

15) 

16from intelligence_toolkit.anonymize_case_data.visuals import color_schemes 

17from intelligence_toolkit.helpers import df_functions 

18 

19example_outputs_folder = "./example_outputs/anonymize_case_data" 

20 

21 

22def cleanup(skip: bool = False): 

23 """Decorator to cleanup the output and cache folders after each test.""" 

24 

25 def decorator(func): 

26 @wraps(func) 

27 def wrapper(*args, **kwargs): 

28 try: 

29 return func(*args, **kwargs) 

30 except AssertionError: 

31 raise 

32 finally: 

33 if not skip: 

34 root = Path(kwargs["input_path"]) 

35 shutil.rmtree(root / "anonymize_case_data", ignore_errors=True) 

36 

37 return wrapper 

38 

39 return decorator 

40 

41 

42class TestACD: 

43 @pytest.fixture() 

44 def dataset(self): 

45 data_path = f"{example_outputs_folder}/customer_complaints/customer_complaints_prepared.csv" 

46 return pd.read_csv(data_path) 

47 

48 def test_anonymize_case_data(self, dataset): 

49 acd = AnonymizeCaseData() 

50 

51 sensitive_data = df_functions.suppress_boolean_binary(dataset) 

52 

53 assert not sensitive_data.isin([0.0]).any().any() 

54 

55 synthesizability_stats = acd.analyze_synthesizability(sensitive_data) 

56 assert synthesizability_stats.num_cols == 9 

57 assert synthesizability_stats.overall_att_count == 101 

58 assert synthesizability_stats.possible_combinations == 27648 

59 assert synthesizability_stats.possible_combinations_per_row == 9.2 

60 assert synthesizability_stats.mean_vals_per_record == 5.409 

61 assert round(synthesizability_stats.max_combinations_per_record, 2) == 42.49 

62 assert round(synthesizability_stats.excess_combinations_ratio, 2) == 0.22 

63 

64 # Anonymize the data 

65 acd.anonymize_case_data( 

66 df=sensitive_data, 

67 epsilon=12.0, 

68 ) 

69 

70 assert len(acd.aggregate_df) > 0 

71 

72 selections = acd.aggregate_df["selections"].to_list() 

73 assert "age_range:(30-40]" in selections 

74 assert "record_count" in selections 

75 assert "quality_issue:True" in selections 

76 assert "age_range:(40-50];city:Mountainview;period:2023-H1" in selections 

77 

78 assert "0.00 %" not in acd.aggregate_error_report["Suppressed %"].to_list() 

79 

80 count_error = acd.aggregate_error_report["Count +/- Error"].to_list() 

81 

82 assert "160.66" in count_error[0] 

83 assert "23.85" in count_error[1] 

84 assert "6.85" in count_error[2] 

85 assert "2.85" in count_error[3] 

86 assert "6.88" in count_error[4] 

87 

88 bar_chart, bar_chart_df = acd.get_bar_chart_fig( 

89 selection=[], 

90 show_attributes=[], 

91 unit="Customer", 

92 width=700, 

93 height=400, 

94 scheme=color_schemes["Alphabet"], 

95 num_values=10, 

96 ) 

97 

98 assert isinstance(bar_chart_df, pd.DataFrame), "Expected a pandas DataFrame" 

99 assert len(bar_chart_df) == 10, "Expected 10 rows in the DataFrame" 

100 expected_columns = [ 

101 "Attribute", 

102 "Count", 

103 "Attribute Value", 

104 ] 

105 assert all( 

106 col in bar_chart_df.columns for col in expected_columns 

107 ), f"DataFrame should contain columns: {expected_columns}" 

108 

109 assert bar_chart.layout.width == 700, "Expected bar chart width of 700" 

110 assert bar_chart.layout.height == 400, "Expected bar chart height of 400" 

111 assert ( 

112 len(bar_chart.data) > 0 

113 ), "Expected the chart to contain at least one data trace"