Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_error_report.py: 100%

89 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import pytest 

5import pandas as pd 

6import numpy as np 

7from collections import defaultdict 

8from intelligence_toolkit.anonymize_case_data.error_report import ErrorReport 

9 

10 

11def test_error_report_initialization(): 

12 src_aggs = {("A", "B"): 10, ("C",): 5} 

13 target_aggs = {("A", "B"): 12, ("D",): 3} 

14 

15 report = ErrorReport(src_aggs, target_aggs) 

16 

17 assert report.src_aggregates == src_aggs 

18 assert report.target_aggregates == target_aggs 

19 

20 

21def test_calc_fabricated(): 

22 src_aggs = {("A", "B"): 10, ("C",): 5} 

23 target_aggs = {("A", "B"): 12, ("D",): 3, ("E", "F"): 7} 

24 

25 report = ErrorReport(src_aggs, target_aggs) 

26 report.calc_fabricated() 

27 

28 # D and E,F are fabricated (not in source) 

29 assert report.fabricated_count == 10 # 3 + 7 

30 assert report.fabricated_count_by_len[1] == 3 # D 

31 assert report.fabricated_count_by_len[2] == 7 # E,F 

32 

33 

34def test_calc_suppressed(): 

35 src_aggs = {("A", "B"): 10, ("C",): 5, ("X", "Y", "Z"): 15} 

36 target_aggs = {("A", "B"): 12} 

37 

38 report = ErrorReport(src_aggs, target_aggs) 

39 report.calc_suppressed() 

40 

41 # C and X,Y,Z are suppressed (not in target) 

42 assert report.suppressed_count == 20 # 5 + 15 

43 assert report.suppressed_count_by_len[1] == 5 # C 

44 assert report.suppressed_count_by_len[3] == 15 # X,Y,Z 

45 

46 

47def test_calc_mean(): 

48 src_aggs = {("A", "B"): 10, ("C",): 5, ("D", "E"): 20, ("F",): 15} 

49 target_aggs = {} 

50 

51 report = ErrorReport(src_aggs, target_aggs) 

52 report.calc_mean() 

53 

54 assert report.mean_count == 12.5 # (10 + 5 + 20 + 15) / 4 

55 assert report.mean_count_by_len[1] == 10.0 # (5 + 15) / 2 

56 assert report.mean_count_by_len[2] == 15.0 # (10 + 20) / 2 

57 

58 

59def test_calc_errors(): 

60 src_aggs = {("A", "B"): 10, ("C",): 5, ("D", "E"): 20} 

61 target_aggs = {("A", "B"): 12, ("C",): 3, ("D", "E"): 25} 

62 

63 report = ErrorReport(src_aggs, target_aggs) 

64 report.calc_errors() 

65 

66 # Errors: |12-10| + |3-5| + |25-20| = 2 + 2 + 5 = 9 total, mean = 3.0 

67 assert report.mean_error == 3.0 

68 assert report.mean_error_by_len[1] == 2.0 # C: |3-5| 

69 assert report.mean_error_by_len[2] == 3.5 # A,B and D,E: (2 + 5) / 2 

70 

71 

72def test_calc_total(): 

73 aggregates = {("A", "B"): 10, ("C",): 5, ("D", "E", "F"): 15, ("G",): 20} 

74 

75 total, total_by_len = ErrorReport.calc_total(aggregates) 

76 

77 assert total == 50 # 10 + 5 + 15 + 20 

78 assert total_by_len[1] == 25 # C + G 

79 assert total_by_len[2] == 10 # A,B 

80 assert total_by_len[3] == 15 # D,E,F 

81 

82 

83def test_gen_creates_dataframe(): 

84 src_aggs = {("A", "B"): 10, ("C",): 5} 

85 target_aggs = {("A", "B"): 12, ("D",): 3} 

86 

87 report = ErrorReport(src_aggs, target_aggs) 

88 result_df = report.gen() 

89 

90 assert isinstance(result_df, pd.DataFrame) 

91 assert list(result_df.columns) == [ 

92 "Length", 

93 "Count +/- Error", 

94 "Suppressed %", 

95 "Fabricated %", 

96 ] 

97 # Should have rows for each length + overall row 

98 assert len(result_df) >= 2 

99 

100 

101def test_gen_with_multiple_lengths(): 

102 src_aggs = { 

103 ("A",): 100, 

104 ("B",): 50, 

105 ("C", "D"): 200, 

106 ("E", "F"): 150, 

107 ("G", "H", "I"): 300, 

108 } 

109 target_aggs = { 

110 ("A",): 95, 

111 ("B",): 55, 

112 ("C", "D"): 190, 

113 ("E", "F"): 160, 

114 ("G", "H", "I"): 310, 

115 ("J",): 10, # Fabricated 

116 } 

117 

118 report = ErrorReport(src_aggs, target_aggs) 

119 result_df = report.gen() 

120 

121 # Should have rows for lengths 1, 2, 3, and overall 

122 assert len(result_df) == 4 

123 assert "1" in result_df["Length"].values 

124 assert "2" in result_df["Length"].values 

125 assert "3" in result_df["Length"].values 

126 assert "Overall" in result_df["Length"].values 

127 

128 

129def test_gen_empty_aggregates(): 

130 src_aggs = {} 

131 target_aggs = {} 

132 

133 report = ErrorReport(src_aggs, target_aggs) 

134 report.calc_fabricated() 

135 report.calc_suppressed() 

136 

137 assert report.fabricated_count == 0 

138 assert report.suppressed_count == 0 

139 

140 

141def test_error_report_with_matching_aggregates(): 

142 # Test when source and target are identical 

143 aggs = {("A", "B"): 10, ("C",): 5} 

144 

145 report = ErrorReport(aggs, aggs) 

146 result_df = report.gen() 

147 

148 # Should have zero fabrication and suppression 

149 assert "0.00 %" in result_df[result_df["Length"] == "Overall"]["Suppressed %"].values[0] 

150 assert "0.00 %" in result_df[result_df["Length"] == "Overall"]["Fabricated %"].values[0] 

151 

152 

153def test_calc_errors_with_no_overlap(): 

154 # Test when there's no overlap between source and target 

155 src_aggs = {("A",): 10, ("B",): 5} 

156 target_aggs = {("C",): 7, ("D",): 3} 

157 

158 report = ErrorReport(src_aggs, target_aggs) 

159 report.calc_errors() 

160 

161 # No common keys, so errors list is empty - mean of empty is nan 

162 assert np.isnan(report.mean_error) 

163 assert len(report.mean_error_by_len) == 0