Coverage for intelligence_toolkit/anonymize_case_data/error_report.py: 100%

56 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4from collections import defaultdict 

5import numpy as np 

6import pandas as pd 

7 

8 

9class ErrorReport: 

10 def __init__(self, src_aggregates, target_aggregates): 

11 self.src_aggregates = src_aggregates 

12 self.target_aggregates = target_aggregates 

13 

14 def calc_fabricated(self): 

15 self.fabricated_count = 0 

16 self.fabricated_count_by_len = defaultdict(int) 

17 

18 for u, v in self.target_aggregates.items(): 

19 if u not in self.src_aggregates: 

20 self.fabricated_count += v 

21 self.fabricated_count_by_len[len(u)] += v 

22 

23 def calc_suppressed(self): 

24 self.suppressed_count = 0 

25 self.suppressed_count_by_len = defaultdict(int) 

26 

27 for o, v in self.src_aggregates.items(): 

28 if o not in self.target_aggregates: 

29 self.suppressed_count += v 

30 self.suppressed_count_by_len[len(o)] += v 

31 

32 def calc_mean(self): 

33 mean = [] 

34 mean_by_len = defaultdict(list) 

35 

36 for o in self.src_aggregates: 

37 mean.append(self.src_aggregates[o]) 

38 mean_by_len[len(o)].append(self.src_aggregates[o]) 

39 

40 self.mean_count = np.mean(mean) 

41 self.mean_count_by_len = {l: np.mean(mean_by_len[l]) for l in mean_by_len} 

42 

43 def calc_errors(self): 

44 errors = [] 

45 errors_by_len = defaultdict(list) 

46 

47 for o in self.src_aggregates: 

48 if o in self.target_aggregates: 

49 err = abs(self.target_aggregates[o] - self.src_aggregates[o]) 

50 errors.append(err) 

51 errors_by_len[len(o)].append(err) 

52 

53 self.mean_error = np.mean(errors) 

54 self.mean_error_by_len = {l: np.mean(errors_by_len[l]) for l in errors_by_len} 

55 

56 def calc_total(aggregates): 

57 total = 0 

58 total_by_len = defaultdict(int) 

59 

60 for k, v in aggregates.items(): 

61 total += v 

62 total_by_len[len(k)] += v 

63 

64 return (total, total_by_len) 

65 

66 def gen(self): 

67 self.calc_fabricated() 

68 self.calc_suppressed() 

69 self.calc_mean() 

70 self.calc_errors() 

71 self.src_total, self.src_total_by_len = ErrorReport.calc_total( 

72 self.src_aggregates 

73 ) 

74 self.target_total, self.target_total_by_len = ErrorReport.calc_total( 

75 self.target_aggregates 

76 ) 

77 

78 rows = [ 

79 [ 

80 str(l), 

81 f"{self.mean_count_by_len[l]:.2f} +/- {self.mean_error_by_len[l]:.2f}", 

82 f"{self.suppressed_count_by_len[l] * 100.0 / self.src_total_by_len[l]:.2f} %", 

83 f"{self.fabricated_count_by_len[l] * 100.0 / self.target_total_by_len[l]:.2f} %", 

84 ] 

85 for l in sorted(self.mean_error_by_len.keys()) 

86 ] 

87 rows.append([ 

88 "Overall", 

89 f"{self.mean_count:.2f} +/- {self.mean_error:.2f}", 

90 f"{self.suppressed_count * 100.0 / self.src_total:.2f} %", 

91 f"{self.fabricated_count * 100.0 / self.target_total:.2f} %", 

92 ]) 

93 

94 return pd.DataFrame( 

95 rows, 

96 columns=[ 

97 "Length", 

98 "Count +/- Error", 

99 "Suppressed %", 

100 "Fabricated %", 

101 ], 

102 )