Coverage for intelligence_toolkit/anonymize_case_data/error_report.py: 100%
56 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4from collections import defaultdict
5import numpy as np
6import pandas as pd
9class ErrorReport:
10 def __init__(self, src_aggregates, target_aggregates):
11 self.src_aggregates = src_aggregates
12 self.target_aggregates = target_aggregates
14 def calc_fabricated(self):
15 self.fabricated_count = 0
16 self.fabricated_count_by_len = defaultdict(int)
18 for u, v in self.target_aggregates.items():
19 if u not in self.src_aggregates:
20 self.fabricated_count += v
21 self.fabricated_count_by_len[len(u)] += v
23 def calc_suppressed(self):
24 self.suppressed_count = 0
25 self.suppressed_count_by_len = defaultdict(int)
27 for o, v in self.src_aggregates.items():
28 if o not in self.target_aggregates:
29 self.suppressed_count += v
30 self.suppressed_count_by_len[len(o)] += v
32 def calc_mean(self):
33 mean = []
34 mean_by_len = defaultdict(list)
36 for o in self.src_aggregates:
37 mean.append(self.src_aggregates[o])
38 mean_by_len[len(o)].append(self.src_aggregates[o])
40 self.mean_count = np.mean(mean)
41 self.mean_count_by_len = {l: np.mean(mean_by_len[l]) for l in mean_by_len}
43 def calc_errors(self):
44 errors = []
45 errors_by_len = defaultdict(list)
47 for o in self.src_aggregates:
48 if o in self.target_aggregates:
49 err = abs(self.target_aggregates[o] - self.src_aggregates[o])
50 errors.append(err)
51 errors_by_len[len(o)].append(err)
53 self.mean_error = np.mean(errors)
54 self.mean_error_by_len = {l: np.mean(errors_by_len[l]) for l in errors_by_len}
56 def calc_total(aggregates):
57 total = 0
58 total_by_len = defaultdict(int)
60 for k, v in aggregates.items():
61 total += v
62 total_by_len[len(k)] += v
64 return (total, total_by_len)
66 def gen(self):
67 self.calc_fabricated()
68 self.calc_suppressed()
69 self.calc_mean()
70 self.calc_errors()
71 self.src_total, self.src_total_by_len = ErrorReport.calc_total(
72 self.src_aggregates
73 )
74 self.target_total, self.target_total_by_len = ErrorReport.calc_total(
75 self.target_aggregates
76 )
78 rows = [
79 [
80 str(l),
81 f"{self.mean_count_by_len[l]:.2f} +/- {self.mean_error_by_len[l]:.2f}",
82 f"{self.suppressed_count_by_len[l] * 100.0 / self.src_total_by_len[l]:.2f} %",
83 f"{self.fabricated_count_by_len[l] * 100.0 / self.target_total_by_len[l]:.2f} %",
84 ]
85 for l in sorted(self.mean_error_by_len.keys())
86 ]
87 rows.append([
88 "Overall",
89 f"{self.mean_count:.2f} +/- {self.mean_error:.2f}",
90 f"{self.suppressed_count * 100.0 / self.src_total:.2f} %",
91 f"{self.fabricated_count * 100.0 / self.target_total:.2f} %",
92 ])
94 return pd.DataFrame(
95 rows,
96 columns=[
97 "Length",
98 "Count +/- Error",
99 "Suppressed %",
100 "Fabricated %",
101 ],
102 )