Coverage for intelligence_toolkit/tests/unit/anonymize_case_data/test_error_report.py: 100%
89 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import pytest
5import pandas as pd
6import numpy as np
7from collections import defaultdict
8from intelligence_toolkit.anonymize_case_data.error_report import ErrorReport
11def test_error_report_initialization():
12 src_aggs = {("A", "B"): 10, ("C",): 5}
13 target_aggs = {("A", "B"): 12, ("D",): 3}
15 report = ErrorReport(src_aggs, target_aggs)
17 assert report.src_aggregates == src_aggs
18 assert report.target_aggregates == target_aggs
21def test_calc_fabricated():
22 src_aggs = {("A", "B"): 10, ("C",): 5}
23 target_aggs = {("A", "B"): 12, ("D",): 3, ("E", "F"): 7}
25 report = ErrorReport(src_aggs, target_aggs)
26 report.calc_fabricated()
28 # D and E,F are fabricated (not in source)
29 assert report.fabricated_count == 10 # 3 + 7
30 assert report.fabricated_count_by_len[1] == 3 # D
31 assert report.fabricated_count_by_len[2] == 7 # E,F
34def test_calc_suppressed():
35 src_aggs = {("A", "B"): 10, ("C",): 5, ("X", "Y", "Z"): 15}
36 target_aggs = {("A", "B"): 12}
38 report = ErrorReport(src_aggs, target_aggs)
39 report.calc_suppressed()
41 # C and X,Y,Z are suppressed (not in target)
42 assert report.suppressed_count == 20 # 5 + 15
43 assert report.suppressed_count_by_len[1] == 5 # C
44 assert report.suppressed_count_by_len[3] == 15 # X,Y,Z
47def test_calc_mean():
48 src_aggs = {("A", "B"): 10, ("C",): 5, ("D", "E"): 20, ("F",): 15}
49 target_aggs = {}
51 report = ErrorReport(src_aggs, target_aggs)
52 report.calc_mean()
54 assert report.mean_count == 12.5 # (10 + 5 + 20 + 15) / 4
55 assert report.mean_count_by_len[1] == 10.0 # (5 + 15) / 2
56 assert report.mean_count_by_len[2] == 15.0 # (10 + 20) / 2
59def test_calc_errors():
60 src_aggs = {("A", "B"): 10, ("C",): 5, ("D", "E"): 20}
61 target_aggs = {("A", "B"): 12, ("C",): 3, ("D", "E"): 25}
63 report = ErrorReport(src_aggs, target_aggs)
64 report.calc_errors()
66 # Errors: |12-10| + |3-5| + |25-20| = 2 + 2 + 5 = 9 total, mean = 3.0
67 assert report.mean_error == 3.0
68 assert report.mean_error_by_len[1] == 2.0 # C: |3-5|
69 assert report.mean_error_by_len[2] == 3.5 # A,B and D,E: (2 + 5) / 2
72def test_calc_total():
73 aggregates = {("A", "B"): 10, ("C",): 5, ("D", "E", "F"): 15, ("G",): 20}
75 total, total_by_len = ErrorReport.calc_total(aggregates)
77 assert total == 50 # 10 + 5 + 15 + 20
78 assert total_by_len[1] == 25 # C + G
79 assert total_by_len[2] == 10 # A,B
80 assert total_by_len[3] == 15 # D,E,F
83def test_gen_creates_dataframe():
84 src_aggs = {("A", "B"): 10, ("C",): 5}
85 target_aggs = {("A", "B"): 12, ("D",): 3}
87 report = ErrorReport(src_aggs, target_aggs)
88 result_df = report.gen()
90 assert isinstance(result_df, pd.DataFrame)
91 assert list(result_df.columns) == [
92 "Length",
93 "Count +/- Error",
94 "Suppressed %",
95 "Fabricated %",
96 ]
97 # Should have rows for each length + overall row
98 assert len(result_df) >= 2
101def test_gen_with_multiple_lengths():
102 src_aggs = {
103 ("A",): 100,
104 ("B",): 50,
105 ("C", "D"): 200,
106 ("E", "F"): 150,
107 ("G", "H", "I"): 300,
108 }
109 target_aggs = {
110 ("A",): 95,
111 ("B",): 55,
112 ("C", "D"): 190,
113 ("E", "F"): 160,
114 ("G", "H", "I"): 310,
115 ("J",): 10, # Fabricated
116 }
118 report = ErrorReport(src_aggs, target_aggs)
119 result_df = report.gen()
121 # Should have rows for lengths 1, 2, 3, and overall
122 assert len(result_df) == 4
123 assert "1" in result_df["Length"].values
124 assert "2" in result_df["Length"].values
125 assert "3" in result_df["Length"].values
126 assert "Overall" in result_df["Length"].values
129def test_gen_empty_aggregates():
130 src_aggs = {}
131 target_aggs = {}
133 report = ErrorReport(src_aggs, target_aggs)
134 report.calc_fabricated()
135 report.calc_suppressed()
137 assert report.fabricated_count == 0
138 assert report.suppressed_count == 0
141def test_error_report_with_matching_aggregates():
142 # Test when source and target are identical
143 aggs = {("A", "B"): 10, ("C",): 5}
145 report = ErrorReport(aggs, aggs)
146 result_df = report.gen()
148 # Should have zero fabrication and suppression
149 assert "0.00 %" in result_df[result_df["Length"] == "Overall"]["Suppressed %"].values[0]
150 assert "0.00 %" in result_df[result_df["Length"] == "Overall"]["Fabricated %"].values[0]
153def test_calc_errors_with_no_overlap():
154 # Test when there's no overlap between source and target
155 src_aggs = {("A",): 10, ("B",): 5}
156 target_aggs = {("C",): 7, ("D",): 3}
158 report = ErrorReport(src_aggs, target_aggs)
159 report.calc_errors()
161 # No common keys, so errors list is empty - mean of empty is nan
162 assert np.isnan(report.mean_error)
163 assert len(report.mean_error_by_len) == 0