Coverage for intelligence_toolkit/tests/unit/match_entity_records/test_prepare_model.py: 100%
109 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
5import polars as pl
6import pytest
8from intelligence_toolkit.match_entity_records.classes import RecordsModel
9from intelligence_toolkit.match_entity_records.prepare_model import (
10 build_attribute_options,
11 build_attributes_list,
12 format_model_df,
13)
16class TestFormatModelDf:
17 @pytest.fixture()
18 def selected_df(self) -> pl.DataFrame:
19 return pl.DataFrame(
20 {
21 "ID1": [10, 20, 30],
22 "Name": ["A", "B", "C"],
23 "Attribute1": ["A1", "B1", "C1"],
24 "Attribute6": ["DD2", "E22", "EF2"],
25 "Attribute3": ["D2", "E2", "F2"],
26 }
27 )
29 def test_empty_df(self) -> None:
30 model = RecordsModel(dataframe=pl.DataFrame(), name_column="", columns=[])
31 result = format_model_df(model)
32 assert result.is_empty()
34 def test_add_with_id(self, selected_df) -> None:
35 model = RecordsModel(
36 dataframe=selected_df,
37 id_column="ID1",
38 name_column="Name",
39 columns=["Attribute1"],
40 )
42 result = format_model_df(model)
44 assert "Entity ID" in result.columns
45 assert "Entity name" in result.columns
46 assert result["Entity ID"].to_list() == ["10", "20", "30"]
47 assert len(result.columns) == 3
49 def test_add_no_id(self, selected_df) -> None:
50 model = RecordsModel(
51 dataframe=selected_df,
52 name_column="Name",
53 columns=["Attribute1"],
54 )
55 result = format_model_df(model)
57 assert "Attribute1" in result.columns
58 assert "Entity ID" in result.columns
59 assert "Entity name" in result.columns
60 assert result["Entity ID"].to_list() == ["0", "1", "2"]
61 assert len(result.columns) == 3
63 def test_add_attributes_ordered(self, selected_df) -> None:
64 model = RecordsModel(
65 dataframe=selected_df,
66 name_column="Name",
67 columns=["Attribute3", "Attribute1"],
68 )
69 result = format_model_df(model)
71 assert "Attribute1" in result.columns
72 assert "Entity ID" in result.columns
73 assert "Entity name" in result.columns
75 assert result.columns == [
76 "Entity ID",
77 "Entity name",
78 "Attribute1",
79 "Attribute3",
80 ]
82 def test_add_attributes_empty(self, selected_df) -> None:
83 model = RecordsModel(
84 dataframe=selected_df,
85 name_column="Name",
86 columns=[],
87 )
88 result = format_model_df(model)
90 assert "Entity ID" in result.columns
91 assert "Entity name" in result.columns
93 assert len(result.columns) == 2
95 def test_add_attributes_no_columns(self, selected_df) -> None:
96 model = RecordsModel(
97 dataframe=selected_df,
98 name_column="Name",
99 columns=[],
100 )
101 result = format_model_df(model)
103 assert result.height == 3
105 def test_add_attributes_max_rows(self, selected_df) -> None:
106 model = RecordsModel(
107 dataframe=selected_df,
108 name_column="Name",
109 columns=[],
110 )
111 max_rows = 2
112 result = format_model_df(model, max_rows=max_rows)
114 assert result.height == 2
117class TestBuildAttributeOptions:
118 def test_empty(self) -> None:
119 matching_dfs = {}
120 result = build_attribute_options(matching_dfs)
121 assert result == []
123 def test_single(self) -> None:
124 matching_dfs = {
125 "dataset1": pl.DataFrame(
126 {
127 "Entity ID": [1],
128 "Entity name": ["A"],
129 "Attribute1": ["A1"],
130 "Attribute2": ["A2"],
131 }
132 )
133 }
134 result = build_attribute_options(matching_dfs)
135 assert result == [
136 "Attribute1::dataset1",
137 "Attribute2::dataset1",
138 ]
140 def test_multiple(self) -> None:
141 matching_dfs = {
142 "dataset1": pl.DataFrame(
143 {
144 "Entity ID": [1],
145 "Entity name": ["A"],
146 "Attribute1": ["A1"],
147 "Attribute2": ["A2"],
148 }
149 ),
150 "dataset2": pl.DataFrame(
151 {
152 "Entity ID": [1],
153 "Entity name": ["A"],
154 "Attribute3": ["A3"],
155 "Attribute4": ["A4"],
156 }
157 ),
158 }
159 result = build_attribute_options(matching_dfs)
160 assert result == [
161 "Attribute1::dataset1",
162 "Attribute2::dataset1",
163 "Attribute3::dataset2",
164 "Attribute4::dataset2",
165 ]
167 def test_order(self) -> None:
168 matching_dfs = {
169 "dataset2": pl.DataFrame(
170 {
171 "Entity ID": [5],
172 "Entity name": ["A"],
173 "Attribute3": ["A3"],
174 "Attribute0": ["A4"],
175 }
176 ),
177 "dataset1": pl.DataFrame(
178 {
179 "Entity ID": [2, 3],
180 "Entity name": ["A", "B"],
181 "Attribute1": ["A1", "B1"],
182 "Attribute2": ["A2", "B2"],
183 }
184 ),
185 }
186 result = build_attribute_options(matching_dfs)
187 assert result == [
188 "Attribute0::dataset2",
189 "Attribute1::dataset1",
190 "Attribute2::dataset1",
191 "Attribute3::dataset2",
192 ]
193 assert result == sorted(result)
196class TestBuildAttributesList:
197 def test_empty_list(self) -> None:
198 """Test build_attributes_list with empty list."""
199 attr_list = []
200 result = build_attributes_list(attr_list)
201 assert result == {}
203 def test_single_attribute(self) -> None:
204 """Test build_attributes_list with single attribute."""
205 attr_list = [{"label": "Name", "columns": ["name::dataset1"]}]
206 result = build_attributes_list(attr_list)
207 assert result == {"dataset1": {"name": "Name"}}
209 def test_multiple_attributes(self) -> None:
210 """Test build_attributes_list with multiple attributes."""
211 attr_list = [
212 {"label": "Name", "columns": ["name::dataset1", "entity_name::dataset2"]},
213 {"label": "Location", "columns": ["location::dataset1"]},
214 ]
215 result = build_attributes_list(attr_list)
216 assert result == {
217 "dataset1": {"name": "Name", "location": "Location"},
218 "dataset2": {"entity_name": "Name"},
219 }
221 def test_no_label_uses_column_name(self) -> None:
222 """Test that missing label uses first column name."""
223 attr_list = [{"label": "", "columns": ["address::dataset1", "addr::dataset2"]}]
224 result = build_attributes_list(attr_list)
225 # Should use "addr" or "address" as the label (sorted)
226 assert "dataset1" in result
227 assert "dataset2" in result
228 # Both should map to same attribute name (first when sorted)
229 assert result["dataset1"]["address"] == result["dataset2"]["addr"]
231 def test_no_columns_skips_attribute(self) -> None:
232 """Test that attribute with no columns is skipped."""
233 attr_list = [
234 {"label": "Name", "columns": []},
235 {"label": "Location", "columns": ["location::dataset1"]},
236 ]
237 result = build_attributes_list(attr_list)
238 assert result == {"dataset1": {"location": "Location"}}
240 def test_none_columns_skips_attribute(self) -> None:
241 """Test that attribute with None columns is skipped."""
242 attr_list = [
243 {"label": "Name", "columns": None},
244 {"label": "Location", "columns": ["location::dataset1"]},
245 ]
246 result = build_attributes_list(attr_list)
247 assert result == {"dataset1": {"location": "Location"}}
249 def test_multiple_columns_same_dataset(self) -> None:
250 """Test multiple columns mapping to same dataset."""
251 attr_list = [
252 {
253 "label": "Name",
254 "columns": ["first_name::dataset1", "last_name::dataset1"],
255 }
256 ]
257 result = build_attributes_list(attr_list)
258 assert result == {"dataset1": {"first_name": "Name", "last_name": "Name"}}
260 def test_complex_scenario(self) -> None:
261 """Test complex scenario with multiple datasets and attributes."""
262 attr_list = [
263 {
264 "label": "PersonName",
265 "columns": [
266 "name::dataset1",
267 "full_name::dataset2",
268 "entity_name::dataset3",
269 ],
270 },
271 {"label": "Address", "columns": ["address::dataset1", "addr::dataset2"]},
272 {"label": "", "columns": ["phone::dataset1"]},
273 ]
274 result = build_attributes_list(attr_list)
275 assert "dataset1" in result
276 assert "dataset2" in result
277 assert "dataset3" in result
278 assert result["dataset1"]["name"] == "PersonName"
279 assert result["dataset1"]["address"] == "Address"
280 assert result["dataset2"]["full_name"] == "PersonName"
281 assert result["dataset2"]["addr"] == "Address"
282 assert result["dataset3"]["entity_name"] == "PersonName"