Coverage for intelligence_toolkit/tests/unit/match_entity_records/test_prepare_model.py: 100%

109 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4 

5import polars as pl 

6import pytest 

7 

8from intelligence_toolkit.match_entity_records.classes import RecordsModel 

9from intelligence_toolkit.match_entity_records.prepare_model import ( 

10 build_attribute_options, 

11 build_attributes_list, 

12 format_model_df, 

13) 

14 

15 

16class TestFormatModelDf: 

17 @pytest.fixture() 

18 def selected_df(self) -> pl.DataFrame: 

19 return pl.DataFrame( 

20 { 

21 "ID1": [10, 20, 30], 

22 "Name": ["A", "B", "C"], 

23 "Attribute1": ["A1", "B1", "C1"], 

24 "Attribute6": ["DD2", "E22", "EF2"], 

25 "Attribute3": ["D2", "E2", "F2"], 

26 } 

27 ) 

28 

29 def test_empty_df(self) -> None: 

30 model = RecordsModel(dataframe=pl.DataFrame(), name_column="", columns=[]) 

31 result = format_model_df(model) 

32 assert result.is_empty() 

33 

34 def test_add_with_id(self, selected_df) -> None: 

35 model = RecordsModel( 

36 dataframe=selected_df, 

37 id_column="ID1", 

38 name_column="Name", 

39 columns=["Attribute1"], 

40 ) 

41 

42 result = format_model_df(model) 

43 

44 assert "Entity ID" in result.columns 

45 assert "Entity name" in result.columns 

46 assert result["Entity ID"].to_list() == ["10", "20", "30"] 

47 assert len(result.columns) == 3 

48 

49 def test_add_no_id(self, selected_df) -> None: 

50 model = RecordsModel( 

51 dataframe=selected_df, 

52 name_column="Name", 

53 columns=["Attribute1"], 

54 ) 

55 result = format_model_df(model) 

56 

57 assert "Attribute1" in result.columns 

58 assert "Entity ID" in result.columns 

59 assert "Entity name" in result.columns 

60 assert result["Entity ID"].to_list() == ["0", "1", "2"] 

61 assert len(result.columns) == 3 

62 

63 def test_add_attributes_ordered(self, selected_df) -> None: 

64 model = RecordsModel( 

65 dataframe=selected_df, 

66 name_column="Name", 

67 columns=["Attribute3", "Attribute1"], 

68 ) 

69 result = format_model_df(model) 

70 

71 assert "Attribute1" in result.columns 

72 assert "Entity ID" in result.columns 

73 assert "Entity name" in result.columns 

74 

75 assert result.columns == [ 

76 "Entity ID", 

77 "Entity name", 

78 "Attribute1", 

79 "Attribute3", 

80 ] 

81 

82 def test_add_attributes_empty(self, selected_df) -> None: 

83 model = RecordsModel( 

84 dataframe=selected_df, 

85 name_column="Name", 

86 columns=[], 

87 ) 

88 result = format_model_df(model) 

89 

90 assert "Entity ID" in result.columns 

91 assert "Entity name" in result.columns 

92 

93 assert len(result.columns) == 2 

94 

95 def test_add_attributes_no_columns(self, selected_df) -> None: 

96 model = RecordsModel( 

97 dataframe=selected_df, 

98 name_column="Name", 

99 columns=[], 

100 ) 

101 result = format_model_df(model) 

102 

103 assert result.height == 3 

104 

105 def test_add_attributes_max_rows(self, selected_df) -> None: 

106 model = RecordsModel( 

107 dataframe=selected_df, 

108 name_column="Name", 

109 columns=[], 

110 ) 

111 max_rows = 2 

112 result = format_model_df(model, max_rows=max_rows) 

113 

114 assert result.height == 2 

115 

116 

117class TestBuildAttributeOptions: 

118 def test_empty(self) -> None: 

119 matching_dfs = {} 

120 result = build_attribute_options(matching_dfs) 

121 assert result == [] 

122 

123 def test_single(self) -> None: 

124 matching_dfs = { 

125 "dataset1": pl.DataFrame( 

126 { 

127 "Entity ID": [1], 

128 "Entity name": ["A"], 

129 "Attribute1": ["A1"], 

130 "Attribute2": ["A2"], 

131 } 

132 ) 

133 } 

134 result = build_attribute_options(matching_dfs) 

135 assert result == [ 

136 "Attribute1::dataset1", 

137 "Attribute2::dataset1", 

138 ] 

139 

140 def test_multiple(self) -> None: 

141 matching_dfs = { 

142 "dataset1": pl.DataFrame( 

143 { 

144 "Entity ID": [1], 

145 "Entity name": ["A"], 

146 "Attribute1": ["A1"], 

147 "Attribute2": ["A2"], 

148 } 

149 ), 

150 "dataset2": pl.DataFrame( 

151 { 

152 "Entity ID": [1], 

153 "Entity name": ["A"], 

154 "Attribute3": ["A3"], 

155 "Attribute4": ["A4"], 

156 } 

157 ), 

158 } 

159 result = build_attribute_options(matching_dfs) 

160 assert result == [ 

161 "Attribute1::dataset1", 

162 "Attribute2::dataset1", 

163 "Attribute3::dataset2", 

164 "Attribute4::dataset2", 

165 ] 

166 

167 def test_order(self) -> None: 

168 matching_dfs = { 

169 "dataset2": pl.DataFrame( 

170 { 

171 "Entity ID": [5], 

172 "Entity name": ["A"], 

173 "Attribute3": ["A3"], 

174 "Attribute0": ["A4"], 

175 } 

176 ), 

177 "dataset1": pl.DataFrame( 

178 { 

179 "Entity ID": [2, 3], 

180 "Entity name": ["A", "B"], 

181 "Attribute1": ["A1", "B1"], 

182 "Attribute2": ["A2", "B2"], 

183 } 

184 ), 

185 } 

186 result = build_attribute_options(matching_dfs) 

187 assert result == [ 

188 "Attribute0::dataset2", 

189 "Attribute1::dataset1", 

190 "Attribute2::dataset1", 

191 "Attribute3::dataset2", 

192 ] 

193 assert result == sorted(result) 

194 

195 

196class TestBuildAttributesList: 

197 def test_empty_list(self) -> None: 

198 """Test build_attributes_list with empty list.""" 

199 attr_list = [] 

200 result = build_attributes_list(attr_list) 

201 assert result == {} 

202 

203 def test_single_attribute(self) -> None: 

204 """Test build_attributes_list with single attribute.""" 

205 attr_list = [{"label": "Name", "columns": ["name::dataset1"]}] 

206 result = build_attributes_list(attr_list) 

207 assert result == {"dataset1": {"name": "Name"}} 

208 

209 def test_multiple_attributes(self) -> None: 

210 """Test build_attributes_list with multiple attributes.""" 

211 attr_list = [ 

212 {"label": "Name", "columns": ["name::dataset1", "entity_name::dataset2"]}, 

213 {"label": "Location", "columns": ["location::dataset1"]}, 

214 ] 

215 result = build_attributes_list(attr_list) 

216 assert result == { 

217 "dataset1": {"name": "Name", "location": "Location"}, 

218 "dataset2": {"entity_name": "Name"}, 

219 } 

220 

221 def test_no_label_uses_column_name(self) -> None: 

222 """Test that missing label uses first column name.""" 

223 attr_list = [{"label": "", "columns": ["address::dataset1", "addr::dataset2"]}] 

224 result = build_attributes_list(attr_list) 

225 # Should use "addr" or "address" as the label (sorted) 

226 assert "dataset1" in result 

227 assert "dataset2" in result 

228 # Both should map to same attribute name (first when sorted) 

229 assert result["dataset1"]["address"] == result["dataset2"]["addr"] 

230 

231 def test_no_columns_skips_attribute(self) -> None: 

232 """Test that attribute with no columns is skipped.""" 

233 attr_list = [ 

234 {"label": "Name", "columns": []}, 

235 {"label": "Location", "columns": ["location::dataset1"]}, 

236 ] 

237 result = build_attributes_list(attr_list) 

238 assert result == {"dataset1": {"location": "Location"}} 

239 

240 def test_none_columns_skips_attribute(self) -> None: 

241 """Test that attribute with None columns is skipped.""" 

242 attr_list = [ 

243 {"label": "Name", "columns": None}, 

244 {"label": "Location", "columns": ["location::dataset1"]}, 

245 ] 

246 result = build_attributes_list(attr_list) 

247 assert result == {"dataset1": {"location": "Location"}} 

248 

249 def test_multiple_columns_same_dataset(self) -> None: 

250 """Test multiple columns mapping to same dataset.""" 

251 attr_list = [ 

252 { 

253 "label": "Name", 

254 "columns": ["first_name::dataset1", "last_name::dataset1"], 

255 } 

256 ] 

257 result = build_attributes_list(attr_list) 

258 assert result == {"dataset1": {"first_name": "Name", "last_name": "Name"}} 

259 

260 def test_complex_scenario(self) -> None: 

261 """Test complex scenario with multiple datasets and attributes.""" 

262 attr_list = [ 

263 { 

264 "label": "PersonName", 

265 "columns": [ 

266 "name::dataset1", 

267 "full_name::dataset2", 

268 "entity_name::dataset3", 

269 ], 

270 }, 

271 {"label": "Address", "columns": ["address::dataset1", "addr::dataset2"]}, 

272 {"label": "", "columns": ["phone::dataset1"]}, 

273 ] 

274 result = build_attributes_list(attr_list) 

275 assert "dataset1" in result 

276 assert "dataset2" in result 

277 assert "dataset3" in result 

278 assert result["dataset1"]["name"] == "PersonName" 

279 assert result["dataset1"]["address"] == "Address" 

280 assert result["dataset2"]["full_name"] == "PersonName" 

281 assert result["dataset2"]["addr"] == "Address" 

282 assert result["dataset3"]["entity_name"] == "PersonName"