Coverage for intelligence_toolkit/tests/unit/generate_mock_data/test_data_generator.py: 100%

116 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import pytest 

5import pandas as pd 

6from unittest.mock import MagicMock, patch, AsyncMock 

7from intelligence_toolkit.generate_mock_data.data_generator import ( 

8 extract_array_fields, 

9 extract_df, 

10 merge_json_objects, 

11 select_random_records, 

12 sample_from_record_array, 

13) 

14 

15 

16def test_extract_array_fields_simple(): 

17 schema = { 

18 "properties": { 

19 "items": {"type": "array", "items": {"type": "object", "properties": {}}} 

20 } 

21 } 

22 

23 result = extract_array_fields(schema) 

24 

25 assert isinstance(result, list) 

26 assert len(result) > 0 

27 assert ["items"] in result 

28 

29 

30def test_extract_array_fields_nested(): 

31 schema = { 

32 "properties": { 

33 "level1": { 

34 "type": "object", 

35 "properties": { 

36 "level2": { 

37 "type": "array", 

38 "items": {"type": "object", "properties": {}}, 

39 } 

40 }, 

41 } 

42 } 

43 } 

44 

45 result = extract_array_fields(schema) 

46 

47 assert ["level1", "level2"] in result 

48 

49 

50def test_extract_array_fields_multiple_arrays(): 

51 schema = { 

52 "properties": { 

53 "array1": {"type": "array", "items": {"type": "string"}}, 

54 "array2": {"type": "array", "items": {"type": "number"}}, 

55 } 

56 } 

57 

58 result = extract_array_fields(schema) 

59 

60 assert len(result) >= 2 

61 assert ["array1"] in result 

62 assert ["array2"] in result 

63 

64 

65def test_extract_array_fields_no_arrays(): 

66 schema = { 

67 "properties": { 

68 "field1": {"type": "string"}, 

69 "field2": {"type": "number"}, 

70 } 

71 } 

72 

73 result = extract_array_fields(schema) 

74 

75 assert result == [] 

76 

77 

78def test_extract_df_simple(): 

79 json_data = {"records": [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]} 

80 record_path = ["records"] 

81 

82 df = extract_df(json_data, record_path) 

83 

84 assert isinstance(df, pd.DataFrame) 

85 assert len(df) == 2 

86 assert "name" in df.columns 

87 assert "age" in df.columns 

88 

89 

90def test_extract_df_empty(): 

91 json_data = {"records": []} 

92 record_path = ["records"] 

93 

94 df = extract_df(json_data, record_path) 

95 

96 assert isinstance(df, pd.DataFrame) 

97 assert len(df) == 0 

98 

99 

100def test_merge_json_objects_simple(): 

101 obj1 = {"a": 1, "b": 2} 

102 obj2 = {"c": 3, "d": 4} 

103 

104 merged, conflicts = merge_json_objects(obj1, obj2) 

105 

106 assert merged == {"a": 1, "b": 2, "c": 3, "d": 4} 

107 assert conflicts == [] 

108 

109 

110def test_merge_json_objects_with_arrays(): 

111 obj1 = {"items": [1, 2, 3]} 

112 obj2 = {"items": [4, 5, 6]} 

113 

114 merged, conflicts = merge_json_objects(obj1, obj2) 

115 

116 assert merged["items"] == [1, 2, 3, 4, 5, 6] 

117 assert conflicts == [] 

118 

119 

120def test_merge_json_objects_with_conflicts(): 

121 obj1 = {"value": 10} 

122 obj2 = {"value": 20} 

123 

124 merged, conflicts = merge_json_objects(obj1, obj2) 

125 

126 assert merged["value"] == 20 # obj2 wins 

127 assert "value" in conflicts 

128 

129 

130def test_merge_json_objects_nested(): 

131 obj1 = {"nested": {"a": 1, "b": 2}} 

132 obj2 = {"nested": {"c": 3}} 

133 

134 merged, conflicts = merge_json_objects(obj1, obj2) 

135 

136 assert merged["nested"]["a"] == 1 

137 assert merged["nested"]["c"] == 3 

138 

139 

140def test_merge_json_objects_nested_arrays(): 

141 obj1 = {"data": {"items": [1, 2]}} 

142 obj2 = {"data": {"items": [3, 4]}} 

143 

144 merged, conflicts = merge_json_objects(obj1, obj2) 

145 

146 assert merged["data"]["items"] == [1, 2, 3, 4] 

147 

148 

149def test_select_random_records_single_category(): 

150 category_to_count = {"duplicates": 3} 

151 

152 result = select_random_records(10, category_to_count) 

153 

154 assert "duplicates" in result 

155 assert len(result["duplicates"]) == 3 

156 assert all(0 <= idx < 10 for idx in result["duplicates"]) 

157 

158 

159def test_select_random_records_multiple_categories(): 

160 category_to_count = {"duplicates": 2, "relations": 3} 

161 

162 result = select_random_records(20, category_to_count) 

163 

164 assert "duplicates" in result 

165 assert "relations" in result 

166 assert len(result["duplicates"]) == 2 

167 assert len(result["relations"]) == 3 

168 

169 

170def test_select_random_records_no_overlap(): 

171 category_to_count = {"cat1": 5, "cat2": 5} 

172 

173 result = select_random_records(20, category_to_count) 

174 

175 # Selected IDs should not overlap 

176 all_ids = result["cat1"] + result["cat2"] 

177 assert len(all_ids) == len(set(all_ids)) 

178 

179 

180@patch("intelligence_toolkit.generate_mock_data.data_generator.schema_builder.get_subobject") 

181def test_sample_from_record_array_sufficient_records(mock_get_subobject): 

182 mock_get_subobject.return_value = [1, 2, 3, 4, 5] 

183 current_object = {"records": []} 

184 record_array = ["records"] 

185 

186 result = sample_from_record_array(current_object, record_array, 3) 

187 

188 assert len(result) == 3 

189 assert all(r in [1, 2, 3, 4, 5] for r in result) 

190 

191 

192@patch("intelligence_toolkit.generate_mock_data.data_generator.schema_builder.get_subobject") 

193def test_sample_from_record_array_insufficient_records(mock_get_subobject): 

194 mock_get_subobject.return_value = [1, 2] 

195 current_object = {"records": []} 

196 record_array = ["records"] 

197 

198 result = sample_from_record_array(current_object, record_array, 5) 

199 

200 # Should return all available when k > available 

201 assert len(result) == 2 

202 assert result == [1, 2] 

203 

204 

205def test_extract_df_with_nested_data(): 

206 json_data = { 

207 "users": [ 

208 {"name": "Alice", "details": {"age": 30, "city": "NYC"}}, 

209 {"name": "Bob", "details": {"age": 25, "city": "LA"}}, 

210 ] 

211 } 

212 record_path = ["users"] 

213 

214 df = extract_df(json_data, record_path) 

215 

216 assert isinstance(df, pd.DataFrame) 

217 assert len(df) == 2 

218 assert "name" in df.columns 

219 

220 

221def test_merge_json_objects_preserves_both_sides(): 

222 obj1 = {"unique1": "value1", "shared": "old"} 

223 obj2 = {"unique2": "value2", "shared": "new"} 

224 

225 merged, conflicts = merge_json_objects(obj1, obj2) 

226 

227 assert "unique1" in merged 

228 assert "unique2" in merged 

229 assert merged["shared"] == "new" 

230 assert "shared" in conflicts