Coverage for intelligence_toolkit/tests/unit/generate_mock_data/test_data_generator.py: 100%
116 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
4import pytest
5import pandas as pd
6from unittest.mock import MagicMock, patch, AsyncMock
7from intelligence_toolkit.generate_mock_data.data_generator import (
8 extract_array_fields,
9 extract_df,
10 merge_json_objects,
11 select_random_records,
12 sample_from_record_array,
13)
16def test_extract_array_fields_simple():
17 schema = {
18 "properties": {
19 "items": {"type": "array", "items": {"type": "object", "properties": {}}}
20 }
21 }
23 result = extract_array_fields(schema)
25 assert isinstance(result, list)
26 assert len(result) > 0
27 assert ["items"] in result
30def test_extract_array_fields_nested():
31 schema = {
32 "properties": {
33 "level1": {
34 "type": "object",
35 "properties": {
36 "level2": {
37 "type": "array",
38 "items": {"type": "object", "properties": {}},
39 }
40 },
41 }
42 }
43 }
45 result = extract_array_fields(schema)
47 assert ["level1", "level2"] in result
50def test_extract_array_fields_multiple_arrays():
51 schema = {
52 "properties": {
53 "array1": {"type": "array", "items": {"type": "string"}},
54 "array2": {"type": "array", "items": {"type": "number"}},
55 }
56 }
58 result = extract_array_fields(schema)
60 assert len(result) >= 2
61 assert ["array1"] in result
62 assert ["array2"] in result
65def test_extract_array_fields_no_arrays():
66 schema = {
67 "properties": {
68 "field1": {"type": "string"},
69 "field2": {"type": "number"},
70 }
71 }
73 result = extract_array_fields(schema)
75 assert result == []
78def test_extract_df_simple():
79 json_data = {"records": [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]}
80 record_path = ["records"]
82 df = extract_df(json_data, record_path)
84 assert isinstance(df, pd.DataFrame)
85 assert len(df) == 2
86 assert "name" in df.columns
87 assert "age" in df.columns
90def test_extract_df_empty():
91 json_data = {"records": []}
92 record_path = ["records"]
94 df = extract_df(json_data, record_path)
96 assert isinstance(df, pd.DataFrame)
97 assert len(df) == 0
100def test_merge_json_objects_simple():
101 obj1 = {"a": 1, "b": 2}
102 obj2 = {"c": 3, "d": 4}
104 merged, conflicts = merge_json_objects(obj1, obj2)
106 assert merged == {"a": 1, "b": 2, "c": 3, "d": 4}
107 assert conflicts == []
110def test_merge_json_objects_with_arrays():
111 obj1 = {"items": [1, 2, 3]}
112 obj2 = {"items": [4, 5, 6]}
114 merged, conflicts = merge_json_objects(obj1, obj2)
116 assert merged["items"] == [1, 2, 3, 4, 5, 6]
117 assert conflicts == []
120def test_merge_json_objects_with_conflicts():
121 obj1 = {"value": 10}
122 obj2 = {"value": 20}
124 merged, conflicts = merge_json_objects(obj1, obj2)
126 assert merged["value"] == 20 # obj2 wins
127 assert "value" in conflicts
130def test_merge_json_objects_nested():
131 obj1 = {"nested": {"a": 1, "b": 2}}
132 obj2 = {"nested": {"c": 3}}
134 merged, conflicts = merge_json_objects(obj1, obj2)
136 assert merged["nested"]["a"] == 1
137 assert merged["nested"]["c"] == 3
140def test_merge_json_objects_nested_arrays():
141 obj1 = {"data": {"items": [1, 2]}}
142 obj2 = {"data": {"items": [3, 4]}}
144 merged, conflicts = merge_json_objects(obj1, obj2)
146 assert merged["data"]["items"] == [1, 2, 3, 4]
149def test_select_random_records_single_category():
150 category_to_count = {"duplicates": 3}
152 result = select_random_records(10, category_to_count)
154 assert "duplicates" in result
155 assert len(result["duplicates"]) == 3
156 assert all(0 <= idx < 10 for idx in result["duplicates"])
159def test_select_random_records_multiple_categories():
160 category_to_count = {"duplicates": 2, "relations": 3}
162 result = select_random_records(20, category_to_count)
164 assert "duplicates" in result
165 assert "relations" in result
166 assert len(result["duplicates"]) == 2
167 assert len(result["relations"]) == 3
170def test_select_random_records_no_overlap():
171 category_to_count = {"cat1": 5, "cat2": 5}
173 result = select_random_records(20, category_to_count)
175 # Selected IDs should not overlap
176 all_ids = result["cat1"] + result["cat2"]
177 assert len(all_ids) == len(set(all_ids))
180@patch("intelligence_toolkit.generate_mock_data.data_generator.schema_builder.get_subobject")
181def test_sample_from_record_array_sufficient_records(mock_get_subobject):
182 mock_get_subobject.return_value = [1, 2, 3, 4, 5]
183 current_object = {"records": []}
184 record_array = ["records"]
186 result = sample_from_record_array(current_object, record_array, 3)
188 assert len(result) == 3
189 assert all(r in [1, 2, 3, 4, 5] for r in result)
192@patch("intelligence_toolkit.generate_mock_data.data_generator.schema_builder.get_subobject")
193def test_sample_from_record_array_insufficient_records(mock_get_subobject):
194 mock_get_subobject.return_value = [1, 2]
195 current_object = {"records": []}
196 record_array = ["records"]
198 result = sample_from_record_array(current_object, record_array, 5)
200 # Should return all available when k > available
201 assert len(result) == 2
202 assert result == [1, 2]
205def test_extract_df_with_nested_data():
206 json_data = {
207 "users": [
208 {"name": "Alice", "details": {"age": 30, "city": "NYC"}},
209 {"name": "Bob", "details": {"age": 25, "city": "LA"}},
210 ]
211 }
212 record_path = ["users"]
214 df = extract_df(json_data, record_path)
216 assert isinstance(df, pd.DataFrame)
217 assert len(df) == 2
218 assert "name" in df.columns
221def test_merge_json_objects_preserves_both_sides():
222 obj1 = {"unique1": "value1", "shared": "old"}
223 obj2 = {"unique2": "value2", "shared": "new"}
225 merged, conflicts = merge_json_objects(obj1, obj2)
227 assert "unique1" in merged
228 assert "unique2" in merged
229 assert merged["shared"] == "new"
230 assert "shared" in conflicts