Coverage for intelligence_toolkit/tests/unit/generate_mock_data/test_api.py: 100%

139 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3 

4import pytest 

5import pandas as pd 

6from unittest.mock import MagicMock, patch, AsyncMock 

7from intelligence_toolkit.generate_mock_data.api import GenerateMockData 

8 

9 

10def test_generate_mock_data_initialization(): 

11 gmd = GenerateMockData() 

12 

13 assert gmd.json_schema == {} 

14 assert gmd.record_arrays == [] 

15 assert gmd.json_object == {} 

16 assert gmd.array_dfs == {} 

17 

18 

19def test_set_schema(): 

20 gmd = GenerateMockData() 

21 schema = { 

22 "properties": { 

23 "records": {"type": "array", "items": {"type": "object", "properties": {}}} 

24 } 

25 } 

26 

27 gmd.set_schema(schema) 

28 

29 assert gmd.json_schema == schema 

30 assert isinstance(gmd.record_arrays, list) 

31 

32 

33@patch("intelligence_toolkit.generate_mock_data.api.data_generator.extract_array_fields") 

34def test_set_schema_extracts_arrays(mock_extract): 

35 mock_extract.return_value = [["records"], ["items"]] 

36 

37 gmd = GenerateMockData() 

38 schema = {"properties": {}} 

39 

40 gmd.set_schema(schema) 

41 

42 assert mock_extract.called 

43 assert gmd.record_arrays == [["records"], ["items"]] 

44 

45 

46def test_set_ai_configuration(): 

47 gmd = GenerateMockData() 

48 ai_config = MagicMock() 

49 

50 gmd.set_ai_configuration(ai_config) 

51 

52 assert gmd.ai_configuration == ai_config 

53 

54 

55@pytest.mark.asyncio 

56@patch("intelligence_toolkit.generate_mock_data.api.data_generator.generate_data") 

57async def test_generate_data_records_basic(mock_generate): 

58 mock_generate.return_value = ( 

59 {"records": []}, 

60 {"records": pd.DataFrame()}, 

61 ) 

62 

63 gmd = GenerateMockData() 

64 gmd.set_ai_configuration(MagicMock()) 

65 gmd.set_schema({"properties": {}}) 

66 

67 await gmd.generate_data_records( 

68 num_records_overall=10, 

69 records_per_batch=5, 

70 duplicate_records_per_batch=1, 

71 related_records_per_batch=1, 

72 ) 

73 

74 assert mock_generate.called 

75 assert isinstance(gmd.json_object, dict) 

76 assert isinstance(gmd.array_dfs, dict) 

77 

78 

79@pytest.mark.asyncio 

80@patch("intelligence_toolkit.generate_mock_data.api.data_generator.generate_data") 

81async def test_generate_data_records_with_guidance(mock_generate): 

82 mock_generate.return_value = ({}, {}) 

83 

84 gmd = GenerateMockData() 

85 gmd.set_ai_configuration(MagicMock()) 

86 gmd.set_schema({"properties": {}}) 

87 

88 await gmd.generate_data_records( 

89 num_records_overall=5, 

90 records_per_batch=5, 

91 duplicate_records_per_batch=0, 

92 related_records_per_batch=0, 

93 generation_guidance="Generate realistic data", 

94 temperature=0.7, 

95 ) 

96 

97 call_args = mock_generate.call_args 

98 assert call_args[1]["generation_guidance"] == "Generate realistic data" 

99 assert call_args[1]["temperature"] == 0.7 

100 

101 

102@pytest.mark.asyncio 

103@patch("intelligence_toolkit.generate_mock_data.api.data_generator.generate_data") 

104async def test_generate_data_records_with_callbacks(mock_generate): 

105 mock_generate.return_value = ({}, {}) 

106 df_callback = MagicMock() 

107 batch_callback = MagicMock() 

108 

109 gmd = GenerateMockData() 

110 gmd.set_ai_configuration(MagicMock()) 

111 gmd.set_schema({"properties": {}}) 

112 

113 await gmd.generate_data_records( 

114 num_records_overall=10, 

115 records_per_batch=5, 

116 duplicate_records_per_batch=1, 

117 related_records_per_batch=1, 

118 df_update_callback=df_callback, 

119 callback_batch=batch_callback, 

120 ) 

121 

122 call_args = mock_generate.call_args[1] 

123 assert call_args["df_update_callback"] == df_callback 

124 assert call_args["callback_batch"] == batch_callback 

125 

126 

127@pytest.mark.asyncio 

128@patch("intelligence_toolkit.generate_mock_data.api.data_generator.generate_data") 

129async def test_generate_data_records_parallel_batches(mock_generate): 

130 mock_generate.return_value = ({}, {}) 

131 

132 gmd = GenerateMockData() 

133 gmd.set_ai_configuration(MagicMock()) 

134 gmd.set_schema({"properties": {}}) 

135 

136 await gmd.generate_data_records( 

137 num_records_overall=50, 

138 records_per_batch=10, 

139 duplicate_records_per_batch=2, 

140 related_records_per_batch=2, 

141 parallel_batches=10, 

142 ) 

143 

144 call_args = mock_generate.call_args[1] 

145 assert call_args["parallel_batches"] == 10 

146 

147 

148@pytest.mark.asyncio 

149@patch("intelligence_toolkit.generate_mock_data.api.text_generator.generate_text_data") 

150async def test_generate_text_data_basic(mock_generate): 

151 mock_generate.return_value = ( 

152 ["Text 1", "Text 2"], 

153 pd.DataFrame({"mock_text": ["Text 1", "Text 2"]}), 

154 ) 

155 

156 gmd = GenerateMockData() 

157 gmd.set_ai_configuration(MagicMock()) 

158 

159 df = pd.DataFrame({"name": ["Alice", "Bob"], "age": [30, 25]}) 

160 

161 await gmd.generate_text_data(df) 

162 

163 assert mock_generate.called 

164 assert hasattr(gmd, "text_list") 

165 assert hasattr(gmd, "text_df") 

166 assert isinstance(gmd.text_list, list) 

167 assert isinstance(gmd.text_df, pd.DataFrame) 

168 

169 

170@pytest.mark.asyncio 

171@patch("intelligence_toolkit.generate_mock_data.api.text_generator.generate_text_data") 

172async def test_generate_text_data_converts_rows_to_json(mock_generate): 

173 mock_generate.return_value = (["Text"], pd.DataFrame()) 

174 

175 gmd = GenerateMockData() 

176 gmd.set_ai_configuration(MagicMock()) 

177 

178 df = pd.DataFrame({"col1": ["value1"], "col2": ["value2"]}) 

179 

180 await gmd.generate_text_data(df) 

181 

182 # Should convert each row to JSON string 

183 call_args = mock_generate.call_args[1] 

184 input_texts = call_args["input_texts"] 

185 assert isinstance(input_texts, list) 

186 assert len(input_texts) == 1 

187 

188 

189@pytest.mark.asyncio 

190@patch("intelligence_toolkit.generate_mock_data.api.text_generator.generate_text_data") 

191async def test_generate_text_data_with_parameters(mock_generate): 

192 mock_generate.return_value = ([], pd.DataFrame()) 

193 

194 gmd = GenerateMockData() 

195 gmd.set_ai_configuration(MagicMock()) 

196 

197 df = pd.DataFrame({"data": [1]}) 

198 

199 await gmd.generate_text_data( 

200 df, 

201 generation_guidance="Be concise", 

202 temperature=0.3, 

203 ) 

204 

205 call_args = mock_generate.call_args[1] 

206 assert call_args["generation_guidance"] == "Be concise" 

207 assert call_args["temperature"] == 0.3 

208 

209 

210@pytest.mark.asyncio 

211@patch("intelligence_toolkit.generate_mock_data.api.text_generator.generate_text_data") 

212async def test_generate_text_data_with_callback(mock_generate): 

213 mock_generate.return_value = ([], pd.DataFrame()) 

214 callback = MagicMock() 

215 

216 gmd = GenerateMockData() 

217 gmd.set_ai_configuration(MagicMock()) 

218 

219 df = pd.DataFrame({"data": [1]}) 

220 

221 await gmd.generate_text_data(df, df_update_callback=callback) 

222 

223 call_args = mock_generate.call_args[1] 

224 assert call_args["df_update_callback"] == callback 

225 

226 

227@pytest.mark.asyncio 

228@patch("intelligence_toolkit.generate_mock_data.api.text_generator.generate_text_data") 

229async def test_generate_text_data_empty_dataframe(mock_generate): 

230 mock_generate.return_value = ([], pd.DataFrame()) 

231 

232 gmd = GenerateMockData() 

233 gmd.set_ai_configuration(MagicMock()) 

234 

235 df = pd.DataFrame() 

236 

237 await gmd.generate_text_data(df) 

238 

239 call_args = mock_generate.call_args[1] 

240 assert call_args["input_texts"] == [] 

241 

242 

243def test_schema_persistence(): 

244 gmd = GenerateMockData() 

245 schema1 = {"properties": {"field1": {}}} 

246 schema2 = {"properties": {"field2": {}}} 

247 

248 gmd.set_schema(schema1) 

249 assert gmd.json_schema == schema1 

250 

251 gmd.set_schema(schema2) 

252 assert gmd.json_schema == schema2