Coverage for intelligence_toolkit/generate_mock_data/api.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4import pandas as pd 

5 

6import intelligence_toolkit.generate_mock_data.data_generator as data_generator 

7import intelligence_toolkit.generate_mock_data.text_generator as text_generator 

8from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration 

9 

10 

11class GenerateMockData: 

12 def __init__(self): 

13 self.json_schema = {} 

14 self.record_arrays = [] 

15 self.json_object = {} 

16 self.array_dfs = {} 

17 

18 def set_schema(self, json_schema: dict): 

19 self.json_schema = json_schema 

20 self.record_arrays: list[list[str]] = data_generator.extract_array_fields( 

21 json_schema 

22 ) 

23 

24 def set_ai_configuration(self, ai_configuration: OpenAIConfiguration): 

25 self.ai_configuration = ai_configuration 

26 

27 async def generate_data_records( 

28 self, 

29 num_records_overall: int, 

30 records_per_batch: int, 

31 duplicate_records_per_batch: int, 

32 related_records_per_batch: int, 

33 generation_guidance: str = "", 

34 temperature: float = 0.5, 

35 df_update_callback=None, 

36 callback_batch=None, 

37 parallel_batches: int = 5, 

38 ): 

39 """ 

40 Generates structured data records according to the JSON schema 

41 

42 Args: 

43 num_records_overall (int): The total number of records to generate 

44 records_per_batch (int): The number of records to generate per batch 

45 duplicate_records_per_batch (int): The number of duplicate records to generate per batch 

46 related_records_per_batch (int): The number of related records to generate per batch 

47 generation_guidance (str): Optional guidance to provide to the model 

48 temperature (float): The temperature to use when generating data 

49 df_update_callback (function): A callback function to update the dataframe 

50 callback_batch (function): A callback function to update the batch 

51 parallel_batches (int): The number of parallel batches to generate 

52 """ 

53 self.json_object, self.array_dfs = await data_generator.generate_data( 

54 ai_configuration=self.ai_configuration, 

55 generation_guidance=generation_guidance, 

56 data_schema=self.json_schema, 

57 num_records_overall=num_records_overall, 

58 records_per_batch=records_per_batch, 

59 duplicate_records_per_batch=duplicate_records_per_batch, 

60 related_records_per_batch=related_records_per_batch, 

61 temperature=temperature, 

62 df_update_callback=df_update_callback, 

63 callback_batch=callback_batch, 

64 parallel_batches=parallel_batches, 

65 ) 

66 

67 async def generate_text_data( 

68 self, 

69 df: pd.DataFrame, 

70 generation_guidance: str = "", 

71 temperature: float = 0.5, 

72 df_update_callback=None, 

73 ): 

74 """ 

75 Generates text data based on the input dataframe 

76 

77 Args: 

78 df (pandas.DataFrame): The input dataframe 

79 generation_guidance (str): Optional guidance to provide to the model 

80 temperature (float): The temperature to use when generating data 

81 df_update_callback (function): A callback function to update the dataframe 

82 """ 

83 input_texts = [] 

84 for _, row in df.iterrows(): 

85 input_texts.append(row.to_json()) 

86 self.text_list, self.text_df = await text_generator.generate_text_data( 

87 ai_configuration=self.ai_configuration, 

88 input_texts=input_texts, 

89 generation_guidance=generation_guidance, 

90 temperature=temperature, 

91 df_update_callback=df_update_callback, 

92 )