Coverage for intelligence_toolkit/generate_mock_data/api.py: 100%
22 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
4import pandas as pd
6import intelligence_toolkit.generate_mock_data.data_generator as data_generator
7import intelligence_toolkit.generate_mock_data.text_generator as text_generator
8from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
11class GenerateMockData:
12 def __init__(self):
13 self.json_schema = {}
14 self.record_arrays = []
15 self.json_object = {}
16 self.array_dfs = {}
18 def set_schema(self, json_schema: dict):
19 self.json_schema = json_schema
20 self.record_arrays: list[list[str]] = data_generator.extract_array_fields(
21 json_schema
22 )
24 def set_ai_configuration(self, ai_configuration: OpenAIConfiguration):
25 self.ai_configuration = ai_configuration
27 async def generate_data_records(
28 self,
29 num_records_overall: int,
30 records_per_batch: int,
31 duplicate_records_per_batch: int,
32 related_records_per_batch: int,
33 generation_guidance: str = "",
34 temperature: float = 0.5,
35 df_update_callback=None,
36 callback_batch=None,
37 parallel_batches: int = 5,
38 ):
39 """
40 Generates structured data records according to the JSON schema
42 Args:
43 num_records_overall (int): The total number of records to generate
44 records_per_batch (int): The number of records to generate per batch
45 duplicate_records_per_batch (int): The number of duplicate records to generate per batch
46 related_records_per_batch (int): The number of related records to generate per batch
47 generation_guidance (str): Optional guidance to provide to the model
48 temperature (float): The temperature to use when generating data
49 df_update_callback (function): A callback function to update the dataframe
50 callback_batch (function): A callback function to update the batch
51 parallel_batches (int): The number of parallel batches to generate
52 """
53 self.json_object, self.array_dfs = await data_generator.generate_data(
54 ai_configuration=self.ai_configuration,
55 generation_guidance=generation_guidance,
56 data_schema=self.json_schema,
57 num_records_overall=num_records_overall,
58 records_per_batch=records_per_batch,
59 duplicate_records_per_batch=duplicate_records_per_batch,
60 related_records_per_batch=related_records_per_batch,
61 temperature=temperature,
62 df_update_callback=df_update_callback,
63 callback_batch=callback_batch,
64 parallel_batches=parallel_batches,
65 )
67 async def generate_text_data(
68 self,
69 df: pd.DataFrame,
70 generation_guidance: str = "",
71 temperature: float = 0.5,
72 df_update_callback=None,
73 ):
74 """
75 Generates text data based on the input dataframe
77 Args:
78 df (pandas.DataFrame): The input dataframe
79 generation_guidance (str): Optional guidance to provide to the model
80 temperature (float): The temperature to use when generating data
81 df_update_callback (function): A callback function to update the dataframe
82 """
83 input_texts = []
84 for _, row in df.iterrows():
85 input_texts.append(row.to_json())
86 self.text_list, self.text_df = await text_generator.generate_text_data(
87 ai_configuration=self.ai_configuration,
88 input_texts=input_texts,
89 generation_guidance=generation_guidance,
90 temperature=temperature,
91 df_update_callback=df_update_callback,
92 )