Coverage for intelligence_toolkit/extract_record_data/api.py: 0%
17 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1import intelligence_toolkit.extract_record_data.data_extractor as data_extractor
2import intelligence_toolkit.generate_mock_data.data_generator as data_generator
3from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
4import pandas as pd
7class ExtractRecordData:
8 def __init__(self):
9 self.json_schema = {}
10 self.record_arrays = []
11 self.json_object = {}
12 self.array_dfs = {}
14 def set_schema(self, json_schema: dict):
15 self.json_schema = json_schema
16 self.record_arrays: list[list[str]] = data_generator.extract_array_fields(
17 json_schema
18 )
20 def set_ai_configuration(self, ai_configuration: OpenAIConfiguration):
21 self.ai_configuration = ai_configuration
23 async def extract_record_data(
24 self,
25 input_texts: list[str],
26 generation_guidance: str = "",
27 df_update_callback=None,
28 callback_batch=None,
29 ):
30 """
31 Extracts structured data records from input texts according to the JSON schema
33 Args:
34 input_texts (list[str]): The list of input texts to extract data from
35 generation_guidance (str): Optional guidance to provide to the model
36 df_update_callback (function): A callback function to update the dataframe
37 callback_batch (function): A callback function to update the batch
38 """
39 self.json_object, self.array_dfs = await data_extractor.extract_record_data(
40 ai_configuration=self.ai_configuration,
41 input_texts=input_texts,
42 data_schema=self.json_schema,
43 record_arrays=self.record_arrays,
44 generation_guidance=generation_guidance,
45 df_update_callback=df_update_callback,
46 callback_batch=callback_batch,
47 )