Coverage for intelligence_toolkit/helpers/document_processor.py: 100%
45 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
2from collections import defaultdict
3import pandas as pd
4from json import dumps, loads
5from pypdf import PdfReader
6import io
7from intelligence_toolkit.AI.text_splitter import TextSplitter
9def convert_files_to_chunks(
10 input_filepaths,
11 chunk_size,
12 callbacks=[],
13):
14 text_to_chunks = defaultdict(list)
16 def add_chunks(filename, text, chunk_size):
17 splitter = TextSplitter(chunk_size=chunk_size)
18 text_chunks = splitter.split(text)
19 for index, text in enumerate(text_chunks):
20 chunk = {"title": filename, "text_chunk": text, "chunk_id": index + 1}
21 text_to_chunks[filename].append(dumps(chunk, indent=2, ensure_ascii=False))
23 for fx, filepath in enumerate(input_filepaths):
24 filename = filepath.split("/")[-1]
25 filename = filename.replace("(", "").replace(")", "").replace(" ", "_")
26 for cb in callbacks:
27 cb.on_batch_change(fx + 1, len(input_filepaths))
29 if filename.endswith(".csv"):
30 df = pd.read_csv(filepath)
31 cols = df.columns.values
32 for ix, row in df.iterrows():
33 rec_text = "; ".join([f"{col}: {str(row[col])}" for col in cols])
34 add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
35 elif filename.endswith(".json"):
36 json_obj = loads(open(filepath).read())
37 # check if json_obj is a list
38 if isinstance(json_obj, list):
39 for ix, js_rec in enumerate(json_obj):
40 rec_text = dumps(js_rec)
41 add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size)
42 else:
43 text = dumps(json_obj)
44 add_chunks(filename, text, chunk_size)
45 elif filename.endswith(".pdf"):
46 page_texts = []
47 pdf_reader = PdfReader(filepath)
48 num_pages = pdf_reader.get_num_pages()
49 for px in range(num_pages):
50 page = pdf_reader.pages[px]
51 page_texts.append(page.extract_text())
52 text = " ".join(page_texts)
53 add_chunks(filename, text, chunk_size)
54 else:
55 text = open(filepath).read()
56 add_chunks(filename, text, chunk_size)
58 return text_to_chunks