Coverage for intelligence_toolkit/helpers/document_processor.py: 100%

45 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1 

2from collections import defaultdict 

3import pandas as pd 

4from json import dumps, loads 

5from pypdf import PdfReader 

6import io 

7from intelligence_toolkit.AI.text_splitter import TextSplitter 

8 

9def convert_files_to_chunks( 

10 input_filepaths, 

11 chunk_size, 

12 callbacks=[], 

13): 

14 text_to_chunks = defaultdict(list) 

15 

16 def add_chunks(filename, text, chunk_size): 

17 splitter = TextSplitter(chunk_size=chunk_size) 

18 text_chunks = splitter.split(text) 

19 for index, text in enumerate(text_chunks): 

20 chunk = {"title": filename, "text_chunk": text, "chunk_id": index + 1} 

21 text_to_chunks[filename].append(dumps(chunk, indent=2, ensure_ascii=False)) 

22 

23 for fx, filepath in enumerate(input_filepaths): 

24 filename = filepath.split("/")[-1] 

25 filename = filename.replace("(", "").replace(")", "").replace(" ", "_") 

26 for cb in callbacks: 

27 cb.on_batch_change(fx + 1, len(input_filepaths)) 

28 

29 if filename.endswith(".csv"): 

30 df = pd.read_csv(filepath) 

31 cols = df.columns.values 

32 for ix, row in df.iterrows(): 

33 rec_text = "; ".join([f"{col}: {str(row[col])}" for col in cols]) 

34 add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size) 

35 elif filename.endswith(".json"): 

36 json_obj = loads(open(filepath).read()) 

37 # check if json_obj is a list 

38 if isinstance(json_obj, list): 

39 for ix, js_rec in enumerate(json_obj): 

40 rec_text = dumps(js_rec) 

41 add_chunks(f"{filename}_{ix+1}", rec_text, chunk_size) 

42 else: 

43 text = dumps(json_obj) 

44 add_chunks(filename, text, chunk_size) 

45 elif filename.endswith(".pdf"): 

46 page_texts = [] 

47 pdf_reader = PdfReader(filepath) 

48 num_pages = pdf_reader.get_num_pages() 

49 for px in range(num_pages): 

50 page = pdf_reader.pages[px] 

51 page_texts.append(page.extract_text()) 

52 text = " ".join(page_texts) 

53 add_chunks(filename, text, chunk_size) 

54 else: 

55 text = open(filepath).read() 

56 add_chunks(filename, text, chunk_size) 

57 

58 return text_to_chunks