Coverage for intelligence_toolkit/tests/unit/helpers/test_document_processor.py: 100%
91 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
4import json
5import tempfile
6from pathlib import Path
7from unittest.mock import MagicMock, patch
9import pandas as pd
10import pytest
12from intelligence_toolkit.helpers.document_processor import convert_files_to_chunks
15@pytest.fixture
16def temp_dir():
17 with tempfile.TemporaryDirectory() as tmpdir:
18 yield tmpdir
21def test_convert_files_to_chunks_txt_file(temp_dir):
22 # Create a test text file
23 txt_file = Path(temp_dir) / "test.txt"
24 txt_file.write_text("This is a test document with some content.")
26 result = convert_files_to_chunks([str(txt_file)], chunk_size=5)
28 assert "test.txt" in result
29 assert len(result["test.txt"]) > 0
32def test_convert_files_to_chunks_csv_file(temp_dir):
33 # Create a test CSV file
34 csv_file = Path(temp_dir) / "test.csv"
35 df = pd.DataFrame({
36 "col1": ["value1", "value2"],
37 "col2": ["value3", "value4"]
38 })
39 df.to_csv(csv_file, index=False)
41 result = convert_files_to_chunks([str(csv_file)], chunk_size=50)
43 # Should create chunks for each row
44 assert "test.csv_1" in result
45 assert "test.csv_2" in result
48def test_convert_files_to_chunks_json_list(temp_dir):
49 # Create a test JSON file with a list
50 json_file = Path(temp_dir) / "test.json"
51 json_data = [
52 {"key1": "value1"},
53 {"key2": "value2"}
54 ]
55 json_file.write_text(json.dumps(json_data))
57 result = convert_files_to_chunks([str(json_file)], chunk_size=50)
59 assert "test.json_1" in result
60 assert "test.json_2" in result
63def test_convert_files_to_chunks_json_object(temp_dir):
64 # Create a test JSON file with a single object
65 json_file = Path(temp_dir) / "test.json"
66 json_data = {"key": "value", "nested": {"inner": "data"}}
67 json_file.write_text(json.dumps(json_data))
69 result = convert_files_to_chunks([str(json_file)], chunk_size=50)
71 assert "test.json" in result
74def test_convert_files_to_chunks_pdf_file(temp_dir):
75 # Mock PDF reading since creating real PDFs is complex
76 pdf_file = Path(temp_dir) / "test.pdf"
77 pdf_file.write_text("dummy pdf content")
79 with patch("intelligence_toolkit.helpers.document_processor.PdfReader") as mock_reader:
80 mock_pdf = MagicMock()
81 mock_pdf.get_num_pages.return_value = 2
82 mock_page1 = MagicMock()
83 mock_page1.extract_text.return_value = "Page 1 content"
84 mock_page2 = MagicMock()
85 mock_page2.extract_text.return_value = "Page 2 content"
86 mock_pdf.pages = [mock_page1, mock_page2]
87 mock_reader.return_value = mock_pdf
89 result = convert_files_to_chunks([str(pdf_file)], chunk_size=50)
91 assert "test.pdf" in result
94def test_convert_files_to_chunks_multiple_files(temp_dir):
95 # Create multiple test files
96 txt_file = Path(temp_dir) / "test1.txt"
97 txt_file.write_text("Content of file 1")
99 txt_file2 = Path(temp_dir) / "test2.txt"
100 txt_file2.write_text("Content of file 2")
102 result = convert_files_to_chunks([str(txt_file), str(txt_file2)], chunk_size=50)
104 assert "test1.txt" in result
105 assert "test2.txt" in result
108def test_convert_files_to_chunks_with_callbacks(temp_dir):
109 txt_file = Path(temp_dir) / "test.txt"
110 txt_file.write_text("Test content")
112 callback = MagicMock()
113 callback.on_batch_change = MagicMock()
115 result = convert_files_to_chunks([str(txt_file)], chunk_size=50, callbacks=[callback])
117 callback.on_batch_change.assert_called()
120def test_convert_files_to_chunks_filename_sanitization(temp_dir):
121 # Create file with special characters in name
122 txt_file = Path(temp_dir) / "test (with spaces).txt"
123 txt_file.write_text("Test content")
125 result = convert_files_to_chunks([str(txt_file)], chunk_size=50)
127 # Parentheses and spaces should be removed/replaced
128 assert "test_with_spaces.txt" in result
131def test_convert_files_to_chunks_chunk_structure(temp_dir):
132 txt_file = Path(temp_dir) / "test.txt"
133 txt_file.write_text("Short text")
135 result = convert_files_to_chunks([str(txt_file)], chunk_size=50)
137 # Verify chunk structure
138 chunk_json = json.loads(result["test.txt"][0])
139 assert "title" in chunk_json
140 assert "text_chunk" in chunk_json
141 assert "chunk_id" in chunk_json
142 assert chunk_json["chunk_id"] == 1
145def test_convert_files_to_chunks_empty_list():
146 result = convert_files_to_chunks([], chunk_size=50)
148 assert len(result) == 0
151def test_convert_files_to_chunks_csv_row_format(temp_dir):
152 csv_file = Path(temp_dir) / "test.csv"
153 df = pd.DataFrame({
154 "name": ["Alice"],
155 "age": [30]
156 })
157 df.to_csv(csv_file, index=False)
159 result = convert_files_to_chunks([str(csv_file)], chunk_size=100)
161 # Check that the chunk contains the formatted row data
162 chunk_json = json.loads(result["test.csv_1"][0])
163 assert "name: Alice" in chunk_json["text_chunk"]
164 assert "age: 30" in chunk_json["text_chunk"]