Coverage for intelligence_toolkit/tests/unit/helpers/test_document_processor.py: 100%

91 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4import json 

5import tempfile 

6from pathlib import Path 

7from unittest.mock import MagicMock, patch 

8 

9import pandas as pd 

10import pytest 

11 

12from intelligence_toolkit.helpers.document_processor import convert_files_to_chunks 

13 

14 

15@pytest.fixture 

16def temp_dir(): 

17 with tempfile.TemporaryDirectory() as tmpdir: 

18 yield tmpdir 

19 

20 

21def test_convert_files_to_chunks_txt_file(temp_dir): 

22 # Create a test text file 

23 txt_file = Path(temp_dir) / "test.txt" 

24 txt_file.write_text("This is a test document with some content.") 

25 

26 result = convert_files_to_chunks([str(txt_file)], chunk_size=5) 

27 

28 assert "test.txt" in result 

29 assert len(result["test.txt"]) > 0 

30 

31 

32def test_convert_files_to_chunks_csv_file(temp_dir): 

33 # Create a test CSV file 

34 csv_file = Path(temp_dir) / "test.csv" 

35 df = pd.DataFrame({ 

36 "col1": ["value1", "value2"], 

37 "col2": ["value3", "value4"] 

38 }) 

39 df.to_csv(csv_file, index=False) 

40 

41 result = convert_files_to_chunks([str(csv_file)], chunk_size=50) 

42 

43 # Should create chunks for each row 

44 assert "test.csv_1" in result 

45 assert "test.csv_2" in result 

46 

47 

48def test_convert_files_to_chunks_json_list(temp_dir): 

49 # Create a test JSON file with a list 

50 json_file = Path(temp_dir) / "test.json" 

51 json_data = [ 

52 {"key1": "value1"}, 

53 {"key2": "value2"} 

54 ] 

55 json_file.write_text(json.dumps(json_data)) 

56 

57 result = convert_files_to_chunks([str(json_file)], chunk_size=50) 

58 

59 assert "test.json_1" in result 

60 assert "test.json_2" in result 

61 

62 

63def test_convert_files_to_chunks_json_object(temp_dir): 

64 # Create a test JSON file with a single object 

65 json_file = Path(temp_dir) / "test.json" 

66 json_data = {"key": "value", "nested": {"inner": "data"}} 

67 json_file.write_text(json.dumps(json_data)) 

68 

69 result = convert_files_to_chunks([str(json_file)], chunk_size=50) 

70 

71 assert "test.json" in result 

72 

73 

74def test_convert_files_to_chunks_pdf_file(temp_dir): 

75 # Mock PDF reading since creating real PDFs is complex 

76 pdf_file = Path(temp_dir) / "test.pdf" 

77 pdf_file.write_text("dummy pdf content") 

78 

79 with patch("intelligence_toolkit.helpers.document_processor.PdfReader") as mock_reader: 

80 mock_pdf = MagicMock() 

81 mock_pdf.get_num_pages.return_value = 2 

82 mock_page1 = MagicMock() 

83 mock_page1.extract_text.return_value = "Page 1 content" 

84 mock_page2 = MagicMock() 

85 mock_page2.extract_text.return_value = "Page 2 content" 

86 mock_pdf.pages = [mock_page1, mock_page2] 

87 mock_reader.return_value = mock_pdf 

88 

89 result = convert_files_to_chunks([str(pdf_file)], chunk_size=50) 

90 

91 assert "test.pdf" in result 

92 

93 

94def test_convert_files_to_chunks_multiple_files(temp_dir): 

95 # Create multiple test files 

96 txt_file = Path(temp_dir) / "test1.txt" 

97 txt_file.write_text("Content of file 1") 

98 

99 txt_file2 = Path(temp_dir) / "test2.txt" 

100 txt_file2.write_text("Content of file 2") 

101 

102 result = convert_files_to_chunks([str(txt_file), str(txt_file2)], chunk_size=50) 

103 

104 assert "test1.txt" in result 

105 assert "test2.txt" in result 

106 

107 

108def test_convert_files_to_chunks_with_callbacks(temp_dir): 

109 txt_file = Path(temp_dir) / "test.txt" 

110 txt_file.write_text("Test content") 

111 

112 callback = MagicMock() 

113 callback.on_batch_change = MagicMock() 

114 

115 result = convert_files_to_chunks([str(txt_file)], chunk_size=50, callbacks=[callback]) 

116 

117 callback.on_batch_change.assert_called() 

118 

119 

120def test_convert_files_to_chunks_filename_sanitization(temp_dir): 

121 # Create file with special characters in name 

122 txt_file = Path(temp_dir) / "test (with spaces).txt" 

123 txt_file.write_text("Test content") 

124 

125 result = convert_files_to_chunks([str(txt_file)], chunk_size=50) 

126 

127 # Parentheses and spaces should be removed/replaced 

128 assert "test_with_spaces.txt" in result 

129 

130 

131def test_convert_files_to_chunks_chunk_structure(temp_dir): 

132 txt_file = Path(temp_dir) / "test.txt" 

133 txt_file.write_text("Short text") 

134 

135 result = convert_files_to_chunks([str(txt_file)], chunk_size=50) 

136 

137 # Verify chunk structure 

138 chunk_json = json.loads(result["test.txt"][0]) 

139 assert "title" in chunk_json 

140 assert "text_chunk" in chunk_json 

141 assert "chunk_id" in chunk_json 

142 assert chunk_json["chunk_id"] == 1 

143 

144 

145def test_convert_files_to_chunks_empty_list(): 

146 result = convert_files_to_chunks([], chunk_size=50) 

147 

148 assert len(result) == 0 

149 

150 

151def test_convert_files_to_chunks_csv_row_format(temp_dir): 

152 csv_file = Path(temp_dir) / "test.csv" 

153 df = pd.DataFrame({ 

154 "name": ["Alice"], 

155 "age": [30] 

156 }) 

157 df.to_csv(csv_file, index=False) 

158 

159 result = convert_files_to_chunks([str(csv_file)], chunk_size=100) 

160 

161 # Check that the chunk contains the formatted row data 

162 chunk_json = json.loads(result["test.csv_1"][0]) 

163 assert "name: Alice" in chunk_json["text_chunk"] 

164 assert "age: 30" in chunk_json["text_chunk"]