lightudq.utils
1import os 2 3import pymupdf 4 5 6def read_markdown_to_text(file_path: str) -> str: 7 with open(file_path, encoding="utf-8") as f: 8 return f.read() 9 10 11def read_pdf_to_text(file_path: str) -> str: 12 doc = pymupdf.open(file_path) 13 text = "" 14 for page in doc: # iterate the document pages 15 text += page.get_text() # get plain text (is in UTF-8) 16 return text 17 18 19def read_document(file_path: str) -> str: 20 """Read document content based on file extension""" 21 _, file_extension = os.path.splitext(file_path) 22 file_extension = file_extension.lower() 23 24 if file_extension == ".md": 25 return read_markdown_to_text(file_path) 26 elif file_extension == ".txt": 27 with open(file_path) as file: 28 return file.read() 29 elif file_extension == ".pdf": 30 return read_pdf_to_text(file_path) 31 else: 32 print(f"Unsupported file format: {file_extension}") 33 return ""
def
read_markdown_to_text(file_path: str) -> str:
def
read_pdf_to_text(file_path: str) -> str:
def
read_document(file_path: str) -> str:
20def read_document(file_path: str) -> str: 21 """Read document content based on file extension""" 22 _, file_extension = os.path.splitext(file_path) 23 file_extension = file_extension.lower() 24 25 if file_extension == ".md": 26 return read_markdown_to_text(file_path) 27 elif file_extension == ".txt": 28 with open(file_path) as file: 29 return file.read() 30 elif file_extension == ".pdf": 31 return read_pdf_to_text(file_path) 32 else: 33 print(f"Unsupported file format: {file_extension}") 34 return ""
Read document content based on file extension