lightudq.utils

 1import os
 2
 3import pymupdf
 4
 5
 6def read_markdown_to_text(file_path: str) -> str:
 7    with open(file_path, encoding="utf-8") as f:
 8        return f.read()
 9
10
11def read_pdf_to_text(file_path: str) -> str:
12    doc = pymupdf.open(file_path)
13    text = ""
14    for page in doc:  # iterate the document pages
15        text += page.get_text()  # get plain text (is in UTF-8)
16    return text
17
18
19def read_document(file_path: str) -> str:
20    """Read document content based on file extension"""
21    _, file_extension = os.path.splitext(file_path)
22    file_extension = file_extension.lower()
23
24    if file_extension == ".md":
25        return read_markdown_to_text(file_path)
26    elif file_extension == ".txt":
27        with open(file_path) as file:
28            return file.read()
29    elif file_extension == ".pdf":
30        return read_pdf_to_text(file_path)
31    else:
32        print(f"Unsupported file format: {file_extension}")
33        return ""
def read_markdown_to_text(file_path: str) -> str:
7def read_markdown_to_text(file_path: str) -> str:
8    with open(file_path, encoding="utf-8") as f:
9        return f.read()
def read_pdf_to_text(file_path: str) -> str:
12def read_pdf_to_text(file_path: str) -> str:
13    doc = pymupdf.open(file_path)
14    text = ""
15    for page in doc:  # iterate the document pages
16        text += page.get_text()  # get plain text (is in UTF-8)
17    return text
def read_document(file_path: str) -> str:
20def read_document(file_path: str) -> str:
21    """Read document content based on file extension"""
22    _, file_extension = os.path.splitext(file_path)
23    file_extension = file_extension.lower()
24
25    if file_extension == ".md":
26        return read_markdown_to_text(file_path)
27    elif file_extension == ".txt":
28        with open(file_path) as file:
29            return file.read()
30    elif file_extension == ".pdf":
31        return read_pdf_to_text(file_path)
32    else:
33        print(f"Unsupported file format: {file_extension}")
34        return ""

Read document content based on file extension