GitPython>=3.1
tqdm>=4.67
pydantic~=2.11
requests>=2.32
datasets<4.0
lxml>=5.4
polars~=1.38.1
gdown~=5.2
pyarrow~=23.0.1
langcodes~=3.5
tokenizers~=0.21
hf_xet~=1.1
regex>=2026.2.28
bibtexparser>=2.0.0b9
huggingface-hub>=0.36.2
transformers>=4.57.6

[dev]
pytest>=8.1.1
pre-commit>=3.7.0
ruff>=0.11.3
types-lxml>=2025.3.30
pyarrow-stubs>=19.3

[docs]
zensical>=0.0.24
mkdocstrings[python]>=1.0

[sentence-segmentation]
wtpsplit~=2.2.0

[word-tokenization]
datatrove[multilingual,processing]~=0.6
