.dockerignore
.gitignore
.pre-commit-config.yaml
LICENSE
MANIFEST.in
README.md
pyproject.toml
uv.lock
.github/workflows/ci.yml
.github/workflows/release.yml
dalla_data_processing/__init__.py
dalla_data_processing/_version.py
dalla_data_processing/cli.py
dalla_data_processing.egg-info/PKG-INFO
dalla_data_processing.egg-info/SOURCES.txt
dalla_data_processing.egg-info/dependency_links.txt
dalla_data_processing.egg-info/entry_points.txt
dalla_data_processing.egg-info/not-zip-safe
dalla_data_processing.egg-info/requires.txt
dalla_data_processing.egg-info/top_level.txt
dalla_data_processing/core/README.md
dalla_data_processing/core/__init__.py
dalla_data_processing/core/dataset.py
dalla_data_processing/core/parallel.py
dalla_data_processing/deduplication/README.md
dalla_data_processing/deduplication/__init__.py
dalla_data_processing/deduplication/onion_wrapper.py
dalla_data_processing/deduplication/postprocessing.py
dalla_data_processing/deduplication/preprocessing.py
dalla_data_processing/deduplication/bin/.gitignore
dalla_data_processing/deduplication/onion/COPYING
dalla_data_processing/deduplication/onion/Makefile
dalla_data_processing/deduplication/onion/Makefile.config
dalla_data_processing/deduplication/onion/README.md
dalla_data_processing/deduplication/onion/src/Makefile
dalla_data_processing/deduplication/onion/src/Makefile.g
dalla_data_processing/deduplication/onion/src/buzhash.c
dalla_data_processing/deduplication/onion/src/buzhash.h
dalla_data_processing/deduplication/onion/src/hashdup.c
dalla_data_processing/deduplication/onion/src/hashgen.c
dalla_data_processing/deduplication/onion/src/onion
dalla_data_processing/deduplication/onion/src/onion.c
dalla_data_processing/deduplication/onion/src/onion_dup.c
dalla_data_processing/deduplication/onion/src/version.c
dalla_data_processing/deduplication/onion/src/version.h
dalla_data_processing/deduplication/onion/src_sc/.gitignore
dalla_data_processing/deduplication/onion/src_sc/Makefile
dalla_data_processing/deduplication/onion/src_sc/Makefile.g
dalla_data_processing/deduplication/onion/src_sc/buzhash.c
dalla_data_processing/deduplication/onion/src_sc/buzhash.h
dalla_data_processing/deduplication/onion/src_sc/hashdup
dalla_data_processing/deduplication/onion/src_sc/hashdup.c
dalla_data_processing/deduplication/onion/src_sc/hashgen
dalla_data_processing/deduplication/onion/src_sc/hashgen.c
dalla_data_processing/deduplication/onion/src_sc/onion.c
dalla_data_processing/deduplication/onion/src_sc/onion_dup.c
dalla_data_processing/deduplication/onion/src_sc/version.c
dalla_data_processing/deduplication/onion/src_sc/version.h
dalla_data_processing/packing/README.md
dalla_data_processing/packing/__init__.py
dalla_data_processing/packing/dataset_packer.py
dalla_data_processing/packing/pack_config.example.yaml
dalla_data_processing/quality/README.md
dalla_data_processing/quality/__init__.py
dalla_data_processing/quality/checker.py
dalla_data_processing/readability/README.md
dalla_data_processing/readability/__init__.py
dalla_data_processing/readability/ranking.py
dalla_data_processing/readability/scorer.py
dalla_data_processing/stemming/README.md
dalla_data_processing/stemming/__init__.py
dalla_data_processing/stemming/stemmer.py
dalla_data_processing/stemming/data/words_al.txt
dalla_data_processing/stemming/data/words_al_t.txt
dalla_data_processing/stemming/data/words_t.txt
dalla_data_processing/utils/__init__.py
dalla_data_processing/utils/logger.py
dalla_data_processing/utils/tokenize.py
scripts/build_onion.sh
scripts/release.sh