LICENSE
README.md
setup.cfg
setup.py
data_modori/__init__.py
data_modori/analysis/__init__.py
data_modori/analysis/column_wise_analysis.py
data_modori/analysis/diversity_analysis.py
data_modori/analysis/overall_analysis.py
data_modori/config/__init__.py
data_modori/config/config.py
data_modori/core/__init__.py
data_modori/core/analyser.py
data_modori/core/data.py
data_modori/core/executor.py
data_modori/core/exporter.py
data_modori/core/ray_executor.py
data_modori/core/tracer.py
data_modori/format/__init__.py
data_modori/format/csv_formatter.py
data_modori/format/formatter.py
data_modori/format/json_formatter.py
data_modori/format/load.py
data_modori/format/mixture_formatter.py
data_modori/format/parquet_formatter.py
data_modori/format/text_formatter.py
data_modori/format/tsv_formatter.py
data_modori/ops/__init__.py
data_modori/ops/base_op.py
data_modori/ops/load.py
data_modori/ops/op_fusion.py
data_modori/ops/common/__init__.py
data_modori/ops/common/aws_s3_downloader.py
data_modori/ops/common/helper_func.py
data_modori/ops/common/special_characters.py
data_modori/ops/deduplicator/__init__.py
data_modori/ops/deduplicator/document_deduplicator.py
data_modori/ops/deduplicator/document_minhash_deduplicator.py
data_modori/ops/deduplicator/document_simhash_deduplicator.py
data_modori/ops/filter/__init__.py
data_modori/ops/filter/alphanumeric_filter.py
data_modori/ops/filter/average_line_length_filter.py
data_modori/ops/filter/character_repetition_filter.py
data_modori/ops/filter/flagged_words_filter.py
data_modori/ops/filter/language_id_score_filter.py
data_modori/ops/filter/maximum_line_length_filter.py
data_modori/ops/filter/perplexity_filter.py
data_modori/ops/filter/special_characters_filter.py
data_modori/ops/filter/specified_field_filter.py
data_modori/ops/filter/specified_numeric_field_filter.py
data_modori/ops/filter/stopwords_filter.py
data_modori/ops/filter/suffix_filter.py
data_modori/ops/filter/text_length_filter.py
data_modori/ops/filter/token_num_filter.py
data_modori/ops/filter/word_num_filter.py
data_modori/ops/filter/word_repetition_filter.py
data_modori/ops/mapper/__init__.py
data_modori/ops/mapper/clean_copyright_mapper.py
data_modori/ops/mapper/clean_email_mapper.py
data_modori/ops/mapper/clean_html_mapper.py
data_modori/ops/mapper/clean_ip_mapper.py
data_modori/ops/mapper/clean_links_mapper.py
data_modori/ops/mapper/expand_macro_mapper.py
data_modori/ops/mapper/fix_unicode_mapper.py
data_modori/ops/mapper/nlpaug_en_mapper.py
data_modori/ops/mapper/punctuation_normalization_mapper.py
data_modori/ops/mapper/remove_bibliography_mapper.py
data_modori/ops/mapper/remove_comments_mapper.py
data_modori/ops/mapper/remove_header_mapper.py
data_modori/ops/mapper/remove_long_words_mapper.py
data_modori/ops/mapper/remove_specific_chars_mapper.py
data_modori/ops/mapper/remove_table_text_mapper.py
data_modori/ops/mapper/remove_words_with_incorrect_substrings_mapper.py
data_modori/ops/mapper/sentence_split_mapper.py
data_modori/ops/mapper/whitespace_normalization_mapper.py
data_modori/ops/selector/__init__.py
data_modori/ops/selector/frequency_specified_field_selector.py
data_modori/ops/selector/topk_specified_field_selector.py
data_modori/utils/__init__.py
data_modori/utils/asset_utils.py
data_modori/utils/cache_utils.py
data_modori/utils/ckpt_utils.py
data_modori/utils/compress.py
data_modori/utils/constant.py
data_modori/utils/file_utils.py
data_modori/utils/fingerprint_utils.py
data_modori/utils/logger_utils.py
data_modori/utils/mm_utils.py
data_modori/utils/model_utils.py
data_modori/utils/registry.py
py_data_modori.egg-info/PKG-INFO
py_data_modori.egg-info/SOURCES.txt
py_data_modori.egg-info/dependency_links.txt
py_data_modori.egg-info/entry_points.txt
py_data_modori.egg-info/requires.txt
py_data_modori.egg-info/top_level.txt
tools/__init__.py
tools/analyze_data.py
tools/process_data.py