Metadata-Version: 2.4
Name: nemo-curator
Version: 1.2.0
Summary: Scalable Data Preprocessing Tool for Training Large Language Models
Author-email: Ayush Dattagupta <adattagupta@nvidia.com>, Abhinav Garg <abhgarg@nvidia.com>, Praateek Mahajan <praateekm@nvidia.com>, Sarah Yurick <syurick@nvidia.com>, Vibhu Jawa <vjawa@nvidia.com>
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Requires-Python: <3.13,>=3.10
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: absl-py<3.0.0,>=2.0.0
Requires-Dist: comment_parser
Requires-Dist: cosmos-xenna==0.2.0
Requires-Dist: fsspec
Requires-Dist: hydra-core
Requires-Dist: jieba==0.42.1
Requires-Dist: loguru
Requires-Dist: mecab-python3
Requires-Dist: omegaconf
Requires-Dist: openai>=1.0.0
Requires-Dist: pandas>=2.1.0
Requires-Dist: pyarrow
Requires-Dist: ray[data,default]>=2.54
Requires-Dist: torch
Requires-Dist: transformers
Provides-Extra: cuda12
Requires-Dist: gpustat; extra == "cuda12"
Requires-Dist: nvidia-ml-py; extra == "cuda12"
Provides-Extra: vllm
Requires-Dist: vllm>=0.14.1; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "vllm"
Provides-Extra: inference-server
Requires-Dist: nemo_curator[cuda12]; extra == "inference-server"
Requires-Dist: nemo_curator[vllm]; extra == "inference-server"
Requires-Dist: boto3>=1.35; extra == "inference-server"
Requires-Dist: nixl-cu12>=0.10.0; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "inference-server"
Requires-Dist: ray[llm,serve]>=2.54; extra == "inference-server"
Requires-Dist: vllm<0.16.0; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "inference-server"
Provides-Extra: deduplication-cuda12
Requires-Dist: cudf-cu12==25.10.*; extra == "deduplication-cuda12"
Requires-Dist: cuml-cu12==25.10.*; extra == "deduplication-cuda12"
Requires-Dist: scikit-learn<1.8.0; extra == "deduplication-cuda12"
Requires-Dist: pylibcugraph-cu12==25.10.*; extra == "deduplication-cuda12"
Requires-Dist: pylibraft-cu12==25.10.*; extra == "deduplication-cuda12"
Requires-Dist: raft-dask-cu12==25.10.*; extra == "deduplication-cuda12"
Requires-Dist: rapidsmpf-cu12==25.10.*; extra == "deduplication-cuda12"
Provides-Extra: audio-common
Requires-Dist: nemo_toolkit[asr]>=2.7.2; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "audio-common"
Requires-Dist: soundfile>=0.12.0; extra == "audio-common"
Requires-Dist: torchaudio; extra == "audio-common"
Requires-Dist: onnx>=1.19.0; extra == "audio-common"
Requires-Dist: silero-vad; extra == "audio-common"
Requires-Dist: librosa; extra == "audio-common"
Requires-Dist: scipy; extra == "audio-common"
Requires-Dist: pydub>=0.25.1; extra == "audio-common"
Provides-Extra: audio-cpu
Requires-Dist: nemo_curator[audio_common]; extra == "audio-cpu"
Requires-Dist: onnxruntime<1.24,>=1.20.1; extra == "audio-cpu"
Provides-Extra: audio-cuda12
Requires-Dist: nemo_curator[audio_common]; extra == "audio-cuda12"
Requires-Dist: nemo_curator[cuda12]; extra == "audio-cuda12"
Requires-Dist: nvidia-cudnn-cu12; extra == "audio-cuda12"
Requires-Dist: onnxruntime-gpu<1.24,>=1.20.1; platform_machine == "x86_64" and extra == "audio-cuda12"
Requires-Dist: torchcodec; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "audio-cuda12"
Provides-Extra: image-cpu
Requires-Dist: Pillow; extra == "image-cpu"
Requires-Dist: torchvision; extra == "image-cpu"
Provides-Extra: image-cuda12
Requires-Dist: nemo_curator[image_cpu]; extra == "image-cuda12"
Requires-Dist: nemo_curator[cuda12]; extra == "image-cuda12"
Requires-Dist: nemo_curator[deduplication_cuda12]; extra == "image-cuda12"
Requires-Dist: nvidia-dali-cuda120; extra == "image-cuda12"
Provides-Extra: text-cpu
Requires-Dist: beautifulsoup4; extra == "text-cpu"
Requires-Dist: justext; extra == "text-cpu"
Requires-Dist: lxml>=6.1.0; extra == "text-cpu"
Requires-Dist: pycld2; extra == "text-cpu"
Requires-Dist: resiliparse; extra == "text-cpu"
Requires-Dist: s5cmd; extra == "text-cpu"
Requires-Dist: trafilatura==2.0.0; extra == "text-cpu"
Requires-Dist: warcio; extra == "text-cpu"
Requires-Dist: fasttext==0.9.3; extra == "text-cpu"
Requires-Dist: sentencepiece; extra == "text-cpu"
Requires-Dist: mwparserfromhell==0.6.5; extra == "text-cpu"
Requires-Dist: peft; extra == "text-cpu"
Requires-Dist: ftfy; extra == "text-cpu"
Requires-Dist: sentence-transformers; extra == "text-cpu"
Provides-Extra: text-cuda12
Requires-Dist: nemo_curator[cuda12]; extra == "text-cuda12"
Requires-Dist: nemo_curator[deduplication_cuda12]; extra == "text-cuda12"
Requires-Dist: nemo_curator[text_cpu]; extra == "text-cuda12"
Requires-Dist: nemo_curator[vllm]; extra == "text-cuda12"
Provides-Extra: video-cpu
Requires-Dist: av==15.1.0; extra == "video-cpu"
Requires-Dist: opencv-python; extra == "video-cpu"
Requires-Dist: torchvision; extra == "video-cpu"
Requires-Dist: einops; extra == "video-cpu"
Requires-Dist: easydict; extra == "video-cpu"
Provides-Extra: video-cuda12
Requires-Dist: nemo_curator[video_cpu]; extra == "video-cuda12"
Requires-Dist: nemo_curator[cuda12]; extra == "video-cuda12"
Requires-Dist: nemo_curator[vllm]; extra == "video-cuda12"
Requires-Dist: cvcuda_cu12; extra == "video-cuda12"
Requires-Dist: flash-attn<=2.8.3; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "video-cuda12"
Requires-Dist: pycuda; extra == "video-cuda12"
Requires-Dist: PyNvVideoCodec==2.0.2; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "video-cuda12"
Requires-Dist: torch<=2.9.1; extra == "video-cuda12"
Requires-Dist: torchaudio; extra == "video-cuda12"
Provides-Extra: math-cpu
Requires-Dist: nemo_curator[text_cpu]; extra == "math-cpu"
Requires-Dist: boto3>=1.35; extra == "math-cpu"
Provides-Extra: math-cuda12
Requires-Dist: nemo_curator[math_cpu]; extra == "math-cuda12"
Requires-Dist: nemo_curator[cuda12]; extra == "math-cuda12"
Requires-Dist: nemo_curator[deduplication_cuda12]; extra == "math-cuda12"
Requires-Dist: vllm>=0.13; (platform_machine == "x86_64" and platform_system != "Darwin") and extra == "math-cuda12"
Provides-Extra: interleaved-cpu
Requires-Dist: albumentations; extra == "interleaved-cpu"
Requires-Dist: open_clip_torch; extra == "interleaved-cpu"
Requires-Dist: opencv-python; extra == "interleaved-cpu"
Requires-Dist: Pillow; extra == "interleaved-cpu"
Requires-Dist: pypdfium2; extra == "interleaved-cpu"
Requires-Dist: s3fs>=2024.12.0; extra == "interleaved-cpu"
Requires-Dist: timm; extra == "interleaved-cpu"
Provides-Extra: interleaved-cuda12
Requires-Dist: nemo_curator[interleaved_cpu]; extra == "interleaved-cuda12"
Requires-Dist: nemo_curator[cuda12]; extra == "interleaved-cuda12"
Requires-Dist: nemo_curator[vllm]; extra == "interleaved-cuda12"
Provides-Extra: sdg-cpu
Requires-Dist: data-designer==0.5.5; extra == "sdg-cpu"
Provides-Extra: sdg-cuda12
Requires-Dist: nemo_curator[cuda12]; extra == "sdg-cuda12"
Requires-Dist: nemo_curator[sdg_cpu]; extra == "sdg-cuda12"
Requires-Dist: nemo_curator[inference_server]; extra == "sdg-cuda12"
Provides-Extra: all
Requires-Dist: nemo_curator[audio_cuda12]; extra == "all"
Requires-Dist: nemo_curator[image_cuda12]; extra == "all"
Requires-Dist: nemo_curator[inference_server]; extra == "all"
Requires-Dist: nemo_curator[interleaved_cuda12]; extra == "all"
Requires-Dist: nemo_curator[math_cuda12]; extra == "all"
Requires-Dist: nemo_curator[sdg_cuda12]; extra == "all"
Requires-Dist: nemo_curator[text_cuda12]; extra == "all"
Requires-Dist: nemo_curator[video_cuda12]; extra == "all"
Dynamic: license-file

<div align="center">

  <a href="https://github.com/NVIDIA-NeMo/Curator/blob/main/LICENSE">![https://pypi.org/project/nemo-curator](https://img.shields.io/github/license/NVIDIA-NeMo/Curator)</a>
  <a href="https://codecov.io/github/NVIDIA-NeMo/Curator">![codecov](https://codecov.io/github/NVIDIA-NeMo/Curator/graph/badge.svg)</a>
  <a href="https://pypi.org/project/nemo-curator/">![https://pypi.org/project/nemo-curator/](https://img.shields.io/pypi/pyversions/nemo-curator.svg)</a>
  <a href="https://github.com/NVIDIA-NeMo/Curator/graphs/contributors">![NVIDIA-NeMo/Curator](https://img.shields.io/github/contributors/NVIDIA-NeMo/Curator)</a>
  <a href="https://github.com/NVIDIA-NeMo/Curator/releases">![https://github.com/NVIDIA-NeMo/Curator/releases](https://img.shields.io/github/release/NVIDIA-NeMo/Curator)</a>
  <a href="https://pypi.org/project/nemo-curator/">![https://github.com/Naereen/badges/](https://badgen.net/badge/open%20source/❤/blue?icon=github)</a>

</div>

# NVIDIA NeMo Curator

**GPU-accelerated data curation for training better AI models, faster.** Scale from laptop to multi-node clusters with modular pipelines for text, images, video, and audio.

> *Part of the [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/) software suite for managing the AI agent lifecycle.*

## What You Can Do

| Modality | Key Capabilities | Get Started |
|----------|-----------------|-------------|
| **Text** | Deduplication • Classification • Quality Filtering • Language Detection | [Text Guide](https://docs.nvidia.com/nemo/curator/latest/get-started/text.html) |
| **Image** | Aesthetic Filtering • NSFW Detection • Embedding Generation • Deduplication | [Image Guide](https://docs.nvidia.com/nemo/curator/latest/get-started/image.html) |
| **Video** | Scene Detection • Clip Extraction • Motion Filtering • Deduplication | [Video Guide](https://docs.nvidia.com/nemo/curator/latest/get-started/video.html) |
| **Audio** | ASR Transcription • Quality Assessment • WER Filtering | [Audio Guide](https://docs.nvidia.com/nemo/curator/latest/get-started/audio.html) |

## Quick Start

```bash
# Install for your modality
uv pip install "nemo-curator[text_cuda12]"

# Run the quickstart example
python tutorials/quickstart.py
```

**Full setup:** [Installation Guide](https://docs.nvidia.com/nemo/curator/latest/admin/installation.html) • [Docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo-curator) • [Tutorials](tutorials/)

---

## Features by Modality

### Text Curation

Process and curate high-quality text datasets for large language model (LLM) training with multilingual support.

| Category | Features | Documentation |
|----------|----------|---------------|
| **Data Sources** | Common Crawl • Wikipedia • ArXiv • Custom datasets | [Load Data](https://docs.nvidia.com/nemo/curator/latest/curate-text/load-data/index.html) |
| **Quality Filtering** | 30+ heuristic filters • fastText classification • GPU-accelerated classifiers for domain, quality, safety, and content type | [Quality Assessment](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/quality-assessment/heuristic.html) |
| **Deduplication** | Exact • Fuzzy (MinHash LSH) • Semantic (GPU-accelerated) | [Deduplication](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/deduplication/index.html) |
| **Processing** | Text cleaning • Language identification | [Content Processing](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/content-processing/text-cleaning.html) |

---

### Image Curation

Curate large-scale image datasets for vision language models (VLMs) and generative AI training.

| Category | Features | Documentation |
|----------|----------|---------------|
| **Data Loading** | WebDataset format • Large-scale image-text pairs | [Load Data](https://docs.nvidia.com/nemo/curator/latest/curate-images/load-data/index.html) |
| **Embeddings** | CLIP embeddings for semantic analysis | [Embeddings](https://docs.nvidia.com/nemo/curator/latest/curate-images/process-data/embeddings/index.html) |
| **Filtering** | Aesthetic quality scoring • NSFW detection | [Filters](https://docs.nvidia.com/nemo/curator/latest/curate-images/process-data/filters/index.html) |

---

### Video Curation

Process large-scale video corpora with distributed, GPU-accelerated pipelines for world foundation models (WFMs).

| Category | Features | Documentation |
|----------|----------|---------------|
| **Data Loading** | Local paths • S3-compatible storage • HTTP(S) URLs | [Load Data](https://docs.nvidia.com/nemo/curator/latest/curate-video/load-data/index.html) |
| **Clipping** | Fixed-stride splitting • Scene-change detection (TransNetV2) | [Clipping](https://docs.nvidia.com/nemo/curator/latest/curate-video/process-data/clipping.html) |
| **Processing** | GPU H.264 encoding • Frame extraction • Motion filtering • Aesthetic filtering | [Processing](https://docs.nvidia.com/nemo/curator/latest/curate-video/process-data/filtering.html) |
| **Embeddings** | Cosmos-Embed1 for clip-level embeddings | [Embeddings](https://docs.nvidia.com/nemo/curator/latest/curate-video/process-data/embeddings.html) |
| **Deduplication** | K-means clustering • Pairwise similarity for near-duplicates | [Deduplication](https://docs.nvidia.com/nemo/curator/latest/curate-video/process-data/dedup.html) |

---

### Audio Curation

Prepare high-quality speech datasets for automatic speech recognition (ASR) and multimodal AI training.

| Category | Features | Documentation |
|----------|----------|---------------|
| **Data Loading** | Local files • Custom manifests • Public datasets (FLEURS) | [Load Data](https://docs.nvidia.com/nemo/curator/latest/curate-audio/load-data/index.html) |
| **ASR Processing** | NeMo Framework pretrained models • Automatic transcription | [ASR Inference](https://docs.nvidia.com/nemo/curator/latest/curate-audio/process-data/asr-inference/index.html) |
| **Quality Assessment** | Word Error Rate (WER) calculation • Duration analysis • Quality-based filtering | [Quality Assessment](https://docs.nvidia.com/nemo/curator/latest/curate-audio/process-data/quality-assessment/index.html) |
| **Integration** | Text curation workflow integration for multimodal pipelines | [Text Integration](https://docs.nvidia.com/nemo/curator/latest/curate-audio/process-data/text-integration/index.html) |

---

## Why NeMo Curator?

### Performance at Scale

NeMo Curator leverages NVIDIA RAPIDS™ libraries such as cuDF, cuML, and cuGraph along with Ray to scale workloads across multi-node, multi-GPU environments.

**Proven Results:**
- **16× faster** fuzzy deduplication on 8 TB RedPajama v2 (1.78 trillion tokens)
- **40% lower** total cost of ownership (TCO) compared to CPU-based alternatives
- **Near-linear scaling** from one to four H100 80 GB nodes (2.05 hrs → 0.50 hrs)

**Real-World Recipe:** The [Nemotron-CC curation pipeline](https://github.com/NVIDIA-NeMo/Nemotron/tree/main/src/nemotron/recipes/data_curation/nemotron-cc) uses NeMo Curator end-to-end — from Common Crawl extraction through language identification, exact/fuzzy/substring deduplication, ensemble quality classification, and LLM-based synthetic data generation — to reproduce the [Nemotron-CC datasets](https://huggingface.co/datasets/nvidia/Nemotron-CC-v2). The SDG stage is also available as an [in-repo tutorial](tutorials/synthetic/nemotron_cc/).

<p align="center">
  <img src="./docs/_images/text-benchmarks.png" alt="Performance benchmarks showing 16x speed improvement, 40% cost savings, and near-linear scaling" width="700"/>
</p>

### Quality Improvements

Data curation modules measurably improve model performance. In ablation studies using a 357M-parameter GPT model trained on curated Common Crawl data:

<p align="center">
  <img src="./docs/_images/ablation.png" alt="Model accuracy improvements across curation pipeline stages" width="700"/>
</p>

**Results:** Progressive improvements in zero-shot downstream task performance through text cleaning, deduplication, and quality filtering stages.

---

## Learn More

| Resource | Links |
|----------|-------|
| **Documentation** | [Main Docs](https://docs.nvidia.com/nemo/curator/latest/) • [API Reference](https://docs.nvidia.com/nemo/curator/latest/apidocs/index.html) • [Concepts](https://docs.nvidia.com/nemo/curator/latest/about/concepts/index.html) |
| **Tutorials** | [Text](tutorials/text/) • [Image](tutorials/image/) • [Video](tutorials/video/) • [Audio](tutorials/audio/) |
| **Recipes** | [Nemotron-CC: end-to-end web data curation](https://github.com/NVIDIA-NeMo/Nemotron/tree/main/src/nemotron/recipes/data_curation/nemotron-cc) • [SDG tutorial (in-repo)](tutorials/synthetic/nemotron_cc/) |
| **Deployment** | [Installation](https://docs.nvidia.com/nemo/curator/latest/admin/installation.html) • [Infrastructure](https://docs.nvidia.com/nemo/curator/latest/reference/infrastructure/index.html) |
| **Community** | [GitHub Discussions](https://github.com/NVIDIA-NeMo/Curator/discussions) • [Issues](https://github.com/NVIDIA-NeMo/Curator/issues) |

---

## Contribute

We welcome community contributions! Please refer to [CONTRIBUTING.md](https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md) for guidelines.
