# AttackLM .gitignore
# ==================
# This file controls what NOT to commit to the AttackLM public repo.
# Rationale per section is below.

# ----- Python -----
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# src/attacklm package — keep tracked, ignore build artifacts only
!src/attacklm/__init__.py
!src/attacklm/__version__.py
!src/attacklm/cli.py

# ----- Virtual environments -----
.venv/
venv/
env/
ENV/
env.bak/
venv.bak/

# ----- Environment / secrets -----
.env
.envrc
.env.*
!.env.example
*.pem
*.key

# ----- IDE / editor -----
.idea/
.vscode/
*.swp
*.swo
*~
.DS_Store
Thumbs.db
*.sublime-project
*.sublime-workspace

# ----- Logs (keep the directory, exclude content) -----
logs/*.log
logs/*.json
logs/*.tmp
!logs/.gitkeep

# ----- HPO trial outputs (keep hpo_state.json, exclude trial CSVs and adapter dirs) -----
hpo_runs/*_trial*/
hpo_runs/*.csv
hpo_runs/*.dataset_stats.json
!hpo_runs/.gitkeep
!hpo_runs/hpo_state.json

# ----- Trained models (large, regenerable) -----
models/
# Backups of previous runs (v0.1.6+ round-2 SFT backup tarballs)
# Lives at models/.backups/{name}.tar.gz; 4-5 GB each, never commit.
models/.backups/
# Heretic / abliteration output (also large, regenerable, may appear at any path)
n/
uncensored/
decensored/
*.gguf
*.safetensors
*.bin
*.pt
*.pth
*.onnx
checkpoints/
adapter_*/
merged_*/

# ----- MLflow / wandb / tensorboard -----
mlruns/
wandb/
*.tfevents.*
runs/

# ----- OS / misc -----
*.tmp
*.bak
*.orig
*.rej
nohup.out
*.swp
.fuse_hidden*
.Trash-*

# ----- Coverage / test artifacts -----
.coverage
.coverage.*
htmlcov/
.pytest_cache/
.tox/
.cache
coverage.xml
*.cover
.hypothesis/

# ----- Mypy / lint caches -----
.mypy_cache/
.dmypy.json
dmypy.json
.pyre/
.pytype/
.ruff_cache/

# ----- Local notes (keep HANDOFF.md / TASKS.md tracked if you want) -----
*.local.md
notes.md
HANDOFF.md
TASKS.md

# ============================================================================
# GENERATED / REGENERABLE DATA — excluded (users generate these themselves)
# ============================================================================
data/datasets/synthetic/
data/datasets/hybrid/
data/datasets/balanced/
data/datasets/combined/*.jsonl
data/*.json


# ============================================================================
# DATA DIRECTORY — explicit per-source rules
# ============================================================================
#
# The data/ tree has two parts:
#   1. CLONED upstream repos (~1.5GB total, have their own .git directories)
#      These are EXCLUDED — clone them yourself with scripts/clone_repos.sh
#   2. DERIVED training data (data/datasets/buckets/*/data.jsonl, ~5MB)
#      These are INCLUDED — they're the project's actual training data
#
# Why exclude the cloned repos?
#   - Massive (1.5GB) — would bloat the repo
#   - Have their own .git history (would confuse git)
#   - Available via clone_repos.sh — one command to get them
#   - Have their own LICENSE files (preserved in derived data)
#
# Why include the derived JSONL?
#   - Small (5MB) — fits in any repo
#   - This IS the training data — the most important artifact
#   - Has source/license attribution (see /ATTRIBUTION.md)
#   - Saves users from running 6 extractors just to start training

# Cloned upstream repos — EXCLUDED
data/atomic-red-team/
data/stockpile/
data/sigma/
data/metasploit-framework/
data/infection_monkey/
data/RTA/
data/arsenal/
data/manx/
data/access/
data/ai_tools/
data/manifests/
data/tool_knowledge.json
data/DATASET_EXPANSION_REPORT.md
data/sigma/
# Bucket layout backup snapshots (created by migrate_buckets_to_v021.py)
data/.bucket_layout_backup/

# Derived training data — INCLUDED
# (no rule needed; nothing to ignore)
# data/datasets/buckets/*/data.jsonl     ← included
# data/datasets/buckets/manifest.json    ← included
# data/datasets/combined/*.jsonl         ← excluded (regenerable, ~15-17MB each)
data/datasets/combined/*.jsonl
# data/datasets/balanced/*.jsonl         ← excluded (regenerable, output of
# data/datasets/balanced/                   scripts/balance_buckets.py)
# data/ATTRIBUTION.md                     ← included
# data/.gitkeep                           ← included (preserves the directory)

# HuggingFace dataset exports — regenerable by hf/scripts/build_hf_dataset.py
hf/data/*.jsonl
hf/data/*.parquet

# ----- LFS (don't accidentally commit big files) -----
*.lfs

# ============================================================================
# ARCHIVE — kept locally for reference, not distributed
# ============================================================================
# 2026-06-11: archived the Textual-based TUI (attacklm-tui) and the legacy
# flat bucket layout (data/datasets/buckets/<bucket>/...). The TUI was
# archived after it caused a 12x slowdown and persistent layout issues; the
# user reverted to the v0.2.3 CLI baseline. The flat layout was archived
# after the per-source layout (data/datasets/buckets/sources/<source>/...)
# became canonical in v0.3.0.
# 2026-06-11: archived three high-risk source buckets (endgameinc/RTA,
# guardicore/infection_monkey, TheBigPromptLibrary) under
# archive/restricted-sources/ — they are NOT redistributed as part of
# AttackLM. See data/LEGAL.md and data/REMOVAL.md.
#
# All archive contents are preserved locally but never pushed to remote.
archive/
