LICENSE
MANIFEST.in
README.md
pyproject.toml
assets/logo.png
configs/claude-haiku-4.5.yaml
configs/gemma4-31b-it.yaml
configs/gpt-4o-mini.yaml
configs/gpt-5.4-mini.yaml
configs/granite-guardian-3.1-8b.yaml
configs/granite-guardian-3.2-5b.yaml
configs/llama-guard-2.yaml
configs/llama-guard-3-1b.yaml
configs/llama-guard-3-8b.yaml
configs/llama-prompt-guard-22m.yaml
configs/md-judge.yaml
configs/prompt-guard-86m.yaml
configs/qwen3guard-0.6b.yaml
configs/qwen3guard-4b.yaml
configs/shieldgemma-9b.yaml
configs/toxicchat-t5.yaml
configs/wildguard.yaml
docs/index.md
docs/assets/logo.png
docs/cli/reference.md
docs/datasets/image.md
docs/datasets/local.md
docs/datasets/overview.md
docs/datasets/text.md
docs/developer/adding-datasets.md
docs/developer/adding-models.md
docs/developer/architecture.md
docs/developer/plugins-and-presets.md
docs/getting-started/installation.md
docs/getting-started/quickstart.md
docs/getting-started/run-modes.md
docs/getting-started/troubleshooting.md
docs/metrics/overview.md
docs/models/anthropic.md
docs/models/http.md
docs/models/huggingface.md
docs/models/openai-compatible.md
docs/models/openai-moderation.md
docs/models/overview.md
docs/models/vllm.md
docs/stylesheets/extra.css
docs/user-guide/benchmark-packs.md
docs/user-guide/benchmark-selection.md
docs/user-guide/common-workflows.md
docs/user-guide/configuration.md
docs/user-guide/resume.md
docs/user-guide/run-artifacts.md
examples/llavaguard-local-image-jsonl.yaml
examples/openai-compatible-local-image-jsonl.yaml
examples/openai-moderation-safe-vs-unsafe-image-edits.yaml
examples/run-hf-mock-jsonl.yaml
examples/run-mock-csv.yaml
examples/run-mock-jsonl-auto-resume.yaml
examples/run-mock-jsonl.yaml
examples/run-openai-mock-jsonl.yaml
examples/run-vllm-llama-guard.yaml
examples/shieldgemma2-local-image-dir.yaml
examples/datasets/local_images.jsonl
examples/datasets/mock_samples.csv
examples/datasets/mock_samples.jsonl
examples/datasets/audio/silence.wav
examples/datasets/local_directory/metadata.json
examples/datasets/local_directory/test.jsonl
examples/datasets/local_image_dir/safe/chart.png
examples/datasets/local_image_dir/safe/landscape.png
examples/datasets/local_image_dir/unsafe/policy_violation.png
examples/datasets/local_images/ambiguous_label.png
examples/datasets/local_images/safe_chart.png
examples/datasets/local_images/safe_landscape.png
src/geh.egg-info/PKG-INFO
src/geh.egg-info/SOURCES.txt
src/geh.egg-info/dependency_links.txt
src/geh.egg-info/entry_points.txt
src/geh.egg-info/requires.txt
src/geh.egg-info/top_level.txt
src/guard_eval_harness/__init__.py
src/guard_eval_harness/__main__.py
src/guard_eval_harness/judgment.py
src/guard_eval_harness/benchmarks/__init__.py
src/guard_eval_harness/benchmarks/packs.py
src/guard_eval_harness/benchmarks/presets.py
src/guard_eval_harness/cli/__init__.py
src/guard_eval_harness/cli/main.py
src/guard_eval_harness/config/__init__.py
src/guard_eval_harness/config/loading.py
src/guard_eval_harness/config/models.py
src/guard_eval_harness/datasets/__init__.py
src/guard_eval_harness/datasets/aart.py
src/guard_eval_harness/datasets/advbench_behaviors.py
src/guard_eval_harness/datasets/advbench_strings.py
src/guard_eval_harness/datasets/aegis_ai_content_safety_dataset_2.py
src/guard_eval_harness/datasets/agent_harm.py
src/guard_eval_harness/datasets/ai_vs_real.py
src/guard_eval_harness/datasets/anthropic_hh_rlhf.py
src/guard_eval_harness/datasets/base.py
src/guard_eval_harness/datasets/beaver_tails_330k.py
src/guard_eval_harness/datasets/bot_adversarial_dialogue.py
src/guard_eval_harness/datasets/cat_qa.py
src/guard_eval_harness/datasets/circleguardbench_public.py
src/guard_eval_harness/datasets/civil_comments.py
src/guard_eval_harness/datasets/convabuse.py
src/guard_eval_harness/datasets/decodingtrust_stereotypes.py
src/guard_eval_harness/datasets/dices_350.py
src/guard_eval_harness/datasets/dices_990.py
src/guard_eval_harness/datasets/do_anything_now_questions.py
src/guard_eval_harness/datasets/do_not_answer.py
src/guard_eval_harness/datasets/dynahate.py
src/guard_eval_harness/datasets/ethos.py
src/guard_eval_harness/datasets/guardrailsai_jailbreak.py
src/guard_eval_harness/datasets/harm_eval.py
src/guard_eval_harness/datasets/harmbench_behaviors.py
src/guard_eval_harness/datasets/harmful_q.py
src/guard_eval_harness/datasets/harmful_qa.py
src/guard_eval_harness/datasets/harmful_qa_questions.py
src/guard_eval_harness/datasets/hate_speech_offensive.py
src/guard_eval_harness/datasets/hatecheck.py
src/guard_eval_harness/datasets/hateful_memes.py
src/guard_eval_harness/datasets/hatemoji_check.py
src/guard_eval_harness/datasets/hatexplain.py
src/guard_eval_harness/datasets/hex_phi.py
src/guard_eval_harness/datasets/holisafe_bench.py
src/guard_eval_harness/datasets/i_cona.py
src/guard_eval_harness/datasets/i_controversial.py
src/guard_eval_harness/datasets/i_malicious_instructions.py
src/guard_eval_harness/datasets/i_physical_safety.py
src/guard_eval_harness/datasets/imagenet1k_val_safe.py
src/guard_eval_harness/datasets/implicit_hate.py
src/guard_eval_harness/datasets/jailbreakbench.py
src/guard_eval_harness/datasets/jailbreakv_28k.py
src/guard_eval_harness/datasets/jbb_behaviors.py
src/guard_eval_harness/datasets/jigsaw_toxicity.py
src/guard_eval_harness/datasets/local_csv.py
src/guard_eval_harness/datasets/local_image_dir.py
src/guard_eval_harness/datasets/local_image_jsonl.py
src/guard_eval_harness/datasets/local_jsonl.py
src/guard_eval_harness/datasets/malicious_instruct.py
src/guard_eval_harness/datasets/measuring_hate_speech.py
src/guard_eval_harness/datasets/media_cache.py
src/guard_eval_harness/datasets/mitre.py
src/guard_eval_harness/datasets/mlcommons_ailuminate.py
src/guard_eval_harness/datasets/mm_safetybench.py
src/guard_eval_harness/datasets/msts.py
src/guard_eval_harness/datasets/multimodal_base.py
src/guard_eval_harness/datasets/multimodal_source_backed.py
src/guard_eval_harness/datasets/niche_hazard_qa.py
src/guard_eval_harness/datasets/olid.py
src/guard_eval_harness/datasets/openai_moderation_dataset.py
src/guard_eval_harness/datasets/openai_moderation_eval.py
src/guard_eval_harness/datasets/or_bench.py
src/guard_eval_harness/datasets/pku_safe_rlhf.py
src/guard_eval_harness/datasets/prosocial_dialog.py
src/guard_eval_harness/datasets/real_toxicity_prompts.py
src/guard_eval_harness/datasets/safe_text.py
src/guard_eval_harness/datasets/safe_vs_unsafe_image_edits.py
src/guard_eval_harness/datasets/salad_bench.py
src/guard_eval_harness/datasets/sample_cache.py
src/guard_eval_harness/datasets/self_harm_image_dataset.py
src/guard_eval_harness/datasets/simple_safety_tests.py
src/guard_eval_harness/datasets/social_bias_frames.py
src/guard_eval_harness/datasets/sorry_bench.py
src/guard_eval_harness/datasets/source_backed.py
src/guard_eval_harness/datasets/strong_reject_instructions.py
src/guard_eval_harness/datasets/tdc_red_teaming.py
src/guard_eval_harness/datasets/tech_hazard_qa.py
src/guard_eval_harness/datasets/toxic_chat.py
src/guard_eval_harness/datasets/toxigen.py
src/guard_eval_harness/datasets/tweet_eval_hate.py
src/guard_eval_harness/datasets/unsafebench.py
src/guard_eval_harness/datasets/violence_image_dataset.py
src/guard_eval_harness/datasets/vlsbench.py
src/guard_eval_harness/datasets/wildguardmix.py
src/guard_eval_harness/datasets/wildjailbreak.py
src/guard_eval_harness/datasets/xstest.py
src/guard_eval_harness/execution/__init__.py
src/guard_eval_harness/execution/artifacts.py
src/guard_eval_harness/execution/runner.py
src/guard_eval_harness/exports/__init__.py
src/guard_eval_harness/exports/summary.py
src/guard_eval_harness/metrics/__init__.py
src/guard_eval_harness/metrics/binary.py
src/guard_eval_harness/models/__init__.py
src/guard_eval_harness/models/_async_dispatch.py
src/guard_eval_harness/models/anthropic.py
src/guard_eval_harness/models/base.py
src/guard_eval_harness/models/catalog.py
src/guard_eval_harness/models/granite_guardian.py
src/guard_eval_harness/models/hf_gemma4_vlm.py
src/guard_eval_harness/models/hf_image_classifier.py
src/guard_eval_harness/models/hf_safeqwen_vlm.py
src/guard_eval_harness/models/hf_shieldgemma2.py
src/guard_eval_harness/models/hf_vlm_guard.py
src/guard_eval_harness/models/http.py
src/guard_eval_harness/models/huggingface.py
src/guard_eval_harness/models/llama_guard.py
src/guard_eval_harness/models/mock.py
src/guard_eval_harness/models/multimodal.py
src/guard_eval_harness/models/openai_compatible.py
src/guard_eval_harness/models/openai_moderation.py
src/guard_eval_harness/models/templates.py
src/guard_eval_harness/models/vllm_adapter.py
src/guard_eval_harness/models/catalog/claude-haiku-4.5.yaml
src/guard_eval_harness/models/catalog/gemma4-31b-it.yaml
src/guard_eval_harness/models/catalog/gpt-4o-mini.yaml
src/guard_eval_harness/models/catalog/gpt-5.4-mini.yaml
src/guard_eval_harness/models/catalog/granite-guardian-3.1-8b.yaml
src/guard_eval_harness/models/catalog/granite-guardian-3.2-5b.yaml
src/guard_eval_harness/models/catalog/llama-guard-2.yaml
src/guard_eval_harness/models/catalog/llama-guard-3-1b.yaml
src/guard_eval_harness/models/catalog/llama-guard-3-8b.yaml
src/guard_eval_harness/models/catalog/llama-prompt-guard-22m.yaml
src/guard_eval_harness/models/catalog/md-judge.yaml
src/guard_eval_harness/models/catalog/prompt-guard-86m.yaml
src/guard_eval_harness/models/catalog/qwen3guard-0.6b.yaml
src/guard_eval_harness/models/catalog/qwen3guard-4b.yaml
src/guard_eval_harness/models/catalog/shieldgemma-9b.yaml
src/guard_eval_harness/models/catalog/toxicchat-t5.yaml
src/guard_eval_harness/models/catalog/wildguard.yaml
src/guard_eval_harness/models/vllm_plugins/__init__.py
src/guard_eval_harness/plugins/__init__.py
src/guard_eval_harness/plugins/discovery.py
src/guard_eval_harness/registry/__init__.py
src/guard_eval_harness/registry/core.py
src/guard_eval_harness/reports/__init__.py
src/guard_eval_harness/reports/summary.py
src/guard_eval_harness/schemas/__init__.py
src/guard_eval_harness/schemas/core.py
tests/__init__.py
tests/test_benchmark_presets.py
tests/test_cli.py
tests/test_cli_commands.py
tests/test_config.py
tests/test_datasets_builtin.py
tests/test_datasets_holisafe_bench.py
tests/test_datasets_image_phase2.py
tests/test_datasets_imagenet1k.py
tests/test_datasets_local.py
tests/test_datasets_local_csv.py
tests/test_datasets_local_image_phase4.py
tests/test_datasets_local_jsonl.py
tests/test_datasets_phase4.py
tests/test_datasets_source_backed.py
tests/test_exports_summary.py
tests/test_media_cache.py
tests/test_metrics_binary.py
tests/test_model_catalog.py
tests/test_models_anthropic.py
tests/test_models_guardreasoner_vl.py
tests/test_models_hf_image_classifier.py
tests/test_models_hf_safeqwen_vlm.py
tests/test_models_hf_shieldgemma2.py
tests/test_models_hf_vlm_guard.py
tests/test_models_http.py
tests/test_models_huggingface.py
tests/test_models_llama_guard.py
tests/test_models_omni_moderation_image.py
tests/test_models_openai_compatible.py
tests/test_models_openai_moderation.py
tests/test_models_vllm.py
tests/test_models_vlm_adapter_imports.py
tests/test_packs.py
tests/test_registry.py
tests/test_runner.py
tests/test_sample_cache.py
tests/test_schemas.py
tests/test_templates.py