.dockerignore
.gitignore
.pre-commit-config.yaml
.python-version
ATTRIBUTIONS.md
CLAUDE.md
CODE_OF_CONDUCT.md
CONTRIBUTING.md
LICENSE
Makefile
README.md
SECURITY.md
codecov.yml
pyproject.toml
uv.lock
.claude/skills/add-benchmark/SKILL.md
.claude/skills/add-benchmark/references/patterns.md
.claude/skills/nemo-gym-debugging/SKILL.md
.claude/skills/nemo-gym-debugging/references/diagnostic-snippets.md
.claude/skills/nemo-gym-debugging/references/error-profiles.md
.claude/skills/nemo-gym-debugging/references/request-boundary-visibility.md
.claude/skills/nemo-gym-debugging/references/vllm-tool-call-schema-checks.md
.claude/skills/nemo-gym-debugging/scripts/check_tool_call_jsonl.py
.claude/skills/nemo-gym-docs/SKILL.md
.claude/skills/nemo-gym-pivot-datasets/SKILL.md
.claude/skills/nemo-gym-pivot-datasets/references/config-training-and-agent-ref.md
.claude/skills/nemo-gym-pivot-datasets/references/conversion-patterns.md
.claude/skills/nemo-gym-pivot-datasets/references/row-contract.md
.claude/skills/nemo-gym-pivot-datasets/scripts/validate_pivot_dataset.py
.claude/skills/nemo-gym-pivot-datasets/scripts/reference/chat_messages_to_pivot_dataset_reference.py
.claude/skills/nemo-gym-pivot-datasets/scripts/reference/conversational_messages_to_pivot_dataset_reference.py
.claude/skills/nemo-gym-pivot-datasets/scripts/reference/generic_pivot_dataset_reference.py
.claude/skills/nemo-gym-pivot-datasets/scripts/reference/tool_messages_to_pivot_dataset_reference.py
.claude/skills/nemo-gym-reward-profiling/SKILL.md
.claude/skills/nemo-gym-reward-profiling/references/output-format.md
.claude/skills/nemo-gym-reward-profiling/references/quick-start.md
.codex/skills/nemo-gym-debugging/SKILL.md
.codex/skills/nemo-gym-debugging/agents/openai.yaml
.codex/skills/nemo-gym-debugging/references/diagnostic-snippets.md
.codex/skills/nemo-gym-debugging/references/error-profiles.md
.codex/skills/nemo-gym-debugging/references/request-boundary-visibility.md
.codex/skills/nemo-gym-debugging/references/vllm-tool-call-schema-checks.md
.codex/skills/nemo-gym-debugging/scripts/check_tool_call_jsonl.py
.codex/skills/nemo-gym-pivot-datasets/SKILL.md
.codex/skills/nemo-gym-pivot-datasets/agents/openai.yaml
.codex/skills/nemo-gym-pivot-datasets/references/config-training-and-agent-ref.md
.codex/skills/nemo-gym-pivot-datasets/references/conversion-patterns.md
.codex/skills/nemo-gym-pivot-datasets/references/row-contract.md
.codex/skills/nemo-gym-pivot-datasets/scripts/validate_pivot_dataset.py
.codex/skills/nemo-gym-pivot-datasets/scripts/reference/chat_messages_to_pivot_dataset_reference.py
.codex/skills/nemo-gym-pivot-datasets/scripts/reference/conversational_messages_to_pivot_dataset_reference.py
.codex/skills/nemo-gym-pivot-datasets/scripts/reference/generic_pivot_dataset_reference.py
.codex/skills/nemo-gym-pivot-datasets/scripts/reference/tool_messages_to_pivot_dataset_reference.py
.codex/skills/nemo-gym-reward-profiling/SKILL.md
.codex/skills/nemo-gym-reward-profiling/agents/openai.yaml
.codex/skills/nemo-gym-reward-profiling/references/output-format.md
.codex/skills/nemo-gym-reward-profiling/references/quick-start.md
.github/CODEOWNERS
.github/copy-pr-bot.yaml
.github/ISSUE_TEMPLATE/bug.md
.github/ISSUE_TEMPLATE/feature.md
.github/workflows/_build_container.yml
.github/workflows/cherry-pick-release-commit.yml
.github/workflows/close-inactive-issue-pr.yml
.github/workflows/code-linting.yml
.github/workflows/community-bot.yml
.github/workflows/copyright-check.yml
.github/workflows/fern-docs-ci.yml
.github/workflows/fern-docs-preview-build.yml
.github/workflows/fern-docs-preview-comment.yml
.github/workflows/full-test-suite.yml
.github/workflows/publish-fern-docs.yml
.github/workflows/release-freeze.yml
.github/workflows/release.yaml
.github/workflows/request-nvskills-ci.yml
.github/workflows/secrets-detector.yml
.github/workflows/unit-tests.yml
.github/workflows/config/.secrets.baseline
.github/workflows/config/changelog-config.json
benchmarks/.gitignore
benchmarks/aalcr/README.md
benchmarks/aalcr/__init__.py
benchmarks/aalcr/config.yaml
benchmarks/aalcr/prepare.py
benchmarks/aalcr/data/.gitignore
benchmarks/aalcr/data/aalcr_benchmark_metrics.json
benchmarks/aime24/__init__.py
benchmarks/aime24/config.yaml
benchmarks/aime24/prepare.py
benchmarks/aime24-x/README.md
benchmarks/aime24-x/__init__.py
benchmarks/aime24-x/aime24_x_utils.py
benchmarks/aime24-x/config.yaml
benchmarks/aime24-x/prepare.py
benchmarks/aime24-x/data/.gitignore
benchmarks/aime24/data/.gitignore
benchmarks/aime24/data/aime24_benchmark_metrics.json
benchmarks/aime25/__init__.py
benchmarks/aime25/config.yaml
benchmarks/aime25/prepare.py
benchmarks/aime25-x/README.md
benchmarks/aime25-x/__init__.py
benchmarks/aime25-x/aime25_x_utils.py
benchmarks/aime25-x/config.yaml
benchmarks/aime25-x/prepare.py
benchmarks/aime25-x/data/.gitignore
benchmarks/aime25/data/.gitignore
benchmarks/aime25/data/aime25_benchmark_metrics.json
benchmarks/aime26/README.md
benchmarks/aime26/__init__.py
benchmarks/aime26/config.yaml
benchmarks/aime26/prepare.py
benchmarks/aime26/data/.gitignore
benchmarks/answer-judge/README.md
benchmarks/answer-judge/__init__.py
benchmarks/answer-judge/config.yaml
benchmarks/answer-judge/prepare.py
benchmarks/answer-judge/data/.gitignore
benchmarks/apex_shortlist/README.md
benchmarks/apex_shortlist/__init__.py
benchmarks/apex_shortlist/config.yaml
benchmarks/apex_shortlist/prepare.py
benchmarks/apex_shortlist/data/.gitignore
benchmarks/arena_hard/README.md
benchmarks/arena_hard/__init__.py
benchmarks/arena_hard/config.yaml
benchmarks/arena_hard/prepare.py
benchmarks/arena_hard/data/.gitignore
benchmarks/arena_hard_v2/README.md
benchmarks/arena_hard_v2/__init__.py
benchmarks/arena_hard_v2/config.yaml
benchmarks/arena_hard_v2/prepare.py
benchmarks/arena_hard_v2/data/.gitignore
benchmarks/asr_leaderboard/README.md
benchmarks/asr_leaderboard/__init__.py
benchmarks/asr_leaderboard/config.yaml
benchmarks/asr_leaderboard/prepare.py
benchmarks/asr_leaderboard/data/.gitignore
benchmarks/asr_leaderboard/prompts/default.yaml
benchmarks/bigcodebench/README.md
benchmarks/bigcodebench/__init__.py
benchmarks/bigcodebench/config.yaml
benchmarks/bigcodebench/prepare.py
benchmarks/bigcodebench/data/.gitignore
benchmarks/birdbench/README.md
benchmarks/birdbench/__init__.py
benchmarks/birdbench/config.yaml
benchmarks/birdbench/prepare.py
benchmarks/birdbench/data/.gitignore
benchmarks/birdbench/prompts/default.yaml
benchmarks/browsecomp/README.md
benchmarks/browsecomp/__init__.py
benchmarks/browsecomp/config.yaml
benchmarks/browsecomp/prepare.py
benchmarks/browsecomp/data/.gitignore
benchmarks/browsecomp/data/browsecomp_benchmark_metrics.json
benchmarks/finance_sec_search/README.md
benchmarks/finance_sec_search/__init__.py
benchmarks/finance_sec_search/config_no_web_search.yaml
benchmarks/finance_sec_search/config_web_search.yaml
benchmarks/finance_sec_search/prepare.py
benchmarks/finance_sec_search/prepare_web_search.py
benchmarks/finance_sec_search/data/.gitignore
benchmarks/flores200/README.md
benchmarks/flores200/__init__.py
benchmarks/flores200/config.yaml
benchmarks/flores200/prepare.py
benchmarks/flores200/data/.gitignore
benchmarks/flores200/data/example_rollouts.jsonl
benchmarks/flores200/data/flores200_devtest_benchmark_metrics.json
benchmarks/flores200/prompts/default.yaml
benchmarks/frontierscience_olympiad/README.md
benchmarks/frontierscience_olympiad/__init__.py
benchmarks/frontierscience_olympiad/config.yaml
benchmarks/frontierscience_olympiad/prepare.py
benchmarks/frontierscience_olympiad/data/.gitignore
benchmarks/gdpval/README.md
benchmarks/gdpval/__init__.py
benchmarks/gdpval/config.yaml
benchmarks/gdpval/prepare.py
benchmarks/gdpval/data/.gitignore
benchmarks/global-piqa/README.md
benchmarks/global-piqa/__init__.py
benchmarks/global-piqa/config.yaml
benchmarks/global-piqa/prepare.py
benchmarks/global-piqa/data/.gitignore
benchmarks/global-piqa/prompts/default.yaml
benchmarks/gpqa/README.md
benchmarks/gpqa/__init__.py
benchmarks/gpqa/config.yaml
benchmarks/gpqa/prepare.py
benchmarks/gpqa-x/README.md
benchmarks/gpqa-x/__init__.py
benchmarks/gpqa-x/config.yaml
benchmarks/gpqa-x/gpqa_x_utils.py
benchmarks/gpqa-x/prepare.py
benchmarks/gpqa-x/data/.gitignore
benchmarks/gpqa/data/.gitignore
benchmarks/gpqa/data/gpqa_diamond_benchmark_metrics.json
benchmarks/gsm8k/README.md
benchmarks/gsm8k/__init__.py
benchmarks/gsm8k/config.yaml
benchmarks/gsm8k/prepare.py
benchmarks/gsm8k/data/.gitignore
benchmarks/hendrycks_math/README.md
benchmarks/hendrycks_math/__init__.py
benchmarks/hendrycks_math/config.yaml
benchmarks/hendrycks_math/prepare.py
benchmarks/hendrycks_math/data/.gitignore
benchmarks/hle/README.md
benchmarks/hle/__init__.py
benchmarks/hle/config.yaml
benchmarks/hle/prepare.py
benchmarks/hle/data/.gitignore
benchmarks/hle/prompts/default.yaml
benchmarks/hle/prompts/judge.txt
benchmarks/hmmt_feb25/README.md
benchmarks/hmmt_feb25/__init__.py
benchmarks/hmmt_feb25/config.yaml
benchmarks/hmmt_feb25/prepare.py
benchmarks/hmmt_feb25/data/.gitignore
benchmarks/hmmt_nov25/README.md
benchmarks/hmmt_nov25/__init__.py
benchmarks/hmmt_nov25/config.yaml
benchmarks/hmmt_nov25/prepare.py
benchmarks/hmmt_nov25/data/.gitignore
benchmarks/hotpotqa_closedbook/README.md
benchmarks/hotpotqa_closedbook/__init__.py
benchmarks/hotpotqa_closedbook/config.yaml
benchmarks/hotpotqa_closedbook/prepare.py
benchmarks/hotpotqa_closedbook/data/.gitignore
benchmarks/hotpotqa_closedbook/prompts/default.yaml
benchmarks/hotpotqa_closedbook/tests/__init__.py
benchmarks/hotpotqa_closedbook/tests/test_prepare.py
benchmarks/human_eval/README.md
benchmarks/human_eval/__init__.py
benchmarks/human_eval/config.yaml
benchmarks/human_eval/prepare.py
benchmarks/human_eval/data/.gitignore
benchmarks/human_eval_infilling/README.md
benchmarks/human_eval_infilling/__init__.py
benchmarks/human_eval_infilling/config.yaml
benchmarks/human_eval_infilling/prepare.py
benchmarks/human_eval_infilling/data/.gitignore
benchmarks/human_eval_infilling/prompts/default.yaml
benchmarks/ifbench/README.md
benchmarks/ifbench/__init__.py
benchmarks/ifbench/config.yaml
benchmarks/ifbench/prepare.py
benchmarks/ifbench/data/.gitignore
benchmarks/ifbench/data/ifbench_benchmark_metrics.json
benchmarks/ifbench/prompts/default.yaml
benchmarks/ifeval/README.md
benchmarks/ifeval/__init__.py
benchmarks/ifeval/config.yaml
benchmarks/ifeval/prepare.py
benchmarks/ifeval/data/.gitignore
benchmarks/imo_answerbench/README.md
benchmarks/imo_answerbench/__init__.py
benchmarks/imo_answerbench/config.yaml
benchmarks/imo_answerbench/prepare.py
benchmarks/imo_answerbench/data/.gitignore
benchmarks/imo_gradingbench/README.md
benchmarks/imo_gradingbench/__init__.py
benchmarks/imo_gradingbench/config.yaml
benchmarks/imo_gradingbench/prepare.py
benchmarks/imo_gradingbench/data/.gitignore
benchmarks/imo_gradingbench/prompts/default.yaml
benchmarks/imo_gradingbench/tests/test_prepare.py
benchmarks/imo_proofbench/README.md
benchmarks/imo_proofbench/__init__.py
benchmarks/imo_proofbench/config.yaml
benchmarks/imo_proofbench/prepare.py
benchmarks/imo_proofbench/data/.gitignore
benchmarks/imo_proofbench/prompts/default.yaml
benchmarks/ioi/README.md
benchmarks/ioi/__init__.py
benchmarks/ioi/config.yaml
benchmarks/ioi/prepare.py
benchmarks/ioi/data/.gitignore
benchmarks/labbench2_vlm/README.md
benchmarks/labbench2_vlm/__init__.py
benchmarks/labbench2_vlm/config.yaml
benchmarks/labbench2_vlm/prepare.py
benchmarks/librispeech_pc/README.md
benchmarks/librispeech_pc/__init__.py
benchmarks/librispeech_pc/config.yaml
benchmarks/librispeech_pc/prepare.py
benchmarks/librispeech_pc/data/.gitignore
benchmarks/librispeech_pc/prompts/default.yaml
benchmarks/livecodebench/__init__.py
benchmarks/livecodebench/prepare_utils.py
benchmarks/livecodebench-x/README.md
benchmarks/livecodebench-x/__init__.py
benchmarks/livecodebench-x/config.yaml
benchmarks/livecodebench-x/livecodebench_x_utils.py
benchmarks/livecodebench-x/prepare.py
benchmarks/livecodebench-x/data/.gitignore
benchmarks/livecodebench/prompts/cascade.yaml
benchmarks/livecodebench/v5_2408_2502/__init__.py
benchmarks/livecodebench/v5_2408_2502/cascade.yaml
benchmarks/livecodebench/v5_2408_2502/config.yaml
benchmarks/livecodebench/v5_2408_2502/prepare.py
benchmarks/livecodebench/v5_2408_2502/prepare_cascade.py
benchmarks/livecodebench/v5_2408_2502/data/.gitignore
benchmarks/livecodebench/v6_2408_2505/__init__.py
benchmarks/livecodebench/v6_2408_2505/cascade.yaml
benchmarks/livecodebench/v6_2408_2505/config.yaml
benchmarks/livecodebench/v6_2408_2505/prepare.py
benchmarks/livecodebench/v6_2408_2505/prepare_cascade.py
benchmarks/livecodebench/v6_2408_2505/data/.gitignore
benchmarks/longbench_v2/README.md
benchmarks/longbench_v2/__init__.py
benchmarks/longbench_v2/config.yaml
benchmarks/longbench_v2/prepare.py
benchmarks/longbench_v2/data/.gitignore
benchmarks/longbench_v2/prompts/default.yaml
benchmarks/longcodebench/README.md
benchmarks/longcodebench/__init__.py
benchmarks/longcodebench/config.yaml
benchmarks/longcodebench/prepare.py
benchmarks/longcodebench/data/.gitignore
benchmarks/m_arena_hard/README.md
benchmarks/m_arena_hard/__init__.py
benchmarks/m_arena_hard/config.yaml
benchmarks/m_arena_hard/prepare.py
benchmarks/m_arena_hard/data/.gitignore
benchmarks/m_arena_hard_v2/README.md
benchmarks/m_arena_hard_v2/__init__.py
benchmarks/m_arena_hard_v2/config.yaml
benchmarks/m_arena_hard_v2/prepare.py
benchmarks/m_arena_hard_v2/data/.gitignore
benchmarks/math-500/README.md
benchmarks/math-500/__init__.py
benchmarks/math-500/config.yaml
benchmarks/math-500/prepare.py
benchmarks/math-500/data/.gitignore
benchmarks/mbpp/README.md
benchmarks/mbpp/__init__.py
benchmarks/mbpp/config.yaml
benchmarks/mbpp/prepare.py
benchmarks/mbpp/data/.gitignore
benchmarks/mbpp/prompts/default.yaml
benchmarks/minif2f/README.md
benchmarks/minif2f/__init__.py
benchmarks/minif2f/config.yaml
benchmarks/minif2f/prepare.py
benchmarks/minif2f/data/.gitignore
benchmarks/mmlu/README.md
benchmarks/mmlu/config.yaml
benchmarks/mmlu/prepare.py
benchmarks/mmlu-redux/README.md
benchmarks/mmlu-redux/config.yaml
benchmarks/mmlu-redux/prepare.py
benchmarks/mmlu-redux/data/.gitignore
benchmarks/mmlu/data/.gitignore
benchmarks/mmlu_pro/README.md
benchmarks/mmlu_pro/__init__.py
benchmarks/mmlu_pro/config.yaml
benchmarks/mmlu_pro/prepare.py
benchmarks/mmlu_pro/data/.gitignore
benchmarks/mmlu_pro/data/mmlu_pro_benchmark_metrics.json
benchmarks/mmlu_prox/README.md
benchmarks/mmlu_prox/__init__.py
benchmarks/mmlu_prox/config.yaml
benchmarks/mmlu_prox/prepare.py
benchmarks/mmlu_prox/data/.gitignore
benchmarks/mmlu_prox/data/mmlu_prox_benchmark_metrics.json
benchmarks/mmmlu/README.md
benchmarks/mmmlu/config.yaml
benchmarks/mmmlu/mmmlu_utils.py
benchmarks/mmmlu/prepare.py
benchmarks/mmmlu/data/.gitignore
benchmarks/mobench/README.md
benchmarks/mobench/__init__.py
benchmarks/mobench/config.yaml
benchmarks/mobench/prepare.py
benchmarks/mobench/data/.gitignore
benchmarks/mrcr/README.md
benchmarks/mrcr/__init__.py
benchmarks/mrcr/config.yaml
benchmarks/mrcr/prepare.py
benchmarks/mrcr/data/.gitignore
benchmarks/musan/README.md
benchmarks/musan/__init__.py
benchmarks/musan/config.yaml
benchmarks/musan/prepare.py
benchmarks/musan/data/.gitignore
benchmarks/musan/prompts/default.yaml
benchmarks/musan/tests/__init__.py
benchmarks/musan/tests/test_prepare.py
benchmarks/nemotron_3_ultra/README.md
benchmarks/nemotron_3_ultra/benchmarks_long.yaml
benchmarks/nemotron_3_ultra/benchmarks_no_external.yaml
benchmarks/nemotron_3_ultra/benchmarks_short.yaml
benchmarks/nemotron_3_ultra/judge_local_endpoints.yaml
benchmarks/nemotron_3_ultra/judge_remote_endpoints.yaml
benchmarks/nemotron_3_ultra/ultra_local_endpoint.yaml
benchmarks/nemotron_3_ultra/ultra_remote_endpoint.yaml
benchmarks/numb3rs/README.md
benchmarks/numb3rs/__init__.py
benchmarks/numb3rs/config.yaml
benchmarks/numb3rs/prepare.py
benchmarks/numb3rs/data/.gitignore
benchmarks/numb3rs/prompts/default.yaml
benchmarks/numb3rs/tests/__init__.py
benchmarks/numb3rs/tests/test_prepare.py
benchmarks/omniscience/README.md
benchmarks/omniscience/__init__.py
benchmarks/omniscience/config.yaml
benchmarks/omniscience/judge_gptoss20b.yaml
benchmarks/omniscience/prepare.py
benchmarks/omniscience/data/.gitignore
benchmarks/omniscience/prompts/generation.yaml
benchmarks/physics/README.md
benchmarks/physics/__init__.py
benchmarks/physics/config.yaml
benchmarks/physics/prepare.py
benchmarks/physics/data/.gitignore
benchmarks/physics/prompts/default.yaml
benchmarks/polymath/README.md
benchmarks/polymath/__init__.py
benchmarks/polymath/config.yaml
benchmarks/polymath/prepare.py
benchmarks/polymath/data/.gitignore
benchmarks/polymath/tests/test_prepare.py
benchmarks/prompts/eval/aai/mcq-10choices.yaml
benchmarks/prompts/eval/aai/mcq-4choices-boxed.yaml
benchmarks/prompts/eval/aai/mcq-4choices.yaml
benchmarks/prompts/eval/bigcodebench/codegen.yaml
benchmarks/prompts/eval/livecodebench/default_reasoning.yaml
benchmarks/prompts/generic/codegen.yaml
benchmarks/prompts/generic/default.yaml
benchmarks/prompts/generic/general-boxed.yaml
benchmarks/prompts/generic/math.yaml
benchmarks/prompts/judge/math-proof-judge.yaml
benchmarks/prompts/judge/math.yaml
benchmarks/prompts/lean4/formal-proof-deepseek-prover-v2.yaml
benchmarks/proof-arena-judge/README.md
benchmarks/proof-arena-judge/__init__.py
benchmarks/proof-arena-judge/config.yaml
benchmarks/proof-arena-judge/prepare.py
benchmarks/proof-arena-judge/data/.gitignore
benchmarks/proof-arena-judge/gemini_imo_2025/1.txt
benchmarks/proof-arena-judge/gemini_imo_2025/2.txt
benchmarks/proof-arena-judge/gemini_imo_2025/3.txt
benchmarks/proof-arena-judge/gemini_imo_2025/4.txt
benchmarks/proof-arena-judge/gemini_imo_2025/5.txt
benchmarks/proof_bench_judge/README.md
benchmarks/proof_bench_judge/__init__.py
benchmarks/proof_bench_judge/config.yaml
benchmarks/proof_bench_judge/prepare.py
benchmarks/proof_bench_judge/data/.gitignore
benchmarks/proofnet/README.md
benchmarks/proofnet/__init__.py
benchmarks/proofnet/config.yaml
benchmarks/proofnet/prepare.py
benchmarks/proofnet/data/.gitignore
benchmarks/putnam_bench/README.md
benchmarks/putnam_bench/__init__.py
benchmarks/putnam_bench/config.yaml
benchmarks/putnam_bench/prepare.py
benchmarks/putnam_bench/data/.gitignore
benchmarks/ruler/.gitignore
benchmarks/ruler/README.md
benchmarks/ruler/__init__.py
benchmarks/ruler/config_nemotron_3_256k.yaml
benchmarks/ruler/prepare_nemotron_3_256k.py
benchmarks/ruler/prepare_utils.py
benchmarks/ruler/ruler_prepare_script.py
benchmarks/ruler/ruler_thread_unsafe.py
benchmarks/ruler/data/.gitignore
benchmarks/simpleqa/README.md
benchmarks/simpleqa/__init__.py
benchmarks/simpleqa/config.yaml
benchmarks/simpleqa/judge_gptoss20b.yaml
benchmarks/simpleqa/prepare.py
benchmarks/simpleqa/data/.gitignore
benchmarks/simpleqa/tests/__init__.py
benchmarks/speed-bench/README.md
benchmarks/speed-bench/__init__.py
benchmarks/speed-bench/_prepare_common.py
benchmarks/speed-bench/config_qualitative.yaml
benchmarks/speed-bench/config_throughput_2k.yaml
benchmarks/speed-bench/prepare.py
benchmarks/speed-bench/prepare_throughput_2k.py
benchmarks/speed-bench/data/.gitignore
benchmarks/spider2_lite/README.md
benchmarks/spider2_lite/__init__.py
benchmarks/spider2_lite/config.yaml
benchmarks/spider2_lite/prepare.py
benchmarks/spider2_lite/data/.gitignore
benchmarks/spider2_lite/data/spider2_lite_benchmark_metrics.json
benchmarks/supergpqa/README.md
benchmarks/supergpqa/__init__.py
benchmarks/supergpqa/config.yaml
benchmarks/supergpqa/prepare.py
benchmarks/supergpqa/data/.gitignore
benchmarks/tau2/.gitignore
benchmarks/tau2/__init__.py
benchmarks/tau2/config.yaml
benchmarks/tau2/prepare.py
benchmarks/tau2/data/.gitignore
benchmarks/ugphysics/README.md
benchmarks/ugphysics/__init__.py
benchmarks/ugphysics/config.yaml
benchmarks/ugphysics/judge_gptoss20b.yaml
benchmarks/ugphysics/prepare.py
benchmarks/ugphysics/data/.gitignore
benchmarks/ugphysics/prompts/default.yaml
benchmarks/ugphysics/tests/__init__.py
benchmarks/ugphysics/tests/test_prepare.py
benchmarks/wmt24pp/README.md
benchmarks/wmt24pp/__init__.py
benchmarks/wmt24pp/config.yaml
benchmarks/wmt24pp/prepare.py
benchmarks/wmt24pp/data/.gitignore
benchmarks/wmt24pp/prompts/default.yaml
benchmarks/xstest/README.md
benchmarks/xstest/__init__.py
benchmarks/xstest/config.yaml
benchmarks/xstest/prepare.py
benchmarks/xstest/data/.gitignore
cache/.gitignore
cache/nemo_gym.egg-info/PKG-INFO
cache/nemo_gym.egg-info/SOURCES.txt
cache/nemo_gym.egg-info/dependency_links.txt
cache/nemo_gym.egg-info/entry_points.txt
cache/nemo_gym.egg-info/requires.txt
cache/nemo_gym.egg-info/top_level.txt
data/.gitignore
docs/README.md
environments/__init__.py
environments/workplace_assistant/README.md
environments/workplace_assistant/__init__.py
environments/workplace_assistant/config.yaml
environments/workplace_assistant/prepare.py
environments/workplace_assistant/data/.gitignore
environments/workplace_assistant/data/example.jsonl
environments/workplace_assistant/data/example_metrics.json
environments/workplace_assistant/data/example_rollouts.jsonl
example_environments/__init__.py
example_environments/example_single_tool_call/README.md
example_environments/example_single_tool_call/__init__.py
example_environments/example_single_tool_call/config.yaml
example_environments/example_single_tool_call/data/example.jsonl
example_environments/example_single_tool_call/data/example_metrics.json
example_environments/example_single_tool_call/data/example_rollouts.jsonl
fern/.gitignore
fern/README.md
fern/docs.yml
fern/fern.config.json
fern/package.json
fern/assets/images/env-training-library.png
fern/assets/images/eval-improve-loop.png
fern/assets/images/model-vs-agent-eval.png
fern/assets/images/product_overview.png
fern/assets/images/product_overview.svg
fern/components/NavButton.tsx
fern/versions/latest.yml
fern/versions/main.yml
fern/versions/v0.2.1.yml
fern/versions/v0.3.0.yml
fern/versions/latest/pages/about/architecture.mdx
fern/versions/latest/pages/about/ecosystem.mdx
fern/versions/latest/pages/about/index.mdx
fern/versions/latest/pages/about/release-notes.mdx
fern/versions/latest/pages/about/concepts/environments.mdx
fern/versions/latest/pages/about/concepts/evaluation.mdx
fern/versions/latest/pages/about/concepts/index.mdx
fern/versions/latest/pages/about/concepts/key-terminology.mdx
fern/versions/latest/pages/about/concepts/training.mdx
fern/versions/latest/pages/agent-server/index.mdx
fern/versions/latest/pages/api-reference/index.mdx
fern/versions/latest/pages/contribute/development-setup.mdx
fern/versions/latest/pages/contribute/index.mdx
fern/versions/latest/pages/contribute/environments/index.mdx
fern/versions/latest/pages/contribute/environments/new-environment.mdx
fern/versions/latest/pages/contribute/rl-framework-integration/generation-backend-and-openai-compatible-http-server.mdx
fern/versions/latest/pages/contribute/rl-framework-integration/gym-integration-footprint-and-form-factor.mdx
fern/versions/latest/pages/contribute/rl-framework-integration/gym-rl-framework-integration-success-criteria.mdx
fern/versions/latest/pages/contribute/rl-framework-integration/index.mdx
fern/versions/latest/pages/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.mdx
fern/versions/latest/pages/data/download-huggingface.mdx
fern/versions/latest/pages/data/index.mdx
fern/versions/latest/pages/data/prepare-validate.mdx
fern/versions/latest/pages/data/prompt-config.mdx
fern/versions/latest/pages/environment-tutorials/adding-a-benchmark.mdx
fern/versions/latest/pages/environment-tutorials/aggregate-metrics.mdx
fern/versions/latest/pages/environment-tutorials/index.mdx
fern/versions/latest/pages/environment-tutorials/integrate-external-environments.mdx
fern/versions/latest/pages/environment-tutorials/multi-step-environment.mdx
fern/versions/latest/pages/environment-tutorials/single-step-environment.mdx
fern/versions/latest/pages/environment-tutorials/stateful-environment.mdx
fern/versions/latest/pages/environment-tutorials/real-world-environment/generating-training-data.mdx
fern/versions/latest/pages/environment-tutorials/real-world-environment/index.mdx
fern/versions/latest/pages/environment-tutorials/real-world-environment/resources-server-implementation.mdx
fern/versions/latest/pages/environment-tutorials/verification-patterns/index.mdx
fern/versions/latest/pages/environment-tutorials/verification-patterns/llm-as-judge.mdx
fern/versions/latest/pages/get-started/installation.mdx
fern/versions/latest/pages/get-started/prerequisites.mdx
fern/versions/latest/pages/get-started/quickstart.mdx
fern/versions/latest/pages/infrastructure/deployment-topology.mdx
fern/versions/latest/pages/infrastructure/index.mdx
fern/versions/latest/pages/infrastructure/engineering-notes/aiohttp-vs-httpx.mdx
fern/versions/latest/pages/infrastructure/engineering-notes/index.mdx
fern/versions/latest/pages/infrastructure/engineering-notes/responses-api-evolution.mdx
fern/versions/latest/pages/infrastructure/engineering-notes/swe-rl-case-study.mdx
fern/versions/latest/pages/infrastructure/engineering-notes/system-design.mdx
fern/versions/latest/pages/model-recipes/index.mdx
fern/versions/latest/pages/model-recipes/nemotron-3-nano.mdx
fern/versions/latest/pages/model-recipes/nemotron-3-super.mdx
fern/versions/latest/pages/model-server/index.mdx
fern/versions/latest/pages/model-server/vllm.mdx
fern/versions/latest/pages/reference/cli-commands.mdx
fern/versions/latest/pages/reference/configuration.mdx
fern/versions/latest/pages/reference/faq.mdx
fern/versions/latest/pages/reference/index.mdx
fern/versions/latest/pages/reference/rl-framework-compatibility.mdx
fern/versions/latest/pages/resources-server/index.mdx
fern/versions/latest/pages/training-tutorials/index.mdx
fern/versions/latest/pages/training-tutorials/multi-environment-training.mdx
fern/versions/latest/pages/training-tutorials/offline-training-w-rollouts.mdx
fern/versions/latest/pages/training-tutorials/unsloth.mdx
fern/versions/latest/pages/training-tutorials/verl.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/about-workplace-assistant.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/gym-configuration.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/index.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/multi-node-training.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/nemo-rl-configuration.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/setup.mdx
fern/versions/latest/pages/training-tutorials/nemo-rl-grpo/single-node-training.mdx
fern/versions/latest/pages/troubleshooting/configuration.mdx
fern/versions/latest/pages/troubleshooting/index.mdx
fern/versions/v0.2.1/pages/index.mdx
fern/versions/v0.2.1/pages/about/ecosystem.mdx
fern/versions/v0.2.1/pages/about/index.mdx
fern/versions/v0.2.1/pages/about/concepts/architecture.mdx
fern/versions/v0.2.1/pages/about/concepts/configuration.mdx
fern/versions/v0.2.1/pages/about/concepts/core-components.mdx
fern/versions/v0.2.1/pages/about/concepts/index.mdx
fern/versions/v0.2.1/pages/about/concepts/key-terminology.mdx
fern/versions/v0.2.1/pages/about/concepts/task-verification.mdx
fern/versions/v0.2.1/pages/about/concepts/training-approaches.mdx
fern/versions/v0.2.1/pages/agent-server/index.mdx
fern/versions/v0.2.1/pages/api-reference/index.mdx
fern/versions/v0.2.1/pages/benchmarks/adding-a-benchmark.mdx
fern/versions/v0.2.1/pages/benchmarks/designing-customer-evaluation.mdx
fern/versions/v0.2.1/pages/benchmarks/index.mdx
fern/versions/v0.2.1/pages/benchmarks/run-benchmark-suite.mdx
fern/versions/v0.2.1/pages/contribute/development-setup.mdx
fern/versions/v0.2.1/pages/contribute/index.mdx
fern/versions/v0.2.1/pages/contribute/environments/index.mdx
fern/versions/v0.2.1/pages/contribute/environments/new-environment.mdx
fern/versions/v0.2.1/pages/contribute/rl-framework-integration/generation-backend-and-openai-compatible-http-server.mdx
fern/versions/v0.2.1/pages/contribute/rl-framework-integration/gym-integration-footprint-and-form-factor.mdx
fern/versions/v0.2.1/pages/contribute/rl-framework-integration/gym-rl-framework-integration-success-criteria.mdx
fern/versions/v0.2.1/pages/contribute/rl-framework-integration/index.mdx
fern/versions/v0.2.1/pages/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.mdx
fern/versions/v0.2.1/pages/data/download-huggingface.mdx
fern/versions/v0.2.1/pages/data/index.mdx
fern/versions/v0.2.1/pages/data/prepare-validate.mdx
fern/versions/v0.2.1/pages/data/prompt-config.mdx
fern/versions/v0.2.1/pages/environment-tutorials/aggregate-metrics.mdx
fern/versions/v0.2.1/pages/environment-tutorials/index.mdx
fern/versions/v0.2.1/pages/environment-tutorials/integrate-external-environments.mdx
fern/versions/v0.2.1/pages/environment-tutorials/llm-as-judge-verification.mdx
fern/versions/v0.2.1/pages/environment-tutorials/multi-step-environment.mdx
fern/versions/v0.2.1/pages/environment-tutorials/single-step-environment.mdx
fern/versions/v0.2.1/pages/environment-tutorials/stateful-environment.mdx
fern/versions/v0.2.1/pages/environment-tutorials/real-world-environment/generating-training-data.mdx
fern/versions/v0.2.1/pages/environment-tutorials/real-world-environment/index.mdx
fern/versions/v0.2.1/pages/environment-tutorials/real-world-environment/resources-server-implementation.mdx
fern/versions/v0.2.1/pages/get-started/detailed-setup.mdx
fern/versions/v0.2.1/pages/get-started/index.mdx
fern/versions/v0.2.1/pages/get-started/pypi-install.mdx
fern/versions/v0.2.1/pages/get-started/quickstart.mdx
fern/versions/v0.2.1/pages/get-started/rollout-collection.mdx
fern/versions/v0.2.1/pages/infrastructure/deployment-topology.mdx
fern/versions/v0.2.1/pages/infrastructure/index.mdx
fern/versions/v0.2.1/pages/infrastructure/engineering-notes/aiohttp-vs-httpx.mdx
fern/versions/v0.2.1/pages/infrastructure/engineering-notes/index.mdx
fern/versions/v0.2.1/pages/infrastructure/engineering-notes/responses-api-evolution.mdx
fern/versions/v0.2.1/pages/infrastructure/engineering-notes/swe-rl-case-study.mdx
fern/versions/v0.2.1/pages/model-recipes/index.mdx
fern/versions/v0.2.1/pages/model-recipes/nemotron-3-nano.mdx
fern/versions/v0.2.1/pages/model-recipes/nemotron-3-super.mdx
fern/versions/v0.2.1/pages/model-server/index.mdx
fern/versions/v0.2.1/pages/model-server/vllm.mdx
fern/versions/v0.2.1/pages/reference/cli-commands.mdx
fern/versions/v0.2.1/pages/reference/configuration.mdx
fern/versions/v0.2.1/pages/reference/faq.mdx
fern/versions/v0.2.1/pages/reference/index.mdx
fern/versions/v0.2.1/pages/reference/rl-framework-compatibility.mdx
fern/versions/v0.2.1/pages/resources-server/index.mdx
fern/versions/v0.2.1/pages/training-tutorials/index.mdx
fern/versions/v0.2.1/pages/training-tutorials/multi-environment-training.mdx
fern/versions/v0.2.1/pages/training-tutorials/offline-training-w-rollouts.mdx
fern/versions/v0.2.1/pages/training-tutorials/unsloth.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/about-workplace-assistant.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/gym-configuration.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/index.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/multi-node-training.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/nemo-rl-configuration.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/setup.mdx
fern/versions/v0.2.1/pages/training-tutorials/nemo-rl-grpo/single-node-training.mdx
fern/versions/v0.2.1/pages/troubleshooting/configuration.mdx
fern/versions/v0.2.1/pages/troubleshooting/index.mdx
fern/versions/v0.3.0/pages/about/architecture.mdx
fern/versions/v0.3.0/pages/about/ecosystem.mdx
fern/versions/v0.3.0/pages/about/index.mdx
fern/versions/v0.3.0/pages/about/release-notes.mdx
fern/versions/v0.3.0/pages/about/concepts/environments.mdx
fern/versions/v0.3.0/pages/about/concepts/evaluation.mdx
fern/versions/v0.3.0/pages/about/concepts/index.mdx
fern/versions/v0.3.0/pages/about/concepts/key-terminology.mdx
fern/versions/v0.3.0/pages/about/concepts/training.mdx
fern/versions/v0.3.0/pages/agent-server/index.mdx
fern/versions/v0.3.0/pages/api-reference/index.mdx
fern/versions/v0.3.0/pages/contribute/development-setup.mdx
fern/versions/v0.3.0/pages/contribute/index.mdx
fern/versions/v0.3.0/pages/contribute/environments/index.mdx
fern/versions/v0.3.0/pages/contribute/environments/new-environment.mdx
fern/versions/v0.3.0/pages/contribute/rl-framework-integration/generation-backend-and-openai-compatible-http-server.mdx
fern/versions/v0.3.0/pages/contribute/rl-framework-integration/gym-integration-footprint-and-form-factor.mdx
fern/versions/v0.3.0/pages/contribute/rl-framework-integration/gym-rl-framework-integration-success-criteria.mdx
fern/versions/v0.3.0/pages/contribute/rl-framework-integration/index.mdx
fern/versions/v0.3.0/pages/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.mdx
fern/versions/v0.3.0/pages/data/download-huggingface.mdx
fern/versions/v0.3.0/pages/data/index.mdx
fern/versions/v0.3.0/pages/data/prepare-validate.mdx
fern/versions/v0.3.0/pages/data/prompt-config.mdx
fern/versions/v0.3.0/pages/environment-tutorials/adding-a-benchmark.mdx
fern/versions/v0.3.0/pages/environment-tutorials/aggregate-metrics.mdx
fern/versions/v0.3.0/pages/environment-tutorials/index.mdx
fern/versions/v0.3.0/pages/environment-tutorials/integrate-external-environments.mdx
fern/versions/v0.3.0/pages/environment-tutorials/multi-step-environment.mdx
fern/versions/v0.3.0/pages/environment-tutorials/single-step-environment.mdx
fern/versions/v0.3.0/pages/environment-tutorials/stateful-environment.mdx
fern/versions/v0.3.0/pages/environment-tutorials/real-world-environment/generating-training-data.mdx
fern/versions/v0.3.0/pages/environment-tutorials/real-world-environment/index.mdx
fern/versions/v0.3.0/pages/environment-tutorials/real-world-environment/resources-server-implementation.mdx
fern/versions/v0.3.0/pages/environment-tutorials/verification-patterns/index.mdx
fern/versions/v0.3.0/pages/environment-tutorials/verification-patterns/llm-as-judge.mdx
fern/versions/v0.3.0/pages/get-started/installation.mdx
fern/versions/v0.3.0/pages/get-started/prerequisites.mdx
fern/versions/v0.3.0/pages/get-started/quickstart.mdx
fern/versions/v0.3.0/pages/infrastructure/deployment-topology.mdx
fern/versions/v0.3.0/pages/infrastructure/index.mdx
fern/versions/v0.3.0/pages/infrastructure/engineering-notes/aiohttp-vs-httpx.mdx
fern/versions/v0.3.0/pages/infrastructure/engineering-notes/index.mdx
fern/versions/v0.3.0/pages/infrastructure/engineering-notes/responses-api-evolution.mdx
fern/versions/v0.3.0/pages/infrastructure/engineering-notes/swe-rl-case-study.mdx
fern/versions/v0.3.0/pages/infrastructure/engineering-notes/system-design.mdx
fern/versions/v0.3.0/pages/model-recipes/index.mdx
fern/versions/v0.3.0/pages/model-recipes/nemotron-3-nano.mdx
fern/versions/v0.3.0/pages/model-recipes/nemotron-3-super.mdx
fern/versions/v0.3.0/pages/model-server/index.mdx
fern/versions/v0.3.0/pages/model-server/vllm.mdx
fern/versions/v0.3.0/pages/reference/cli-commands.mdx
fern/versions/v0.3.0/pages/reference/configuration.mdx
fern/versions/v0.3.0/pages/reference/faq.mdx
fern/versions/v0.3.0/pages/reference/index.mdx
fern/versions/v0.3.0/pages/reference/rl-framework-compatibility.mdx
fern/versions/v0.3.0/pages/resources-server/index.mdx
fern/versions/v0.3.0/pages/training-tutorials/index.mdx
fern/versions/v0.3.0/pages/training-tutorials/multi-environment-training.mdx
fern/versions/v0.3.0/pages/training-tutorials/offline-training-w-rollouts.mdx
fern/versions/v0.3.0/pages/training-tutorials/unsloth.mdx
fern/versions/v0.3.0/pages/training-tutorials/verl.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/about-workplace-assistant.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/gym-configuration.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/index.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/multi-node-training.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/nemo-rl-configuration.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/setup.mdx
fern/versions/v0.3.0/pages/training-tutorials/nemo-rl-grpo/single-node-training.mdx
fern/versions/v0.3.0/pages/troubleshooting/configuration.mdx
fern/versions/v0.3.0/pages/troubleshooting/index.mdx
nemo_gym/__init__.py
nemo_gym/base_resources_server.py
nemo_gym/base_responses_api_agent.py
nemo_gym/base_responses_api_model.py
nemo_gym/benchmarks.py
nemo_gym/cli.py
nemo_gym/cli_setup_command.py
nemo_gym/config_types.py
nemo_gym/dataset_orchestrator.py
nemo_gym/gitlab_utils.py
nemo_gym/global_config.py
nemo_gym/hf_utils.py
nemo_gym/openai_utils.py
nemo_gym/package_info.py
nemo_gym/profiling.py
nemo_gym/prompt.py
nemo_gym/reward_profile.py
nemo_gym/rollout_collection.py
nemo_gym/server_metadata.py
nemo_gym/server_status.py
nemo_gym/server_utils.py
nemo_gym/train_data_utils.py
nemo_gym/resources/resources_server_template.py
nemo_gym/resources/resources_server_test_template.py
resources_servers/.gitignore
resources_servers/aalcr/README.md
resources_servers/aalcr/app.py
resources_servers/aalcr/requirements.txt
resources_servers/aalcr/configs/aalcr.yaml
resources_servers/aalcr/data/.gitignore
resources_servers/aalcr/data/example.jsonl
resources_servers/aalcr/data/example_metrics.json
resources_servers/aalcr/data/example_rollouts.jsonl
resources_servers/aalcr/tests/test_app.py
resources_servers/abstention/README.md
resources_servers/abstention/app.py
resources_servers/abstention/dataset_preprocess.py
resources_servers/abstention/requirements.txt
resources_servers/abstention/configs/abstention.yaml
resources_servers/abstention/data/.gitignore
resources_servers/abstention/data/example.jsonl
resources_servers/abstention/data/example_metrics.json
resources_servers/abstention/data/example_rollouts.jsonl
resources_servers/abstention/tests/__init__.py
resources_servers/abstention/tests/test_app.py
resources_servers/arc_agi/README.md
resources_servers/arc_agi/app.py
resources_servers/arc_agi/create_dataset.py
resources_servers/arc_agi/requirements.txt
resources_servers/arc_agi/configs/arc_agi.yaml
resources_servers/arc_agi/data/example.jsonl
resources_servers/arc_agi/data/example_metrics.json
resources_servers/arc_agi/data/example_prepare.jsonl
resources_servers/arc_agi/data/example_rollouts.jsonl
resources_servers/arc_agi/tests/__init__.py
resources_servers/arc_agi/tests/test_app.py
resources_servers/arena_judge/README.md
resources_servers/arena_judge/__init__.py
resources_servers/arena_judge/app.py
resources_servers/arena_judge/requirements.txt
resources_servers/arena_judge/configs/arena_judge.yaml
resources_servers/arena_judge/data/.gitignore
resources_servers/arena_judge/data/example.jsonl
resources_servers/arena_judge/data/example_metrics.json
resources_servers/arena_judge/data/example_rollouts.jsonl
resources_servers/arena_judge/prompts/arena.yaml
resources_servers/arena_judge/prompts/arena_creative.yaml
resources_servers/arena_judge/tests/__init__.py
resources_servers/arena_judge/tests/test_app.py
resources_servers/asr_with_pc/README.md
resources_servers/asr_with_pc/__init__.py
resources_servers/asr_with_pc/app.py
resources_servers/asr_with_pc/generate_example_data.py
resources_servers/asr_with_pc/requirements.txt
resources_servers/asr_with_pc/configs/asr_with_pc.yaml
resources_servers/asr_with_pc/data/.gitignore
resources_servers/asr_with_pc/data/example.jsonl
resources_servers/asr_with_pc/data/example_metrics.json
resources_servers/asr_with_pc/data/example_rollouts.jsonl
resources_servers/asr_with_pc/data/example_rollouts_agent_metrics.json
resources_servers/asr_with_pc/data/example_rollouts_aggregate_metrics.json
resources_servers/asr_with_pc/data/example_rollouts_materialized_inputs.jsonl
resources_servers/asr_with_pc/data/example_rollouts_reward_profiling.jsonl
resources_servers/asr_with_pc/tests/__init__.py
resources_servers/asr_with_pc/tests/test_app.py
resources_servers/aviary/README.md
resources_servers/aviary/__init__.py
resources_servers/aviary/app.py
resources_servers/aviary/client_app.py
resources_servers/aviary/gsm8k_app.py
resources_servers/aviary/hotpotqa_app.py
resources_servers/aviary/hypotest_app.py
resources_servers/aviary/notebook_app.py
resources_servers/aviary/requirements.txt
resources_servers/aviary/schemas.py
resources_servers/aviary/configs/aviary.yaml
resources_servers/aviary/configs/bbh_bundled.yaml
resources_servers/aviary/configs/bbh_remote.yaml
resources_servers/aviary/configs/bixbench_aviary.yaml
resources_servers/aviary/configs/gsm8k_aviary.yaml
resources_servers/aviary/configs/hotpotqa_aviary.yaml
resources_servers/aviary/data/.gitignore
resources_servers/aviary/data/bbh_train_example.jsonl
resources_servers/aviary/data/bixbench_example.jsonl
resources_servers/aviary/data/example.jsonl
resources_servers/aviary/data/example_metrics.json
resources_servers/aviary/data/example_rollouts.jsonl
resources_servers/aviary/data/gsm8k_example.jsonl
resources_servers/aviary/data/hotpotqa_example.jsonl
resources_servers/aviary/tests/test_app.py
resources_servers/bigcodebench/.gitignore
resources_servers/bigcodebench/README.md
resources_servers/bigcodebench/app.py
resources_servers/bigcodebench/bcb_runner.py
resources_servers/bigcodebench/code_extraction.py
resources_servers/bigcodebench/requirements.txt
resources_servers/bigcodebench/setup_bcb_venv.py
resources_servers/bigcodebench/configs/bigcodebench.yaml
resources_servers/bigcodebench/data/.gitignore
resources_servers/bigcodebench/data/example.jsonl
resources_servers/bigcodebench/data/example_metrics.json
resources_servers/bigcodebench/data/example_rollouts.jsonl
resources_servers/bigcodebench/tests/__init__.py
resources_servers/bigcodebench/tests/test_app.py
resources_servers/bird_sql/README.md
resources_servers/bird_sql/app.py
resources_servers/bird_sql/eval_utils.py
resources_servers/bird_sql/requirements.txt
resources_servers/bird_sql/setup_bird_sql.py
resources_servers/bird_sql/configs/bird_sql.yaml
resources_servers/bird_sql/data/.gitignore
resources_servers/bird_sql/data/example.jsonl
resources_servers/bird_sql/data/example_metrics.json
resources_servers/bird_sql/data/example_rollouts.jsonl
resources_servers/bird_sql/tests/__init__.py
resources_servers/bird_sql/tests/test_app.py
resources_servers/blackjack/README.md
resources_servers/blackjack/app.py
resources_servers/blackjack/requirements.txt
resources_servers/blackjack/configs/blackjack.yaml
resources_servers/blackjack/data/.gitignore
resources_servers/blackjack/data/example.jsonl
resources_servers/blackjack/data/example_metrics.json
resources_servers/blackjack/data/example_rollouts.jsonl
resources_servers/blackjack/data/example_rollouts_aggregate_metrics.json
resources_servers/blackjack/data/example_rollouts_materialized_inputs.jsonl
resources_servers/blackjack/tests/__init__.py
resources_servers/blackjack/tests/test_app.py
resources_servers/browsecomp_advanced_harness/README.md
resources_servers/browsecomp_advanced_harness/app.py
resources_servers/browsecomp_advanced_harness/judge_prompt.py
resources_servers/browsecomp_advanced_harness/requirements.txt
resources_servers/browsecomp_advanced_harness/configs/browsecomp_advanced_harness.yaml
resources_servers/browsecomp_advanced_harness/data/.gitignore
resources_servers/browsecomp_advanced_harness/data/example.jsonl
resources_servers/browsecomp_advanced_harness/data/example_metrics.json
resources_servers/browsecomp_advanced_harness/data/example_rollouts.jsonl
resources_servers/browsecomp_advanced_harness/tests/dummy_exclude_domains_file.json
resources_servers/browsecomp_advanced_harness/tests/test_app.py
resources_servers/calendar/README.md
resources_servers/calendar/app.py
resources_servers/calendar/client.py
resources_servers/calendar/create_synth_conversations.py
resources_servers/calendar/dataset_preprocess.py
resources_servers/calendar/generate_rollouts.py
resources_servers/calendar/prompts.py
resources_servers/calendar/requirements.txt
resources_servers/calendar/utils.py
resources_servers/calendar/configs/calendar.yaml
resources_servers/calendar/configs/calendar_v2.yaml
resources_servers/calendar/data/.gitignore
resources_servers/calendar/data/example.jsonl
resources_servers/calendar/data/example_metrics.json
resources_servers/calendar/data/example_rollouts.jsonl
resources_servers/calendar/tests/README.md
resources_servers/calendar/tests/test_app.py
resources_servers/circle_click/README.md
resources_servers/circle_click/app.py
resources_servers/circle_click/generate_data.py
resources_servers/circle_click/requirements.txt
resources_servers/circle_click/view_rollouts.py
resources_servers/circle_click/configs/circle_click.yaml
resources_servers/circle_click/data/example.jsonl
resources_servers/circle_click/data/example_metrics.json
resources_servers/circle_click/data/example_rollouts.jsonl
resources_servers/circle_click/data/example_rollouts_agent_metrics.json
resources_servers/circle_click/data/example_rollouts_materialized_inputs.jsonl
resources_servers/circle_click/data/example_rollouts_reward_profiling.jsonl
resources_servers/circle_click/tests/__init__.py
resources_servers/circle_click/tests/test_app.py
resources_servers/circle_count/README.md
resources_servers/circle_count/app.py
resources_servers/circle_count/generate_data.py
resources_servers/circle_count/requirements.txt
resources_servers/circle_count/view_rollouts.py
resources_servers/circle_count/configs/circle_count.yaml
resources_servers/circle_count/data/.gitignore
resources_servers/circle_count/data/example.jsonl
resources_servers/circle_count/data/example_metrics.json
resources_servers/circle_count/data/example_rollouts.jsonl
resources_servers/circle_count/data/example_rollouts_aggregate_metrics.json
resources_servers/circle_count/data/example_rollouts_materialized_inputs.jsonl
resources_servers/circle_count/tests/__init__.py
resources_servers/circle_count/tests/test_app.py
resources_servers/code_fim/README.md
resources_servers/code_fim/app.py
resources_servers/code_fim/pyproject.toml
resources_servers/code_fim/configs/code_fim.yaml
resources_servers/code_fim/data/.gitignore
resources_servers/code_fim/data/example.jsonl
resources_servers/code_fim/data/example_metrics.json
resources_servers/code_fim/data/example_rollouts.jsonl
resources_servers/code_fim/human_eval_infilling_integration/__init__.py
resources_servers/code_fim/human_eval_infilling_integration/runner.py
resources_servers/code_fim/tests/__init__.py
resources_servers/code_fim/tests/test_app.py
resources_servers/code_gen/README.md
resources_servers/code_gen/analyze_test_cases.py
resources_servers/code_gen/app.py
resources_servers/code_gen/livecodebench_accuracy_test.py
resources_servers/code_gen/livecodebench_accuracy_test_prep.py
resources_servers/code_gen/requirements.txt
resources_servers/code_gen/configs/code_gen.yaml
resources_servers/code_gen/data/.gitignore
resources_servers/code_gen/data/example.jsonl
resources_servers/code_gen/data/example_metrics.json
resources_servers/code_gen/data/example_rollouts.jsonl
resources_servers/code_gen/data/livecodebench_v5_2024-07-01_2025-02-01_validation_metrics.json
resources_servers/code_gen/data/opencodereasoning_filtered_25k_train_metrics.json
resources_servers/code_gen/data/train_metrics.json
resources_servers/code_gen/lcb_integration/README.md
resources_servers/code_gen/lcb_integration/__init__.py
resources_servers/code_gen/lcb_integration/compute_code_generation_metrics.py
resources_servers/code_gen/lcb_integration/extraction_utils.py
resources_servers/code_gen/lcb_integration/lm_styles.py
resources_servers/code_gen/lcb_integration/pass_k_utils.py
resources_servers/code_gen/lcb_integration/testing_util.py
resources_servers/code_gen/scripts/preprocess_train_dataset.py
resources_servers/code_gen/tests/__init__.py
resources_servers/code_gen/tests/test_app.py
resources_servers/code_gen/tests/test_compute_code_generation_metrics.py
resources_servers/competitive_coding_challenges/README.md
resources_servers/competitive_coding_challenges/app.py
resources_servers/competitive_coding_challenges/ccc_eval.py
resources_servers/competitive_coding_challenges/requirements.txt
resources_servers/competitive_coding_challenges/configs/competitive_coding_challenges.yaml
resources_servers/competitive_coding_challenges/data/example.jsonl
resources_servers/competitive_coding_challenges/data/example_metrics.json
resources_servers/competitive_coding_challenges/data/example_rollouts.jsonl
resources_servers/competitive_coding_challenges/data/pass_rate_histogram.png
resources_servers/competitive_coding_challenges/tests/test_app.py
resources_servers/cvdp/README.md
resources_servers/cvdp/app.py
resources_servers/cvdp/requirements.txt
resources_servers/cvdp/configs/cvdp.yaml
resources_servers/cvdp/cvdp_lib/__init__.py
resources_servers/cvdp/cvdp_lib/cvdp_constants.py
resources_servers/cvdp/cvdp_lib/cvdp_report_lib.py
resources_servers/cvdp/cvdp_lib/cvdp_run_reporter.py
resources_servers/cvdp/cvdp_lib/model_helpers.py
resources_servers/cvdp/cvdp_lib/subjective.py
resources_servers/cvdp/data/.gitignore
resources_servers/cvdp/data/example.jsonl
resources_servers/cvdp/data/example_metrics.json
resources_servers/cvdp/data/example_rollouts.jsonl
resources_servers/cvdp/data/example_rollouts_aggregate_metrics.json
resources_servers/cvdp/data/example_rollouts_materialized_inputs.jsonl
resources_servers/cvdp/docs/data-lineage.md
resources_servers/cvdp/scripts/convert_to_gym.py
resources_servers/cvdp/scripts/cvdp_pass_at_k_report.py
resources_servers/cvdp/scripts/cvdp_report.py
resources_servers/cvdp/tests/__init__.py
resources_servers/cvdp/tests/test_app.py
resources_servers/equivalence_llm_judge/README.md
resources_servers/equivalence_llm_judge/app.py
resources_servers/equivalence_llm_judge/prepare_sciq.py
resources_servers/equivalence_llm_judge/requirements.txt
resources_servers/equivalence_llm_judge/configs/equivalence_llm_judge.yaml
resources_servers/equivalence_llm_judge/configs/lc_judge.yaml
resources_servers/equivalence_llm_judge/configs/nl2bash-equivalency.yaml
resources_servers/equivalence_llm_judge/data/example.jsonl
resources_servers/equivalence_llm_judge/data/example_metrics.json
resources_servers/equivalence_llm_judge/data/example_metrics_openqa.json
resources_servers/equivalence_llm_judge/data/example_nl2bash.jsonl
resources_servers/equivalence_llm_judge/data/example_openqa.jsonl
resources_servers/equivalence_llm_judge/data/example_openqa_metrics.json
resources_servers/equivalence_llm_judge/data/example_prepare.jsonl
resources_servers/equivalence_llm_judge/data/example_rollouts.jsonl
resources_servers/equivalence_llm_judge/data/example_rollouts_openqa.jsonl
resources_servers/equivalence_llm_judge/prompt_templates/equivalence_llm_judge.txt
resources_servers/equivalence_llm_judge/prompt_templates/lc.txt
resources_servers/equivalence_llm_judge/prompt_templates/lc_judge.txt
resources_servers/equivalence_llm_judge/tests/test_app.py
resources_servers/equivalence_rule/README.md
resources_servers/equivalence_rule/app.py
resources_servers/equivalence_rule/requirements.txt
resources_servers/equivalence_rule/configs/lc.yaml
resources_servers/equivalence_rule/data/.gitignore
resources_servers/equivalence_rule/data/example.jsonl
resources_servers/equivalence_rule/data/example_metrics.json
resources_servers/equivalence_rule/data/example_rollouts.jsonl
resources_servers/equivalence_rule/tests/__init__.py
resources_servers/equivalence_rule/tests/test_app.py
resources_servers/ether0/README.md
resources_servers/ether0/app.py
resources_servers/ether0/requirements.txt
resources_servers/ether0/setup_ether0.py
resources_servers/ether0/configs/ether0.yaml
resources_servers/ether0/data/example.jsonl
resources_servers/ether0/data/example_metrics.json
resources_servers/ether0/data/example_rollouts.jsonl
resources_servers/ether0/data/example_rollouts_agent_metrics.json
resources_servers/ether0/data/example_rollouts_materialized_inputs.jsonl
resources_servers/ether0/data/example_rollouts_reward_profiling.jsonl
resources_servers/ether0/scripts/prepare_ether0.py
resources_servers/ether0/tests/__init__.py
resources_servers/ether0/tests/conftest.py
resources_servers/ether0/tests/test_app.py
resources_servers/evalplus/README.md
resources_servers/evalplus/app.py
resources_servers/evalplus/requirements.txt
resources_servers/evalplus/configs/evalplus.yaml
resources_servers/evalplus/data/.gitignore
resources_servers/evalplus/data/example.jsonl
resources_servers/evalplus/data/example_metrics.json
resources_servers/evalplus/data/example_rollouts.jsonl
resources_servers/evalplus/evalplus_integration/__init__.py
resources_servers/evalplus/evalplus_integration/runner.py
resources_servers/evalplus/tests/__init__.py
resources_servers/evalplus/tests/test_app.py
resources_servers/example_multi_step/README.md
resources_servers/example_multi_step/app.py
resources_servers/example_multi_step/dataset_preprocess.py
resources_servers/example_multi_step/requirements.txt
resources_servers/example_multi_step/train_dataset_config.yaml
resources_servers/example_multi_step/configs/example_multi_step.yaml
resources_servers/example_multi_step/data/.gitignore
resources_servers/example_multi_step/data/example.jsonl
resources_servers/example_multi_step/data/example_metrics.json
resources_servers/example_multi_step/data/example_rollouts.jsonl
resources_servers/example_multi_step/data/train_metrics.json
resources_servers/example_multi_step/data/validation_metrics.json
resources_servers/example_multi_step/tests/__init__.py
resources_servers/example_multi_step/tests/test_app.py
resources_servers/example_multi_turn_gymnasium/README.md
resources_servers/example_multi_turn_gymnasium/app.py
resources_servers/example_multi_turn_gymnasium/requirements.txt
resources_servers/example_multi_turn_gymnasium/configs/example_multi_turn_gymnasium.yaml
resources_servers/example_multi_turn_gymnasium/data/.gitignore
resources_servers/example_multi_turn_gymnasium/data/example.jsonl
resources_servers/example_multi_turn_gymnasium/data/example_metrics.json
resources_servers/example_multi_turn_gymnasium/data/example_rollouts.jsonl
resources_servers/example_multi_turn_gymnasium/data/example_rollouts_aggregate_metrics.json
resources_servers/example_multi_turn_gymnasium/data/example_rollouts_materialized_inputs.jsonl
resources_servers/example_multi_turn_gymnasium/tests/__init__.py
resources_servers/example_multi_turn_gymnasium/tests/test_app.py
resources_servers/example_session_state_mgmt/README.md
resources_servers/example_session_state_mgmt/app.py
resources_servers/example_session_state_mgmt/client.py
resources_servers/example_session_state_mgmt/create_examples.py
resources_servers/example_session_state_mgmt/requirements.txt
resources_servers/example_session_state_mgmt/configs/example_session_state_mgmt.yaml
resources_servers/example_session_state_mgmt/data/.gitignore
resources_servers/example_session_state_mgmt/data/example.jsonl
resources_servers/example_session_state_mgmt/data/example_metrics.json
resources_servers/example_session_state_mgmt/data/example_rollouts.jsonl
resources_servers/example_session_state_mgmt/tests/test_app.py
resources_servers/example_single_tool_call/README.md
resources_servers/example_single_tool_call/app.py
resources_servers/example_single_tool_call/client.py
resources_servers/example_single_tool_call/create_examples.py
resources_servers/example_single_tool_call/requirements.txt
resources_servers/example_single_tool_call/configs/example_single_tool_call.yaml
resources_servers/example_single_tool_call/data/.gitignore
resources_servers/example_single_tool_call/data/example.jsonl
resources_servers/example_single_tool_call/data/example_metrics.json
resources_servers/example_single_tool_call/data/example_rollouts.jsonl
resources_servers/example_single_tool_call/tests/test_app.py
resources_servers/finance_sec_search/.gitignore
resources_servers/finance_sec_search/README.md
resources_servers/finance_sec_search/app.py
resources_servers/finance_sec_search/requirements.txt
resources_servers/finance_sec_search/configs/finance_sec_search.yaml
resources_servers/finance_sec_search/data/example.jsonl
resources_servers/finance_sec_search/data/example_metrics.json
resources_servers/finance_sec_search/data/example_questions.jsonl
resources_servers/finance_sec_search/data/example_rollouts.jsonl
resources_servers/finance_sec_search/data/example_rollouts_aggregate_metrics.json
resources_servers/finance_sec_search/prompt_templates/finance_sec_search_judge.yaml
resources_servers/finance_sec_search/prompt_templates/finance_sec_search_retrieval.yaml
resources_servers/finance_sec_search/scripts/convert_questions.py
resources_servers/finance_sec_search/scripts/prefetch_sec_metadata.py
resources_servers/finance_sec_search/scripts/prepare_secque_questions.py
resources_servers/finance_sec_search/tests/__init__.py
resources_servers/finance_sec_search/tests/test_app.py
resources_servers/format_verification/ARCHITECTURE.md
resources_servers/format_verification/README.md
resources_servers/format_verification/app.py
resources_servers/format_verification/requirements.txt
resources_servers/format_verification/configs/citation_format.yaml
resources_servers/format_verification/configs/freeform_formatting.yaml
resources_servers/format_verification/data/ds2_freeform_formatting_example.jsonl
resources_servers/format_verification/data/ds2_freeform_formatting_example_metrics.json
resources_servers/format_verification/data/ds3_citation_format_example.jsonl
resources_servers/format_verification/data/ds3_citation_format_example_metrics.json
resources_servers/format_verification/data/example.jsonl
resources_servers/format_verification/data/example_metrics.json
resources_servers/format_verification/data/example_rollouts.jsonl
resources_servers/format_verification/misc/breakdown_citation.py
resources_servers/format_verification/misc/breakdown_freeform.py
resources_servers/format_verification/misc/data_generation/prepare_data.py
resources_servers/format_verification/tests/__init__.py
resources_servers/format_verification/tests/test_app.py
resources_servers/frontierscience_judge/README.md
resources_servers/frontierscience_judge/app.py
resources_servers/frontierscience_judge/requirements.txt
resources_servers/frontierscience_judge/configs/frontierscience_judge.yaml
resources_servers/frontierscience_judge/data/.gitignore
resources_servers/frontierscience_judge/data/example.jsonl
resources_servers/frontierscience_judge/data/example_metrics.json
resources_servers/frontierscience_judge/data/example_rollouts.jsonl
resources_servers/frontierscience_judge/prompts/judge.yaml
resources_servers/frontierscience_judge/tests/__init__.py
resources_servers/frontierscience_judge/tests/test_app.py
resources_servers/gdpval/README.md
resources_servers/gdpval/__init__.py
resources_servers/gdpval/app.py
resources_servers/gdpval/comparison.py
resources_servers/gdpval/preconvert.py
resources_servers/gdpval/requirements.txt
resources_servers/gdpval/scoring.py
resources_servers/gdpval/setup_libreoffice.py
resources_servers/gdpval/configs/gdpval.yaml
resources_servers/gdpval/data/.gitignore
resources_servers/gdpval/data/example.jsonl
resources_servers/gdpval/data/example_metrics.json
resources_servers/gdpval/data/example_rollouts.jsonl
resources_servers/gdpval/prompts/judge_prompt.j2
resources_servers/gdpval/tests/__init__.py
resources_servers/gdpval/tests/test_app.py
resources_servers/gdpval/tests/test_preconvert.py
resources_servers/gdpval/tests/test_setup_libreoffice.py
resources_servers/genrm_compare/README.md
resources_servers/genrm_compare/__init__.py
resources_servers/genrm_compare/app.py
resources_servers/genrm_compare/comparison_strategies.py
resources_servers/genrm_compare/requirements.txt
resources_servers/genrm_compare/utils.py
resources_servers/genrm_compare/configs/genrm_compare.yaml
resources_servers/genrm_compare/data/example.jsonl
resources_servers/genrm_compare/data/example_metrics.json
resources_servers/genrm_compare/data/example_rollouts.jsonl
resources_servers/genrm_compare/tests/__init__.py
resources_servers/genrm_compare/tests/test_app.py
resources_servers/genrm_compare/tests/test_comparison_strategies.py
resources_servers/genrm_compare/tests/test_utils.py
resources_servers/google_search/README.md
resources_servers/google_search/app.py
resources_servers/google_search/client.py
resources_servers/google_search/requirements.txt
resources_servers/google_search/configs/google_search.yaml
resources_servers/google_search/data/.gitignore
resources_servers/google_search/data/example.jsonl
resources_servers/google_search/data/example_metrics.json
resources_servers/google_search/data/example_rollouts.jsonl
resources_servers/google_search/data/train_metrics.json
resources_servers/google_search/tests/test_app.py
resources_servers/gpqa_diamond/README.md
resources_servers/gpqa_diamond/app.py
resources_servers/gpqa_diamond/dataset_preprocess.py
resources_servers/gpqa_diamond/requirements.txt
resources_servers/gpqa_diamond/configs/gpqa_diamond.yaml
resources_servers/gpqa_diamond/data/.gitignore
resources_servers/gpqa_diamond/data/example.jsonl
resources_servers/gpqa_diamond/data/example_metrics.json
resources_servers/gpqa_diamond/data/example_rollouts.jsonl
resources_servers/gpqa_diamond/tests/test_app.py
resources_servers/grl_sokoban/README.md
resources_servers/grl_sokoban/app.py
resources_servers/grl_sokoban/generate_test_examples.py
resources_servers/grl_sokoban/requirements.txt
resources_servers/grl_sokoban/configs/grl_sokoban.yaml
resources_servers/grl_sokoban/data/.gitignore
resources_servers/grl_sokoban/data/example.jsonl
resources_servers/grl_sokoban/data/example_metrics.json
resources_servers/grl_sokoban/data/example_rollouts.jsonl
resources_servers/grl_sokoban/data/example_rollouts_aggregate_metrics.json
resources_servers/grl_sokoban/data/example_rollouts_materialized_inputs.jsonl
resources_servers/grl_sokoban/sokoban_env/__init__.py
resources_servers/grl_sokoban/sokoban_env/generation.py
resources_servers/grl_sokoban/sokoban_env/sokoban_env.py
resources_servers/grl_sokoban/tests/test_app.py
resources_servers/grl_tetris/README.md
resources_servers/grl_tetris/app.py
resources_servers/grl_tetris/generate_test_examples.py
resources_servers/grl_tetris/requirements.txt
resources_servers/grl_tetris/configs/grl_tetris.yaml
resources_servers/grl_tetris/data/.gitignore
resources_servers/grl_tetris/data/example.jsonl
resources_servers/grl_tetris/data/example_metrics.json
resources_servers/grl_tetris/data/example_rollouts.jsonl
resources_servers/grl_tetris/data/example_rollouts_aggregate_metrics.json
resources_servers/grl_tetris/data/example_rollouts_materialized_inputs.jsonl
resources_servers/grl_tetris/tests/test_app.py
resources_servers/grl_tetris/tetris_env/__init__.py
resources_servers/grl_tetris/tetris_env/tetris_env.py
resources_servers/gymnasium/README.md
resources_servers/gymnasium/__init__.py
resources_servers/gymnasium/base.py
resources_servers/gymnasium/requirements.txt
resources_servers/gymnasium/configs/gymnasium.yaml
resources_servers/gymnasium/data/.gitignore
resources_servers/gymnasium/data/example.jsonl
resources_servers/gymnasium/data/example_metrics.json
resources_servers/gymnasium/data/example_rollouts.jsonl
resources_servers/gymnasium/tests/__init__.py
resources_servers/gymnasium/tests/test_app.py
resources_servers/hotpotqa_qa/README.md
resources_servers/hotpotqa_qa/app.py
resources_servers/hotpotqa_qa/requirements.txt
resources_servers/hotpotqa_qa/scoring.py
resources_servers/hotpotqa_qa/configs/hotpotqa_qa.yaml
resources_servers/hotpotqa_qa/data/.gitignore
resources_servers/hotpotqa_qa/data/example.jsonl
resources_servers/hotpotqa_qa/data/example_metrics.json
resources_servers/hotpotqa_qa/data/example_rollouts.jsonl
resources_servers/hotpotqa_qa/tests/__init__.py
resources_servers/hotpotqa_qa/tests/test_app.py
resources_servers/ifbench/.gitignore
resources_servers/ifbench/README.md
resources_servers/ifbench/__init__.py
resources_servers/ifbench/app.py
resources_servers/ifbench/requirements.txt
resources_servers/ifbench/setup_ifbench.py
resources_servers/ifbench/configs/ifbench.yaml
resources_servers/ifbench/data/.gitignore
resources_servers/ifbench/data/example.jsonl
resources_servers/ifbench/data/example_metrics.json
resources_servers/ifbench/data/example_rollouts.jsonl
resources_servers/ifbench/tests/__init__.py
resources_servers/ifbench/tests/conftest.py
resources_servers/ifbench/tests/test_app.py
resources_servers/imo_gradingbench/README.md
resources_servers/imo_gradingbench/__init__.py
resources_servers/imo_gradingbench/app.py
resources_servers/imo_gradingbench/requirements.txt
resources_servers/imo_gradingbench/configs/imo_gradingbench.yaml
resources_servers/imo_gradingbench/data/.gitignore
resources_servers/imo_gradingbench/data/example.jsonl
resources_servers/imo_gradingbench/data/example_metrics.json
resources_servers/imo_gradingbench/data/example_rollouts.jsonl
resources_servers/imo_gradingbench/tests/__init__.py
resources_servers/imo_gradingbench/tests/test_app.py
resources_servers/imo_proofbench_judge/README.md
resources_servers/imo_proofbench_judge/app.py
resources_servers/imo_proofbench_judge/requirements.txt
resources_servers/imo_proofbench_judge/configs/imo_proofbench_judge.yaml
resources_servers/imo_proofbench_judge/data/.gitignore
resources_servers/imo_proofbench_judge/data/example.jsonl
resources_servers/imo_proofbench_judge/data/example_metrics.json
resources_servers/imo_proofbench_judge/data/example_rollouts.jsonl
resources_servers/imo_proofbench_judge/prompts/judge.yaml
resources_servers/imo_proofbench_judge/tests/__init__.py
resources_servers/imo_proofbench_judge/tests/test_app.py
resources_servers/indirect_prompt_injection/README.md
resources_servers/indirect_prompt_injection/app.py
resources_servers/indirect_prompt_injection/common_tools.py
resources_servers/indirect_prompt_injection/ecommerce_tools.py
resources_servers/indirect_prompt_injection/education_tools.py
resources_servers/indirect_prompt_injection/healthcare_tools.py
resources_servers/indirect_prompt_injection/hr_tools.py
resources_servers/indirect_prompt_injection/insurance_tools.py
resources_servers/indirect_prompt_injection/it_helpdesk_tools.py
resources_servers/indirect_prompt_injection/legal_tools.py
resources_servers/indirect_prompt_injection/logistics_tools.py
resources_servers/indirect_prompt_injection/real_estate_tools.py
resources_servers/indirect_prompt_injection/requirements.txt
resources_servers/indirect_prompt_injection/verifier.py
resources_servers/indirect_prompt_injection/configs/indirect_prompt_injection.yaml
resources_servers/indirect_prompt_injection/configs/nemotron_nano_2gpu.yaml
resources_servers/indirect_prompt_injection/data/.gitignore
resources_servers/indirect_prompt_injection/data/example.jsonl
resources_servers/indirect_prompt_injection/data/example_metrics.json
resources_servers/indirect_prompt_injection/data/example_rollouts.jsonl
resources_servers/indirect_prompt_injection/tests/__init__.py
resources_servers/indirect_prompt_injection/tests/test_app.py
resources_servers/indirect_prompt_injection/tests/test_common_tools.py
resources_servers/indirect_prompt_injection/tests/test_ecommerce_tools.py
resources_servers/indirect_prompt_injection/tests/test_education_tools.py
resources_servers/indirect_prompt_injection/tests/test_healthcare_tools.py
resources_servers/indirect_prompt_injection/tests/test_hr_tools.py
resources_servers/indirect_prompt_injection/tests/test_insurance_tools.py
resources_servers/indirect_prompt_injection/tests/test_it_helpdesk_tools.py
resources_servers/indirect_prompt_injection/tests/test_legal_tools.py
resources_servers/indirect_prompt_injection/tests/test_logistics_tools.py
resources_servers/indirect_prompt_injection/tests/test_real_estate_tools.py
resources_servers/indirect_prompt_injection/tests/test_verifier.py
resources_servers/instruction_following/README.md
resources_servers/instruction_following/app.py
resources_servers/instruction_following/requirements.txt
resources_servers/instruction_following/configs/instruction_following.yaml
resources_servers/instruction_following/data/.gitignore
resources_servers/instruction_following/data/example.jsonl
resources_servers/instruction_following/data/example_metrics.json
resources_servers/instruction_following/data/example_rollouts.jsonl
resources_servers/instruction_following/data/train_metrics.json
resources_servers/instruction_following/tests/__init__.py
resources_servers/instruction_following/tests/test_app.py
resources_servers/inverse_if/.gitignore
resources_servers/inverse_if/README.md
resources_servers/inverse_if/app.py
resources_servers/inverse_if/dataset_preprocess.py
resources_servers/inverse_if/requirements.txt
resources_servers/inverse_if/configs/inverse_if.yaml
resources_servers/inverse_if/data/.gitignore
resources_servers/inverse_if/data/README.md
resources_servers/inverse_if/data/example.jsonl
resources_servers/inverse_if/data/example_metrics.json
resources_servers/inverse_if/data/example_rollouts.jsonl
resources_servers/inverse_if/tests/__init__.py
resources_servers/inverse_if/tests/test_inverse_if.py
resources_servers/jailbreak_detection/README.md
resources_servers/jailbreak_detection/app.py
resources_servers/jailbreak_detection/requirements.txt
resources_servers/jailbreak_detection/util.py
resources_servers/jailbreak_detection/configs/jailbreak_detection_nemotron_combined_reward_tp8.yaml
resources_servers/jailbreak_detection/configs/safety_judge_model.yaml
resources_servers/jailbreak_detection/configs/verifier_prompt_templates.yaml
resources_servers/jailbreak_detection/data/.gitignore
resources_servers/jailbreak_detection/data/example.jsonl
resources_servers/jailbreak_detection/data/example_metrics.json
resources_servers/jailbreak_detection/data/example_rollouts.jsonl
resources_servers/jailbreak_detection/tests/__init__.py
resources_servers/jailbreak_detection/tests/test_app.py
resources_servers/labbench2_vlm/README.md
resources_servers/labbench2_vlm/app.py
resources_servers/labbench2_vlm/prepare_data.py
resources_servers/labbench2_vlm/requirements.txt
resources_servers/labbench2_vlm/configs/judge_model_openai.yaml
resources_servers/labbench2_vlm/configs/labbench2_vlm.yaml
resources_servers/labbench2_vlm/data/.gitignore
resources_servers/labbench2_vlm/data/example.jsonl
resources_servers/labbench2_vlm/data/example_metrics.json
resources_servers/labbench2_vlm/data/example_rollouts.jsonl
resources_servers/labbench2_vlm/data/test_media/figs/imgs/b60fdf79-25b2-4bf2-a5bb-cb553d83770f/figure.png
resources_servers/labbench2_vlm/data/test_media/figs/imgs/d9daf6c6-513a-4969-b5a6-86a1bd504c6c/figure.png
resources_servers/labbench2_vlm/data/test_media/figs/pdfs/b60fdf79-25b2-4bf2-a5bb-cb553d83770f/paper.pdf
resources_servers/labbench2_vlm/data/test_media/protocols/a68f494c-50de-4200-b12b-82108e9c1d8e/protocol.pdf
resources_servers/labbench2_vlm/data/test_media/protocols/e0759c5d-f4eb-4bb5-850e-55a0adaede9d/protocol.pdf
resources_servers/labbench2_vlm/data/test_media/tables/imgs/54f637f5-27cf-450b-8d9c-6069006d15d0/table.png
resources_servers/labbench2_vlm/data/test_media/tables/pdfs/54f637f5-27cf-450b-8d9c-6069006d15d0/paper.pdf
resources_servers/labbench2_vlm/prompt_templates/judge.txt
resources_servers/labbench2_vlm/prompt_templates/judge_protocol.txt
resources_servers/labbench2_vlm/tests/__init__.py
resources_servers/labbench2_vlm/tests/test_app.py
resources_servers/math_advanced_calculations/README.md
resources_servers/math_advanced_calculations/__init__.py
resources_servers/math_advanced_calculations/app.py
resources_servers/math_advanced_calculations/client.py
resources_servers/math_advanced_calculations/dataset_preprocess.py
resources_servers/math_advanced_calculations/math_advanced_calculations_tools.py
resources_servers/math_advanced_calculations/requirements.txt
resources_servers/math_advanced_calculations/seed_prompt_creation.py
resources_servers/math_advanced_calculations/configs/math_advanced_calculations.yaml
resources_servers/math_advanced_calculations/data/.gitignore
resources_servers/math_advanced_calculations/data/example.jsonl
resources_servers/math_advanced_calculations/data/example_metrics.json
resources_servers/math_advanced_calculations/data/example_rollouts.jsonl
resources_servers/math_advanced_calculations/data/train_metrics.json
resources_servers/math_advanced_calculations/tests/__init__.py
resources_servers/math_advanced_calculations/tests/test_app.py
resources_servers/math_formal_lean/README.md
resources_servers/math_formal_lean/__init__.py
resources_servers/math_formal_lean/app.py
resources_servers/math_formal_lean/prepare_minif2f.py
resources_servers/math_formal_lean/prepare_nemotron_math_proofs.py
resources_servers/math_formal_lean/proof_utils.py
resources_servers/math_formal_lean/requirements.txt
resources_servers/math_formal_lean/sandbox_client.py
resources_servers/math_formal_lean/configs/math_formal_lean.yaml
resources_servers/math_formal_lean/configs/math_formal_lean_multi_turn.yaml
resources_servers/math_formal_lean/configs/nemotron_clean_easy.yaml
resources_servers/math_formal_lean/configs/nemotron_first_try_hard.yaml
resources_servers/math_formal_lean/configs/nemotron_medium_500.yaml
resources_servers/math_formal_lean/configs/nemotron_very_easy.yaml
resources_servers/math_formal_lean/data/example.jsonl
resources_servers/math_formal_lean/data/example_metrics.json
resources_servers/math_formal_lean/data/example_rollouts.jsonl
resources_servers/math_formal_lean/data/minif2f_test.jsonl
resources_servers/math_formal_lean/data/minif2f_valid.jsonl
resources_servers/math_formal_lean/data/multi_turn_full_example.jsonl
resources_servers/math_formal_lean/data/multi_turn_success_examples.jsonl
resources_servers/math_formal_lean/tests/__init__.py
resources_servers/math_formal_lean/tests/test_app.py
resources_servers/math_formal_lean/tests/test_proof_utils.py
resources_servers/math_proof_judgement/README.md
resources_servers/math_proof_judgement/__init__.py
resources_servers/math_proof_judgement/app.py
resources_servers/math_proof_judgement/requirements.txt
resources_servers/math_proof_judgement/configs/math_proof_judgement.yaml
resources_servers/math_proof_judgement/data/.gitignore
resources_servers/math_proof_judgement/data/example.jsonl
resources_servers/math_proof_judgement/data/example_metrics.json
resources_servers/math_proof_judgement/data/example_rollouts.jsonl
resources_servers/math_proof_judgement/tests/__init__.py
resources_servers/math_proof_judgement/tests/test_app.py
resources_servers/math_with_autograder/README.md
resources_servers/math_with_autograder/app.py
resources_servers/math_with_autograder/requirements.txt
resources_servers/math_with_autograder/configs/judge_gptoss20b.yaml
resources_servers/math_with_autograder/configs/math_with_autograder.yaml
resources_servers/math_with_autograder/data/.gitignore
resources_servers/math_with_autograder/data/example.jsonl
resources_servers/math_with_autograder/data/example_metrics.json
resources_servers/math_with_autograder/data/example_rollouts.jsonl
resources_servers/math_with_autograder/prompts/judge.yaml
resources_servers/math_with_autograder/tests/__init__.py
resources_servers/math_with_autograder/tests/test_app.py
resources_servers/math_with_code/README.md
resources_servers/math_with_code/app.py
resources_servers/math_with_code/client.py
resources_servers/math_with_code/requirements.txt
resources_servers/math_with_code/configs/math_with_code.yaml
resources_servers/math_with_code/data/.gitignore
resources_servers/math_with_code/data/example.jsonl
resources_servers/math_with_code/data/example_metrics.json
resources_servers/math_with_code/data/example_rollouts.jsonl
resources_servers/math_with_code/data/train_metrics.json
resources_servers/math_with_code/tests/test_app.py
resources_servers/math_with_judge/README.md
resources_servers/math_with_judge/app.py
resources_servers/math_with_judge/client.py
resources_servers/math_with_judge/filter_for_mixed_rewards.py
resources_servers/math_with_judge/prepare_aime24.py
resources_servers/math_with_judge/prepare_bytedtsinghua_dapo17k_aime24.py
resources_servers/math_with_judge/prepare_dapo17k.py
resources_servers/math_with_judge/requirements.txt
resources_servers/math_with_judge/configs/dapo17k.yaml
resources_servers/math_with_judge/configs/math_stack_overflow.yaml
resources_servers/math_with_judge/configs/math_with_judge.yaml
resources_servers/math_with_judge/configs/math_with_judge_hermes_agent.yaml
resources_servers/math_with_judge/configs/math_with_local_judge.yaml
resources_servers/math_with_judge/data/.gitignore
resources_servers/math_with_judge/data/OpenMathReasoning_aime24_validation_metrics.json
resources_servers/math_with_judge/data/OpenMathReasoning_train_metrics.json
resources_servers/math_with_judge/data/aime24_bytedtsinghua_validation_metrics.json
resources_servers/math_with_judge/data/aime24_validation_metrics.json
resources_servers/math_with_judge/data/dapo17k_bytedtsinghua_train_metrics.json
resources_servers/math_with_judge/data/dapo17k_train_metrics.json
resources_servers/math_with_judge/data/example.jsonl
resources_servers/math_with_judge/data/example_metrics.json
resources_servers/math_with_judge/data/example_rollouts.jsonl
resources_servers/math_with_judge/data/math_stack_overflow_train_metrics.json
resources_servers/math_with_judge/data/train_metrics.json
resources_servers/math_with_judge/tests/test_app.py
resources_servers/mcqa/README.md
resources_servers/mcqa/app.py
resources_servers/mcqa/dataset_preprocess.py
resources_servers/mcqa/requirements.txt
resources_servers/mcqa/configs/mcqa.yaml
resources_servers/mcqa/data/.gitignore
resources_servers/mcqa/data/example.jsonl
resources_servers/mcqa/data/example_metrics.json
resources_servers/mcqa/data/example_rollouts.jsonl
resources_servers/mcqa/data/example_rollouts_with_template_metadata.jsonl
resources_servers/mcqa/data/example_with_template_metadata.jsonl
resources_servers/mcqa/data/example_with_template_metadata_metrics.json
resources_servers/mcqa/data/train_metrics.json
resources_servers/mcqa/tests/__init__.py
resources_servers/mcqa/tests/test_app.py
resources_servers/mrcr/README.md
resources_servers/mrcr/app.py
resources_servers/mrcr/requirements.txt
resources_servers/mrcr/configs/mrcr.yaml
resources_servers/mrcr/data/.gitignore
resources_servers/mrcr/data/example.jsonl
resources_servers/mrcr/data/example_metrics.json
resources_servers/mrcr/data/example_rollouts.jsonl
resources_servers/mrcr/tests/__init__.py
resources_servers/mrcr/tests/test_app.py
resources_servers/multichallenge/.gitignore
resources_servers/multichallenge/README.md
resources_servers/multichallenge/app.py
resources_servers/multichallenge/dataset_preprocess.py
resources_servers/multichallenge/requirements.txt
resources_servers/multichallenge/configs/multichallenge.yaml
resources_servers/multichallenge/configs/multichallenge_nrl.yaml
resources_servers/multichallenge/data/.gitignore
resources_servers/multichallenge/data/README.md
resources_servers/multichallenge/data/example.jsonl
resources_servers/multichallenge/data/example_metrics.json
resources_servers/multichallenge/data/example_rollouts.jsonl
resources_servers/multichallenge/tests/__init__.py
resources_servers/multichallenge/tests/test_multichallenge.py
resources_servers/newton_bench/README.md
resources_servers/newton_bench/app.py
resources_servers/newton_bench/client.py
resources_servers/newton_bench/generate_dataset.py
resources_servers/newton_bench/requirements.txt
resources_servers/newton_bench/setup_newton_bench.py
resources_servers/newton_bench/configs/newton_bench.yaml
resources_servers/newton_bench/data/example.jsonl
resources_servers/newton_bench/data/example_metrics.json
resources_servers/newton_bench/data/example_rollouts.jsonl
resources_servers/newton_bench/newton_bench_utils/prompt_utils.py
resources_servers/newton_bench/newton_bench_utils/sandbox.py
resources_servers/newton_bench/newton_bench_utils/schemas.py
resources_servers/newton_bench/tests/conftest.py
resources_servers/newton_bench/tests/test_app.py
resources_servers/ns_tools/README.md
resources_servers/ns_tools/app.py
resources_servers/ns_tools/prepare_dataset.py
resources_servers/ns_tools/requirements.txt
resources_servers/ns_tools/configs/ns_tools.yaml
resources_servers/ns_tools/data/compmath_test.jsonl
resources_servers/ns_tools/data/example.jsonl
resources_servers/ns_tools/data/example_metrics.json
resources_servers/ns_tools/data/example_rollouts.jsonl
resources_servers/ns_tools/tests/__init__.py
resources_servers/ns_tools/tests/test_app.py
resources_servers/nvarc/README.md
resources_servers/nvarc/app.py
resources_servers/nvarc/problem.py
resources_servers/nvarc/requirements.txt
resources_servers/nvarc/configs/inductive.yaml
resources_servers/nvarc/configs/transductive.yaml
resources_servers/nvarc/data/.gitignore
resources_servers/nvarc/data/example.jsonl
resources_servers/nvarc/data/example_metrics.json
resources_servers/nvarc/data/example_rollouts.jsonl
resources_servers/nvarc/tests/test_app.py
resources_servers/omniscience/README.md
resources_servers/omniscience/app.py
resources_servers/omniscience/requirements.txt
resources_servers/omniscience/configs/omniscience.yaml
resources_servers/omniscience/data/.gitignore
resources_servers/omniscience/data/example.jsonl
resources_servers/omniscience/data/example_metrics.json
resources_servers/omniscience/data/example_rollouts.jsonl
resources_servers/omniscience/prompts/judge.yaml
resources_servers/omniscience/tests/__init__.py
resources_servers/omniscience/tests/test_app.py
resources_servers/openenv/README.md
resources_servers/openenv/app.py
resources_servers/openenv/requirements.txt
resources_servers/openenv/configs/openenv_coding.yaml
resources_servers/openenv/configs/openenv_echo.yaml
resources_servers/openenv/configs/openenv_maze.yaml
resources_servers/openenv/data/example.jsonl
resources_servers/openenv/data/example_metrics.json
resources_servers/openenv/data/example_rollouts.jsonl
resources_servers/openenv/data/coding/example.jsonl
resources_servers/openenv/data/echo/example.jsonl
resources_servers/openenv/data/maze/example.jsonl
resources_servers/openenv/tests/__init__.py
resources_servers/openenv/tests/test_app.py
resources_servers/over_refusal_detection/README.md
resources_servers/over_refusal_detection/app.py
resources_servers/over_refusal_detection/requirements.txt
resources_servers/over_refusal_detection/util.py
resources_servers/over_refusal_detection/configs/over_refusal_detection.yaml
resources_servers/over_refusal_detection/configs/over_refusal_detection_nemotron.yaml
resources_servers/over_refusal_detection/configs/over_refusal_detection_nemotron_tp8.yaml
resources_servers/over_refusal_detection/data/.gitignore
resources_servers/over_refusal_detection/data/example.jsonl
resources_servers/over_refusal_detection/data/example_metrics.json
resources_servers/over_refusal_detection/data/example_rollouts.jsonl
resources_servers/over_refusal_detection/tests/__init__.py
resources_servers/over_refusal_detection/tests/test_app.py
resources_servers/physics_judge/README.md
resources_servers/physics_judge/app.py
resources_servers/physics_judge/requirements.txt
resources_servers/physics_judge/configs/judge_openai.yaml
resources_servers/physics_judge/configs/physics_judge.yaml
resources_servers/physics_judge/data/.gitignore
resources_servers/physics_judge/data/example.jsonl
resources_servers/physics_judge/data/example_metrics.json
resources_servers/physics_judge/data/example_rollouts.jsonl
resources_servers/physics_judge/prompts/judge.yaml
resources_servers/physics_judge/tests/__init__.py
resources_servers/physics_judge/tests/test_app.py
resources_servers/polymath/README.md
resources_servers/polymath/app.py
resources_servers/polymath/requirements.txt
resources_servers/polymath/configs/polymath.yaml
resources_servers/polymath/data/.gitignore
resources_servers/polymath/data/example.jsonl
resources_servers/polymath/data/example_metrics.json
resources_servers/polymath/data/example_rollouts.jsonl
resources_servers/polymath/data/example_rollouts_materialized_inputs.jsonl
resources_servers/polymath/tests/__init__.py
resources_servers/polymath/tests/test_app.py
resources_servers/proof_genselect/README.md
resources_servers/proof_genselect/__init__.py
resources_servers/proof_genselect/app.py
resources_servers/proof_genselect/prepare_data.py
resources_servers/proof_genselect/requirements.txt
resources_servers/proof_genselect/configs/proof_genselect.yaml
resources_servers/proof_genselect/data/example.jsonl
resources_servers/proof_genselect/data/example_metrics.json
resources_servers/proof_genselect/data/example_rollouts.jsonl
resources_servers/proof_genselect/data/example_rollouts_aggregate_metrics.json
resources_servers/proof_genselect/data/example_rollouts_materialized_inputs.jsonl
resources_servers/proof_genselect/prompt_templates/genselect.yaml
resources_servers/proof_genselect/tests/__init__.py
resources_servers/proof_genselect/tests/test_app.py
resources_servers/proof_judge/README.md
resources_servers/proof_judge/__init__.py
resources_servers/proof_judge/app.py
resources_servers/proof_judge/prepare_data.py
resources_servers/proof_judge/requirements.txt
resources_servers/proof_judge/configs/proof_judge.yaml
resources_servers/proof_judge/data/example.jsonl
resources_servers/proof_judge/data/example_metrics.json
resources_servers/proof_judge/data/example_rollouts.jsonl
resources_servers/proof_judge/data/example_rollouts_aggregate_metrics.json
resources_servers/proof_judge/data/example_rollouts_materialized_inputs.jsonl
resources_servers/proof_judge/prompt_templates/meta-verifier.yaml
resources_servers/proof_judge/prompt_templates/prover.yaml
resources_servers/proof_judge/prompt_templates/verifier.yaml
resources_servers/proof_judge/tests/__init__.py
resources_servers/proof_judge/tests/test_app.py
resources_servers/proof_verification/README.md
resources_servers/proof_verification/__init__.py
resources_servers/proof_verification/app.py
resources_servers/proof_verification/prepare_data.py
resources_servers/proof_verification/requirements.txt
resources_servers/proof_verification/configs/proof_verification.yaml
resources_servers/proof_verification/data/example.jsonl
resources_servers/proof_verification/data/example_metrics.json
resources_servers/proof_verification/data/example_rollouts.jsonl
resources_servers/proof_verification/data/example_rollouts_aggregate_metrics.json
resources_servers/proof_verification/data/example_rollouts_materialized_inputs.jsonl
resources_servers/proof_verification/prompt_templates/meta-verifier.yaml
resources_servers/proof_verification/prompt_templates/verifier.yaml
resources_servers/proof_verification/tests/__init__.py
resources_servers/proof_verification/tests/test_app.py
resources_servers/rdkit_chemistry/README.md
resources_servers/rdkit_chemistry/app.py
resources_servers/rdkit_chemistry/requirements.txt
resources_servers/rdkit_chemistry/sandbox_launcher.py
resources_servers/rdkit_chemistry/configs/rdkit_chemistry.yaml
resources_servers/rdkit_chemistry/data/example.jsonl
resources_servers/rdkit_chemistry/data/example_metrics.json
resources_servers/rdkit_chemistry/data/example_rollouts.jsonl
resources_servers/rdkit_chemistry/tests/__init__.py
resources_servers/rdkit_chemistry/tests/test_app.py
resources_servers/reasoning_gym/README.md
resources_servers/reasoning_gym/app.py
resources_servers/reasoning_gym/requirements.txt
resources_servers/reasoning_gym/configs/orchestrator_agent.yaml
resources_servers/reasoning_gym/configs/parallel_thinking_agent.yaml
resources_servers/reasoning_gym/configs/reasoning_gym.yaml
resources_servers/reasoning_gym/configs/reasoning_gym_claude_code_agent.yaml
resources_servers/reasoning_gym/configs/reflection_agent.yaml
resources_servers/reasoning_gym/configs/resources_only.yaml
resources_servers/reasoning_gym/configs/rewoo_agent.yaml
resources_servers/reasoning_gym/data/example.jsonl
resources_servers/reasoning_gym/data/example_metrics.json
resources_servers/reasoning_gym/data/example_rollouts.jsonl
resources_servers/reasoning_gym/scripts/create_dataset.py
resources_servers/reasoning_gym/tests/test_app.py
resources_servers/ruler/README.md
resources_servers/ruler/app.py
resources_servers/ruler/requirements.txt
resources_servers/ruler/configs/ruler.yaml
resources_servers/ruler/data/.gitignore
resources_servers/ruler/data/example.jsonl
resources_servers/ruler/data/example_metrics.json
resources_servers/ruler/data/example_rollouts.jsonl
resources_servers/ruler/tests/test_app.py
resources_servers/simpleqa/README.md
resources_servers/simpleqa/__init__.py
resources_servers/simpleqa/app.py
resources_servers/simpleqa/requirements.txt
resources_servers/simpleqa/configs/simpleqa.yaml
resources_servers/simpleqa/data/.gitignore
resources_servers/simpleqa/data/example.jsonl
resources_servers/simpleqa/data/example_metrics.json
resources_servers/simpleqa/data/example_rollouts.jsonl
resources_servers/simpleqa/prompts/judge.yaml
resources_servers/simpleqa/tests/__init__.py
resources_servers/simpleqa/tests/test_app.py
resources_servers/single_step_tool_use_with_argument_comparison/README.md
resources_servers/single_step_tool_use_with_argument_comparison/__init__.py
resources_servers/single_step_tool_use_with_argument_comparison/app.py
resources_servers/single_step_tool_use_with_argument_comparison/requirements.txt
resources_servers/single_step_tool_use_with_argument_comparison/common/__init__.py
resources_servers/single_step_tool_use_with_argument_comparison/common/response_utils.py
resources_servers/single_step_tool_use_with_argument_comparison/common/verification_utils.py
resources_servers/single_step_tool_use_with_argument_comparison/configs/droid_pivot_single_step_tool_use_with_argument_comparison.yaml
resources_servers/single_step_tool_use_with_argument_comparison/configs/search_pivot_single_step_tool_use_with_argument_comparison.yaml
resources_servers/single_step_tool_use_with_argument_comparison/configs/single_step_tool_use_with_argument_comparison.yaml
resources_servers/single_step_tool_use_with_argument_comparison/configs/swe_pivot_single_step_tool_use_with_argument_comparison.yaml
resources_servers/single_step_tool_use_with_argument_comparison/configs/toolcall_schema_single_step_tool_use_with_argument_comparison.yaml
resources_servers/single_step_tool_use_with_argument_comparison/data/.gitignore
resources_servers/single_step_tool_use_with_argument_comparison/data/example.jsonl
resources_servers/single_step_tool_use_with_argument_comparison/data/example_metrics.json
resources_servers/single_step_tool_use_with_argument_comparison/data/example_rollouts.jsonl
resources_servers/single_step_tool_use_with_argument_comparison/tests/__init__.py
resources_servers/single_step_tool_use_with_argument_comparison/tests/test_app.py
resources_servers/single_step_tool_use_with_argument_comparison/tests/common/__init__.py
resources_servers/single_step_tool_use_with_argument_comparison/tests/common/test_response_utils.py
resources_servers/single_step_tool_use_with_argument_comparison/tests/common/test_verification_utils.py
resources_servers/speed_bench/README.md
resources_servers/speed_bench/app.py
resources_servers/speed_bench/requirements.txt
resources_servers/speed_bench/configs/speed_bench.yaml
resources_servers/speed_bench/data/.gitignore
resources_servers/speed_bench/data/example.jsonl
resources_servers/speed_bench/data/example_metrics.json
resources_servers/speed_bench/data/example_rollouts.jsonl
resources_servers/speed_bench/tests/__init__.py
resources_servers/speed_bench/tests/test_app.py
resources_servers/spider2_lite/.gitignore
resources_servers/spider2_lite/README.md
resources_servers/spider2_lite/app.py
resources_servers/spider2_lite/eval_utils.py
resources_servers/spider2_lite/requirements.txt
resources_servers/spider2_lite/setup_spider2.py
resources_servers/spider2_lite/configs/spider2_lite.yaml
resources_servers/spider2_lite/data/.gitignore
resources_servers/spider2_lite/data/example.jsonl
resources_servers/spider2_lite/data/example_metrics.json
resources_servers/spider2_lite/data/example_rollouts.jsonl
resources_servers/spider2_lite/scripts/prepare_dataset.py
resources_servers/spider2_lite/scripts/run_eval.sh
resources_servers/spider2_lite/tests/__init__.py
resources_servers/spider2_lite/tests/conftest.py
resources_servers/spider2_lite/tests/test_app.py
resources_servers/structeval/README.md
resources_servers/structeval/app.py
resources_servers/structeval/requirements.txt
resources_servers/structeval/configs/structeval_nonrenderable.yaml
resources_servers/structeval/data/example.jsonl
resources_servers/structeval/data/example_metrics.json
resources_servers/structeval/data/example_rollouts.jsonl
resources_servers/structeval/data/example_rollouts_aggregate_metrics.json
resources_servers/structeval/data/structeval_nonrenderable_example.jsonl
resources_servers/structeval/data/structeval_nonrenderable_example_rollouts.jsonl
resources_servers/structeval/data/structeval_nonrenderable_example_rollouts_aggregate_metrics.json
resources_servers/structeval/misc/prepare_data.py
resources_servers/structeval/tests/__init__.py
resources_servers/structeval/tests/test_app.py
resources_servers/structured_outputs/README.md
resources_servers/structured_outputs/app.py
resources_servers/structured_outputs/client.py
resources_servers/structured_outputs/requirements.txt
resources_servers/structured_outputs/configs/structured_outputs_json.yaml
resources_servers/structured_outputs/configs/structured_outputs_json_yaml_xml_v1.yaml
resources_servers/structured_outputs/configs/structured_outputs_v3.yaml
resources_servers/structured_outputs/configs/structured_outputs_v4.yaml
resources_servers/structured_outputs/data/.gitignore
resources_servers/structured_outputs/data/example.jsonl
resources_servers/structured_outputs/data/example_metrics.json
resources_servers/structured_outputs/data/example_rollouts.jsonl
resources_servers/structured_outputs/data/structured_outputs_251027_nano_v3_sdg_json_train_metrics.json
resources_servers/structured_outputs/data/structured_outputs_251027_nano_v3_sdg_json_val_metrics.json
resources_servers/structured_outputs/data/structured_outputs_v3_example.jsonl
resources_servers/structured_outputs/data/structured_outputs_v3_example_metrics.json
resources_servers/structured_outputs/data/structured_outputs_v4_example.jsonl
resources_servers/structured_outputs/misc/breakdown_rollouts_metrics.py
resources_servers/structured_outputs/misc/check_tool_call_jsonl.py
resources_servers/structured_outputs/misc/structured-outputs-design.md
resources_servers/structured_outputs/misc/data_generation/.gitignore
resources_servers/structured_outputs/misc/data_generation/251027_nano_v3_sdg.py
resources_servers/structured_outputs/misc/data_generation/260309_nano_v3_sdg_all.py
resources_servers/structured_outputs/misc/data_generation/260309_nano_v3_sdg_json.py
resources_servers/structured_outputs/misc/data_generation/260309_nano_v3_sdg_xml.py
resources_servers/structured_outputs/misc/data_generation/260309_nano_v3_sdg_yaml.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/260409_augmented_sdg.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/README.md
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/prepare_verified_data.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/templates.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/__init__.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/direct.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/error_correction.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/multistep.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/schema_only.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v3/generators/translation.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v4/260424_tool_call_sdg.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v4/README.md
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v4/plot_train_distribution.py
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v4/structured_outputs_v4_train_distribution.png
resources_servers/structured_outputs/misc/data_generation/structured_outputs_v4/vllm-tool-schema-compatibility.md
resources_servers/structured_outputs/tests/__init__.py
resources_servers/structured_outputs/tests/test_app.py
resources_servers/swe_pivot/README.md
resources_servers/swe_pivot/__init__.py
resources_servers/swe_pivot/app.py
resources_servers/swe_pivot/requirements.txt
resources_servers/swe_pivot/configs/swe_pivot.yaml
resources_servers/swe_pivot/data/.gitignore
resources_servers/swe_pivot/data/example.jsonl
resources_servers/swe_pivot/data/example_metrics.json
resources_servers/swe_pivot/data/example_rollouts.jsonl
resources_servers/swe_pivot/tests/__init__.py
resources_servers/swe_pivot/tests/test_verifier.py
resources_servers/swerl_gen/README.md
resources_servers/swerl_gen/app.py
resources_servers/swerl_gen/dataset_preprocess.py
resources_servers/swerl_gen/gen_eval_scripts.py
resources_servers/swerl_gen/prompts.py
resources_servers/swerl_gen/requirements.txt
resources_servers/swerl_gen/utils.py
resources_servers/swerl_gen/configs/swerl_gen.yaml
resources_servers/swerl_gen/data/.gitignore
resources_servers/swerl_gen/data/example.jsonl
resources_servers/swerl_gen/data/example_metrics.json
resources_servers/swerl_gen/data/example_rollouts.jsonl
resources_servers/swerl_gen/eval/__init__.py
resources_servers/swerl_gen/eval/eval_instance.py
resources_servers/swerl_gen/eval/process_patch.py
resources_servers/swerl_gen/eval/reward_functions.py
resources_servers/swerl_gen/eval/singularity_utils.py
resources_servers/swerl_gen/tests/__init__.py
resources_servers/swerl_gen/tests/test_app.py
resources_servers/swerl_llm_judge/README.md
resources_servers/swerl_llm_judge/app.py
resources_servers/swerl_llm_judge/dataset_preprocess.py
resources_servers/swerl_llm_judge/prompts.py
resources_servers/swerl_llm_judge/requirements.txt
resources_servers/swerl_llm_judge/utils.py
resources_servers/swerl_llm_judge/configs/swerl_llm_judge.yaml
resources_servers/swerl_llm_judge/data/example.jsonl
resources_servers/swerl_llm_judge/data/example_metrics.json
resources_servers/swerl_llm_judge/data/example_rollouts.jsonl
resources_servers/swerl_llm_judge/tests/__init__.py
resources_servers/swerl_llm_judge/tests/test_app.py
resources_servers/tavily_search/.gitignore
resources_servers/tavily_search/README.md
resources_servers/tavily_search/app.py
resources_servers/tavily_search/client.py
resources_servers/tavily_search/judge_prompt.py
resources_servers/tavily_search/plot_session_id_metrics.py
resources_servers/tavily_search/requirements.txt
resources_servers/tavily_search/configs/tavily_search_judge_openai_model.yaml
resources_servers/tavily_search/configs/tavily_search_judge_vllm_model.yaml
resources_servers/tavily_search/data/.gitignore
resources_servers/tavily_search/data/example.jsonl
resources_servers/tavily_search/data/example_metrics.json
resources_servers/tavily_search/data/example_rollouts.jsonl
resources_servers/tavily_search/data/preprocess_dataset/convert_simple_evals.py
resources_servers/tavily_search/data/preprocess_dataset/upload_datasets.sh
resources_servers/tavily_search/tests/dummy_exclude_domains_file.json
resources_servers/tavily_search/tests/test_app.py
resources_servers/terminal_multi_harness/README.md
resources_servers/terminal_multi_harness/__init__.py
resources_servers/terminal_multi_harness/app.py
resources_servers/terminal_multi_harness/requirements.txt
resources_servers/terminal_multi_harness/common/__init__.py
resources_servers/terminal_multi_harness/common/response_utils.py
resources_servers/terminal_multi_harness/common/verification_utils.py
resources_servers/terminal_multi_harness/configs/terminal_multi_harness_agent006.yaml
resources_servers/terminal_multi_harness/configs/terminal_multi_harness_codex.yaml
resources_servers/terminal_multi_harness/configs/terminal_multi_harness_opencode.yaml
resources_servers/terminal_multi_harness/configs/terminal_multi_harness_stirrup.yaml
resources_servers/terminal_multi_harness/data/.gitignore
resources_servers/terminal_multi_harness/data/example.jsonl
resources_servers/terminal_multi_harness/data/example_metrics.json
resources_servers/terminal_multi_harness/data/example_rollouts.jsonl
resources_servers/terminal_multi_harness/docs/AGENTS.md
resources_servers/terminal_multi_harness/docs/README.md
resources_servers/terminal_multi_harness/docs/development-workflow.md
resources_servers/terminal_multi_harness/docs/harness-extension-architecture.md
resources_servers/terminal_multi_harness/docs/rulebooks/agent006-match-rules.md
resources_servers/terminal_multi_harness/docs/rulebooks/codex-match-rules.md
resources_servers/terminal_multi_harness/docs/rulebooks/opencode-match-rules.md
resources_servers/terminal_multi_harness/docs/rulebooks/stirrup-match-rules.md
resources_servers/terminal_multi_harness/tests/__init__.py
resources_servers/terminal_multi_harness/tests/test_app.py
resources_servers/terminal_multi_harness/tests/common/__init__.py
resources_servers/terminal_multi_harness/tests/common/test_response_utils.py
resources_servers/terminal_multi_harness/tests/common/test_verification_utils.py
resources_servers/terminus_judge/README.md
resources_servers/terminus_judge/app.py
resources_servers/terminus_judge/requirements.txt
resources_servers/terminus_judge/schemas.py
resources_servers/terminus_judge/configs/terminus_judge.yaml
resources_servers/terminus_judge/configs/terminus_judge_simple.yaml
resources_servers/terminus_judge/configs/terminus_judge_string_only.yaml
resources_servers/terminus_judge/data/.gitignore
resources_servers/terminus_judge/data/example.jsonl
resources_servers/terminus_judge/data/example_metrics.json
resources_servers/terminus_judge/data/example_rollouts.jsonl
resources_servers/terminus_judge/prompt_templates/rubrics_v4.txt
resources_servers/terminus_judge/prompt_templates/terminus_prompt.txt
resources_servers/terminus_judge/scripts/README.md
resources_servers/terminus_judge/scripts/prepare.py
resources_servers/terminus_judge/tests/test_app.py
resources_servers/text_to_sql/README.md
resources_servers/text_to_sql/app.py
resources_servers/text_to_sql/prompts.py
resources_servers/text_to_sql/requirements.txt
resources_servers/text_to_sql/configs/text_to_sql.yaml
resources_servers/text_to_sql/data/.gitignore
resources_servers/text_to_sql/data/example.jsonl
resources_servers/text_to_sql/data/example_metrics.json
resources_servers/text_to_sql/data/example_rollouts.jsonl
resources_servers/text_to_sql/tests/__init__.py
resources_servers/text_to_sql/tests/test_app.py
resources_servers/ugphysics_judge/README.md
resources_servers/ugphysics_judge/__init__.py
resources_servers/ugphysics_judge/app.py
resources_servers/ugphysics_judge/requirements.txt
resources_servers/ugphysics_judge/configs/ugphysics_judge.yaml
resources_servers/ugphysics_judge/data/.gitignore
resources_servers/ugphysics_judge/data/example.jsonl
resources_servers/ugphysics_judge/data/example_metrics.json
resources_servers/ugphysics_judge/data/example_rollouts.jsonl
resources_servers/ugphysics_judge/prompts/judge.yaml
resources_servers/ugphysics_judge/tests/__init__.py
resources_servers/ugphysics_judge/tests/test_app.py
resources_servers/verifif/README.md
resources_servers/verifif/app.py
resources_servers/verifif/requirements.txt
resources_servers/verifif/configs/verifif.yaml
resources_servers/verifif/data/example.jsonl
resources_servers/verifif/data/example_metrics.json
resources_servers/verifif/data/example_rollouts.jsonl
resources_servers/verifif/tests/__init__.py
resources_servers/verifif/tests/test_app.py
resources_servers/verifif/vif_validators/__init__.py
resources_servers/verifif/vif_validators/data_loader.py
resources_servers/verifif/vif_validators/evaluation_modes.csv
resources_servers/verifif/vif_validators/instruction_definition.csv
resources_servers/verifif/vif_validators/subinstruction_definition.csv
resources_servers/verifif/vif_validators/validator.py
resources_servers/verifif/vif_validators/ar/__init__.py
resources_servers/verifif/vif_validators/ar/validator.py
resources_servers/verifif/vif_validators/de/__init__.py
resources_servers/verifif/vif_validators/de/validator.py
resources_servers/verifif/vif_validators/en/__init__.py
resources_servers/verifif/vif_validators/en/validator.py
resources_servers/verifif/vif_validators/en/wrapper.py
resources_servers/verifif/vif_validators/es/__init__.py
resources_servers/verifif/vif_validators/es/validator.py
resources_servers/verifif/vif_validators/fre/__init__.py
resources_servers/verifif/vif_validators/fre/validator.py
resources_servers/verifif/vif_validators/hi/__init__.py
resources_servers/verifif/vif_validators/hi/validator.py
resources_servers/verifif/vif_validators/it/__init__.py
resources_servers/verifif/vif_validators/it/validator.py
resources_servers/verifif/vif_validators/ja/__init__.py
resources_servers/verifif/vif_validators/ja/validator.py
resources_servers/verifif/vif_validators/ko/__init__.py
resources_servers/verifif/vif_validators/ko/validator.py
resources_servers/verifif/vif_validators/pt_br/__init__.py
resources_servers/verifif/vif_validators/pt_br/validator.py
resources_servers/verifif/vif_validators/zh/__init__.py
resources_servers/verifif/vif_validators/zh/validator.py
resources_servers/vlm_eval_kit/.gitignore
resources_servers/vlm_eval_kit/README.md
resources_servers/vlm_eval_kit/app.py
resources_servers/vlm_eval_kit/call_aggregate_metrics.py
resources_servers/vlm_eval_kit/prepare_data.py
resources_servers/vlm_eval_kit/pyproject.toml
resources_servers/vlm_eval_kit/configs/MMBench_DEV_EN_V11.yaml
resources_servers/vlm_eval_kit/configs/OCRBench.yaml
resources_servers/vlm_eval_kit/configs/vlm_eval_kit.yaml
resources_servers/vlm_eval_kit/data/.gitignore
resources_servers/vlm_eval_kit/data/MMBench_DEV_EN_V11_validation_metrics.json
resources_servers/vlm_eval_kit/data/OCRBench_validation_metrics.json
resources_servers/vlm_eval_kit/data/example.jsonl
resources_servers/vlm_eval_kit/data/example_metrics.json
resources_servers/vlm_eval_kit/data/example_rollouts.jsonl
resources_servers/vlm_eval_kit/tests/test_app.py
resources_servers/wmt_translation/README.md
resources_servers/wmt_translation/__init__.py
resources_servers/wmt_translation/app.py
resources_servers/wmt_translation/requirements.txt
resources_servers/wmt_translation/configs/wmt_translation.yaml
resources_servers/wmt_translation/data/.gitignore
resources_servers/wmt_translation/data/example.jsonl
resources_servers/wmt_translation/data/example_metrics.json
resources_servers/wmt_translation/data/example_rollouts.jsonl
resources_servers/wmt_translation/tests/__init__.py
resources_servers/wmt_translation/tests/test_app.py
resources_servers/workplace_assistant/README.md
resources_servers/workplace_assistant/app.py
resources_servers/workplace_assistant/client.py
resources_servers/workplace_assistant/dataset_preprocess.py
resources_servers/workplace_assistant/requirements.txt
resources_servers/workplace_assistant/utils.py
resources_servers/workplace_assistant/configs/workplace_assistant.yaml
resources_servers/workplace_assistant/csv_data/processed/analytics_data.csv
resources_servers/workplace_assistant/csv_data/processed/calendar_events.csv
resources_servers/workplace_assistant/csv_data/processed/customer_relationship_manager_data.csv
resources_servers/workplace_assistant/csv_data/processed/emails.csv
resources_servers/workplace_assistant/csv_data/processed/project_tasks.csv
resources_servers/workplace_assistant/csv_data/raw/email_addresses.csv
resources_servers/workplace_assistant/csv_data/raw/events.csv
resources_servers/workplace_assistant/data/.gitignore
resources_servers/workplace_assistant/data/example.jsonl
resources_servers/workplace_assistant/data/example_metrics.json
resources_servers/workplace_assistant/data/example_rollouts.jsonl
resources_servers/workplace_assistant/data/train_metrics.json
resources_servers/workplace_assistant/data/validation_metrics.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/README.md
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/multistep-toolcalling-sdg.ipynb
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/requirements.txt
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/analytics.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/calendar.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/company_directory.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/customer_relationship_manager.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/email.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/environment.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/tools/project_management.json
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/utils/__init__.py
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/utils/convert_to_nemo_gym_format.py
resources_servers/workplace_assistant/notebooks/synthetic-data-generation/utils/quality_filtering.py
resources_servers/workplace_assistant/tests/test_app.py
resources_servers/workplace_assistant/workplace_assistant_tools/analytics.py
resources_servers/workplace_assistant/workplace_assistant_tools/calendar.py
resources_servers/workplace_assistant/workplace_assistant_tools/company_directory.py
resources_servers/workplace_assistant/workplace_assistant_tools/customer_relationship_manager.py
resources_servers/workplace_assistant/workplace_assistant_tools/email.py
resources_servers/workplace_assistant/workplace_assistant_tools/project_management.py
resources_servers/xlam_fc/README.md
resources_servers/xlam_fc/app.py
resources_servers/xlam_fc/generate_dataset.py
resources_servers/xlam_fc/requirements.txt
resources_servers/xlam_fc/configs/xlam_fc.yaml
resources_servers/xlam_fc/data/example.jsonl
resources_servers/xlam_fc/data/example_metrics.json
resources_servers/xlam_fc/data/example_rollouts.jsonl
resources_servers/xlam_fc/tests/__init__.py
resources_servers/xlam_fc/tests/test_app.py
resources_servers/xstest/README.md
resources_servers/xstest/app.py
resources_servers/xstest/requirements.txt
resources_servers/xstest/configs/xstest.yaml
resources_servers/xstest/data/.gitignore
resources_servers/xstest/data/example.jsonl
resources_servers/xstest/data/example_metrics.json
resources_servers/xstest/data/example_rollouts.jsonl
resources_servers/xstest/prompt_templates/wildguard_judge.txt
resources_servers/xstest/prompt_templates/xstest_judge.txt
resources_servers/xstest/scripts/aggregate_results.py
resources_servers/xstest/tests/__init__.py
resources_servers/xstest/tests/test_app.py
responses_api_agents/aviary_agent/README.md
responses_api_agents/aviary_agent/app.py
responses_api_agents/aviary_agent/requirements.txt
responses_api_agents/aviary_agent/tests/test_app.py
responses_api_agents/browsecomp_agent/README.md
responses_api_agents/browsecomp_agent/__init__.py
responses_api_agents/browsecomp_agent/app.py
responses_api_agents/browsecomp_agent/requirements.txt
responses_api_agents/browsecomp_agent/configs/browsecomp_agent.yaml
responses_api_agents/browsecomp_agent/tests/__init__.py
responses_api_agents/browsecomp_agent/tests/test_app.py
responses_api_agents/claude_code_agent/README.md
responses_api_agents/claude_code_agent/__init__.py
responses_api_agents/claude_code_agent/app.py
responses_api_agents/claude_code_agent/requirements.txt
responses_api_agents/claude_code_agent/setup_claude_code.py
responses_api_agents/claude_code_agent/configs/claude_code_agent.yaml
responses_api_agents/claude_code_agent/data/example_math_rollouts.jsonl
responses_api_agents/claude_code_agent/data/example_math_rollouts_aggregate_metrics.json
responses_api_agents/claude_code_agent/data/example_math_rollouts_materialized_inputs.jsonl
responses_api_agents/claude_code_agent/tests/__init__.py
responses_api_agents/claude_code_agent/tests/test_app.py
responses_api_agents/cvdp_agent/README.md
responses_api_agents/cvdp_agent/__init__.py
responses_api_agents/cvdp_agent/app.py
responses_api_agents/cvdp_agent/requirements.txt
responses_api_agents/cvdp_agent/configs/cvdp_agent.yaml
responses_api_agents/cvdp_agent/tests/test_app.py
responses_api_agents/finance_agent/README.md
responses_api_agents/finance_agent/__init__.py
responses_api_agents/finance_agent/app.py
responses_api_agents/finance_agent/requirements.txt
responses_api_agents/finance_agent/tests/__init__.py
responses_api_agents/finance_agent/tests/test_app.py
responses_api_agents/gymnasium_agent/README.md
responses_api_agents/gymnasium_agent/__init__.py
responses_api_agents/gymnasium_agent/app.py
responses_api_agents/gymnasium_agent/requirements.txt
responses_api_agents/gymnasium_agent/configs/gymnasium_agent.yaml
responses_api_agents/gymnasium_agent/tests/__init__.py
responses_api_agents/gymnasium_agent/tests/test_app.py
responses_api_agents/harbor_agent/.gitignore
responses_api_agents/harbor_agent/README.md
responses_api_agents/harbor_agent/__init__.py
responses_api_agents/harbor_agent/app.py
responses_api_agents/harbor_agent/client.py
responses_api_agents/harbor_agent/requirements.txt
responses_api_agents/harbor_agent/utils.py
responses_api_agents/harbor_agent/configs/harbor_agent.yaml
responses_api_agents/harbor_agent/configs/harbor_agent_daytona.yaml
responses_api_agents/harbor_agent/custom_agents/terminus_2_nemo_gym.py
responses_api_agents/harbor_agent/custom_agents/llms/nemo_gym_llm.py
responses_api_agents/harbor_agent/custom_agents/llms/test_nemo_gym_llm.py
responses_api_agents/harbor_agent/custom_envs/singularity/server.py
responses_api_agents/harbor_agent/custom_envs/singularity/singularity.py
responses_api_agents/harbor_agent/custom_envs/singularity/scripts/build_and_push_images.py
responses_api_agents/harbor_agent/custom_envs/singularity/scripts/rewrite_task_tomls.py
responses_api_agents/harbor_agent/custom_envs/singularity/scripts/write_min_setup_sh.py
responses_api_agents/harbor_agent/example/example_input.jsonl
responses_api_agents/harbor_agent/example/example_output.jsonl
responses_api_agents/harbor_agent/example/terminal_bench_daytona_input.jsonl
responses_api_agents/harbor_agent/example/terminal_bench_daytona_output.jsonl
responses_api_agents/harbor_agent/tests/__init__.py
responses_api_agents/harbor_agent/tests/test_app.py
responses_api_agents/hermes_agent/README.md
responses_api_agents/hermes_agent/__init__.py
responses_api_agents/hermes_agent/app.py
responses_api_agents/hermes_agent/requirements.txt
responses_api_agents/hermes_agent/configs/hermes_agent.yaml
responses_api_agents/hermes_agent/data/example_math_rollouts.jsonl
responses_api_agents/hermes_agent/data/example_math_rollouts_aggregate_metrics.json
responses_api_agents/hermes_agent/data/example_math_rollouts_materialized_inputs.jsonl
responses_api_agents/hermes_agent/tests/__init__.py
responses_api_agents/hermes_agent/tests/test_app.py
responses_api_agents/hermes_agent/tests/test_config.py
responses_api_agents/labbench2_vlm_agent/README.md
responses_api_agents/labbench2_vlm_agent/app.py
responses_api_agents/labbench2_vlm_agent/requirements.txt
responses_api_agents/labbench2_vlm_agent/tests/test_app.py
responses_api_agents/langgraph_agent/README.md
responses_api_agents/langgraph_agent/app.py
responses_api_agents/langgraph_agent/orchestrator_agent.py
responses_api_agents/langgraph_agent/parallel_thinking_agent.py
responses_api_agents/langgraph_agent/reflection_agent.py
responses_api_agents/langgraph_agent/requirements.txt
responses_api_agents/langgraph_agent/rewoo_agent.py
responses_api_agents/langgraph_agent/configs/orchestrator_agent.yaml
responses_api_agents/langgraph_agent/configs/parallel_thinking_agent.yaml
responses_api_agents/langgraph_agent/configs/reflection_agent.yaml
responses_api_agents/langgraph_agent/configs/rewoo_agent.yaml
responses_api_agents/langgraph_agent/tests/__init__.py
responses_api_agents/langgraph_agent/tests/test_app.py
responses_api_agents/mini_swe_agent/.gitignore
responses_api_agents/mini_swe_agent/README.md
responses_api_agents/mini_swe_agent/__init__.py
responses_api_agents/mini_swe_agent/app.py
responses_api_agents/mini_swe_agent/client.py
responses_api_agents/mini_swe_agent/requirements.txt
responses_api_agents/mini_swe_agent/utils.py
responses_api_agents/mini_swe_agent/assets/miniswe_qwen_coder.png
responses_api_agents/mini_swe_agent/configs/mini_swe_agent.yaml
responses_api_agents/mini_swe_agent/data/.gitignore
responses_api_agents/mini_swe_agent/data/example.jsonl
responses_api_agents/mini_swe_agent/tests/test_app.py
responses_api_agents/non_executing_simple_agent/README.md
responses_api_agents/non_executing_simple_agent/__init__.py
responses_api_agents/non_executing_simple_agent/app.py
responses_api_agents/non_executing_simple_agent/requirements.txt
responses_api_agents/non_executing_simple_agent/configs/non_executing_simple_agent.yaml
responses_api_agents/non_executing_simple_agent/tests/test_app.py
responses_api_agents/proof_refinement_agent/README.md
responses_api_agents/proof_refinement_agent/__init__.py
responses_api_agents/proof_refinement_agent/app.py
responses_api_agents/proof_refinement_agent/requirements.txt
responses_api_agents/proof_refinement_agent/configs/proof_refinement_agent.yaml
responses_api_agents/proof_refinement_agent/tests/test_app.py
responses_api_agents/simple_agent/README.md
responses_api_agents/simple_agent/__init__.py
responses_api_agents/simple_agent/app.py
responses_api_agents/simple_agent/client.py
responses_api_agents/simple_agent/requirements.txt
responses_api_agents/simple_agent/configs/simple_agent.yaml
responses_api_agents/simple_agent/tests/test_app.py
responses_api_agents/speed_bench_agent/README.md
responses_api_agents/speed_bench_agent/__init__.py
responses_api_agents/speed_bench_agent/app.py
responses_api_agents/speed_bench_agent/requirements.txt
responses_api_agents/speed_bench_agent/configs/speed_bench_agent.yaml
responses_api_agents/speed_bench_agent/tests/__init__.py
responses_api_agents/speed_bench_agent/tests/test_app.py
responses_api_agents/stirrup_agent/README.md
responses_api_agents/stirrup_agent/__init__.py
responses_api_agents/stirrup_agent/app.py
responses_api_agents/stirrup_agent/apptainer_provider.py
responses_api_agents/stirrup_agent/client.py
responses_api_agents/stirrup_agent/file_reader.py
responses_api_agents/stirrup_agent/finish_tool_coercing.py
responses_api_agents/stirrup_agent/nemo_agent.py
responses_api_agents/stirrup_agent/nemo_client.py
responses_api_agents/stirrup_agent/requirements.txt
responses_api_agents/stirrup_agent/stirrup_utils.py
responses_api_agents/stirrup_agent/task_strategy.py
responses_api_agents/stirrup_agent/tavily_search.py
responses_api_agents/stirrup_agent/configs/stirrup_gdpval.yaml
responses_api_agents/stirrup_agent/containers/gdpval.def
responses_api_agents/stirrup_agent/data/.gitignore
responses_api_agents/stirrup_agent/data/example.jsonl
responses_api_agents/stirrup_agent/prompts/gdpval_user_prompt.txt
responses_api_agents/stirrup_agent/prompts/system_prompt.j2
responses_api_agents/stirrup_agent/prompts/user_prompt.j2
responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh
responses_api_agents/stirrup_agent/tasks/__init__.py
responses_api_agents/stirrup_agent/tasks/gdpval.py
responses_api_agents/stirrup_agent/tests/__init__.py
responses_api_agents/stirrup_agent/tests/test_app.py
responses_api_agents/stirrup_agent/tests/test_apptainer_provider.py
responses_api_agents/stirrup_agent/tests/test_finish_tool_coercing.py
responses_api_agents/stirrup_agent/tests/test_nemo_client.py
responses_api_agents/stirrup_agent/tests/test_tasks_gdpval.py
responses_api_agents/stirrup_agent/tests/test_tavily_search.py
responses_api_agents/swe_agents/.gitignore
responses_api_agents/swe_agents/README.md
responses_api_agents/swe_agents/__init__.py
responses_api_agents/swe_agents/app.py
responses_api_agents/swe_agents/client.py
responses_api_agents/swe_agents/requirements.txt
responses_api_agents/swe_agents/configs/__init__.py
responses_api_agents/swe_agents/configs/oh_config.toml
responses_api_agents/swe_agents/configs/swe_agent_config.yaml
responses_api_agents/swe_agents/configs/swe_agent_tools_openai_format.json
responses_api_agents/swe_agents/configs/swebench_multi_tools.yaml
responses_api_agents/swe_agents/configs/swebench_openhands.yaml
responses_api_agents/swe_agents/configs/swebench_openhands_training.yaml
responses_api_agents/swe_agents/configs/swebench_swe_agent.yaml
responses_api_agents/swe_agents/data/.gitignore
responses_api_agents/swe_agents/data/example.jsonl
responses_api_agents/swe_agents/data/example_dummy_swebench_response.json
responses_api_agents/swe_agents/data/example_metrics.json
responses_api_agents/swe_agents/prompts/breadth_first/system_prompt.j2
responses_api_agents/swe_agents/prompts/breadth_first/user_prompt.j2
responses_api_agents/swe_agents/prompts/codex/system_prompt.j2
responses_api_agents/swe_agents/prompts/codex/user_prompt.j2
responses_api_agents/swe_agents/prompts/divide_and_conquer/system_prompt.j2
responses_api_agents/swe_agents/prompts/divide_and_conquer/user_prompt.j2
responses_api_agents/swe_agents/prompts/explore_plan_execute/system_prompt.j2
responses_api_agents/swe_agents/prompts/explore_plan_execute/user_prompt.j2
responses_api_agents/swe_agents/prompts/hypothesis_driven/system_prompt.j2
responses_api_agents/swe_agents/prompts/hypothesis_driven/user_prompt.j2
responses_api_agents/swe_agents/prompts/incremental/system_prompt.j2
responses_api_agents/swe_agents/prompts/incremental/user_prompt.j2
responses_api_agents/swe_agents/prompts/minimalist/system_prompt.j2
responses_api_agents/swe_agents/prompts/minimalist/user_prompt.j2
responses_api_agents/swe_agents/prompts/opencode/system_prompt.j2
responses_api_agents/swe_agents/prompts/opencode/user_prompt.j2
responses_api_agents/swe_agents/prompts/openhands/system_prompt.j2
responses_api_agents/swe_agents/prompts/openhands/user_prompt.j2
responses_api_agents/swe_agents/prompts/plan_and_execute/system_prompt.j2
responses_api_agents/swe_agents/prompts/plan_and_execute/user_prompt.j2
responses_api_agents/swe_agents/prompts/root_cause/system_prompt.j2
responses_api_agents/swe_agents/prompts/root_cause/user_prompt.j2
responses_api_agents/swe_agents/prompts/surgical/system_prompt.j2
responses_api_agents/swe_agents/prompts/surgical/user_prompt.j2
responses_api_agents/swe_agents/prompts/terminus/system_prompt.j2
responses_api_agents/swe_agents/prompts/terminus/user_prompt.j2
responses_api_agents/swe_agents/prompts/test_driven/system_prompt.j2
responses_api_agents/swe_agents/prompts/test_driven/user_prompt.j2
responses_api_agents/swe_agents/prompts/verify_first/system_prompt.j2
responses_api_agents/swe_agents/prompts/verify_first/user_prompt.j2
responses_api_agents/swe_agents/setup_scripts/openhands.sh
responses_api_agents/swe_agents/setup_scripts/r2e_gym.sh
responses_api_agents/swe_agents/setup_scripts/swe_rebench.sh
responses_api_agents/swe_agents/setup_scripts/swebench.sh
responses_api_agents/swe_agents/setup_scripts/swebench_multilingual.sh
responses_api_agents/swe_agents/swe_bench_ext/__init__.py
responses_api_agents/swe_agents/swe_bench_ext/frameworks.py
responses_api_agents/swe_agents/swe_bench_ext/parsing.py
responses_api_agents/swe_agents/swe_bench_ext/utils.py
responses_api_agents/swe_agents/tests/__init__.py
responses_api_agents/swe_agents/tests/test_app.py
responses_api_agents/tau2/.gitignore
responses_api_agents/tau2/README.md
responses_api_agents/tau2/__init__.py
responses_api_agents/tau2/app.py
responses_api_agents/tau2/requirements.txt
responses_api_agents/tau2/configs/tau2_agent.yaml
responses_api_agents/tau2/data/.gitignore
responses_api_agents/tau2/data/example.jsonl
responses_api_agents/tau2/data/example_metrics.json
responses_api_agents/tau2/data/example_rollouts.jsonl
responses_api_agents/tau2/tests/test_app.py
responses_api_agents/tau2/tests/test_data.json
responses_api_agents/tool_simulation_agent/README.md
responses_api_agents/tool_simulation_agent/__init__.py
responses_api_agents/tool_simulation_agent/app.py
responses_api_agents/tool_simulation_agent/requirements.txt
responses_api_agents/tool_simulation_agent/configs/tool_simulation_agent.yaml
responses_api_agents/tool_simulation_agent/tests/__init__.py
responses_api_agents/tool_simulation_agent/tests/test_app.py
responses_api_agents/verifiers_agent/README.md
responses_api_agents/verifiers_agent/__init__.py
responses_api_agents/verifiers_agent/app.py
responses_api_agents/verifiers_agent/requirements.txt
responses_api_agents/verifiers_agent/configs/acereason-math.yaml
responses_api_agents/verifiers_agent/data/acereason-math-example.jsonl
responses_api_agents/verifiers_agent/scripts/create_dataset.py
responses_api_agents/verifiers_agent/tests/__init__.py
responses_api_agents/verifiers_agent/tests/test_app.py
responses_api_models/azure_openai_model/README.md
responses_api_models/azure_openai_model/app.py
responses_api_models/azure_openai_model/client.py
responses_api_models/azure_openai_model/requirements.txt
responses_api_models/azure_openai_model/configs/azure_openai_model.yaml
responses_api_models/azure_openai_model/tests/test_app.py
responses_api_models/genrm_model/README.md
responses_api_models/genrm_model/__init__.py
responses_api_models/genrm_model/app.py
responses_api_models/genrm_model/pyproject.toml
responses_api_models/genrm_model/setup.py
responses_api_models/genrm_model/configs/genrm_model.yaml
responses_api_models/genrm_model/tests/__init__.py
responses_api_models/genrm_model/tests/test_app.py
responses_api_models/local_vllm_model/README.md
responses_api_models/local_vllm_model/__init__.py
responses_api_models/local_vllm_model/app.py
responses_api_models/local_vllm_model/local_vllm_model_actor.py
responses_api_models/local_vllm_model/pyproject.toml
responses_api_models/local_vllm_model/setup.py
responses_api_models/local_vllm_model/configs/Qwen/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3-30B-A3B-Instruct-2507-ngram-specdec.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3-30B-A3B-Instruct-2507.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3-30B-A3B-Thinking-2507.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3.5-122B-A10B.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3.5-27B.yaml
responses_api_models/local_vllm_model/configs/Qwen/Qwen3.5-35B-A3B.yaml
responses_api_models/local_vllm_model/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
responses_api_models/local_vllm_model/configs/nvidia/nemotron_3_ultra_dev_nemorl_gb200.yaml
responses_api_models/local_vllm_model/configs/openai/gpt-oss-120b-reasoning-high.yaml
responses_api_models/local_vllm_model/configs/openai/gpt-oss-120b-reasoning-low.yaml
responses_api_models/local_vllm_model/configs/openai/gpt-oss-120b-reasoning-medium.yaml
responses_api_models/local_vllm_model/configs/openai/gpt-oss-20b-reasoning-high.yaml
responses_api_models/local_vllm_model/scripts/launch_vllm_server.sh
responses_api_models/local_vllm_model/scripts/nvidia_smi_cluster.py
responses_api_models/local_vllm_model/test_scripts/1_node/1_instance_1x8.sh
responses_api_models/local_vllm_model/test_scripts/1_node/1_instance_2x4.sh
responses_api_models/local_vllm_model/test_scripts/1_node/2_instances_1x4.sh
responses_api_models/local_vllm_model/test_scripts/2_nodes/1_instance_1x16.sh
responses_api_models/local_vllm_model/test_scripts/2_nodes/2_instances_1x8.sh
responses_api_models/local_vllm_model/test_scripts/2_nodes/2_instances_2x4.sh
responses_api_models/local_vllm_model/test_scripts/4_nodes/1_instance_2x16.sh
responses_api_models/local_vllm_model/test_scripts/4_nodes/2_instances_1x16.sh
responses_api_models/local_vllm_model/test_scripts/8_nodes/2_instances_2x16.sh
responses_api_models/local_vllm_model/tests/__init__.py
responses_api_models/local_vllm_model/tests/test_app.py
responses_api_models/local_vllm_model_proxy/README.md
responses_api_models/local_vllm_model_proxy/__init__.py
responses_api_models/local_vllm_model_proxy/app.py
responses_api_models/local_vllm_model_proxy/requirements.txt
responses_api_models/local_vllm_model_proxy/configs/local_vllm_model_proxy.yaml
responses_api_models/local_vllm_model_proxy/tests/test_app.py
responses_api_models/openai_model/README.md
responses_api_models/openai_model/__init__.py
responses_api_models/openai_model/app.py
responses_api_models/openai_model/client.py
responses_api_models/openai_model/requirements.txt
responses_api_models/openai_model/configs/openai_model.yaml
responses_api_models/openai_model/configs/OpenAI/gpt-4.1-2025-04-14.yaml
responses_api_models/openai_model/configs/OpenAI/gpt-5-2025-08-07.yaml
responses_api_models/openai_model/configs/OpenAI/gpt-5-nano-2025-08-07.yaml
responses_api_models/openai_model/configs/OpenAI/gpt-5.2-2025-12-11.yaml
responses_api_models/openai_model/tests/test_app.py
responses_api_models/vllm_model/README.md
responses_api_models/vllm_model/__init__.py
responses_api_models/vllm_model/app.py
responses_api_models/vllm_model/client.py
responses_api_models/vllm_model/pyproject.toml
responses_api_models/vllm_model/configs/vllm_model.yaml
responses_api_models/vllm_model/configs/vllm_model_for_training.yaml
responses_api_models/vllm_model/tests/__init__.py
responses_api_models/vllm_model/tests/round_trip_test_data.json
responses_api_models/vllm_model/tests/test_app.py
results/.gitignore
scripts/add_verified_flag.py
scripts/print_aggregate_results.py
scripts/update_env_list.py
scripts/wait_for_servers.sh
tests/__init__.py
tests/conftest.py
tests/functional_tests/L2_Functional_Tests_GPU.sh
tests/functional_tests/__init__.py
tests/unit_tests/__init__.py
tests/unit_tests/test_aggregate_metrics.py
tests/unit_tests/test_base_resources_server.py
tests/unit_tests/test_base_responses_api_agent.py
tests/unit_tests/test_base_responses_api_model.py
tests/unit_tests/test_benchmarks.py
tests/unit_tests/test_cli.py
tests/unit_tests/test_cli_setup_command.py
tests/unit_tests/test_config_types_help.py
tests/unit_tests/test_dataset_orchestrator.py
tests/unit_tests/test_gitlab_utils.py
tests/unit_tests/test_global_config.py
tests/unit_tests/test_hf_utils.py
tests/unit_tests/test_openai_utils.py
tests/unit_tests/test_placeholder.py
tests/unit_tests/test_profiling.py
tests/unit_tests/test_prompt.py
tests/unit_tests/test_reward_profile.py
tests/unit_tests/test_rollout_collection.py
tests/unit_tests/test_server_status.py
tests/unit_tests/test_server_utils.py
tests/unit_tests/test_train_data_utils.py