LICENSE
README.md
__init__.py
__main__.py
cli.py
pyproject.toml
samplers.py
utils.py
./__init__.py
./__main__.py
./cli.py
./samplers.py
./utils.py
llm_eval_framework.egg-info/PKG-INFO
llm_eval_framework.egg-info/SOURCES.txt
llm_eval_framework.egg-info/dependency_links.txt
llm_eval_framework.egg-info/entry_points.txt
llm_eval_framework.egg-info/requires.txt
llm_eval_framework.egg-info/top_level.txt
tasks/__init__.py
tasks/alpaca_eval_task.py
tasks/arena_hard_task.py
tasks/healthbench_task.py
tasks/ifbench_task.py
tasks/ifeval_task.py
tasks/pairwise_base.py
tasks/writingbench_task.py
tasks/alpaca_eval/__init__.py
tasks/alpaca_eval/data/alpaca_eval_gpt4_baseline.json
tasks/arena_hard/__init__.py
tasks/arena_hard/data/arena-hard-v2.0/question.jsonl
tasks/arena_hard/data/arena-hard-v2.0/model_answer/gemini-2.0-flash-001.jsonl
tasks/arena_hard/data/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl
tasks/healthbench/__init__.py
tasks/healthbench/data/healthbench_eval.jsonl
tasks/ifbench/__init__.py
tasks/ifbench/data/IFBench_test.jsonl
tasks/ifeval/__init__.py
tasks/ifeval/evaluation.py
tasks/ifeval/instructions.py
tasks/ifeval/instructions_registry.py
tasks/ifeval/instructions_util.py
tasks/ifeval/data/input_data.jsonl
tasks/writingbench/__init__.py
tasks/writingbench/calculate_scores.py
tasks/writingbench/prompt.py
tasks/writingbench/benchmark_query/benchmark_all.jsonl
tasks/writingbench/benchmark_query/benchmark_single_prompt.jsonl
tasks/writingbench/benchmark_query/requirement/format/format_subset.jsonl
tasks/writingbench/benchmark_query/requirement/format/format_subset_C.jsonl
tasks/writingbench/benchmark_query/requirement/length/length_subset.jsonl
tasks/writingbench/benchmark_query/requirement/length/length_subset_C.jsonl
tasks/writingbench/benchmark_query/requirement/style/style_subset.jsonl
tasks/writingbench/benchmark_query/requirement/style/style_subset_C.jsonl
tests/test_arena_hard_bundled.py