LICENSE
README.md
pyproject.toml
api/app/__init__.py
api/app/main.py
api/app/api/__init__.py
api/app/api/api_keys.py
api/app/api/auth.py
api/app/api/datasets.py
api/app/api/graders.py
api/app/api/permissions.py
api/app/api/scenarios.py
api/app/api/tasks.py
api/app/api/tenants.py
api/app/api/users.py
api/app/core/__init__.py
api/app/core/config.py
api/app/core/database.py
api/app/models/__init__.py
api/app/models/api_key.py
api/app/models/permission.py
api/app/models/task.py
api/app/models/tenant.py
api/app/models/user.py
api/app/schemas/__init__.py
api/app/schemas/api_key.py
api/app/schemas/permission.py
api/app/schemas/task.py
api/app/schemas/tenant.py
api/app/schemas/user.py
api/app/services/__init__.py
api/app/services/evaluation.py
api/app/services/storage.py
api/app/services/task_executor.py
api/app/utils/__init__.py
api/app/utils/deps.py
api/app/utils/permissions.py
api/app/utils/security.py
cookbooks/data_refinement/refinement.py
cookbooks/grader_validation/accuracy.py
cookbooks/grader_validation/grader_validator.py
cookbooks/grader_validation/rewardbench2.py
cookbooks/integrations/langsmith.py
cookbooks/pairwise_evaluation/pairwise_evaluation.py
cookbooks/training_judge_model/bradley-terry/dataset.py
cookbooks/training_judge_model/bradley-terry/trainer.py
cookbooks/training_judge_model/grpo/chat_rl_dataset.py
cookbooks/training_judge_model/grpo/pairwise/reward_fn.py
cookbooks/training_judge_model/grpo/pointwise/reward_fn.py
cookbooks/zero_shot_evaluation/__main__.py
cookbooks/zero_shot_evaluation/chart_generator.py
cookbooks/zero_shot_evaluation/query_generator.py
cookbooks/zero_shot_evaluation/report_generator.py
cookbooks/zero_shot_evaluation/response_collector.py
cookbooks/zero_shot_evaluation/schema.py
cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
examples/scenario_evaluation_example.py
modelselect/__init__.py
modelselect/analyzer/__init__.py
modelselect/analyzer/base_analyzer.py
modelselect/analyzer/pairwise_analyzer.py
modelselect/analyzer/statistical/__init__.py
modelselect/analyzer/statistical/consistency_analyzer.py
modelselect/analyzer/statistical/distribution_analyzer.py
modelselect/analyzer/validation/__init__.py
modelselect/analyzer/validation/accuracy_analyzer.py
modelselect/analyzer/validation/base_validation_analyzer.py
modelselect/analyzer/validation/correlation_analyzer.py
modelselect/analyzer/validation/f1_score_analyzer.py
modelselect/analyzer/validation/false_negative_analyzer.py
modelselect/analyzer/validation/false_positive_analyzer.py
modelselect/analyzer/validation/precision_analyzer.py
modelselect/analyzer/validation/recall_analyzer.py
modelselect/generator/__init__.py
modelselect/generator/base_generator.py
modelselect/generator/llm_grader_generator.py
modelselect/generator/iterative_rubric/__init__.py
modelselect/generator/iterative_rubric/categorizer.py
modelselect/generator/iterative_rubric/generator.py
modelselect/generator/iterative_rubric/mcr_selector.py
modelselect/generator/iterative_rubric/query_rubric_generator.py
modelselect/generator/simple_rubric/__init__.py
modelselect/generator/simple_rubric/generator.py
modelselect/generator/simple_rubric/rubric_generator.py
modelselect/graders/__init__.py
modelselect/graders/base_grader.py
modelselect/graders/function_grader.py
modelselect/graders/llm_grader.py
modelselect/graders/schema.py
modelselect/graders/agent/__init__.py
modelselect/graders/agent/utils.py
modelselect/graders/agent/action/__init__.py
modelselect/graders/agent/action/action_alignment.py
modelselect/graders/agent/action/action_loop.py
modelselect/graders/agent/memory/__init__.py
modelselect/graders/agent/memory/memory_accuracy.py
modelselect/graders/agent/memory/memory_detail_preservation.py
modelselect/graders/agent/memory/memory_retrieval_effectiveness.py
modelselect/graders/agent/observation/__init__.py
modelselect/graders/agent/observation/observation_information_gain.py
modelselect/graders/agent/plan/__init__.py
modelselect/graders/agent/plan/plan_feasibility.py
modelselect/graders/agent/reflection/__init__.py
modelselect/graders/agent/reflection/reflection_accuracy.py
modelselect/graders/agent/reflection/reflection_outcome_understanding.py
modelselect/graders/agent/reflection/reflection_progress_awareness.py
modelselect/graders/agent/tool/__init__.py
modelselect/graders/agent/tool/tool_call_accuracy.py
modelselect/graders/agent/tool/tool_call_precision_recall_match.py
modelselect/graders/agent/tool/tool_call_step_sequence_match.py
modelselect/graders/agent/tool/tool_call_success.py
modelselect/graders/agent/tool/tool_parameter_check.py
modelselect/graders/agent/tool/tool_selection.py
modelselect/graders/agent/trajectory/__init__.py
modelselect/graders/agent/trajectory/trajectory_comprehensive.py
modelselect/graders/code/__init__.py
modelselect/graders/code/code_execution.py
modelselect/graders/code/code_style.py
modelselect/graders/code/patch_similarity.py
modelselect/graders/code/syntax_checker.py
modelselect/graders/code/_utils/__init__.py
modelselect/graders/code/_utils/testing_util.py
modelselect/graders/code/_utils/utils.py
modelselect/graders/common/__init__.py
modelselect/graders/common/correctness.py
modelselect/graders/common/hallucination.py
modelselect/graders/common/harmfulness.py
modelselect/graders/common/instruction_following.py
modelselect/graders/common/relevance.py
modelselect/graders/format/__init__.py
modelselect/graders/format/length_penalty.py
modelselect/graders/format/ngram_repetition_penalty.py
modelselect/graders/format/reasoning_format.py
modelselect/graders/format/reasoning_tool_format.py
modelselect/graders/format/json/__init__.py
modelselect/graders/format/json/json_match.py
modelselect/graders/format/json/json_validator.py
modelselect/graders/math/__init__.py
modelselect/graders/math/math_expression_verify.py
modelselect/graders/multimodal/__init__.py
modelselect/graders/multimodal/image_coherence.py
modelselect/graders/multimodal/image_helpfulness.py
modelselect/graders/multimodal/text_to_image.py
modelselect/graders/multimodal/_internal/__init__.py
modelselect/graders/multimodal/_internal/context_utils.py
modelselect/graders/multimodal/_internal/criteria_utils.py
modelselect/graders/multimodal/_internal/schema.py
modelselect/graders/text/__init__.py
modelselect/graders/text/number_accuracy.py
modelselect/graders/text/similarity.py
modelselect/graders/text/string_match.py
modelselect/graders/text/_utils/__init__.py
modelselect/graders/text/_utils/compute.py
modelselect/graders/text/_utils/normalization.py
modelselect/graders/text/_utils/setup_nltk_data.py
modelselect/graders/text/_utils/string_match_compute.py
modelselect/graders/text/_utils/tokenization.py
modelselect/models/__init__.py
modelselect/models/base_chat_model.py
modelselect/models/openai_chat_model.py
modelselect/models/qwen_vl_model.py
modelselect/models/formatter/__init__.py
modelselect/models/formatter/base_formatter.py
modelselect/models/formatter/dashscope_formatter.py
modelselect/models/schema/__init__.py
modelselect/models/schema/prompt_template.py
modelselect/models/schema/oai/__init__.py
modelselect/models/schema/oai/message.py
modelselect/models/schema/oai/response.py
modelselect/models/schema/qwen/__init__.py
modelselect/models/schema/qwen/mllmImage.py
modelselect/runner/__init__.py
modelselect/runner/base_runner.py
modelselect/runner/grading_runner.py
modelselect/runner/aggregator/__init__.py
modelselect/runner/aggregator/base_aggregator.py
modelselect/runner/aggregator/weighted_sum_aggregator.py
modelselect/utils/__init__.py
modelselect/utils/concurrency.py
modelselect/utils/grader_info.py
modelselect/utils/instance.py
modelselect/utils/mapping.py
modelselect/utils/tokenizer.py
modelselect/utils/utils.py
ms_modelselect.egg-info/PKG-INFO
ms_modelselect.egg-info/SOURCES.txt
ms_modelselect.egg-info/dependency_links.txt
ms_modelselect.egg-info/requires.txt
ms_modelselect.egg-info/top_level.txt
tests/test_batch_scenario_evaluation.py
tests/test_modelselect_final.py
tests/test_modelselect_integration.py
tests/test_scenario_evaluation.py
tests/analyzer/statistical/test_distribution_analyzer.py
tests/analyzer/validation/test_accuracy_analyzer.py
tests/analyzer/validation/test_consistency_analyzer.py
tests/analyzer/validation/test_correlation_analyzer.py
tests/analyzer/validation/test_f1_score_analyzer.py
tests/analyzer/validation/test_false_negative_analyzer.py
tests/analyzer/validation/test_false_positive_analyzer.py
tests/analyzer/validation/test_precision_analyzer.py
tests/analyzer/validation/test_recall_analyzer.py
tests/benchmarks/test_rewardbench2.py
tests/data/run_grader.py
tests/data/run_grader_eval_bfcl_dataset.py
tests/data/utils/tool_call/generate_bfcl_tool_call_data.py
tests/data/utils/tool_call/generate_new_cases.py
tests/data/utils/tool_call/llm_select_tools.py
tests/data/utils/tool_call/process_bfcl_tool_call_data.py
tests/docs/test_building_graders_custom.py
tests/docs/test_building_graders_overview.py
tests/generator/test_iterative_rubric.py
tests/generator/test_simple_rubric.py
tests/graders/test_llm_grader.py
tests/graders/agent/action/test_action_alignment.py
tests/graders/agent/action/test_action_loop.py
tests/graders/agent/memory/test_memory_accuracy.py
tests/graders/agent/memory/test_memory_detail_preservation.py
tests/graders/agent/memory/test_memory_retrieval_effectiveness.py
tests/graders/agent/observation/test_observation_information_gain.py
tests/graders/agent/plan/test_plan_feasibility.py
tests/graders/agent/reflection/test_reflection_accuracy.py
tests/graders/agent/reflection/test_reflection_outcome_understanding.py
tests/graders/agent/reflection/test_reflection_progress_awareness.py
tests/graders/agent/tool/test_tool_call_accuracy.py
tests/graders/agent/tool/test_tool_call_precision_recall_match.py
tests/graders/agent/tool/test_tool_call_step_sequence_match.py
tests/graders/agent/tool/test_tool_call_success.py
tests/graders/agent/tool/test_tool_parameter_check.py
tests/graders/agent/tool/test_tool_selection.py
tests/graders/agent/trajectory/test_trajectory_comprehensive.py
tests/graders/common/test_correctness.py
tests/graders/common/test_function_grader.py
tests/graders/common/test_hallucination.py
tests/graders/common/test_harmfulness.py
tests/graders/common/test_instruction_following.py
tests/graders/common/test_relevance.py
tests/graders/format/test_json_match.py
tests/graders/format/test_json_validator.py
tests/graders/multimodal/test_image_coherence.py
tests/graders/multimodal/test_image_helpfulness.py
tests/graders/multimodal/test_text_to_image.py
tests/graders/text/similarity/__init__.py
tests/graders/text/similarity/test_bleu.py
tests/graders/text/similarity/test_f1_score.py
tests/graders/text/similarity/test_fuzzy_match.py
tests/graders/text/similarity/test_rouge.py
tests/graders/text/string/test_string_match.py
tests/models/test_openai_chat_model.py
tests/models/schema/test_prompt_template.py
tests/runner/test_grading_runner.py
tests/runner/aggregator/test_weighted_sum_aggregator.py
tests/utils/test_grader_info.py
tests/utils/test_mapping.py