╔═══════════════════════════════════════════════════════════════════════════════╗
║                   SWE-BENCH TEST COVERAGE GAP ANALYSIS                        ║
║                            QUICK REFERENCE                                    ║
╚═══════════════════════════════════════════════════════════════════════════════╝

OVERALL METRICS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  Total Functions:        57 functions across 6 modules
  Functions Tested:       25 (43.8%)
  Functions NOT Tested:   32 (56.2%)
  
BY MODULE:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  types.py                5/6   (83.3%)  ✓ Good
  score_official.py       3/4   (75.0%)  ✓ Good
  smoke.py                1/2   (50.0%)  ⚠ Moderate
  io.py                  10/25  (40.0%)  ✗ CRITICAL
  generate.py             5/14  (35.7%)  ✗ CRITICAL
  cli.py                  1/7   (14.3%)  ✗ CRITICAL


CRITICAL EXECUTION PATHS IN generate_lite_predictions:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. SYNC PATH (jobs=1)
   Status:        ⚠ PARTIALLY TESTED (implicit via default)
   Code:          generate.py lines 293-325
   Key Functions: _propose_prediction(), _write_prediction_map()
   Issue:         _propose_prediction NOT UNIT TESTED
   Impact:        Medium - Patch generation bugs may not be caught

2. THREADPOOL PATH (jobs>1)  
   Status:        ✗ COMPLETELY UNTESTED
   Code:          generate.py lines 326-368
   Test Coverage: 0%
   Issue:         No tests with jobs > 1
   Impact:        CRITICAL - Race conditions, synchronization issues
   Required Test: generate_lite_predictions(..., jobs=4)

3. ASYNC PATH (openai_async=True)
   Status:        ✓ TESTED
   Code:          generate.py lines 270-292
   Test Coverage: ~80% (via test_generate_lite_predictions_prefers_async_path_when_enabled)
   Impact:        Low - Main path tested

4. RESUME LOGIC (resume=True)
   Status:        ✗ COMPLETELY UNTESTED
   Code:          generate.py lines 255-262
   Test Coverage: 0%
   Issue:         No actual resume-with-partial-predictions test
   Impact:        CRITICAL - Data loss risk on resume
   Required Test: Write partial predictions, resume=True, validate counts


MOST CRITICAL UNTESTED FUNCTIONS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

FROM generate.py:
  ✗ _generate_for_condition()        [LINE 238-380] MAIN ORCHESTRATION FUNCTION
  ✗ _propose_prediction()            [LINE 390-437] SYNC PATCH GENERATION
  ✗ _build_prompt()                  [LINE 606-640] CONSTRUCTS LLM PROMPT
  ✗ _write_prediction_map()          [LINE 596-603] WRITES PREDICTIONS FILE
  ✗ _select_instance_ids()           [LINE 208-235] INSTANCE FILTERING
  ✗ _load_snapshot_text()            [LINE 643-656] LOADS DEBUG SNAPSHOTS

FROM io.py:
  ✗ read_predictions()               [LINE 144-177] READS PREDICTION JSONL
  ✗ load_instance_ids()              [LINE 118-129] READS INSTANCE LIST
  ✗ ensure_run_layout()              [LINE 99-103]  CREATES DIRECTORY STRUCTURE
  ✗ load_json()                      [LINE 111-115] LOADS JSON FILES
  ✗ validate_run_id()                [LINE 27-35]   VALIDATES RUN ID

FROM cli.py:
  ✗ _add_generate_arguments()        [LINE 30-63]   ARGUMENT SETUP
  ✗ _add_score_arguments()           [LINE 70-81]   ARGUMENT SETUP
  ✗ _add_prepare_arguments()         [LINE 65-68]   ARGUMENT SETUP


RECOMMENDED TESTS TO ADD (PRIORITY ORDER):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. [HIGH] Test ThreadPoolExecutor path with jobs > 1
   File:       test_evals_swebench_generate.py
   Effort:     1-2 hours
   Impact:     Catches race conditions, thread safety issues

2. [HIGH] Test resume logic with partial predictions
   File:       test_evals_swebench_generate.py
   Effort:     1-2 hours  
   Impact:     Prevents data loss on resume

3. [HIGH] Unit test _propose_prediction() with error cases
   File:       test_evals_swebench_generate.py
   Effort:     30 minutes
   Impact:     Catches sync patch generation bugs

4. [MEDIUM] Unit test _read_predictions() with edge cases
   File:       test_evals_swebench_io.py
   Effort:     45 minutes
   Impact:     Validates JSONL parsing robustness

5. [MEDIUM] Unit test _build_prompt() with snapshots
   File:       test_evals_swebench_generate.py
   Effort:     45 minutes
   Impact:     Ensures correct LLM prompt construction

6. [MEDIUM] Unit test _select_instance_ids() with max_instances
   File:       test_evals_swebench_generate.py
   Effort:     30 minutes
   Impact:     Validates instance filtering

7. [MEDIUM] Unit test _load_instance_ids() with malformed data
   File:       test_evals_swebench_io.py
   Effort:     45 minutes
   Impact:     Validates instance list robustness


SPECIFIC CODE LOCATIONS FOR FOCUS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

generate.py:
  Lines 39-167    generate_lite_predictions() - MAIN API (partial coverage)
  Lines 238-380   _generate_for_condition() - ORCHESTRATION (no tests)
  Lines 270-292   if openai_async: branch (tested)
  Lines 293-325   elif jobs == 1: branch (implicit test only)
  Lines 326-368   else: ThreadPoolExecutor branch (NOT TESTED!)
  Lines 390-437   _propose_prediction() (NOT TESTED!)
  Lines 255-262   Resume logic (NOT TESTED!)

io.py:
  Lines 144-177   read_predictions() - CRITICAL (no tests)
  Lines 118-129   load_instance_ids() (no tests)
  Lines 99-103    ensure_run_layout() (no tests)


TEST FILE LOCATIONS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  tests/test_evals_swebench_io.py                     (138 lines)
  tests/test_evals_swebench_generate.py               (50 lines - sparse!)
  tests/test_evals_swebench_generate_async.py         (106 lines)
  tests/test_evals_swebench_generate_health.py        (59 lines)
  tests/test_evals_swebench_cli.py                    (173 lines)
  tests/test_evals_swebench_smoke.py                  (74 lines)
  tests/test_evals_swebench_score_cmd.py              (128 lines)
