Decode distribution drift detected: the candidate generates different tokens than fp16 on short-context prompts.
Brittleness to small input changes: candidate's output drifts more than fp16's under typo / casing / punctuation perturbations.
| length \ position | 0.10 | 0.50 | 0.90 |
|---|---|---|---|
| 4096 | 1.00 / 1.00 | 1.00 / 1.00 | 1.00 / 1.00 |
| 8192 | 1.00 / 1.00 | 1.00 / 1.00 | 1.00 / 1.00 |
| 16384 | n/a | n/a | n/a |
| perturbation | score | band | distribution | |
|---|---|---|---|---|
| typo | 70.65 | Degraded | ||
| case | 78.88 | Degraded | ||
| punct | 80.55 | Pass | ||
| paraphrase | skipped | N/A | didn't apply on these prompts |
python3 -m refract.cli score --model Qwen2.5-7B-Instruct-Q8_0.gguf --reference ctk=f16,ctv=f16 --candidate ctk=q8_0,ctv=turbo4 --prompts refract/prompts/v0.1.jsonl --full --rniah-up-to 16384 --json-out report.json --html-out report.html
{
"schema": "refract.report.v0.3.1",
"framework_version": "0.3.2",
"environment": {},
"repro_command": "",
"timestamp": "2026-04-30T16:23:41",
"score_direction": "higher_is_better",
"score_range": [
0,
100
],
"model": "/Users/tom/local_llms/models/Qwen2.5-7B-Instruct-Q8_0.gguf",
"reference": "ctk=f16,ctv=f16",
"candidate": "ctk=q8_0,ctv=turbo4",
"composite": 77.97857717848223,
"band": "DEGRADED",
"summary": "Visible drift. Audit on your workload before deploying.",
"diagnosis": [
"Decode distribution drift detected: the candidate generates different tokens than fp16 on short-context prompts.",
"Brittleness to small input changes: candidate's output drifts more than fp16's under typo / casing / punctuation perturbations."
],
"composite_detail": {
"gtm_score": 55.13333333333333,
"kld_score": 98.75274353103794,
"rniah_score": 100.0,
"plad_score": 76.734244212101,
"floor_score": null,
"floor_ok": null,
"floor_min": 99.5,
"notes": []
},
"axes": {
"gtm": {
"score": 55.13333333333333,
"full_match_rate": 0.36666666666666664,
"median_first_divergence": 6,
"mean_prefix_agreement_length": 27.566666666666666,
"mean_cand_length": 50.0,
"mean_ref_length": 48.56666666666667,
"n_prompts": 30,
"n_tokens_each": 50,
"per_prompt": [],
"notes": [],
"band": "FAIL",
"description": "Token-level agreement with the fp16 reference."
},
"kld": {
"score": 98.75274353103794,
"mean_kld": 0.012551,
"ppl": null,
"rms_dp_pct": null,
"same_topp_pct": null,
"base_path": "",
"chunks": 32,
"ctx": 512,
"is_self_reference": false,
"corpus": null,
"band": "EXCELLENT",
"description": "Distribution-level divergence from the fp16 reference."
},
"rniah": {
"score": 100.0,
"n_cells": 9,
"cells": [
{
"length": 4096,
"position": 0.1,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 4096,
"position": 0.5,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 4096,
"position": 0.9,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 8192,
"position": 0.1,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 8192,
"position": 0.5,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 8192,
"position": 0.9,
"n_trials": 1,
"base_acc": 1.0,
"cand_acc": 1.0,
"degradation": 0.0,
"base_hits": 1,
"cand_hits": 1
},
{
"length": 16384,
"position": 0.1,
"n_trials": 1,
"base_acc": 0.0,
"cand_acc": 0.0,
"degradation": 0.0,
"base_hits": 0,
"cand_hits": 0
},
{
"length": 16384,
"position": 0.5,
"n_trials": 1,
"base_acc": 0.0,
"cand_acc": 0.0,
"degradation": 0.0,
"base_hits": 0,
"cand_hits": 0
},
{
"length": 16384,
"position": 0.9,
"n_trials": 1,
"base_acc": 0.0,
"cand_acc": 0.0,
"degradation": 0.0,
"base_hits": 0,
"cand_hits": 0
}
],
"skipped_cells": [],
"needle": "Note: APRICOT-7-BLUE is the rare paint color featured in this article.",
"password_keyword": "APRICOT-7-BLUE",
"notes": [],
"confidence": "ok",
"base_acc_avg": 0.6666666666666666,
"band": "EXCELLENT",
"description": "Long-context retrieval quality vs the reference."
},
"plad": {
"score": 76.734244212101,
"per_perturbation_score": {
"typo": 70.64992294338457,
"case": 78.88327659567271,
"punct": 80.55044933047421,
"paraphrase": NaN
},
"per_prompt": [],
"n_prompts": 30,
"n_perturbations": 4,
"notes": [
"36 (prompt, perturbation) pairs were skipped (perturbation could not apply, e.g. no \u22654-char word for typo)."
],
"skipped_perturbations": [
"paraphrase"
],
"confidence": "partial",
"band": "DEGRADED",
"description": "Robustness to small prompt changes vs the reference."
}
},
"extras": {}
}